From 001f8c8f55a2a9a4c14c522ff12fcf27ae04c1e1 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 7 Jan 2019 08:39:47 -0800 Subject: [PATCH] Rename LookaheadSet -> TokenSet Also, replace non-standard `with` method with a `FromIterator` implementation. --- cli/src/build_tables/build_lex_table.rs | 32 ++-- cli/src/build_tables/build_parse_table.rs | 16 +- cli/src/build_tables/item.rs | 39 +++-- cli/src/build_tables/item_set_builder.rs | 34 ++-- cli/src/build_tables/minimize_parse_table.rs | 6 +- cli/src/build_tables/mod.rs | 170 ++++++++++--------- cli/src/build_tables/token_conflicts.rs | 18 +- 7 files changed, 165 insertions(+), 150 deletions(-) diff --git a/cli/src/build_tables/build_lex_table.rs b/cli/src/build_tables/build_lex_table.rs index bcc1bf3d..9fc8edc6 100644 --- a/cli/src/build_tables/build_lex_table.rs +++ b/cli/src/build_tables/build_lex_table.rs @@ -1,4 +1,4 @@ -use super::item::LookaheadSet; +use super::item::TokenSet; use super::token_conflicts::TokenConflictMap; use crate::grammars::{LexicalGrammar, SyntaxGrammar}; use crate::nfa::{CharacterSet, NfaCursor, NfaTransition}; @@ -11,7 +11,7 @@ pub(crate) fn build_lex_table( parse_table: &mut ParseTable, syntax_grammar: &SyntaxGrammar, lexical_grammar: &LexicalGrammar, - keywords: &LookaheadSet, + keywords: &TokenSet, minimize: bool, ) -> (LexTable, LexTable) { let keyword_lex_table; @@ -25,19 +25,23 @@ pub(crate) fn build_lex_table( let mut builder = LexTableBuilder::new(lexical_grammar); for state in parse_table.states.iter_mut() { - let tokens = LookaheadSet::with(state.terminal_entries.keys().filter_map(|token| { - if token.is_terminal() { - if keywords.contains(&token) { - syntax_grammar.word_token - } else { + let tokens = state + .terminal_entries + .keys() + .filter_map(|token| { + if token.is_terminal() { + if keywords.contains(&token) { + syntax_grammar.word_token + } else { + Some(*token) + } + } else if token.is_eof() { Some(*token) + } else { + None } - } else if token.is_eof() { - Some(*token) - } else { - None - } - })); + }) + .collect(); state.lex_state_id = builder.add_state_for_tokens(&tokens); } @@ -75,7 +79,7 @@ impl<'a> LexTableBuilder<'a> { } } - fn add_state_for_tokens(&mut self, tokens: &LookaheadSet) -> usize { + fn add_state_for_tokens(&mut self, tokens: &TokenSet) -> usize { let mut eof_valid = false; let nfa_states = tokens .iter() diff --git a/cli/src/build_tables/build_parse_table.rs b/cli/src/build_tables/build_parse_table.rs index cda1d7ea..27baf146 100644 --- a/cli/src/build_tables/build_parse_table.rs +++ b/cli/src/build_tables/build_parse_table.rs @@ -1,4 +1,4 @@ -use super::item::{LookaheadSet, ParseItem, ParseItemSet}; +use super::item::{ParseItem, ParseItemSet, TokenSet}; use super::item_set_builder::ParseItemSetBuilder; use crate::error::{Error, Result}; use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar, VariableType}; @@ -38,12 +38,12 @@ struct ParseTableBuilder<'a> { item_sets_by_state_id: Vec>, parse_state_queue: VecDeque, parse_table: ParseTable, - following_tokens: Vec, + following_tokens: Vec, state_ids_to_log: Vec, } impl<'a> ParseTableBuilder<'a> { - fn build(mut self) -> Result<(ParseTable, Vec)> { + fn build(mut self) -> Result<(ParseTable, Vec)> { // Ensure that the empty alias sequence has index 0. self.parse_table.alias_sequences.push(Vec::new()); @@ -57,7 +57,7 @@ impl<'a> ParseTableBuilder<'a> { ParseItemSet::with( [( ParseItem::start(), - LookaheadSet::with([Symbol::end()].iter().cloned()), + [Symbol::end()].iter().cloned().collect(), )] .iter() .cloned(), @@ -174,7 +174,7 @@ impl<'a> ParseTableBuilder<'a> { .or_insert_with(|| ParseItemSet::default()) .entries .entry(successor) - .or_insert_with(|| LookaheadSet::new()) + .or_insert_with(|| TokenSet::new()) .insert_all(lookaheads); } else { terminal_successors @@ -182,7 +182,7 @@ impl<'a> ParseTableBuilder<'a> { .or_insert_with(|| ParseItemSet::default()) .entries .entry(successor) - .or_insert_with(|| LookaheadSet::new()) + .or_insert_with(|| TokenSet::new()) .insert_all(lookaheads); } } else { @@ -714,7 +714,7 @@ pub(crate) fn build_parse_table( lexical_grammar: &LexicalGrammar, inlines: &InlinedProductionMap, state_ids_to_log: Vec, -) -> Result<(ParseTable, Vec)> { +) -> Result<(ParseTable, Vec)> { ParseTableBuilder { syntax_grammar, lexical_grammar, @@ -729,7 +729,7 @@ pub(crate) fn build_parse_table( alias_sequences: Vec::new(), max_aliased_production_length: 0, }, - following_tokens: vec![LookaheadSet::new(); lexical_grammar.variables.len()], + following_tokens: vec![TokenSet::new(); lexical_grammar.variables.len()], } .build() } diff --git a/cli/src/build_tables/item.rs b/cli/src/build_tables/item.rs index bbd5bbfa..5d6edc2f 100644 --- a/cli/src/build_tables/item.rs +++ b/cli/src/build_tables/item.rs @@ -6,6 +6,7 @@ use std::cmp::Ordering; use std::collections::BTreeMap; use std::fmt; use std::hash::{Hash, Hasher}; +use std::iter::FromIterator; use std::u32; lazy_static! { @@ -24,7 +25,7 @@ lazy_static! { } #[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub(crate) struct LookaheadSet { +pub(crate) struct TokenSet { terminal_bits: SmallBitVec, external_bits: SmallBitVec, eof: bool, @@ -39,7 +40,7 @@ pub(crate) struct ParseItem<'a> { #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct ParseItemSet<'a> { - pub entries: BTreeMap, LookaheadSet>, + pub entries: BTreeMap, TokenSet>, } pub(crate) struct ParseItemDisplay<'a>( @@ -48,7 +49,7 @@ pub(crate) struct ParseItemDisplay<'a>( pub &'a LexicalGrammar, ); -pub(crate) struct LookaheadSetDisplay<'a>(&'a LookaheadSet, &'a SyntaxGrammar, &'a LexicalGrammar); +pub(crate) struct TokenSetDisplay<'a>(&'a TokenSet, &'a SyntaxGrammar, &'a LexicalGrammar); #[allow(dead_code)] pub(crate) struct ParseItemSetDisplay<'a>( @@ -57,7 +58,7 @@ pub(crate) struct ParseItemSetDisplay<'a>( pub &'a LexicalGrammar, ); -impl LookaheadSet { +impl TokenSet { pub fn new() -> Self { Self { terminal_bits: SmallBitVec::new(), @@ -92,17 +93,9 @@ impl LookaheadSet { .chain(if self.eof { Some(Symbol::end()) } else { None }) } - pub fn with(symbols: impl IntoIterator) -> Self { - let mut result = Self::new(); - for symbol in symbols { - result.insert(symbol); - } - result - } - pub fn contains(&self, symbol: &Symbol) -> bool { match symbol.kind { - SymbolType::NonTerminal => panic!("Cannot store non-terminals in a LookaheadSet"), + SymbolType::NonTerminal => panic!("Cannot store non-terminals in a TokenSet"), SymbolType::Terminal => self.terminal_bits.get(symbol.index).unwrap_or(false), SymbolType::External => self.external_bits.get(symbol.index).unwrap_or(false), SymbolType::End => self.eof, @@ -111,7 +104,7 @@ impl LookaheadSet { pub fn insert(&mut self, other: Symbol) { let vec = match other.kind { - SymbolType::NonTerminal => panic!("Cannot store non-terminals in a LookaheadSet"), + SymbolType::NonTerminal => panic!("Cannot store non-terminals in a TokenSet"), SymbolType::Terminal => &mut self.terminal_bits, SymbolType::External => &mut self.external_bits, SymbolType::End => { @@ -125,7 +118,7 @@ impl LookaheadSet { vec.set(other.index, true); } - pub fn insert_all(&mut self, other: &LookaheadSet) -> bool { + pub fn insert_all(&mut self, other: &TokenSet) -> bool { let mut result = false; if other.terminal_bits.len() > self.terminal_bits.len() { self.terminal_bits.resize(other.terminal_bits.len(), false); @@ -153,6 +146,16 @@ impl LookaheadSet { } } +impl FromIterator for TokenSet { + fn from_iter>(iter: T) -> Self { + let mut result = Self::new(); + for symbol in iter { + result.insert(symbol); + } + result + } +} + impl<'a> ParseItem<'a> { pub fn start() -> Self { ParseItem { @@ -204,7 +207,7 @@ impl<'a> ParseItem<'a> { } impl<'a> ParseItemSet<'a> { - pub fn with(elements: impl IntoIterator, LookaheadSet)>) -> Self { + pub fn with(elements: impl IntoIterator, TokenSet)>) -> Self { let mut result = Self::default(); for (item, lookaheads) in elements { result.entries.insert(item, lookaheads); @@ -296,7 +299,7 @@ impl<'a> fmt::Display for ParseItemDisplay<'a> { } } -impl<'a> fmt::Display for LookaheadSetDisplay<'a> { +impl<'a> fmt::Display for TokenSetDisplay<'a> { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { write!(f, "[")?; for (i, symbol) in self.0.iter().enumerate() { @@ -328,7 +331,7 @@ impl<'a> fmt::Display for ParseItemSetDisplay<'a> { f, "{}\t{}", ParseItemDisplay(item, self.1, self.2), - LookaheadSetDisplay(lookaheads, self.1, self.2) + TokenSetDisplay(lookaheads, self.1, self.2) )?; } Ok(()) diff --git a/cli/src/build_tables/item_set_builder.rs b/cli/src/build_tables/item_set_builder.rs index 939d700c..fea3b4d1 100644 --- a/cli/src/build_tables/item_set_builder.rs +++ b/cli/src/build_tables/item_set_builder.rs @@ -1,4 +1,4 @@ -use super::item::{LookaheadSet, ParseItem, ParseItemDisplay, ParseItemSet}; +use super::item::{ParseItem, ParseItemDisplay, ParseItemSet, TokenSet}; use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; use crate::rules::Symbol; use hashbrown::{HashMap, HashSet}; @@ -12,15 +12,15 @@ struct TransitiveClosureAddition<'a> { #[derive(Clone, Debug, PartialEq, Eq)] struct FollowSetInfo { - lookaheads: LookaheadSet, + lookaheads: TokenSet, propagates_lookaheads: bool, } pub(crate) struct ParseItemSetBuilder<'a> { syntax_grammar: &'a SyntaxGrammar, lexical_grammar: &'a LexicalGrammar, - first_sets: HashMap, - last_sets: HashMap, + first_sets: HashMap, + last_sets: HashMap, inlines: &'a InlinedProductionMap, transitive_closure_additions: Vec>>, } @@ -54,7 +54,7 @@ impl<'a> ParseItemSetBuilder<'a> { // terminal itself. for i in 0..lexical_grammar.variables.len() { let symbol = Symbol::terminal(i); - let mut set = LookaheadSet::new(); + let mut set = TokenSet::new(); set.insert(symbol); result.first_sets.insert(symbol, set.clone()); result.last_sets.insert(symbol, set); @@ -62,7 +62,7 @@ impl<'a> ParseItemSetBuilder<'a> { for i in 0..syntax_grammar.external_tokens.len() { let symbol = Symbol::external(i); - let mut set = LookaheadSet::new(); + let mut set = TokenSet::new(); set.insert(symbol); result.first_sets.insert(symbol, set.clone()); result.last_sets.insert(symbol, set); @@ -80,10 +80,7 @@ impl<'a> ParseItemSetBuilder<'a> { for i in 0..syntax_grammar.variables.len() { let symbol = Symbol::non_terminal(i); - let first_set = &mut result - .first_sets - .entry(symbol) - .or_insert(LookaheadSet::new()); + let first_set = &mut result.first_sets.entry(symbol).or_insert(TokenSet::new()); processed_non_terminals.clear(); symbols_to_process.clear(); symbols_to_process.push(symbol); @@ -103,10 +100,7 @@ impl<'a> ParseItemSetBuilder<'a> { } // The LAST set is defined in a similar way to the FIRST set. - let last_set = &mut result - .last_sets - .entry(symbol) - .or_insert(LookaheadSet::new()); + let last_set = &mut result.last_sets.entry(symbol).or_insert(TokenSet::new()); processed_non_terminals.clear(); symbols_to_process.clear(); symbols_to_process.push(symbol); @@ -148,7 +142,7 @@ impl<'a> ParseItemSetBuilder<'a> { // Again, rather than computing these additions recursively, we use an explicit // stack called `entries_to_process`. for i in 0..syntax_grammar.variables.len() { - let empty_lookaheads = LookaheadSet::new(); + let empty_lookaheads = TokenSet::new(); let mut entries_to_process = vec![(i, &empty_lookaheads, true)]; // First, build up a map whose keys are all of the non-terminals that can @@ -160,7 +154,7 @@ impl<'a> ParseItemSetBuilder<'a> { let existing_info = follow_set_info_by_non_terminal .entry(variable_index) .or_insert_with(|| FollowSetInfo { - lookaheads: LookaheadSet::new(), + lookaheads: TokenSet::new(), propagates_lookaheads: false, }); @@ -269,15 +263,15 @@ impl<'a> ParseItemSetBuilder<'a> { result } - pub fn first_set(&self, symbol: &Symbol) -> &LookaheadSet { + pub fn first_set(&self, symbol: &Symbol) -> &TokenSet { &self.first_sets[symbol] } - pub fn last_set(&self, symbol: &Symbol) -> &LookaheadSet { + pub fn last_set(&self, symbol: &Symbol) -> &TokenSet { &self.first_sets[symbol] } - fn add_item(&self, set: &mut ParseItemSet<'a>, item: ParseItem<'a>, lookaheads: &LookaheadSet) { + fn add_item(&self, set: &mut ParseItemSet<'a>, item: ParseItem<'a>, lookaheads: &TokenSet) { if let Some(step) = item.step() { if step.symbol.is_non_terminal() { let next_step = item.successor().step(); @@ -294,7 +288,7 @@ impl<'a> ParseItemSetBuilder<'a> { let lookaheads = set .entries .entry(addition.item) - .or_insert_with(|| LookaheadSet::new()); + .or_insert_with(|| TokenSet::new()); lookaheads.insert_all(&addition.info.lookaheads); if addition.info.propagates_lookaheads { lookaheads.insert_all(following_tokens); diff --git a/cli/src/build_tables/minimize_parse_table.rs b/cli/src/build_tables/minimize_parse_table.rs index 573bf974..d83e117f 100644 --- a/cli/src/build_tables/minimize_parse_table.rs +++ b/cli/src/build_tables/minimize_parse_table.rs @@ -1,4 +1,4 @@ -use super::item::LookaheadSet; +use super::item::TokenSet; use super::token_conflicts::TokenConflictMap; use crate::grammars::{SyntaxGrammar, VariableType}; use crate::rules::{AliasMap, Symbol}; @@ -10,7 +10,7 @@ pub(crate) fn minimize_parse_table( syntax_grammar: &SyntaxGrammar, simple_aliases: &AliasMap, token_conflict_map: &TokenConflictMap, - keywords: &LookaheadSet, + keywords: &TokenSet, ) { let mut minimizer = Minimizer { parse_table, @@ -28,7 +28,7 @@ struct Minimizer<'a> { parse_table: &'a mut ParseTable, syntax_grammar: &'a SyntaxGrammar, token_conflict_map: &'a TokenConflictMap<'a>, - keywords: &'a LookaheadSet, + keywords: &'a TokenSet, simple_aliases: &'a AliasMap, } diff --git a/cli/src/build_tables/mod.rs b/cli/src/build_tables/mod.rs index 04b750e3..c632aa7b 100644 --- a/cli/src/build_tables/mod.rs +++ b/cli/src/build_tables/mod.rs @@ -9,7 +9,7 @@ mod token_conflicts; use self::build_lex_table::build_lex_table; use self::build_parse_table::build_parse_table; use self::coincident_tokens::CoincidentTokenIndex; -use self::item::LookaheadSet; +use self::item::TokenSet; use self::minimize_parse_table::minimize_parse_table; use self::token_conflicts::TokenConflictMap; use crate::error::Result; @@ -44,11 +44,7 @@ pub(crate) fn build_tables( &coincident_token_index, &token_conflict_map, ); - mark_fragile_tokens( - &mut parse_table, - lexical_grammar, - &token_conflict_map, - ); + mark_fragile_tokens(&mut parse_table, lexical_grammar, &token_conflict_map); if minimize { minimize_parse_table( &mut parse_table, @@ -85,22 +81,25 @@ fn populate_error_state( // First identify the *conflict-free tokens*: tokens that do not overlap with // any other token in any way. - let conflict_free_tokens = LookaheadSet::with((0..n).into_iter().filter_map(|i| { - let conflicts_with_other_tokens = (0..n).into_iter().any(|j| { - j != i - && !coincident_token_index.contains(Symbol::terminal(i), Symbol::terminal(j)) - && token_conflict_map.does_conflict(i, j) - }); - if conflicts_with_other_tokens { - None - } else { - info!( - "error recovery - token {} has no conflicts", - lexical_grammar.variables[i].name - ); - Some(Symbol::terminal(i)) - } - })); + let conflict_free_tokens: TokenSet = (0..n) + .into_iter() + .filter_map(|i| { + let conflicts_with_other_tokens = (0..n).into_iter().any(|j| { + j != i + && !coincident_token_index.contains(Symbol::terminal(i), Symbol::terminal(j)) + && token_conflict_map.does_conflict(i, j) + }); + if conflicts_with_other_tokens { + None + } else { + info!( + "error recovery - token {} has no conflicts", + lexical_grammar.variables[i].name + ); + Some(Symbol::terminal(i)) + } + }) + .collect(); let recover_entry = ParseTableEntry { reusable: false, @@ -153,9 +152,9 @@ fn identify_keywords( word_token: Option, token_conflict_map: &TokenConflictMap, coincident_token_index: &CoincidentTokenIndex, -) -> LookaheadSet { +) -> TokenSet { if word_token.is_none() { - return LookaheadSet::new(); + return TokenSet::new(); } let word_token = word_token.unwrap(); @@ -163,8 +162,11 @@ fn identify_keywords( // First find all of the candidate keyword tokens: tokens that start with // letters or underscore and can match the same string as a word token. - let keywords = LookaheadSet::with(lexical_grammar.variables.iter().enumerate().filter_map( - |(i, variable)| { + let keywords: TokenSet = lexical_grammar + .variables + .iter() + .enumerate() + .filter_map(|(i, variable)| { cursor.reset(vec![variable.start_state]); if all_chars_are_alphabetical(&cursor) && token_conflict_map.does_match_same_string(i, word_token.index) @@ -177,69 +179,75 @@ fn identify_keywords( } else { None } - }, - )); + }) + .collect(); // Exclude keyword candidates that shadow another keyword candidate. - let keywords = LookaheadSet::with(keywords.iter().filter(|token| { - for other_token in keywords.iter() { - if other_token != *token - && token_conflict_map.does_match_same_string(token.index, other_token.index) - { - info!( - "Keywords - exclude {} because it matches the same string as {}", - lexical_grammar.variables[token.index].name, - lexical_grammar.variables[other_token.index].name - ); - return false; + let keywords: TokenSet = keywords + .iter() + .filter(|token| { + for other_token in keywords.iter() { + if other_token != *token + && token_conflict_map.does_match_same_string(token.index, other_token.index) + { + info!( + "Keywords - exclude {} because it matches the same string as {}", + lexical_grammar.variables[token.index].name, + lexical_grammar.variables[other_token.index].name + ); + return false; + } } - } - true - })); + true + }) + .collect(); // Exclude keyword candidates for which substituting the keyword capture // token would introduce new lexical conflicts with other tokens. - let keywords = LookaheadSet::with(keywords.iter().filter(|token| { - for other_index in 0..lexical_grammar.variables.len() { - if keywords.contains(&Symbol::terminal(other_index)) { - continue; + let keywords = keywords + .iter() + .filter(|token| { + for other_index in 0..lexical_grammar.variables.len() { + if keywords.contains(&Symbol::terminal(other_index)) { + continue; + } + + // If the word token was already valid in every state containing + // this keyword candidate, then substituting the word token won't + // introduce any new lexical conflicts. + if coincident_token_index + .states_with(*token, Symbol::terminal(other_index)) + .iter() + .all(|state_id| { + parse_table.states[*state_id] + .terminal_entries + .contains_key(&word_token) + }) + { + continue; + } + + if !token_conflict_map.has_same_conflict_status( + token.index, + word_token.index, + other_index, + ) { + info!( + "Keywords - exclude {} because of conflict with {}", + lexical_grammar.variables[token.index].name, + lexical_grammar.variables[other_index].name + ); + return false; + } } - // If the word token was already valid in every state containing - // this keyword candidate, then substituting the word token won't - // introduce any new lexical conflicts. - if coincident_token_index - .states_with(*token, Symbol::terminal(other_index)) - .iter() - .all(|state_id| { - parse_table.states[*state_id] - .terminal_entries - .contains_key(&word_token) - }) - { - continue; - } - - if !token_conflict_map.has_same_conflict_status( - token.index, - word_token.index, - other_index, - ) { - info!( - "Keywords - exclude {} because of conflict with {}", - lexical_grammar.variables[token.index].name, - lexical_grammar.variables[other_index].name - ); - return false; - } - } - - info!( - "Keywords - include {}", - lexical_grammar.variables[token.index].name, - ); - true - })); + info!( + "Keywords - include {}", + lexical_grammar.variables[token.index].name, + ); + true + }) + .collect(); keywords } diff --git a/cli/src/build_tables/token_conflicts.rs b/cli/src/build_tables/token_conflicts.rs index cb2b6efe..7bb443a5 100644 --- a/cli/src/build_tables/token_conflicts.rs +++ b/cli/src/build_tables/token_conflicts.rs @@ -1,4 +1,4 @@ -use crate::build_tables::item::LookaheadSet; +use crate::build_tables::item::TokenSet; use crate::grammars::LexicalGrammar; use crate::nfa::{CharacterSet, NfaCursor, NfaTransition}; use hashbrown::HashSet; @@ -22,7 +22,7 @@ pub(crate) struct TokenConflictMap<'a> { } impl<'a> TokenConflictMap<'a> { - pub fn new(grammar: &'a LexicalGrammar, following_tokens: Vec) -> Self { + pub fn new(grammar: &'a LexicalGrammar, following_tokens: Vec) -> Self { let mut cursor = NfaCursor::new(&grammar.nfa, Vec::new()); let starting_chars = get_starting_chars(&mut cursor, grammar); let following_chars = get_following_chars(&starting_chars, following_tokens); @@ -141,7 +141,7 @@ fn get_starting_chars(cursor: &mut NfaCursor, grammar: &LexicalGrammar) -> Vec, - following_tokens: Vec, + following_tokens: Vec, ) -> Vec { following_tokens .into_iter() @@ -352,9 +352,15 @@ mod tests { let token_map = TokenConflictMap::new( &grammar, vec![ - LookaheadSet::with([Symbol::terminal(var("identifier"))].iter().cloned()), - LookaheadSet::with([Symbol::terminal(var("in"))].iter().cloned()), - LookaheadSet::with([Symbol::terminal(var("identifier"))].iter().cloned()), + [Symbol::terminal(var("identifier"))] + .iter() + .cloned() + .collect(), + [Symbol::terminal(var("in"))].iter().cloned().collect(), + [Symbol::terminal(var("identifier"))] + .iter() + .cloned() + .collect(), ], );