diff --git a/cli/src/generate/build_tables/build_lex_table.rs b/cli/src/generate/build_tables/build_lex_table.rs index e5f78921..21594253 100644 --- a/cli/src/generate/build_tables/build_lex_table.rs +++ b/cli/src/generate/build_tables/build_lex_table.rs @@ -1,10 +1,9 @@ use super::coincident_tokens::CoincidentTokenIndex; -use super::item::TokenSet; use super::token_conflicts::TokenConflictMap; use crate::generate::dedup::split_state_id_groups; use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar}; use crate::generate::nfa::{CharacterSet, NfaCursor}; -use crate::generate::rules::Symbol; +use crate::generate::rules::{Symbol, TokenSet}; use crate::generate::tables::{AdvanceAction, LexState, LexTable, ParseStateId, ParseTable}; use log::info; use std::collections::hash_map::Entry; diff --git a/cli/src/generate/build_tables/build_parse_table.rs b/cli/src/generate/build_tables/build_parse_table.rs index 4c242a74..30c714a2 100644 --- a/cli/src/generate/build_tables/build_parse_table.rs +++ b/cli/src/generate/build_tables/build_parse_table.rs @@ -1,11 +1,11 @@ -use super::item::{ParseItem, ParseItemSet, ParseItemSetCore, TokenSet}; +use super::item::{ParseItem, ParseItemSet, ParseItemSetCore}; use super::item_set_builder::ParseItemSetBuilder; use crate::error::{Error, Result}; use crate::generate::grammars::{ InlinedProductionMap, LexicalGrammar, SyntaxGrammar, VariableType, }; use crate::generate::node_types::VariableInfo; -use crate::generate::rules::{Associativity, Symbol, SymbolType}; +use crate::generate::rules::{Associativity, Symbol, SymbolType, TokenSet}; use crate::generate::tables::{ FieldLocation, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry, ProductionInfo, ProductionInfoId, diff --git a/cli/src/generate/build_tables/item.rs b/cli/src/generate/build_tables/item.rs index 5ee0144b..df712402 100644 --- a/cli/src/generate/build_tables/item.rs +++ b/cli/src/generate/build_tables/item.rs @@ -1,12 +1,9 @@ use crate::generate::grammars::{LexicalGrammar, Production, ProductionStep, SyntaxGrammar}; -use crate::generate::rules::Associativity; -use crate::generate::rules::{Symbol, SymbolType}; +use crate::generate::rules::{Associativity, Symbol, SymbolType, TokenSet}; use lazy_static::lazy_static; -use smallbitvec::SmallBitVec; use std::cmp::Ordering; use std::fmt; use std::hash::{Hash, Hasher}; -use std::iter::FromIterator; use std::u32; lazy_static! { @@ -25,17 +22,6 @@ lazy_static! { }; } -// Because tokens are represented as small (~400 max) unsigned integers, -// sets of tokens can be efficiently represented as bit vectors with each -// index correspoding to a token, and each value representing whether or not -// the token is present in the set. -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub(crate) struct TokenSet { - terminal_bits: SmallBitVec, - external_bits: SmallBitVec, - eof: bool, -} - #[derive(Clone, Copy, Debug)] pub(crate) struct ParseItem<'a> { pub variable_index: u32, @@ -72,148 +58,6 @@ pub(crate) struct ParseItemSetDisplay<'a>( pub &'a LexicalGrammar, ); -impl TokenSet { - pub fn new() -> Self { - Self { - terminal_bits: SmallBitVec::new(), - external_bits: SmallBitVec::new(), - eof: false, - } - } - - pub fn iter<'a>(&'a self) -> impl Iterator + 'a { - self.terminal_bits - .iter() - .enumerate() - .filter_map(|(i, value)| { - if value { - Some(Symbol::terminal(i)) - } else { - None - } - }) - .chain( - self.external_bits - .iter() - .enumerate() - .filter_map(|(i, value)| { - if value { - Some(Symbol::external(i)) - } else { - None - } - }), - ) - .chain(if self.eof { Some(Symbol::end()) } else { None }) - } - - pub fn terminals<'a>(&'a self) -> impl Iterator + 'a { - self.terminal_bits - .iter() - .enumerate() - .filter_map(|(i, value)| { - if value { - Some(Symbol::terminal(i)) - } else { - None - } - }) - } - - pub fn contains(&self, symbol: &Symbol) -> bool { - match symbol.kind { - SymbolType::NonTerminal => panic!("Cannot store non-terminals in a TokenSet"), - SymbolType::Terminal => self.terminal_bits.get(symbol.index).unwrap_or(false), - SymbolType::External => self.external_bits.get(symbol.index).unwrap_or(false), - SymbolType::End => self.eof, - } - } - - pub fn contains_terminal(&self, index: usize) -> bool { - self.terminal_bits.get(index).unwrap_or(false) - } - - pub fn insert(&mut self, other: Symbol) { - let vec = match other.kind { - SymbolType::NonTerminal => panic!("Cannot store non-terminals in a TokenSet"), - SymbolType::Terminal => &mut self.terminal_bits, - SymbolType::External => &mut self.external_bits, - SymbolType::End => { - self.eof = true; - return; - } - }; - if other.index >= vec.len() { - vec.resize(other.index + 1, false); - } - vec.set(other.index, true); - } - - pub fn remove(&mut self, other: &Symbol) { - let vec = match other.kind { - SymbolType::NonTerminal => panic!("Cannot store non-terminals in a TokenSet"), - SymbolType::Terminal => &mut self.terminal_bits, - SymbolType::External => &mut self.external_bits, - SymbolType::End => { - self.eof = false; - return; - } - }; - if other.index < vec.len() { - vec.set(other.index, false); - } - } - - pub fn insert_all_terminals(&mut self, other: &TokenSet) -> bool { - let mut result = false; - if other.terminal_bits.len() > self.terminal_bits.len() { - self.terminal_bits.resize(other.terminal_bits.len(), false); - } - for (i, element) in other.terminal_bits.iter().enumerate() { - if element { - result |= !self.terminal_bits[i]; - self.terminal_bits.set(i, element); - } - } - result - } - - fn insert_all_externals(&mut self, other: &TokenSet) -> bool { - let mut result = false; - if other.external_bits.len() > self.external_bits.len() { - self.external_bits.resize(other.external_bits.len(), false); - } - for (i, element) in other.external_bits.iter().enumerate() { - if element { - result |= !self.external_bits[i]; - self.external_bits.set(i, element); - } - } - result - } - - pub fn insert_all(&mut self, other: &TokenSet) -> bool { - let mut result = false; - if other.eof { - result |= !self.eof; - self.eof = true; - } - result |= self.insert_all_terminals(other); - result |= self.insert_all_externals(other); - result - } -} - -impl FromIterator for TokenSet { - fn from_iter>(iter: T) -> Self { - let mut result = Self::new(); - for symbol in iter { - result.insert(symbol); - } - result - } -} - impl<'a> ParseItem<'a> { pub fn start() -> Self { ParseItem { diff --git a/cli/src/generate/build_tables/item_set_builder.rs b/cli/src/generate/build_tables/item_set_builder.rs index 3bed0492..ec8f368b 100644 --- a/cli/src/generate/build_tables/item_set_builder.rs +++ b/cli/src/generate/build_tables/item_set_builder.rs @@ -1,6 +1,6 @@ -use super::item::{ParseItem, ParseItemDisplay, ParseItemSet, TokenSet, TokenSetDisplay}; +use super::item::{ParseItem, ParseItemDisplay, ParseItemSet, TokenSetDisplay}; use crate::generate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; -use crate::generate::rules::{Symbol, SymbolType}; +use crate::generate::rules::{Symbol, SymbolType, TokenSet}; use std::collections::{HashMap, HashSet}; use std::fmt; diff --git a/cli/src/generate/build_tables/minimize_parse_table.rs b/cli/src/generate/build_tables/minimize_parse_table.rs index e0129514..db7e3961 100644 --- a/cli/src/generate/build_tables/minimize_parse_table.rs +++ b/cli/src/generate/build_tables/minimize_parse_table.rs @@ -1,8 +1,7 @@ -use super::item::TokenSet; use super::token_conflicts::TokenConflictMap; use crate::generate::dedup::split_state_id_groups; use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar, VariableType}; -use crate::generate::rules::{AliasMap, Symbol}; +use crate::generate::rules::{AliasMap, Symbol, TokenSet}; use crate::generate::tables::{ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry}; use log::info; use std::collections::{HashMap, HashSet}; diff --git a/cli/src/generate/build_tables/mod.rs b/cli/src/generate/build_tables/mod.rs index 78c26045..2c3f47fb 100644 --- a/cli/src/generate/build_tables/mod.rs +++ b/cli/src/generate/build_tables/mod.rs @@ -9,14 +9,13 @@ mod token_conflicts; use self::build_lex_table::build_lex_table; use self::build_parse_table::build_parse_table; use self::coincident_tokens::CoincidentTokenIndex; -use self::item::TokenSet; use self::minimize_parse_table::minimize_parse_table; use self::token_conflicts::TokenConflictMap; use crate::error::Result; use crate::generate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; use crate::generate::nfa::{CharacterSet, NfaCursor}; use crate::generate::node_types::VariableInfo; -use crate::generate::rules::{AliasMap, Symbol, SymbolType}; +use crate::generate::rules::{AliasMap, Symbol, SymbolType, TokenSet}; use crate::generate::tables::{LexTable, ParseAction, ParseTable, ParseTableEntry}; use log::info; diff --git a/cli/src/generate/build_tables/token_conflicts.rs b/cli/src/generate/build_tables/token_conflicts.rs index 71a569e6..edb92108 100644 --- a/cli/src/generate/build_tables/token_conflicts.rs +++ b/cli/src/generate/build_tables/token_conflicts.rs @@ -1,6 +1,7 @@ -use crate::generate::build_tables::item::{TokenSet, TokenSetDisplay}; +use crate::generate::build_tables::item::{TokenSetDisplay}; use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar}; use crate::generate::nfa::{CharacterSet, NfaCursor, NfaTransition}; +use crate::generate::rules::TokenSet; use std::collections::HashSet; use std::cmp::Ordering; use std::fmt; diff --git a/cli/src/generate/rules.rs b/cli/src/generate/rules.rs index 82655d26..f3e39ebf 100644 --- a/cli/src/generate/rules.rs +++ b/cli/src/generate/rules.rs @@ -1,4 +1,6 @@ +use smallbitvec::SmallBitVec; use std::collections::HashMap; +use std::iter::FromIterator; #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] pub(crate) enum SymbolType { @@ -57,6 +59,17 @@ pub(crate) enum Rule { Seq(Vec), } +// Because tokens are represented as small (~400 max) unsigned integers, +// sets of tokens can be efficiently represented as bit vectors with each +// index correspoding to a token, and each value representing whether or not +// the token is present in the set. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct TokenSet { + terminal_bits: SmallBitVec, + external_bits: SmallBitVec, + eof: bool, +} + impl Rule { pub fn field(name: String, content: Rule) -> Self { add_metadata(content, move |params| { @@ -205,6 +218,152 @@ impl From for Rule { } } +impl TokenSet { + pub fn new() -> Self { + Self { + terminal_bits: SmallBitVec::new(), + external_bits: SmallBitVec::new(), + eof: false, + } + } + + pub fn iter<'a>(&'a self) -> impl Iterator + 'a { + self.terminal_bits + .iter() + .enumerate() + .filter_map(|(i, value)| { + if value { + Some(Symbol::terminal(i)) + } else { + None + } + }) + .chain( + self.external_bits + .iter() + .enumerate() + .filter_map(|(i, value)| { + if value { + Some(Symbol::external(i)) + } else { + None + } + }), + ) + .chain(if self.eof { Some(Symbol::end()) } else { None }) + } + + pub fn terminals<'a>(&'a self) -> impl Iterator + 'a { + self.terminal_bits + .iter() + .enumerate() + .filter_map(|(i, value)| { + if value { + Some(Symbol::terminal(i)) + } else { + None + } + }) + } + + pub fn contains(&self, symbol: &Symbol) -> bool { + match symbol.kind { + SymbolType::NonTerminal => panic!("Cannot store non-terminals in a TokenSet"), + SymbolType::Terminal => self.terminal_bits.get(symbol.index).unwrap_or(false), + SymbolType::External => self.external_bits.get(symbol.index).unwrap_or(false), + SymbolType::End => self.eof, + } + } + + pub fn contains_terminal(&self, index: usize) -> bool { + self.terminal_bits.get(index).unwrap_or(false) + } + + pub fn insert(&mut self, other: Symbol) { + let vec = match other.kind { + SymbolType::NonTerminal => panic!("Cannot store non-terminals in a TokenSet"), + SymbolType::Terminal => &mut self.terminal_bits, + SymbolType::External => &mut self.external_bits, + SymbolType::End => { + self.eof = true; + return; + } + }; + if other.index >= vec.len() { + vec.resize(other.index + 1, false); + } + vec.set(other.index, true); + } + + pub fn remove(&mut self, other: &Symbol) { + let vec = match other.kind { + SymbolType::NonTerminal => panic!("Cannot store non-terminals in a TokenSet"), + SymbolType::Terminal => &mut self.terminal_bits, + SymbolType::External => &mut self.external_bits, + SymbolType::End => { + self.eof = false; + return; + } + }; + if other.index < vec.len() { + vec.set(other.index, false); + } + } + + pub fn is_empty(&self) -> bool { + !self.eof && !self.terminal_bits.iter().any(|a| a) && !self.external_bits.iter().any(|a| a) + } + + pub fn insert_all_terminals(&mut self, other: &TokenSet) -> bool { + let mut result = false; + if other.terminal_bits.len() > self.terminal_bits.len() { + self.terminal_bits.resize(other.terminal_bits.len(), false); + } + for (i, element) in other.terminal_bits.iter().enumerate() { + if element { + result |= !self.terminal_bits[i]; + self.terminal_bits.set(i, element); + } + } + result + } + + fn insert_all_externals(&mut self, other: &TokenSet) -> bool { + let mut result = false; + if other.external_bits.len() > self.external_bits.len() { + self.external_bits.resize(other.external_bits.len(), false); + } + for (i, element) in other.external_bits.iter().enumerate() { + if element { + result |= !self.external_bits[i]; + self.external_bits.set(i, element); + } + } + result + } + + pub fn insert_all(&mut self, other: &TokenSet) -> bool { + let mut result = false; + if other.eof { + result |= !self.eof; + self.eof = true; + } + result |= self.insert_all_terminals(other); + result |= self.insert_all_externals(other); + result + } +} + +impl FromIterator for TokenSet { + fn from_iter>(iter: T) -> Self { + let mut result = Self::new(); + for symbol in iter { + result.insert(symbol); + } + result + } +} + fn add_metadata(input: Rule, f: T) -> Rule { match input { Rule::Metadata { rule, mut params } => {