diff --git a/src/build_tables/build_parse_table.rs b/src/build_tables/build_parse_table.rs index a7911689..2fe6fd8d 100644 --- a/src/build_tables/build_parse_table.rs +++ b/src/build_tables/build_parse_table.rs @@ -7,7 +7,8 @@ use crate::tables::{ AliasSequenceId, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry, }; use core::ops::Range; -use std::collections::hash_map::Entry; +use std::hash::Hasher; +use std::collections::hash_map::{Entry, DefaultHasher}; use std::collections::{HashMap, HashSet, VecDeque}; use std::fmt::Write; @@ -44,14 +45,13 @@ impl<'a> ParseTableBuilder<'a> { self.parse_table.alias_sequences.push(Vec::new()); // Ensure that the error state has index 0. - let error_state_id = - self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default()); + self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default()); self.add_parse_state( &Vec::new(), &Vec::new(), ParseItemSet::with( - [(ParseItem::start(), LookaheadSet::with(&[Symbol::end()]))] + [(ParseItem::start(), LookaheadSet::with([Symbol::end()].iter().cloned()))] .iter() .cloned(), ), @@ -78,6 +78,10 @@ impl<'a> ParseTableBuilder<'a> { } } + let mut hasher = DefaultHasher::new(); + item_set.hash_unfinished_items(&mut hasher); + let unfinished_item_signature = hasher.finish(); + match self.state_ids_by_item_set.entry(item_set) { Entry::Occupied(o) => *o.get(), Entry::Vacant(v) => { @@ -87,6 +91,7 @@ impl<'a> ParseTableBuilder<'a> { lex_state_id: 0, terminal_entries: HashMap::new(), nonterminal_entries: HashMap::new(), + unfinished_item_signature, }); self.parse_state_queue.push_back(ParseStateQueueEntry { state_id, diff --git a/src/build_tables/coincident_tokens.rs b/src/build_tables/coincident_tokens.rs new file mode 100644 index 00000000..10707489 --- /dev/null +++ b/src/build_tables/coincident_tokens.rs @@ -0,0 +1,36 @@ +use crate::rules::Symbol; +use crate::tables::{ParseStateId, ParseTable}; +use std::collections::{HashMap, HashSet}; + +pub(crate) struct CoincidentTokenIndex { + entries: HashMap<(Symbol, Symbol), HashSet>, + empty: HashSet, +} + +impl CoincidentTokenIndex { + pub fn new(table: &ParseTable) -> Self { + let mut entries = HashMap::new(); + for (i, state) in table.states.iter().enumerate() { + for symbol in state.terminal_entries.keys() { + for other_symbol in state.terminal_entries.keys() { + entries + .entry((*symbol, *other_symbol)) + .or_insert(HashSet::new()) + .insert(i); + } + } + } + Self { + entries, + empty: HashSet::new(), + } + } + + pub fn states_with(&self, a: Symbol, b: Symbol) -> &HashSet { + self.entries.get(&(a, b)).unwrap_or(&self.empty) + } + + pub fn contains(&self, a: Symbol, b: Symbol) -> bool { + self.entries.contains_key(&(a, b)) + } +} diff --git a/src/build_tables/item.rs b/src/build_tables/item.rs index 28723d24..4cd2f643 100644 --- a/src/build_tables/item.rs +++ b/src/build_tables/item.rs @@ -2,11 +2,11 @@ use crate::grammars::{LexicalGrammar, Production, ProductionStep, SyntaxGrammar} use crate::rules::Associativity; use crate::rules::{Symbol, SymbolType}; use smallbitvec::SmallBitVec; +use std::cmp::Ordering; use std::collections::BTreeMap; use std::fmt; use std::hash::{Hash, Hasher}; use std::u32; -use std::cmp::Ordering; lazy_static! { static ref START_PRODUCTION: Production = Production { @@ -85,10 +85,10 @@ impl LookaheadSet { .chain(if self.eof { Some(Symbol::end()) } else { None }) } - pub fn with<'a>(symbols: impl IntoIterator) -> Self { + pub fn with(symbols: impl IntoIterator) -> Self { let mut result = Self::new(); for symbol in symbols { - result.insert(*symbol); + result.insert(symbol); } result } @@ -219,6 +219,21 @@ impl<'a> ParseItemSet<'a> { result } + pub fn hash_unfinished_items(&self, h: &mut impl Hasher) { + let mut previous_variable_index = u32::MAX; + let mut previous_step_index = u32::MAX; + for item in self.entries.keys() { + if item.step().is_none() && item.variable_index != previous_variable_index + || item.step_index != previous_step_index + { + h.write_u32(item.variable_index); + h.write_u32(item.step_index); + previous_variable_index = item.variable_index; + previous_step_index = item.step_index; + } + } + } + pub fn display_with( &'a self, syntax_grammar: &'a SyntaxGrammar, @@ -369,11 +384,18 @@ impl<'a> Ord for ParseItem<'a> { if o != Ordering::Equal { return o; } - let o = self.production.dynamic_precedence.cmp(&other.production.dynamic_precedence); + let o = self + .production + .dynamic_precedence + .cmp(&other.production.dynamic_precedence); if o != Ordering::Equal { return o; } - let o = self.production.steps.len().cmp(&other.production.steps.len()); + let o = self + .production + .steps + .len() + .cmp(&other.production.steps.len()); if o != Ordering::Equal { return o; } diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index d1983068..665c56a0 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -1,18 +1,20 @@ -use crate::error::Result; -use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; -use crate::rules::{AliasMap, Symbol}; -use crate::tables::{LexTable, ParseTable}; - mod build_parse_table; +mod coincident_tokens; mod item; mod item_set_builder; mod lex_table_builder; mod shrink_parse_table; -mod token_conflict_map; +mod token_conflicts; use self::build_parse_table::build_parse_table; +use self::coincident_tokens::CoincidentTokenIndex; +use self::item::LookaheadSet; use self::shrink_parse_table::shrink_parse_table; -use self::token_conflict_map::TokenConflictMap; +use self::token_conflicts::TokenConflictMap; +use crate::error::Result; +use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; +use crate::rules::{AliasMap, Symbol}; +use crate::tables::{LexTable, ParseAction, ParseTable, ParseTableEntry}; pub(crate) fn build_tables( syntax_grammar: &SyntaxGrammar, @@ -23,6 +25,76 @@ pub(crate) fn build_tables( let (mut parse_table, following_tokens) = build_parse_table(syntax_grammar, lexical_grammar, inlines)?; let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens); - shrink_parse_table(&mut parse_table, syntax_grammar, simple_aliases); + let coincident_token_index = CoincidentTokenIndex::new(&parse_table); + populate_error_state( + &mut parse_table, + syntax_grammar, + lexical_grammar, + &coincident_token_index, + &token_conflict_map, + ); + shrink_parse_table( + &mut parse_table, + syntax_grammar, + simple_aliases, + &token_conflict_map, + ); Ok((parse_table, LexTable::default(), LexTable::default(), None)) } + +fn populate_error_state( + parse_table: &mut ParseTable, + syntax_grammar: &SyntaxGrammar, + lexical_grammar: &LexicalGrammar, + coincident_token_index: &CoincidentTokenIndex, + token_conflict_map: &TokenConflictMap, +) { + let state = &mut parse_table.states[0]; + let n = lexical_grammar.variables.len(); + let conflict_free_tokens = LookaheadSet::with((0..n).into_iter().filter_map(|i| { + let conflicts_with_other_tokens = (0..n).into_iter().all(|j| { + j == i + || coincident_token_index.contains(Symbol::terminal(i), Symbol::terminal(j)) + || !token_conflict_map.does_conflict(i, j) + }); + if conflicts_with_other_tokens { + None + } else { + Some(Symbol::terminal(i)) + } + })); + + let recover_entry = ParseTableEntry { + reusable: false, + actions: vec![ParseAction::Recover], + }; + + for i in 0..n { + let symbol = Symbol::terminal(i); + let can_be_used_for_recovery = conflict_free_tokens.contains(&symbol) + || conflict_free_tokens.iter().all(|t| { + coincident_token_index.contains(symbol, t) + || !token_conflict_map.does_conflict(i, t.index) + }); + if can_be_used_for_recovery { + eprintln!("include {}", &lexical_grammar.variables[symbol.index].name); + state + .terminal_entries + .entry(symbol) + .or_insert_with(|| recover_entry.clone()); + } else { + eprintln!("exclude {}", &lexical_grammar.variables[symbol.index].name); + } + } + + for (i, external_token) in syntax_grammar.external_tokens.iter().enumerate() { + if external_token.corresponding_internal_token.is_none() { + state + .terminal_entries + .entry(Symbol::external(i)) + .or_insert_with(|| recover_entry.clone()); + } + } + + state.terminal_entries.insert(Symbol::end(), recover_entry); +} diff --git a/src/build_tables/shrink_parse_table.rs b/src/build_tables/shrink_parse_table.rs index 8e826f5c..026c3058 100644 --- a/src/build_tables/shrink_parse_table.rs +++ b/src/build_tables/shrink_parse_table.rs @@ -1,14 +1,17 @@ +use super::token_conflicts::TokenConflictMap; use crate::grammars::{SyntaxGrammar, VariableType}; -use crate::rules::AliasMap; -use crate::tables::{ParseAction, ParseTable}; +use crate::rules::{AliasMap, Symbol}; +use crate::tables::{ParseAction, ParseState, ParseTable, ParseTableEntry}; use std::collections::{HashMap, HashSet}; pub(crate) fn shrink_parse_table( parse_table: &mut ParseTable, syntax_grammar: &SyntaxGrammar, simple_aliases: &AliasMap, + token_conflict_map: &TokenConflictMap, ) { remove_unit_reductions(parse_table, syntax_grammar, simple_aliases); + merge_compatible_states(parse_table, syntax_grammar, token_conflict_map); remove_unused_states(parse_table); } @@ -86,6 +89,157 @@ fn remove_unit_reductions( } } +fn merge_compatible_states( + parse_table: &mut ParseTable, + syntax_grammar: &SyntaxGrammar, + token_conflict_map: &TokenConflictMap, +) { + let mut state_ids_by_signature = HashMap::new(); + for (i, state) in parse_table.states.iter().enumerate() { + state_ids_by_signature + .entry(state.unfinished_item_signature) + .or_insert(Vec::new()) + .push(i); + } + + let mut deleted_states = HashSet::new(); + loop { + let mut state_replacements = HashMap::new(); + for (_, state_ids) in &state_ids_by_signature { + for i in state_ids { + for j in state_ids { + if j == i { + break; + } + if deleted_states.contains(j) || deleted_states.contains(i) { + continue; + } + if merge_parse_state(syntax_grammar, token_conflict_map, parse_table, *j, *i) { + deleted_states.insert(*i); + state_replacements.insert(*i, *j); + } + } + } + } + + if state_replacements.is_empty() { + break; + } + + for state in parse_table.states.iter_mut() { + state.update_referenced_states(|other_state_id, _| { + *state_replacements + .get(&other_state_id) + .unwrap_or(&other_state_id) + }); + } + } +} + +fn merge_parse_state( + syntax_grammar: &SyntaxGrammar, + token_conflict_map: &TokenConflictMap, + parse_table: &mut ParseTable, + left: usize, + right: usize, +) -> bool { + let left_state = &parse_table.states[left]; + let right_state = &parse_table.states[right]; + + if left_state.nonterminal_entries != right_state.nonterminal_entries { + return false; + } + + for (symbol, left_entry) in &left_state.terminal_entries { + if let Some(right_entry) = right_state.terminal_entries.get(symbol) { + if right_entry.actions != left_entry.actions { + return false; + } + } else if !can_add_entry_to_state( + syntax_grammar, + token_conflict_map, + right_state, + *symbol, + left_entry, + ) { + return false; + } + } + + eprintln!("maybe merge {} {}", left, right); + + let mut symbols_to_add = Vec::new(); + for (symbol, right_entry) in &right_state.terminal_entries { + if !left_state.terminal_entries.contains_key(&symbol) { + if !can_add_entry_to_state( + syntax_grammar, + token_conflict_map, + left_state, + *symbol, + right_entry, + ) { + return false; + } + symbols_to_add.push(*symbol); + } + } + + for symbol in symbols_to_add { + let entry = parse_table.states[right].terminal_entries[&symbol].clone(); + parse_table.states[left] + .terminal_entries + .insert(symbol, entry); + } + + true +} + +fn can_add_entry_to_state( + syntax_grammar: &SyntaxGrammar, + token_conflict_map: &TokenConflictMap, + state: &ParseState, + token: Symbol, + entry: &ParseTableEntry, +) -> bool { + // Do not add external tokens; they could conflict lexically with any of the state's + // existing lookahead tokens. + if token.is_external() { + return false; + } + + // Only merge parse states by allowing existing reductions to happen + // with additional lookahead tokens. Do not alter parse states in ways + // that allow entirely new types of actions to happen. + if state.terminal_entries.iter().all(|(_, e)| e != entry) { + return false; + } + match entry.actions.last() { + Some(ParseAction::Reduce { .. }) => {} + _ => return false, + } + + // Do not add tokens which are both internal and external. Their validity could + // influence the behavior of the external scanner. + if syntax_grammar + .external_tokens + .iter() + .any(|t| t.corresponding_internal_token == Some(token)) + { + return false; + } + + // Do not add a token if it conflicts with an existing token. + if token.is_terminal() { + for existing_token in state.terminal_entries.keys() { + if token_conflict_map.does_conflict(token.index, existing_token.index) { + return false; + } + } + } + + true +} + fn remove_unused_states(parse_table: &mut ParseTable) { let mut state_usage_map = vec![false; parse_table.states.len()]; for state in &parse_table.states { diff --git a/src/build_tables/token_conflict_map.rs b/src/build_tables/token_conflicts.rs similarity index 92% rename from src/build_tables/token_conflict_map.rs rename to src/build_tables/token_conflicts.rs index 52c68cc7..09d5e97c 100644 --- a/src/build_tables/token_conflict_map.rs +++ b/src/build_tables/token_conflicts.rs @@ -8,6 +8,7 @@ use std::fmt; struct TokenConflictStatus { does_overlap: bool, does_match_valid_continuation: bool, + does_match_separators: bool, matches_same_string: bool, } @@ -46,8 +47,9 @@ impl TokenConflictMap { self.status_matrix[matrix_index(self.n, i, j)].matches_same_string } - pub fn does_match_valid_continuation(&self, i: usize, j: usize) -> bool { - self.status_matrix[matrix_index(self.n, i, j)].does_match_valid_continuation + pub fn does_conflict(&self, i: usize, j: usize) -> bool { + let entry = &self.status_matrix[matrix_index(self.n, i, j)]; + entry.does_match_valid_continuation || entry.does_match_separators } pub fn does_overlap(&self, i: usize, j: usize) -> bool { @@ -207,10 +209,15 @@ fn compute_conflict_status( if chars.does_intersect(&following_chars[j]) { result.0.does_match_valid_continuation = true; } + if cursor.in_separator() { + result.0.does_match_separators = true; + } } else { result.1.does_overlap = true; if chars.does_intersect(&following_chars[i]) { result.1.does_match_valid_continuation = true; + } else { + result.1.does_match_separators = true; } } } @@ -326,9 +333,9 @@ mod tests { let token_map = TokenConflictMap::new( &grammar, vec![ - LookaheadSet::with(&[Symbol::terminal(var("identifier"))]), - LookaheadSet::with(&[Symbol::terminal(var("in"))]), - LookaheadSet::with(&[Symbol::terminal(var("identifier"))]), + LookaheadSet::with([Symbol::terminal(var("identifier"))].iter().cloned()), + LookaheadSet::with([Symbol::terminal(var("in"))].iter().cloned()), + LookaheadSet::with([Symbol::terminal(var("identifier"))].iter().cloned()), ], ); @@ -338,12 +345,12 @@ mod tests { // Depending on what character follows, the string "in" may be treated as part of an // `identifier` token. - assert!(token_map.does_match_valid_continuation(var("identifier"), var("in"))); + assert!(token_map.does_conflict(var("identifier"), var("in"))); // Depending on what character follows, the string "instanceof" may be treated as part of // an `identifier` token. - assert!(token_map.does_match_valid_continuation(var("identifier"), var("instanceof"))); - assert!(token_map.does_match_valid_continuation(var("instanceof"), var("in"))); + assert!(token_map.does_conflict(var("identifier"), var("instanceof"))); + assert!(token_map.does_conflict(var("instanceof"), var("in"))); } fn index_of_var(grammar: &LexicalGrammar, name: &str) -> usize { diff --git a/src/nfa.rs b/src/nfa.rs index 738d1b40..ee39d178 100644 --- a/src/nfa.rs +++ b/src/nfa.rs @@ -86,15 +86,34 @@ impl CharacterSet { } pub fn add(self, other: &CharacterSet) -> Self { - if let CharacterSet::Include(other_chars) = other { - if let CharacterSet::Include(mut chars) = self { - chars.extend(other_chars); - chars.sort_unstable(); - chars.dedup(); - return CharacterSet::Include(chars); - } + match self { + CharacterSet::Include(mut chars) => match other { + CharacterSet::Include(other_chars) => { + chars.extend(other_chars); + chars.sort_unstable(); + chars.dedup(); + CharacterSet::Include(chars) + } + CharacterSet::Exclude(other_chars) => { + let excluded_chars = other_chars + .iter() + .cloned() + .filter(|c| !chars.contains(&c)) + .collect(); + CharacterSet::Exclude(excluded_chars) + } + }, + CharacterSet::Exclude(mut chars) => match other { + CharacterSet::Include(other_chars) => { + chars.retain(|c| !other_chars.contains(&c)); + CharacterSet::Exclude(chars) + } + CharacterSet::Exclude(other_chars) => { + chars.retain(|c| other_chars.contains(&c)); + CharacterSet::Exclude(chars) + }, + }, } - panic!("Called add with a negated character set"); } pub fn does_intersect(&self, other: &CharacterSet) -> bool { @@ -458,6 +477,9 @@ mod tests { (CharacterSet::empty().add_char('f'), 0, 4), ], vec![ + (CharacterSet::empty().add_char('d'), 0, vec![1, 2]), + (CharacterSet::empty().add_char('f'), 0, vec![1, 4]), + (CharacterSet::empty().add_char('i'), 0, vec![1, 3]), ( CharacterSet::empty() .add_range('a', 'c') @@ -467,9 +489,6 @@ mod tests { 0, vec![1], ), - (CharacterSet::empty().add_char('d'), 0, vec![1, 2]), - (CharacterSet::empty().add_char('f'), 0, vec![1, 4]), - (CharacterSet::empty().add_char('i'), 0, vec![1, 3]), ], ), ]; diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index 2b7e7b4d..4ef17b27 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -164,12 +164,20 @@ impl NfaBuilder { Err(Error::regex("Unicode character classes are not supported")) } Class::Perl(class) => { - self.push_advance(self.expand_perl_character_class(&class.kind), next_state_id); + let mut chars = self.expand_perl_character_class(&class.kind); + if class.negated { + chars = chars.negate(); + } + self.push_advance(chars, next_state_id); Ok(true) } Class::Bracketed(class) => match &class.kind { ClassSet::Item(item) => { - self.push_advance(self.expand_character_class(&item)?, next_state_id); + let mut chars = self.expand_character_class(&item)?; + if class.negated { + chars = chars.negate(); + } + self.push_advance(chars, next_state_id); Ok(true) } ClassSet::BinaryOp(_) => Err(Error::regex( diff --git a/src/tables.rs b/src/tables.rs index 0815aac8..344c4816 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -37,6 +37,7 @@ pub(crate) struct ParseState { pub terminal_entries: HashMap, pub nonterminal_entries: HashMap, pub lex_state_id: usize, + pub unfinished_item_signature: u64, } #[derive(Debug, PartialEq, Eq)]