Implement parse state merging

This commit is contained in:
Max Brunsfeld 2019-01-01 13:47:29 -08:00
parent c6b9e97c58
commit a46b8fcb46
9 changed files with 364 additions and 40 deletions

View file

@ -7,7 +7,8 @@ use crate::tables::{
AliasSequenceId, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry,
};
use core::ops::Range;
use std::collections::hash_map::Entry;
use std::hash::Hasher;
use std::collections::hash_map::{Entry, DefaultHasher};
use std::collections::{HashMap, HashSet, VecDeque};
use std::fmt::Write;
@ -44,14 +45,13 @@ impl<'a> ParseTableBuilder<'a> {
self.parse_table.alias_sequences.push(Vec::new());
// Ensure that the error state has index 0.
let error_state_id =
self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default());
self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default());
self.add_parse_state(
&Vec::new(),
&Vec::new(),
ParseItemSet::with(
[(ParseItem::start(), LookaheadSet::with(&[Symbol::end()]))]
[(ParseItem::start(), LookaheadSet::with([Symbol::end()].iter().cloned()))]
.iter()
.cloned(),
),
@ -78,6 +78,10 @@ impl<'a> ParseTableBuilder<'a> {
}
}
let mut hasher = DefaultHasher::new();
item_set.hash_unfinished_items(&mut hasher);
let unfinished_item_signature = hasher.finish();
match self.state_ids_by_item_set.entry(item_set) {
Entry::Occupied(o) => *o.get(),
Entry::Vacant(v) => {
@ -87,6 +91,7 @@ impl<'a> ParseTableBuilder<'a> {
lex_state_id: 0,
terminal_entries: HashMap::new(),
nonterminal_entries: HashMap::new(),
unfinished_item_signature,
});
self.parse_state_queue.push_back(ParseStateQueueEntry {
state_id,

View file

@ -0,0 +1,36 @@
use crate::rules::Symbol;
use crate::tables::{ParseStateId, ParseTable};
use std::collections::{HashMap, HashSet};
pub(crate) struct CoincidentTokenIndex {
entries: HashMap<(Symbol, Symbol), HashSet<ParseStateId>>,
empty: HashSet<ParseStateId>,
}
impl CoincidentTokenIndex {
pub fn new(table: &ParseTable) -> Self {
let mut entries = HashMap::new();
for (i, state) in table.states.iter().enumerate() {
for symbol in state.terminal_entries.keys() {
for other_symbol in state.terminal_entries.keys() {
entries
.entry((*symbol, *other_symbol))
.or_insert(HashSet::new())
.insert(i);
}
}
}
Self {
entries,
empty: HashSet::new(),
}
}
pub fn states_with(&self, a: Symbol, b: Symbol) -> &HashSet<ParseStateId> {
self.entries.get(&(a, b)).unwrap_or(&self.empty)
}
pub fn contains(&self, a: Symbol, b: Symbol) -> bool {
self.entries.contains_key(&(a, b))
}
}

View file

@ -2,11 +2,11 @@ use crate::grammars::{LexicalGrammar, Production, ProductionStep, SyntaxGrammar}
use crate::rules::Associativity;
use crate::rules::{Symbol, SymbolType};
use smallbitvec::SmallBitVec;
use std::cmp::Ordering;
use std::collections::BTreeMap;
use std::fmt;
use std::hash::{Hash, Hasher};
use std::u32;
use std::cmp::Ordering;
lazy_static! {
static ref START_PRODUCTION: Production = Production {
@ -85,10 +85,10 @@ impl LookaheadSet {
.chain(if self.eof { Some(Symbol::end()) } else { None })
}
pub fn with<'a>(symbols: impl IntoIterator<Item = &'a Symbol>) -> Self {
pub fn with(symbols: impl IntoIterator<Item = Symbol>) -> Self {
let mut result = Self::new();
for symbol in symbols {
result.insert(*symbol);
result.insert(symbol);
}
result
}
@ -219,6 +219,21 @@ impl<'a> ParseItemSet<'a> {
result
}
pub fn hash_unfinished_items(&self, h: &mut impl Hasher) {
let mut previous_variable_index = u32::MAX;
let mut previous_step_index = u32::MAX;
for item in self.entries.keys() {
if item.step().is_none() && item.variable_index != previous_variable_index
|| item.step_index != previous_step_index
{
h.write_u32(item.variable_index);
h.write_u32(item.step_index);
previous_variable_index = item.variable_index;
previous_step_index = item.step_index;
}
}
}
pub fn display_with(
&'a self,
syntax_grammar: &'a SyntaxGrammar,
@ -369,11 +384,18 @@ impl<'a> Ord for ParseItem<'a> {
if o != Ordering::Equal {
return o;
}
let o = self.production.dynamic_precedence.cmp(&other.production.dynamic_precedence);
let o = self
.production
.dynamic_precedence
.cmp(&other.production.dynamic_precedence);
if o != Ordering::Equal {
return o;
}
let o = self.production.steps.len().cmp(&other.production.steps.len());
let o = self
.production
.steps
.len()
.cmp(&other.production.steps.len());
if o != Ordering::Equal {
return o;
}

View file

@ -1,18 +1,20 @@
use crate::error::Result;
use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar};
use crate::rules::{AliasMap, Symbol};
use crate::tables::{LexTable, ParseTable};
mod build_parse_table;
mod coincident_tokens;
mod item;
mod item_set_builder;
mod lex_table_builder;
mod shrink_parse_table;
mod token_conflict_map;
mod token_conflicts;
use self::build_parse_table::build_parse_table;
use self::coincident_tokens::CoincidentTokenIndex;
use self::item::LookaheadSet;
use self::shrink_parse_table::shrink_parse_table;
use self::token_conflict_map::TokenConflictMap;
use self::token_conflicts::TokenConflictMap;
use crate::error::Result;
use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar};
use crate::rules::{AliasMap, Symbol};
use crate::tables::{LexTable, ParseAction, ParseTable, ParseTableEntry};
pub(crate) fn build_tables(
syntax_grammar: &SyntaxGrammar,
@ -23,6 +25,76 @@ pub(crate) fn build_tables(
let (mut parse_table, following_tokens) =
build_parse_table(syntax_grammar, lexical_grammar, inlines)?;
let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens);
shrink_parse_table(&mut parse_table, syntax_grammar, simple_aliases);
let coincident_token_index = CoincidentTokenIndex::new(&parse_table);
populate_error_state(
&mut parse_table,
syntax_grammar,
lexical_grammar,
&coincident_token_index,
&token_conflict_map,
);
shrink_parse_table(
&mut parse_table,
syntax_grammar,
simple_aliases,
&token_conflict_map,
);
Ok((parse_table, LexTable::default(), LexTable::default(), None))
}
fn populate_error_state(
parse_table: &mut ParseTable,
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
coincident_token_index: &CoincidentTokenIndex,
token_conflict_map: &TokenConflictMap,
) {
let state = &mut parse_table.states[0];
let n = lexical_grammar.variables.len();
let conflict_free_tokens = LookaheadSet::with((0..n).into_iter().filter_map(|i| {
let conflicts_with_other_tokens = (0..n).into_iter().all(|j| {
j == i
|| coincident_token_index.contains(Symbol::terminal(i), Symbol::terminal(j))
|| !token_conflict_map.does_conflict(i, j)
});
if conflicts_with_other_tokens {
None
} else {
Some(Symbol::terminal(i))
}
}));
let recover_entry = ParseTableEntry {
reusable: false,
actions: vec![ParseAction::Recover],
};
for i in 0..n {
let symbol = Symbol::terminal(i);
let can_be_used_for_recovery = conflict_free_tokens.contains(&symbol)
|| conflict_free_tokens.iter().all(|t| {
coincident_token_index.contains(symbol, t)
|| !token_conflict_map.does_conflict(i, t.index)
});
if can_be_used_for_recovery {
eprintln!("include {}", &lexical_grammar.variables[symbol.index].name);
state
.terminal_entries
.entry(symbol)
.or_insert_with(|| recover_entry.clone());
} else {
eprintln!("exclude {}", &lexical_grammar.variables[symbol.index].name);
}
}
for (i, external_token) in syntax_grammar.external_tokens.iter().enumerate() {
if external_token.corresponding_internal_token.is_none() {
state
.terminal_entries
.entry(Symbol::external(i))
.or_insert_with(|| recover_entry.clone());
}
}
state.terminal_entries.insert(Symbol::end(), recover_entry);
}

View file

@ -1,14 +1,17 @@
use super::token_conflicts::TokenConflictMap;
use crate::grammars::{SyntaxGrammar, VariableType};
use crate::rules::AliasMap;
use crate::tables::{ParseAction, ParseTable};
use crate::rules::{AliasMap, Symbol};
use crate::tables::{ParseAction, ParseState, ParseTable, ParseTableEntry};
use std::collections::{HashMap, HashSet};
pub(crate) fn shrink_parse_table(
parse_table: &mut ParseTable,
syntax_grammar: &SyntaxGrammar,
simple_aliases: &AliasMap,
token_conflict_map: &TokenConflictMap,
) {
remove_unit_reductions(parse_table, syntax_grammar, simple_aliases);
merge_compatible_states(parse_table, syntax_grammar, token_conflict_map);
remove_unused_states(parse_table);
}
@ -86,6 +89,157 @@ fn remove_unit_reductions(
}
}
fn merge_compatible_states(
parse_table: &mut ParseTable,
syntax_grammar: &SyntaxGrammar,
token_conflict_map: &TokenConflictMap,
) {
let mut state_ids_by_signature = HashMap::new();
for (i, state) in parse_table.states.iter().enumerate() {
state_ids_by_signature
.entry(state.unfinished_item_signature)
.or_insert(Vec::new())
.push(i);
}
let mut deleted_states = HashSet::new();
loop {
let mut state_replacements = HashMap::new();
for (_, state_ids) in &state_ids_by_signature {
for i in state_ids {
for j in state_ids {
if j == i {
break;
}
if deleted_states.contains(j) || deleted_states.contains(i) {
continue;
}
if merge_parse_state(syntax_grammar, token_conflict_map, parse_table, *j, *i) {
deleted_states.insert(*i);
state_replacements.insert(*i, *j);
}
}
}
}
if state_replacements.is_empty() {
break;
}
for state in parse_table.states.iter_mut() {
state.update_referenced_states(|other_state_id, _| {
*state_replacements
.get(&other_state_id)
.unwrap_or(&other_state_id)
});
}
}
}
fn merge_parse_state(
syntax_grammar: &SyntaxGrammar,
token_conflict_map: &TokenConflictMap,
parse_table: &mut ParseTable,
left: usize,
right: usize,
) -> bool {
let left_state = &parse_table.states[left];
let right_state = &parse_table.states[right];
if left_state.nonterminal_entries != right_state.nonterminal_entries {
return false;
}
for (symbol, left_entry) in &left_state.terminal_entries {
if let Some(right_entry) = right_state.terminal_entries.get(symbol) {
if right_entry.actions != left_entry.actions {
return false;
}
} else if !can_add_entry_to_state(
syntax_grammar,
token_conflict_map,
right_state,
*symbol,
left_entry,
) {
return false;
}
}
eprintln!("maybe merge {} {}", left, right);
let mut symbols_to_add = Vec::new();
for (symbol, right_entry) in &right_state.terminal_entries {
if !left_state.terminal_entries.contains_key(&symbol) {
if !can_add_entry_to_state(
syntax_grammar,
token_conflict_map,
left_state,
*symbol,
right_entry,
) {
return false;
}
symbols_to_add.push(*symbol);
}
}
for symbol in symbols_to_add {
let entry = parse_table.states[right].terminal_entries[&symbol].clone();
parse_table.states[left]
.terminal_entries
.insert(symbol, entry);
}
true
}
fn can_add_entry_to_state(
syntax_grammar: &SyntaxGrammar,
token_conflict_map: &TokenConflictMap,
state: &ParseState,
token: Symbol,
entry: &ParseTableEntry,
) -> bool {
// Do not add external tokens; they could conflict lexically with any of the state's
// existing lookahead tokens.
if token.is_external() {
return false;
}
// Only merge parse states by allowing existing reductions to happen
// with additional lookahead tokens. Do not alter parse states in ways
// that allow entirely new types of actions to happen.
if state.terminal_entries.iter().all(|(_, e)| e != entry) {
return false;
}
match entry.actions.last() {
Some(ParseAction::Reduce { .. }) => {}
_ => return false,
}
// Do not add tokens which are both internal and external. Their validity could
// influence the behavior of the external scanner.
if syntax_grammar
.external_tokens
.iter()
.any(|t| t.corresponding_internal_token == Some(token))
{
return false;
}
// Do not add a token if it conflicts with an existing token.
if token.is_terminal() {
for existing_token in state.terminal_entries.keys() {
if token_conflict_map.does_conflict(token.index, existing_token.index) {
return false;
}
}
}
true
}
fn remove_unused_states(parse_table: &mut ParseTable) {
let mut state_usage_map = vec![false; parse_table.states.len()];
for state in &parse_table.states {

View file

@ -8,6 +8,7 @@ use std::fmt;
struct TokenConflictStatus {
does_overlap: bool,
does_match_valid_continuation: bool,
does_match_separators: bool,
matches_same_string: bool,
}
@ -46,8 +47,9 @@ impl TokenConflictMap {
self.status_matrix[matrix_index(self.n, i, j)].matches_same_string
}
pub fn does_match_valid_continuation(&self, i: usize, j: usize) -> bool {
self.status_matrix[matrix_index(self.n, i, j)].does_match_valid_continuation
pub fn does_conflict(&self, i: usize, j: usize) -> bool {
let entry = &self.status_matrix[matrix_index(self.n, i, j)];
entry.does_match_valid_continuation || entry.does_match_separators
}
pub fn does_overlap(&self, i: usize, j: usize) -> bool {
@ -207,10 +209,15 @@ fn compute_conflict_status(
if chars.does_intersect(&following_chars[j]) {
result.0.does_match_valid_continuation = true;
}
if cursor.in_separator() {
result.0.does_match_separators = true;
}
} else {
result.1.does_overlap = true;
if chars.does_intersect(&following_chars[i]) {
result.1.does_match_valid_continuation = true;
} else {
result.1.does_match_separators = true;
}
}
}
@ -326,9 +333,9 @@ mod tests {
let token_map = TokenConflictMap::new(
&grammar,
vec![
LookaheadSet::with(&[Symbol::terminal(var("identifier"))]),
LookaheadSet::with(&[Symbol::terminal(var("in"))]),
LookaheadSet::with(&[Symbol::terminal(var("identifier"))]),
LookaheadSet::with([Symbol::terminal(var("identifier"))].iter().cloned()),
LookaheadSet::with([Symbol::terminal(var("in"))].iter().cloned()),
LookaheadSet::with([Symbol::terminal(var("identifier"))].iter().cloned()),
],
);
@ -338,12 +345,12 @@ mod tests {
// Depending on what character follows, the string "in" may be treated as part of an
// `identifier` token.
assert!(token_map.does_match_valid_continuation(var("identifier"), var("in")));
assert!(token_map.does_conflict(var("identifier"), var("in")));
// Depending on what character follows, the string "instanceof" may be treated as part of
// an `identifier` token.
assert!(token_map.does_match_valid_continuation(var("identifier"), var("instanceof")));
assert!(token_map.does_match_valid_continuation(var("instanceof"), var("in")));
assert!(token_map.does_conflict(var("identifier"), var("instanceof")));
assert!(token_map.does_conflict(var("instanceof"), var("in")));
}
fn index_of_var(grammar: &LexicalGrammar, name: &str) -> usize {

View file

@ -86,15 +86,34 @@ impl CharacterSet {
}
pub fn add(self, other: &CharacterSet) -> Self {
if let CharacterSet::Include(other_chars) = other {
if let CharacterSet::Include(mut chars) = self {
chars.extend(other_chars);
chars.sort_unstable();
chars.dedup();
return CharacterSet::Include(chars);
}
match self {
CharacterSet::Include(mut chars) => match other {
CharacterSet::Include(other_chars) => {
chars.extend(other_chars);
chars.sort_unstable();
chars.dedup();
CharacterSet::Include(chars)
}
CharacterSet::Exclude(other_chars) => {
let excluded_chars = other_chars
.iter()
.cloned()
.filter(|c| !chars.contains(&c))
.collect();
CharacterSet::Exclude(excluded_chars)
}
},
CharacterSet::Exclude(mut chars) => match other {
CharacterSet::Include(other_chars) => {
chars.retain(|c| !other_chars.contains(&c));
CharacterSet::Exclude(chars)
}
CharacterSet::Exclude(other_chars) => {
chars.retain(|c| other_chars.contains(&c));
CharacterSet::Exclude(chars)
},
},
}
panic!("Called add with a negated character set");
}
pub fn does_intersect(&self, other: &CharacterSet) -> bool {
@ -458,6 +477,9 @@ mod tests {
(CharacterSet::empty().add_char('f'), 0, 4),
],
vec![
(CharacterSet::empty().add_char('d'), 0, vec![1, 2]),
(CharacterSet::empty().add_char('f'), 0, vec![1, 4]),
(CharacterSet::empty().add_char('i'), 0, vec![1, 3]),
(
CharacterSet::empty()
.add_range('a', 'c')
@ -467,9 +489,6 @@ mod tests {
0,
vec![1],
),
(CharacterSet::empty().add_char('d'), 0, vec![1, 2]),
(CharacterSet::empty().add_char('f'), 0, vec![1, 4]),
(CharacterSet::empty().add_char('i'), 0, vec![1, 3]),
],
),
];

View file

@ -164,12 +164,20 @@ impl NfaBuilder {
Err(Error::regex("Unicode character classes are not supported"))
}
Class::Perl(class) => {
self.push_advance(self.expand_perl_character_class(&class.kind), next_state_id);
let mut chars = self.expand_perl_character_class(&class.kind);
if class.negated {
chars = chars.negate();
}
self.push_advance(chars, next_state_id);
Ok(true)
}
Class::Bracketed(class) => match &class.kind {
ClassSet::Item(item) => {
self.push_advance(self.expand_character_class(&item)?, next_state_id);
let mut chars = self.expand_character_class(&item)?;
if class.negated {
chars = chars.negate();
}
self.push_advance(chars, next_state_id);
Ok(true)
}
ClassSet::BinaryOp(_) => Err(Error::regex(

View file

@ -37,6 +37,7 @@ pub(crate) struct ParseState {
pub terminal_entries: HashMap<Symbol, ParseTableEntry>,
pub nonterminal_entries: HashMap<Symbol, ParseStateId>,
pub lex_state_id: usize,
pub unfinished_item_signature: u64,
}
#[derive(Debug, PartialEq, Eq)]