From 223a656fc81647e2595c3da26577594f59e8b9a7 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 21 Jun 2019 13:12:09 -0700 Subject: [PATCH] Fix another bug in lex state merging Reuse more logic for lex and parse state merging algorithms --- .../generate/build_tables/build_lex_table.rs | 137 ++++++++++-------- .../build_tables/build_parse_table.rs | 5 +- .../build_tables/minimize_parse_table.rs | 87 +---------- cli/src/generate/build_tables/mod.rs | 64 ++++++++ .../generate/build_tables/token_conflicts.rs | 8 +- cli/src/generate/tables.rs | 38 +---- 6 files changed, 155 insertions(+), 184 deletions(-) diff --git a/cli/src/generate/build_tables/build_lex_table.rs b/cli/src/generate/build_tables/build_lex_table.rs index 0e3da4ff..f1d1acf1 100644 --- a/cli/src/generate/build_tables/build_lex_table.rs +++ b/cli/src/generate/build_tables/build_lex_table.rs @@ -1,5 +1,6 @@ use super::coincident_tokens::CoincidentTokenIndex; use super::item::TokenSet; +use super::split_state_id_groups; use super::token_conflicts::TokenConflictMap; use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar}; use crate::generate::nfa::{CharacterSet, NfaCursor}; @@ -7,7 +8,7 @@ use crate::generate::rules::Symbol; use crate::generate::tables::{AdvanceAction, LexState, LexTable, ParseStateId, ParseTable}; use log::info; use std::collections::hash_map::Entry; -use std::collections::{BTreeMap, HashMap, VecDeque}; +use std::collections::{HashMap, VecDeque}; use std::mem; pub(crate) fn build_lex_table( @@ -251,13 +252,15 @@ fn merge_token_set( }; for existing_token in set_without_terminal.terminals() { - if token_conflict_map.does_conflict(i, existing_token.index) || - token_conflict_map.does_match_prefix(i, existing_token.index) { + if token_conflict_map.does_conflict(i, existing_token.index) + || token_conflict_map.does_match_prefix(i, existing_token.index) + { return false; } if !coincident_token_index.contains(symbol, existing_token) { - if token_conflict_map.does_overlap(existing_token.index, i) || - token_conflict_map.does_overlap(i, existing_token.index) { + if token_conflict_map.does_overlap(existing_token.index, i) + || token_conflict_map.does_overlap(i, existing_token.index) + { return false; } } @@ -269,76 +272,86 @@ fn merge_token_set( } fn minimize_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) { - let mut state_replacements = BTreeMap::new(); - let mut done = false; - while !done { - done = true; - for (i, state_i) in table.states.iter().enumerate() { - if state_replacements.contains_key(&i) { - continue; - } - for (j, state_j) in table.states.iter().enumerate() { - if j == i { - break; - } - if state_replacements.contains_key(&j) { - continue; - } - if state_i.equals(state_j, i, j) { - info!("replace state {} with state {}", i, j); - state_replacements.insert(i, j); - done = false; - break; - } - } - } - for state in table.states.iter_mut() { - for (_, advance_action) in state.advance_actions.iter_mut() { - advance_action.state = state_replacements - .get(&advance_action.state) - .cloned() - .unwrap_or(advance_action.state); - } + // Initially group the states by their accept action and their + // valid lookahead characters. + let mut state_ids_by_signature = HashMap::new(); + for (i, state) in table.states.iter().enumerate() { + let signature = ( + i == 0, + state.accept_action, + state + .advance_actions + .iter() + .map(|(characters, action)| (characters.clone(), action.in_main_token)) + .collect::>(), + ); + state_ids_by_signature + .entry(signature) + .or_insert(Vec::new()) + .push(i); + } + let mut state_ids_by_group_id = state_ids_by_signature + .into_iter() + .map(|e| e.1) + .collect::>(); + let error_group_index = state_ids_by_group_id + .iter() + .position(|g| g.contains(&0)) + .unwrap(); + state_ids_by_group_id.swap(error_group_index, 0); + + let mut group_ids_by_state_id = vec![0; table.states.len()]; + for (group_id, state_ids) in state_ids_by_group_id.iter().enumerate() { + for state_id in state_ids { + group_ids_by_state_id[*state_id] = group_id; } } - let final_state_replacements = (0..table.states.len()) - .into_iter() - .map(|state_id| { - let replacement = state_replacements - .get(&state_id) - .cloned() - .unwrap_or(state_id); - let prior_removed = state_replacements - .iter() - .take_while(|i| *i.0 < replacement) - .count(); - replacement - prior_removed - }) - .collect::>(); + while split_state_id_groups( + &table.states, + &mut state_ids_by_group_id, + &mut group_ids_by_state_id, + 1, + lex_states_differ, + ) { + continue; + } + + let mut new_states = Vec::with_capacity(state_ids_by_group_id.len()); + for state_ids in &state_ids_by_group_id { + let mut new_state = LexState::default(); + mem::swap(&mut new_state, &mut table.states[state_ids[0]]); + + for (_, advance_action) in new_state.advance_actions.iter_mut() { + advance_action.state = group_ids_by_state_id[advance_action.state]; + } + new_states.push(new_state); + } for state in parse_table.states.iter_mut() { - state.lex_state_id = final_state_replacements[state.lex_state_id]; + state.lex_state_id = group_ids_by_state_id[state.lex_state_id]; } - for state in table.states.iter_mut() { - for (_, advance_action) in state.advance_actions.iter_mut() { - advance_action.state = final_state_replacements[advance_action.state]; - } - } + table.states = new_states; +} - let mut i = 0; - table.states.retain(|_| { - let result = !state_replacements.contains_key(&i); - i += 1; - result - }); +fn lex_states_differ( + left: &LexState, + right: &LexState, + group_ids_by_state_id: &Vec, +) -> bool { + left.advance_actions + .iter() + .zip(right.advance_actions.iter()) + .any(|(left, right)| { + group_ids_by_state_id[left.1.state] != group_ids_by_state_id[right.1.state] + }) } fn sort_states(table: &mut LexTable, parse_table: &mut ParseTable) { // Get a mapping of old state index -> new_state_index let mut old_ids_by_new_id = (0..table.states.len()).collect::>(); - &old_ids_by_new_id[1..].sort_unstable_by_key(|id| &table.states[*id]); + &old_ids_by_new_id[1..].sort_by_key(|id| &table.states[*id]); // Get the inverse mapping let mut new_ids_by_old_id = vec![0; old_ids_by_new_id.len()]; diff --git a/cli/src/generate/build_tables/build_parse_table.rs b/cli/src/generate/build_tables/build_parse_table.rs index c3372091..f7c3b42a 100644 --- a/cli/src/generate/build_tables/build_parse_table.rs +++ b/cli/src/generate/build_tables/build_parse_table.rs @@ -745,7 +745,10 @@ fn populate_following_tokens( .iter() .flat_map(|v| &v.productions) .chain(&inlines.productions); - let all_tokens = (0..result.len()).into_iter().map(Symbol::terminal).collect::(); + let all_tokens = (0..result.len()) + .into_iter() + .map(Symbol::terminal) + .collect::(); for production in productions { for i in 1..production.steps.len() { let left_tokens = builder.last_set(&production.steps[i - 1].symbol); diff --git a/cli/src/generate/build_tables/minimize_parse_table.rs b/cli/src/generate/build_tables/minimize_parse_table.rs index 5ab8eaf6..d1823fc7 100644 --- a/cli/src/generate/build_tables/minimize_parse_table.rs +++ b/cli/src/generate/build_tables/minimize_parse_table.rs @@ -1,4 +1,5 @@ use super::item::TokenSet; +use super::split_state_id_groups; use super::token_conflicts::TokenConflictMap; use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar, VariableType}; use crate::generate::rules::{AliasMap, Symbol}; @@ -126,15 +127,19 @@ impl<'a> Minimizer<'a> { group_ids_by_state_id.push(state.core_id); } - self.split_state_id_groups_by( + split_state_id_groups( + &self.parse_table.states, &mut state_ids_by_group_id, &mut group_ids_by_state_id, + 0, |left, right, groups| self.states_conflict(left, right, groups), ); - while self.split_state_id_groups_by( + while split_state_id_groups( + &self.parse_table.states, &mut state_ids_by_group_id, &mut group_ids_by_state_id, + 0, |left, right, groups| self.state_successors_differ(left, right, groups), ) { continue; @@ -183,84 +188,6 @@ impl<'a> Minimizer<'a> { self.parse_table.states = new_states; } - fn split_state_id_groups_by( - &self, - state_ids_by_group_id: &mut Vec>, - group_ids_by_state_id: &mut Vec, - mut f: impl FnMut(&ParseState, &ParseState, &Vec) -> bool, - ) -> bool { - let mut result = false; - - // Examine each group of states, and split them up if necessary. For - // each group of states, find a subgroup where all the states are mutually - // compatible. Leave that subgroup in place, and split off all of the - // other states in the group into a new group. Those states are not - // necessarily mutually compatible, but they will be split up in later - // iterations. - let mut group_id = 0; - while group_id < state_ids_by_group_id.len() { - let state_ids = &state_ids_by_group_id[group_id]; - let mut split_state_ids = Vec::new(); - - let mut i = 0; - while i < state_ids.len() { - let left_state_id = state_ids[i]; - if split_state_ids.contains(&left_state_id) { - i += 1; - continue; - } - - let left_state = &self.parse_table.states[left_state_id]; - - // Identify all of the other states in the group that are incompatible with - // this state. - let mut j = i + 1; - while j < state_ids.len() { - let right_state_id = state_ids[j]; - if split_state_ids.contains(&right_state_id) { - j += 1; - continue; - } - let right_state = &self.parse_table.states[right_state_id]; - - if f(left_state, right_state, &group_ids_by_state_id) { - split_state_ids.push(right_state_id); - } - - j += 1; - } - - i += 1; - } - - // If any states were removed from the group, add them all as a new group. - if split_state_ids.len() > 0 { - result = true; - state_ids_by_group_id[group_id].retain(|i| !split_state_ids.contains(&i)); - - info!( - "split state groups {:?} {:?}", - state_ids_by_group_id[group_id], split_state_ids, - ); - - let new_group_id = state_ids_by_group_id.len(); - for id in &split_state_ids { - group_ids_by_state_id[*id] = new_group_id; - } - - state_ids_by_group_id.push(Vec::new()); - mem::swap( - &mut split_state_ids, - state_ids_by_group_id.last_mut().unwrap(), - ); - } - - group_id += 1; - } - - result - } - fn states_conflict( &self, left_state: &ParseState, diff --git a/cli/src/generate/build_tables/mod.rs b/cli/src/generate/build_tables/mod.rs index de28cda3..7b33d4c3 100644 --- a/cli/src/generate/build_tables/mod.rs +++ b/cli/src/generate/build_tables/mod.rs @@ -355,3 +355,67 @@ fn all_chars_are_alphabetical(cursor: &NfaCursor) -> bool { } }) } + +fn split_state_id_groups( + states: &Vec, + state_ids_by_group_id: &mut Vec>, + group_ids_by_state_id: &mut Vec, + start_group_id: usize, + mut f: impl FnMut(&S, &S, &Vec) -> bool, +) -> bool { + let mut result = false; + + let mut group_id = start_group_id; + while group_id < state_ids_by_group_id.len() { + let state_ids = &state_ids_by_group_id[group_id]; + let mut split_state_ids = Vec::new(); + + let mut i = 0; + while i < state_ids.len() { + let left_state_id = state_ids[i]; + if split_state_ids.contains(&left_state_id) { + i += 1; + continue; + } + + let left_state = &states[left_state_id]; + + // Identify all of the other states in the group that are incompatible with + // this state. + let mut j = i + 1; + while j < state_ids.len() { + let right_state_id = state_ids[j]; + if split_state_ids.contains(&right_state_id) { + j += 1; + continue; + } + let right_state = &states[right_state_id]; + + if f(left_state, right_state, &group_ids_by_state_id) { + split_state_ids.push(right_state_id); + } + + j += 1; + } + + i += 1; + } + + // If any states were removed from the group, add them all as a new group. + if split_state_ids.len() > 0 { + result = true; + state_ids_by_group_id[group_id].retain(|i| !split_state_ids.contains(&i)); + + let new_group_id = state_ids_by_group_id.len(); + for id in &split_state_ids { + group_ids_by_state_id[*id] = new_group_id; + } + + state_ids_by_group_id.push(split_state_ids); + } + + group_id += 1; + } + + result +} diff --git a/cli/src/generate/build_tables/token_conflicts.rs b/cli/src/generate/build_tables/token_conflicts.rs index 303e79fb..a7737eb3 100644 --- a/cli/src/generate/build_tables/token_conflicts.rs +++ b/cli/src/generate/build_tables/token_conflicts.rs @@ -79,10 +79,10 @@ impl<'a> TokenConflictMap<'a> { pub fn does_overlap(&self, i: usize, j: usize) -> bool { let status = &self.status_matrix[matrix_index(self.n, i, j)]; - status.does_match_separators || - status.matches_prefix || - status.matches_same_string || - status.does_match_continuation + status.does_match_separators + || status.matches_prefix + || status.matches_same_string + || status.does_match_continuation } pub fn prefer_token(grammar: &LexicalGrammar, left: (i32, usize), right: (i32, usize)) -> bool { diff --git a/cli/src/generate/tables.rs b/cli/src/generate/tables.rs index f7d96556..75fb7710 100644 --- a/cli/src/generate/tables.rs +++ b/cli/src/generate/tables.rs @@ -69,8 +69,8 @@ pub(crate) struct AdvanceAction { #[derive(Clone, Debug, Default, PartialEq, Eq, PartialOrd, Ord)] pub(crate) struct LexState { - pub advance_actions: Vec<(CharacterSet, AdvanceAction)>, pub accept_action: Option, + pub advance_actions: Vec<(CharacterSet, AdvanceAction)>, } #[derive(Debug, PartialEq, Eq)] @@ -152,39 +152,3 @@ impl ParseAction { } } } - -impl LexState { - pub fn equals(&self, other: &LexState, left_state: usize, right_state: usize) -> bool { - if self.accept_action != other.accept_action { - return false; - } - - if self.advance_actions.len() != other.advance_actions.len() { - return false; - } - - for (left, right) in self - .advance_actions - .iter() - .zip(other.advance_actions.iter()) - { - if left.0 != right.0 || left.1.in_main_token != right.1.in_main_token { - return false; - } - - let left_successor = left.1.state; - let right_successor = right.1.state; - - // Two states can be equal if they have different successors but the successor - // states are equal. - if left_successor != right_successor - && (left_successor != left_state || right_successor != right_state) - && (left_successor != right_state || right_successor != left_state) - { - return false; - } - } - - true - } -}