Fix another bug in lex state merging

Reuse more logic for lex and parse state merging algorithms
This commit is contained in:
Max Brunsfeld 2019-06-21 13:12:09 -07:00
parent fcd8913233
commit 223a656fc8
6 changed files with 155 additions and 184 deletions

View file

@ -1,5 +1,6 @@
use super::coincident_tokens::CoincidentTokenIndex;
use super::item::TokenSet;
use super::split_state_id_groups;
use super::token_conflicts::TokenConflictMap;
use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar};
use crate::generate::nfa::{CharacterSet, NfaCursor};
@ -7,7 +8,7 @@ use crate::generate::rules::Symbol;
use crate::generate::tables::{AdvanceAction, LexState, LexTable, ParseStateId, ParseTable};
use log::info;
use std::collections::hash_map::Entry;
use std::collections::{BTreeMap, HashMap, VecDeque};
use std::collections::{HashMap, VecDeque};
use std::mem;
pub(crate) fn build_lex_table(
@ -251,13 +252,15 @@ fn merge_token_set(
};
for existing_token in set_without_terminal.terminals() {
if token_conflict_map.does_conflict(i, existing_token.index) ||
token_conflict_map.does_match_prefix(i, existing_token.index) {
if token_conflict_map.does_conflict(i, existing_token.index)
|| token_conflict_map.does_match_prefix(i, existing_token.index)
{
return false;
}
if !coincident_token_index.contains(symbol, existing_token) {
if token_conflict_map.does_overlap(existing_token.index, i) ||
token_conflict_map.does_overlap(i, existing_token.index) {
if token_conflict_map.does_overlap(existing_token.index, i)
|| token_conflict_map.does_overlap(i, existing_token.index)
{
return false;
}
}
@ -269,76 +272,86 @@ fn merge_token_set(
}
fn minimize_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) {
let mut state_replacements = BTreeMap::new();
let mut done = false;
while !done {
done = true;
for (i, state_i) in table.states.iter().enumerate() {
if state_replacements.contains_key(&i) {
continue;
}
for (j, state_j) in table.states.iter().enumerate() {
if j == i {
break;
}
if state_replacements.contains_key(&j) {
continue;
}
if state_i.equals(state_j, i, j) {
info!("replace state {} with state {}", i, j);
state_replacements.insert(i, j);
done = false;
break;
}
}
}
for state in table.states.iter_mut() {
for (_, advance_action) in state.advance_actions.iter_mut() {
advance_action.state = state_replacements
.get(&advance_action.state)
.cloned()
.unwrap_or(advance_action.state);
}
// Initially group the states by their accept action and their
// valid lookahead characters.
let mut state_ids_by_signature = HashMap::new();
for (i, state) in table.states.iter().enumerate() {
let signature = (
i == 0,
state.accept_action,
state
.advance_actions
.iter()
.map(|(characters, action)| (characters.clone(), action.in_main_token))
.collect::<Vec<_>>(),
);
state_ids_by_signature
.entry(signature)
.or_insert(Vec::new())
.push(i);
}
let mut state_ids_by_group_id = state_ids_by_signature
.into_iter()
.map(|e| e.1)
.collect::<Vec<_>>();
let error_group_index = state_ids_by_group_id
.iter()
.position(|g| g.contains(&0))
.unwrap();
state_ids_by_group_id.swap(error_group_index, 0);
let mut group_ids_by_state_id = vec![0; table.states.len()];
for (group_id, state_ids) in state_ids_by_group_id.iter().enumerate() {
for state_id in state_ids {
group_ids_by_state_id[*state_id] = group_id;
}
}
let final_state_replacements = (0..table.states.len())
.into_iter()
.map(|state_id| {
let replacement = state_replacements
.get(&state_id)
.cloned()
.unwrap_or(state_id);
let prior_removed = state_replacements
.iter()
.take_while(|i| *i.0 < replacement)
.count();
replacement - prior_removed
})
.collect::<Vec<_>>();
while split_state_id_groups(
&table.states,
&mut state_ids_by_group_id,
&mut group_ids_by_state_id,
1,
lex_states_differ,
) {
continue;
}
let mut new_states = Vec::with_capacity(state_ids_by_group_id.len());
for state_ids in &state_ids_by_group_id {
let mut new_state = LexState::default();
mem::swap(&mut new_state, &mut table.states[state_ids[0]]);
for (_, advance_action) in new_state.advance_actions.iter_mut() {
advance_action.state = group_ids_by_state_id[advance_action.state];
}
new_states.push(new_state);
}
for state in parse_table.states.iter_mut() {
state.lex_state_id = final_state_replacements[state.lex_state_id];
state.lex_state_id = group_ids_by_state_id[state.lex_state_id];
}
for state in table.states.iter_mut() {
for (_, advance_action) in state.advance_actions.iter_mut() {
advance_action.state = final_state_replacements[advance_action.state];
}
}
table.states = new_states;
}
let mut i = 0;
table.states.retain(|_| {
let result = !state_replacements.contains_key(&i);
i += 1;
result
});
fn lex_states_differ(
left: &LexState,
right: &LexState,
group_ids_by_state_id: &Vec<usize>,
) -> bool {
left.advance_actions
.iter()
.zip(right.advance_actions.iter())
.any(|(left, right)| {
group_ids_by_state_id[left.1.state] != group_ids_by_state_id[right.1.state]
})
}
fn sort_states(table: &mut LexTable, parse_table: &mut ParseTable) {
// Get a mapping of old state index -> new_state_index
let mut old_ids_by_new_id = (0..table.states.len()).collect::<Vec<_>>();
&old_ids_by_new_id[1..].sort_unstable_by_key(|id| &table.states[*id]);
&old_ids_by_new_id[1..].sort_by_key(|id| &table.states[*id]);
// Get the inverse mapping
let mut new_ids_by_old_id = vec![0; old_ids_by_new_id.len()];

View file

@ -745,7 +745,10 @@ fn populate_following_tokens(
.iter()
.flat_map(|v| &v.productions)
.chain(&inlines.productions);
let all_tokens = (0..result.len()).into_iter().map(Symbol::terminal).collect::<TokenSet>();
let all_tokens = (0..result.len())
.into_iter()
.map(Symbol::terminal)
.collect::<TokenSet>();
for production in productions {
for i in 1..production.steps.len() {
let left_tokens = builder.last_set(&production.steps[i - 1].symbol);

View file

@ -1,4 +1,5 @@
use super::item::TokenSet;
use super::split_state_id_groups;
use super::token_conflicts::TokenConflictMap;
use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar, VariableType};
use crate::generate::rules::{AliasMap, Symbol};
@ -126,15 +127,19 @@ impl<'a> Minimizer<'a> {
group_ids_by_state_id.push(state.core_id);
}
self.split_state_id_groups_by(
split_state_id_groups(
&self.parse_table.states,
&mut state_ids_by_group_id,
&mut group_ids_by_state_id,
0,
|left, right, groups| self.states_conflict(left, right, groups),
);
while self.split_state_id_groups_by(
while split_state_id_groups(
&self.parse_table.states,
&mut state_ids_by_group_id,
&mut group_ids_by_state_id,
0,
|left, right, groups| self.state_successors_differ(left, right, groups),
) {
continue;
@ -183,84 +188,6 @@ impl<'a> Minimizer<'a> {
self.parse_table.states = new_states;
}
fn split_state_id_groups_by(
&self,
state_ids_by_group_id: &mut Vec<Vec<ParseStateId>>,
group_ids_by_state_id: &mut Vec<ParseStateId>,
mut f: impl FnMut(&ParseState, &ParseState, &Vec<ParseStateId>) -> bool,
) -> bool {
let mut result = false;
// Examine each group of states, and split them up if necessary. For
// each group of states, find a subgroup where all the states are mutually
// compatible. Leave that subgroup in place, and split off all of the
// other states in the group into a new group. Those states are not
// necessarily mutually compatible, but they will be split up in later
// iterations.
let mut group_id = 0;
while group_id < state_ids_by_group_id.len() {
let state_ids = &state_ids_by_group_id[group_id];
let mut split_state_ids = Vec::new();
let mut i = 0;
while i < state_ids.len() {
let left_state_id = state_ids[i];
if split_state_ids.contains(&left_state_id) {
i += 1;
continue;
}
let left_state = &self.parse_table.states[left_state_id];
// Identify all of the other states in the group that are incompatible with
// this state.
let mut j = i + 1;
while j < state_ids.len() {
let right_state_id = state_ids[j];
if split_state_ids.contains(&right_state_id) {
j += 1;
continue;
}
let right_state = &self.parse_table.states[right_state_id];
if f(left_state, right_state, &group_ids_by_state_id) {
split_state_ids.push(right_state_id);
}
j += 1;
}
i += 1;
}
// If any states were removed from the group, add them all as a new group.
if split_state_ids.len() > 0 {
result = true;
state_ids_by_group_id[group_id].retain(|i| !split_state_ids.contains(&i));
info!(
"split state groups {:?} {:?}",
state_ids_by_group_id[group_id], split_state_ids,
);
let new_group_id = state_ids_by_group_id.len();
for id in &split_state_ids {
group_ids_by_state_id[*id] = new_group_id;
}
state_ids_by_group_id.push(Vec::new());
mem::swap(
&mut split_state_ids,
state_ids_by_group_id.last_mut().unwrap(),
);
}
group_id += 1;
}
result
}
fn states_conflict(
&self,
left_state: &ParseState,

View file

@ -355,3 +355,67 @@ fn all_chars_are_alphabetical(cursor: &NfaCursor) -> bool {
}
})
}
fn split_state_id_groups<S>(
states: &Vec<S>,
state_ids_by_group_id: &mut Vec<Vec<usize>>,
group_ids_by_state_id: &mut Vec<usize>,
start_group_id: usize,
mut f: impl FnMut(&S, &S, &Vec<usize>) -> bool,
) -> bool {
let mut result = false;
let mut group_id = start_group_id;
while group_id < state_ids_by_group_id.len() {
let state_ids = &state_ids_by_group_id[group_id];
let mut split_state_ids = Vec::new();
let mut i = 0;
while i < state_ids.len() {
let left_state_id = state_ids[i];
if split_state_ids.contains(&left_state_id) {
i += 1;
continue;
}
let left_state = &states[left_state_id];
// Identify all of the other states in the group that are incompatible with
// this state.
let mut j = i + 1;
while j < state_ids.len() {
let right_state_id = state_ids[j];
if split_state_ids.contains(&right_state_id) {
j += 1;
continue;
}
let right_state = &states[right_state_id];
if f(left_state, right_state, &group_ids_by_state_id) {
split_state_ids.push(right_state_id);
}
j += 1;
}
i += 1;
}
// If any states were removed from the group, add them all as a new group.
if split_state_ids.len() > 0 {
result = true;
state_ids_by_group_id[group_id].retain(|i| !split_state_ids.contains(&i));
let new_group_id = state_ids_by_group_id.len();
for id in &split_state_ids {
group_ids_by_state_id[*id] = new_group_id;
}
state_ids_by_group_id.push(split_state_ids);
}
group_id += 1;
}
result
}

View file

@ -79,10 +79,10 @@ impl<'a> TokenConflictMap<'a> {
pub fn does_overlap(&self, i: usize, j: usize) -> bool {
let status = &self.status_matrix[matrix_index(self.n, i, j)];
status.does_match_separators ||
status.matches_prefix ||
status.matches_same_string ||
status.does_match_continuation
status.does_match_separators
|| status.matches_prefix
|| status.matches_same_string
|| status.does_match_continuation
}
pub fn prefer_token(grammar: &LexicalGrammar, left: (i32, usize), right: (i32, usize)) -> bool {

View file

@ -69,8 +69,8 @@ pub(crate) struct AdvanceAction {
#[derive(Clone, Debug, Default, PartialEq, Eq, PartialOrd, Ord)]
pub(crate) struct LexState {
pub advance_actions: Vec<(CharacterSet, AdvanceAction)>,
pub accept_action: Option<Symbol>,
pub advance_actions: Vec<(CharacterSet, AdvanceAction)>,
}
#[derive(Debug, PartialEq, Eq)]
@ -152,39 +152,3 @@ impl ParseAction {
}
}
}
impl LexState {
pub fn equals(&self, other: &LexState, left_state: usize, right_state: usize) -> bool {
if self.accept_action != other.accept_action {
return false;
}
if self.advance_actions.len() != other.advance_actions.len() {
return false;
}
for (left, right) in self
.advance_actions
.iter()
.zip(other.advance_actions.iter())
{
if left.0 != right.0 || left.1.in_main_token != right.1.in_main_token {
return false;
}
let left_successor = left.1.state;
let right_successor = right.1.state;
// Two states can be equal if they have different successors but the successor
// states are equal.
if left_successor != right_successor
&& (left_successor != left_state || right_successor != right_state)
&& (left_successor != right_state || right_successor != left_state)
{
return false;
}
}
true
}
}