Fix another bug in lex state merging
Reuse more logic for lex and parse state merging algorithms
This commit is contained in:
parent
fcd8913233
commit
223a656fc8
6 changed files with 155 additions and 184 deletions
|
|
@ -1,5 +1,6 @@
|
|||
use super::coincident_tokens::CoincidentTokenIndex;
|
||||
use super::item::TokenSet;
|
||||
use super::split_state_id_groups;
|
||||
use super::token_conflicts::TokenConflictMap;
|
||||
use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar};
|
||||
use crate::generate::nfa::{CharacterSet, NfaCursor};
|
||||
|
|
@ -7,7 +8,7 @@ use crate::generate::rules::Symbol;
|
|||
use crate::generate::tables::{AdvanceAction, LexState, LexTable, ParseStateId, ParseTable};
|
||||
use log::info;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::{BTreeMap, HashMap, VecDeque};
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::mem;
|
||||
|
||||
pub(crate) fn build_lex_table(
|
||||
|
|
@ -251,13 +252,15 @@ fn merge_token_set(
|
|||
};
|
||||
|
||||
for existing_token in set_without_terminal.terminals() {
|
||||
if token_conflict_map.does_conflict(i, existing_token.index) ||
|
||||
token_conflict_map.does_match_prefix(i, existing_token.index) {
|
||||
if token_conflict_map.does_conflict(i, existing_token.index)
|
||||
|| token_conflict_map.does_match_prefix(i, existing_token.index)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
if !coincident_token_index.contains(symbol, existing_token) {
|
||||
if token_conflict_map.does_overlap(existing_token.index, i) ||
|
||||
token_conflict_map.does_overlap(i, existing_token.index) {
|
||||
if token_conflict_map.does_overlap(existing_token.index, i)
|
||||
|| token_conflict_map.does_overlap(i, existing_token.index)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
@ -269,76 +272,86 @@ fn merge_token_set(
|
|||
}
|
||||
|
||||
fn minimize_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) {
|
||||
let mut state_replacements = BTreeMap::new();
|
||||
let mut done = false;
|
||||
while !done {
|
||||
done = true;
|
||||
for (i, state_i) in table.states.iter().enumerate() {
|
||||
if state_replacements.contains_key(&i) {
|
||||
continue;
|
||||
}
|
||||
for (j, state_j) in table.states.iter().enumerate() {
|
||||
if j == i {
|
||||
break;
|
||||
}
|
||||
if state_replacements.contains_key(&j) {
|
||||
continue;
|
||||
}
|
||||
if state_i.equals(state_j, i, j) {
|
||||
info!("replace state {} with state {}", i, j);
|
||||
state_replacements.insert(i, j);
|
||||
done = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
for state in table.states.iter_mut() {
|
||||
for (_, advance_action) in state.advance_actions.iter_mut() {
|
||||
advance_action.state = state_replacements
|
||||
.get(&advance_action.state)
|
||||
.cloned()
|
||||
.unwrap_or(advance_action.state);
|
||||
}
|
||||
// Initially group the states by their accept action and their
|
||||
// valid lookahead characters.
|
||||
let mut state_ids_by_signature = HashMap::new();
|
||||
for (i, state) in table.states.iter().enumerate() {
|
||||
let signature = (
|
||||
i == 0,
|
||||
state.accept_action,
|
||||
state
|
||||
.advance_actions
|
||||
.iter()
|
||||
.map(|(characters, action)| (characters.clone(), action.in_main_token))
|
||||
.collect::<Vec<_>>(),
|
||||
);
|
||||
state_ids_by_signature
|
||||
.entry(signature)
|
||||
.or_insert(Vec::new())
|
||||
.push(i);
|
||||
}
|
||||
let mut state_ids_by_group_id = state_ids_by_signature
|
||||
.into_iter()
|
||||
.map(|e| e.1)
|
||||
.collect::<Vec<_>>();
|
||||
let error_group_index = state_ids_by_group_id
|
||||
.iter()
|
||||
.position(|g| g.contains(&0))
|
||||
.unwrap();
|
||||
state_ids_by_group_id.swap(error_group_index, 0);
|
||||
|
||||
let mut group_ids_by_state_id = vec![0; table.states.len()];
|
||||
for (group_id, state_ids) in state_ids_by_group_id.iter().enumerate() {
|
||||
for state_id in state_ids {
|
||||
group_ids_by_state_id[*state_id] = group_id;
|
||||
}
|
||||
}
|
||||
|
||||
let final_state_replacements = (0..table.states.len())
|
||||
.into_iter()
|
||||
.map(|state_id| {
|
||||
let replacement = state_replacements
|
||||
.get(&state_id)
|
||||
.cloned()
|
||||
.unwrap_or(state_id);
|
||||
let prior_removed = state_replacements
|
||||
.iter()
|
||||
.take_while(|i| *i.0 < replacement)
|
||||
.count();
|
||||
replacement - prior_removed
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
while split_state_id_groups(
|
||||
&table.states,
|
||||
&mut state_ids_by_group_id,
|
||||
&mut group_ids_by_state_id,
|
||||
1,
|
||||
lex_states_differ,
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut new_states = Vec::with_capacity(state_ids_by_group_id.len());
|
||||
for state_ids in &state_ids_by_group_id {
|
||||
let mut new_state = LexState::default();
|
||||
mem::swap(&mut new_state, &mut table.states[state_ids[0]]);
|
||||
|
||||
for (_, advance_action) in new_state.advance_actions.iter_mut() {
|
||||
advance_action.state = group_ids_by_state_id[advance_action.state];
|
||||
}
|
||||
new_states.push(new_state);
|
||||
}
|
||||
|
||||
for state in parse_table.states.iter_mut() {
|
||||
state.lex_state_id = final_state_replacements[state.lex_state_id];
|
||||
state.lex_state_id = group_ids_by_state_id[state.lex_state_id];
|
||||
}
|
||||
|
||||
for state in table.states.iter_mut() {
|
||||
for (_, advance_action) in state.advance_actions.iter_mut() {
|
||||
advance_action.state = final_state_replacements[advance_action.state];
|
||||
}
|
||||
}
|
||||
table.states = new_states;
|
||||
}
|
||||
|
||||
let mut i = 0;
|
||||
table.states.retain(|_| {
|
||||
let result = !state_replacements.contains_key(&i);
|
||||
i += 1;
|
||||
result
|
||||
});
|
||||
fn lex_states_differ(
|
||||
left: &LexState,
|
||||
right: &LexState,
|
||||
group_ids_by_state_id: &Vec<usize>,
|
||||
) -> bool {
|
||||
left.advance_actions
|
||||
.iter()
|
||||
.zip(right.advance_actions.iter())
|
||||
.any(|(left, right)| {
|
||||
group_ids_by_state_id[left.1.state] != group_ids_by_state_id[right.1.state]
|
||||
})
|
||||
}
|
||||
|
||||
fn sort_states(table: &mut LexTable, parse_table: &mut ParseTable) {
|
||||
// Get a mapping of old state index -> new_state_index
|
||||
let mut old_ids_by_new_id = (0..table.states.len()).collect::<Vec<_>>();
|
||||
&old_ids_by_new_id[1..].sort_unstable_by_key(|id| &table.states[*id]);
|
||||
&old_ids_by_new_id[1..].sort_by_key(|id| &table.states[*id]);
|
||||
|
||||
// Get the inverse mapping
|
||||
let mut new_ids_by_old_id = vec![0; old_ids_by_new_id.len()];
|
||||
|
|
|
|||
|
|
@ -745,7 +745,10 @@ fn populate_following_tokens(
|
|||
.iter()
|
||||
.flat_map(|v| &v.productions)
|
||||
.chain(&inlines.productions);
|
||||
let all_tokens = (0..result.len()).into_iter().map(Symbol::terminal).collect::<TokenSet>();
|
||||
let all_tokens = (0..result.len())
|
||||
.into_iter()
|
||||
.map(Symbol::terminal)
|
||||
.collect::<TokenSet>();
|
||||
for production in productions {
|
||||
for i in 1..production.steps.len() {
|
||||
let left_tokens = builder.last_set(&production.steps[i - 1].symbol);
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
use super::item::TokenSet;
|
||||
use super::split_state_id_groups;
|
||||
use super::token_conflicts::TokenConflictMap;
|
||||
use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar, VariableType};
|
||||
use crate::generate::rules::{AliasMap, Symbol};
|
||||
|
|
@ -126,15 +127,19 @@ impl<'a> Minimizer<'a> {
|
|||
group_ids_by_state_id.push(state.core_id);
|
||||
}
|
||||
|
||||
self.split_state_id_groups_by(
|
||||
split_state_id_groups(
|
||||
&self.parse_table.states,
|
||||
&mut state_ids_by_group_id,
|
||||
&mut group_ids_by_state_id,
|
||||
0,
|
||||
|left, right, groups| self.states_conflict(left, right, groups),
|
||||
);
|
||||
|
||||
while self.split_state_id_groups_by(
|
||||
while split_state_id_groups(
|
||||
&self.parse_table.states,
|
||||
&mut state_ids_by_group_id,
|
||||
&mut group_ids_by_state_id,
|
||||
0,
|
||||
|left, right, groups| self.state_successors_differ(left, right, groups),
|
||||
) {
|
||||
continue;
|
||||
|
|
@ -183,84 +188,6 @@ impl<'a> Minimizer<'a> {
|
|||
self.parse_table.states = new_states;
|
||||
}
|
||||
|
||||
fn split_state_id_groups_by(
|
||||
&self,
|
||||
state_ids_by_group_id: &mut Vec<Vec<ParseStateId>>,
|
||||
group_ids_by_state_id: &mut Vec<ParseStateId>,
|
||||
mut f: impl FnMut(&ParseState, &ParseState, &Vec<ParseStateId>) -> bool,
|
||||
) -> bool {
|
||||
let mut result = false;
|
||||
|
||||
// Examine each group of states, and split them up if necessary. For
|
||||
// each group of states, find a subgroup where all the states are mutually
|
||||
// compatible. Leave that subgroup in place, and split off all of the
|
||||
// other states in the group into a new group. Those states are not
|
||||
// necessarily mutually compatible, but they will be split up in later
|
||||
// iterations.
|
||||
let mut group_id = 0;
|
||||
while group_id < state_ids_by_group_id.len() {
|
||||
let state_ids = &state_ids_by_group_id[group_id];
|
||||
let mut split_state_ids = Vec::new();
|
||||
|
||||
let mut i = 0;
|
||||
while i < state_ids.len() {
|
||||
let left_state_id = state_ids[i];
|
||||
if split_state_ids.contains(&left_state_id) {
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
let left_state = &self.parse_table.states[left_state_id];
|
||||
|
||||
// Identify all of the other states in the group that are incompatible with
|
||||
// this state.
|
||||
let mut j = i + 1;
|
||||
while j < state_ids.len() {
|
||||
let right_state_id = state_ids[j];
|
||||
if split_state_ids.contains(&right_state_id) {
|
||||
j += 1;
|
||||
continue;
|
||||
}
|
||||
let right_state = &self.parse_table.states[right_state_id];
|
||||
|
||||
if f(left_state, right_state, &group_ids_by_state_id) {
|
||||
split_state_ids.push(right_state_id);
|
||||
}
|
||||
|
||||
j += 1;
|
||||
}
|
||||
|
||||
i += 1;
|
||||
}
|
||||
|
||||
// If any states were removed from the group, add them all as a new group.
|
||||
if split_state_ids.len() > 0 {
|
||||
result = true;
|
||||
state_ids_by_group_id[group_id].retain(|i| !split_state_ids.contains(&i));
|
||||
|
||||
info!(
|
||||
"split state groups {:?} {:?}",
|
||||
state_ids_by_group_id[group_id], split_state_ids,
|
||||
);
|
||||
|
||||
let new_group_id = state_ids_by_group_id.len();
|
||||
for id in &split_state_ids {
|
||||
group_ids_by_state_id[*id] = new_group_id;
|
||||
}
|
||||
|
||||
state_ids_by_group_id.push(Vec::new());
|
||||
mem::swap(
|
||||
&mut split_state_ids,
|
||||
state_ids_by_group_id.last_mut().unwrap(),
|
||||
);
|
||||
}
|
||||
|
||||
group_id += 1;
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
fn states_conflict(
|
||||
&self,
|
||||
left_state: &ParseState,
|
||||
|
|
|
|||
|
|
@ -355,3 +355,67 @@ fn all_chars_are_alphabetical(cursor: &NfaCursor) -> bool {
|
|||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn split_state_id_groups<S>(
|
||||
states: &Vec<S>,
|
||||
state_ids_by_group_id: &mut Vec<Vec<usize>>,
|
||||
group_ids_by_state_id: &mut Vec<usize>,
|
||||
start_group_id: usize,
|
||||
mut f: impl FnMut(&S, &S, &Vec<usize>) -> bool,
|
||||
) -> bool {
|
||||
let mut result = false;
|
||||
|
||||
let mut group_id = start_group_id;
|
||||
while group_id < state_ids_by_group_id.len() {
|
||||
let state_ids = &state_ids_by_group_id[group_id];
|
||||
let mut split_state_ids = Vec::new();
|
||||
|
||||
let mut i = 0;
|
||||
while i < state_ids.len() {
|
||||
let left_state_id = state_ids[i];
|
||||
if split_state_ids.contains(&left_state_id) {
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
let left_state = &states[left_state_id];
|
||||
|
||||
// Identify all of the other states in the group that are incompatible with
|
||||
// this state.
|
||||
let mut j = i + 1;
|
||||
while j < state_ids.len() {
|
||||
let right_state_id = state_ids[j];
|
||||
if split_state_ids.contains(&right_state_id) {
|
||||
j += 1;
|
||||
continue;
|
||||
}
|
||||
let right_state = &states[right_state_id];
|
||||
|
||||
if f(left_state, right_state, &group_ids_by_state_id) {
|
||||
split_state_ids.push(right_state_id);
|
||||
}
|
||||
|
||||
j += 1;
|
||||
}
|
||||
|
||||
i += 1;
|
||||
}
|
||||
|
||||
// If any states were removed from the group, add them all as a new group.
|
||||
if split_state_ids.len() > 0 {
|
||||
result = true;
|
||||
state_ids_by_group_id[group_id].retain(|i| !split_state_ids.contains(&i));
|
||||
|
||||
let new_group_id = state_ids_by_group_id.len();
|
||||
for id in &split_state_ids {
|
||||
group_ids_by_state_id[*id] = new_group_id;
|
||||
}
|
||||
|
||||
state_ids_by_group_id.push(split_state_ids);
|
||||
}
|
||||
|
||||
group_id += 1;
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
|
|
|||
|
|
@ -79,10 +79,10 @@ impl<'a> TokenConflictMap<'a> {
|
|||
|
||||
pub fn does_overlap(&self, i: usize, j: usize) -> bool {
|
||||
let status = &self.status_matrix[matrix_index(self.n, i, j)];
|
||||
status.does_match_separators ||
|
||||
status.matches_prefix ||
|
||||
status.matches_same_string ||
|
||||
status.does_match_continuation
|
||||
status.does_match_separators
|
||||
|| status.matches_prefix
|
||||
|| status.matches_same_string
|
||||
|| status.does_match_continuation
|
||||
}
|
||||
|
||||
pub fn prefer_token(grammar: &LexicalGrammar, left: (i32, usize), right: (i32, usize)) -> bool {
|
||||
|
|
|
|||
|
|
@ -69,8 +69,8 @@ pub(crate) struct AdvanceAction {
|
|||
|
||||
#[derive(Clone, Debug, Default, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub(crate) struct LexState {
|
||||
pub advance_actions: Vec<(CharacterSet, AdvanceAction)>,
|
||||
pub accept_action: Option<Symbol>,
|
||||
pub advance_actions: Vec<(CharacterSet, AdvanceAction)>,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
|
|
@ -152,39 +152,3 @@ impl ParseAction {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl LexState {
|
||||
pub fn equals(&self, other: &LexState, left_state: usize, right_state: usize) -> bool {
|
||||
if self.accept_action != other.accept_action {
|
||||
return false;
|
||||
}
|
||||
|
||||
if self.advance_actions.len() != other.advance_actions.len() {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (left, right) in self
|
||||
.advance_actions
|
||||
.iter()
|
||||
.zip(other.advance_actions.iter())
|
||||
{
|
||||
if left.0 != right.0 || left.1.in_main_token != right.1.in_main_token {
|
||||
return false;
|
||||
}
|
||||
|
||||
let left_successor = left.1.state;
|
||||
let right_successor = right.1.state;
|
||||
|
||||
// Two states can be equal if they have different successors but the successor
|
||||
// states are equal.
|
||||
if left_successor != right_successor
|
||||
&& (left_successor != left_state || right_successor != right_state)
|
||||
&& (left_successor != right_state || right_successor != left_state)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
true
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue