Fix another bug in lex state merging

Reuse more logic for lex and parse state merging algorithms
2019-06-21 13:12:09 -07:00 · 2019-06-21 13:12:09 -07:00 · 223a656fc8
commit 223a656fc8
parent fcd8913233
6 changed files with 155 additions and 184 deletions
--- a/cli/src/generate/build_tables/build_lex_table.rs
+++ b/cli/src/generate/build_tables/build_lex_table.rs
@ -1,5 +1,6 @@
 use super::coincident_tokens::CoincidentTokenIndex;
 use super::item::TokenSet;
+use super::split_state_id_groups;
 use super::token_conflicts::TokenConflictMap;
 use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar};
 use crate::generate::nfa::{CharacterSet, NfaCursor};
@ -7,7 +8,7 @@ use crate::generate::rules::Symbol;
 use crate::generate::tables::{AdvanceAction, LexState, LexTable, ParseStateId, ParseTable};
 use log::info;
 use std::collections::hash_map::Entry;
-use std::collections::{BTreeMap, HashMap, VecDeque};
+use std::collections::{HashMap, VecDeque};
 use std::mem;

 pub(crate) fn build_lex_table(
@ -251,13 +252,15 @@ fn merge_token_set(
        };

        for existing_token in set_without_terminal.terminals() {
-            if token_conflict_map.does_conflict(i, existing_token.index) ||
-               token_conflict_map.does_match_prefix(i, existing_token.index) {
+            if token_conflict_map.does_conflict(i, existing_token.index)
+                || token_conflict_map.does_match_prefix(i, existing_token.index)
+            {
                return false;
            }
            if !coincident_token_index.contains(symbol, existing_token) {
-                if token_conflict_map.does_overlap(existing_token.index, i) ||
-                   token_conflict_map.does_overlap(i, existing_token.index) {
+                if token_conflict_map.does_overlap(existing_token.index, i)
+                    || token_conflict_map.does_overlap(i, existing_token.index)
+                {
                    return false;
                }
            }
@ -269,76 +272,86 @@ fn merge_token_set(
 }

 fn minimize_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) {
-    let mut state_replacements = BTreeMap::new();
-    let mut done = false;
-    while !done {
-        done = true;
-        for (i, state_i) in table.states.iter().enumerate() {
-            if state_replacements.contains_key(&i) {
-                continue;
-            }
-            for (j, state_j) in table.states.iter().enumerate() {
-                if j == i {
-                    break;
-                }
-                if state_replacements.contains_key(&j) {
-                    continue;
-                }
-                if state_i.equals(state_j, i, j) {
-                    info!("replace state {} with state {}", i, j);
-                    state_replacements.insert(i, j);
-                    done = false;
-                    break;
-                }
-            }
-        }
-        for state in table.states.iter_mut() {
-            for (_, advance_action) in state.advance_actions.iter_mut() {
-                advance_action.state = state_replacements
-                    .get(&advance_action.state)
-                    .cloned()
-                    .unwrap_or(advance_action.state);
-            }
+    // Initially group the states by their accept action and their
+    // valid lookahead characters.
+    let mut state_ids_by_signature = HashMap::new();
+    for (i, state) in table.states.iter().enumerate() {
+        let signature = (
+            i == 0,
+            state.accept_action,
+            state
+                .advance_actions
+                .iter()
+                .map(|(characters, action)| (characters.clone(), action.in_main_token))
+                .collect::<Vec<_>>(),
+        );
+        state_ids_by_signature
+            .entry(signature)
+            .or_insert(Vec::new())
+            .push(i);
+    }
+    let mut state_ids_by_group_id = state_ids_by_signature
+        .into_iter()
+        .map(|e| e.1)
+        .collect::<Vec<_>>();
+    let error_group_index = state_ids_by_group_id
+        .iter()
+        .position(|g| g.contains(&0))
+        .unwrap();
+    state_ids_by_group_id.swap(error_group_index, 0);
+
+    let mut group_ids_by_state_id = vec![0; table.states.len()];
+    for (group_id, state_ids) in state_ids_by_group_id.iter().enumerate() {
+        for state_id in state_ids {
+            group_ids_by_state_id[*state_id] = group_id;
        }
    }

-    let final_state_replacements = (0..table.states.len())
-        .into_iter()
-        .map(|state_id| {
-            let replacement = state_replacements
-                .get(&state_id)
-                .cloned()
-                .unwrap_or(state_id);
-            let prior_removed = state_replacements
-                .iter()
-                .take_while(|i| *i.0 < replacement)
-                .count();
-            replacement - prior_removed
-        })
-        .collect::<Vec<_>>();
+    while split_state_id_groups(
+        &table.states,
+        &mut state_ids_by_group_id,
+        &mut group_ids_by_state_id,
+        1,
+        lex_states_differ,
+    ) {
+        continue;
+    }
+
+    let mut new_states = Vec::with_capacity(state_ids_by_group_id.len());
+    for state_ids in &state_ids_by_group_id {
+        let mut new_state = LexState::default();
+        mem::swap(&mut new_state, &mut table.states[state_ids[0]]);
+
+        for (_, advance_action) in new_state.advance_actions.iter_mut() {
+            advance_action.state = group_ids_by_state_id[advance_action.state];
+        }
+        new_states.push(new_state);
+    }

    for state in parse_table.states.iter_mut() {
-        state.lex_state_id = final_state_replacements[state.lex_state_id];
+        state.lex_state_id = group_ids_by_state_id[state.lex_state_id];
    }

-    for state in table.states.iter_mut() {
-        for (_, advance_action) in state.advance_actions.iter_mut() {
-            advance_action.state = final_state_replacements[advance_action.state];
-        }
-    }
+    table.states = new_states;
+}

-    let mut i = 0;
-    table.states.retain(|_| {
-        let result = !state_replacements.contains_key(&i);
-        i += 1;
-        result
-    });
+fn lex_states_differ(
+    left: &LexState,
+    right: &LexState,
+    group_ids_by_state_id: &Vec<usize>,
+) -> bool {
+    left.advance_actions
+        .iter()
+        .zip(right.advance_actions.iter())
+        .any(|(left, right)| {
+            group_ids_by_state_id[left.1.state] != group_ids_by_state_id[right.1.state]
+        })
 }

 fn sort_states(table: &mut LexTable, parse_table: &mut ParseTable) {
    // Get a mapping of old state index -> new_state_index
    let mut old_ids_by_new_id = (0..table.states.len()).collect::<Vec<_>>();
-    &old_ids_by_new_id[1..].sort_unstable_by_key(|id| &table.states[*id]);
+    &old_ids_by_new_id[1..].sort_by_key(|id| &table.states[*id]);

    // Get the inverse mapping
    let mut new_ids_by_old_id = vec![0; old_ids_by_new_id.len()];
--- a/cli/src/generate/build_tables/build_parse_table.rs
+++ b/cli/src/generate/build_tables/build_parse_table.rs
@ -745,7 +745,10 @@ fn populate_following_tokens(
        .iter()
        .flat_map(|v| &v.productions)
        .chain(&inlines.productions);
-    let all_tokens = (0..result.len()).into_iter().map(Symbol::terminal).collect::<TokenSet>();
+    let all_tokens = (0..result.len())
+        .into_iter()
+        .map(Symbol::terminal)
+        .collect::<TokenSet>();
    for production in productions {
        for i in 1..production.steps.len() {
            let left_tokens = builder.last_set(&production.steps[i - 1].symbol);
--- a/cli/src/generate/build_tables/minimize_parse_table.rs
+++ b/cli/src/generate/build_tables/minimize_parse_table.rs
@ -1,4 +1,5 @@
 use super::item::TokenSet;
+use super::split_state_id_groups;
 use super::token_conflicts::TokenConflictMap;
 use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar, VariableType};
 use crate::generate::rules::{AliasMap, Symbol};
@ -126,15 +127,19 @@ impl<'a> Minimizer<'a> {
            group_ids_by_state_id.push(state.core_id);
        }

-        self.split_state_id_groups_by(
+        split_state_id_groups(
+            &self.parse_table.states,
            &mut state_ids_by_group_id,
            &mut group_ids_by_state_id,
+            0,
            |left, right, groups| self.states_conflict(left, right, groups),
        );

-        while self.split_state_id_groups_by(
+        while split_state_id_groups(
+            &self.parse_table.states,
            &mut state_ids_by_group_id,
            &mut group_ids_by_state_id,
+            0,
            |left, right, groups| self.state_successors_differ(left, right, groups),
        ) {
            continue;
@ -183,84 +188,6 @@ impl<'a> Minimizer<'a> {
        self.parse_table.states = new_states;
    }

-    fn split_state_id_groups_by(
-        &self,
-        state_ids_by_group_id: &mut Vec<Vec<ParseStateId>>,
-        group_ids_by_state_id: &mut Vec<ParseStateId>,
-        mut f: impl FnMut(&ParseState, &ParseState, &Vec<ParseStateId>) -> bool,
-    ) -> bool {
-        let mut result = false;
-
-        // Examine each group of states, and split them up if necessary. For
-        // each group of states, find a subgroup where all the states are mutually
-        // compatible. Leave that subgroup in place, and split off all of the
-        // other states in the group into a new group. Those states are not
-        // necessarily mutually compatible, but they will be split up in later
-        // iterations.
-        let mut group_id = 0;
-        while group_id < state_ids_by_group_id.len() {
-            let state_ids = &state_ids_by_group_id[group_id];
-            let mut split_state_ids = Vec::new();
-
-            let mut i = 0;
-            while i < state_ids.len() {
-                let left_state_id = state_ids[i];
-                if split_state_ids.contains(&left_state_id) {
-                    i += 1;
-                    continue;
-                }
-
-                let left_state = &self.parse_table.states[left_state_id];
-
-                // Identify all of the other states in the group that are incompatible with
-                // this state.
-                let mut j = i + 1;
-                while j < state_ids.len() {
-                    let right_state_id = state_ids[j];
-                    if split_state_ids.contains(&right_state_id) {
-                        j += 1;
-                        continue;
-                    }
-                    let right_state = &self.parse_table.states[right_state_id];
-
-                    if f(left_state, right_state, &group_ids_by_state_id) {
-                        split_state_ids.push(right_state_id);
-                    }
-
-                    j += 1;
-                }
-
-                i += 1;
-            }
-
-            // If any states were removed from the group, add them all as a new group.
-            if split_state_ids.len() > 0 {
-                result = true;
-                state_ids_by_group_id[group_id].retain(|i| !split_state_ids.contains(&i));
-
-                info!(
-                    "split state groups {:?} {:?}",
-                    state_ids_by_group_id[group_id], split_state_ids,
-                );
-
-                let new_group_id = state_ids_by_group_id.len();
-                for id in &split_state_ids {
-                    group_ids_by_state_id[*id] = new_group_id;
-                }
-
-                state_ids_by_group_id.push(Vec::new());
-                mem::swap(
-                    &mut split_state_ids,
-                    state_ids_by_group_id.last_mut().unwrap(),
-                );
-            }
-
-            group_id += 1;
-        }
-
-        result
-    }
-
    fn states_conflict(
        &self,
        left_state: &ParseState,
--- a/cli/src/generate/build_tables/mod.rs
+++ b/cli/src/generate/build_tables/mod.rs
@ -355,3 +355,67 @@ fn all_chars_are_alphabetical(cursor: &NfaCursor) -> bool {
        }
    })
 }
+
+fn split_state_id_groups<S>(
+    states: &Vec<S>,
+    state_ids_by_group_id: &mut Vec<Vec<usize>>,
+    group_ids_by_state_id: &mut Vec<usize>,
+    start_group_id: usize,
+    mut f: impl FnMut(&S, &S, &Vec<usize>) -> bool,
+) -> bool {
+    let mut result = false;
+
+    let mut group_id = start_group_id;
+    while group_id < state_ids_by_group_id.len() {
+        let state_ids = &state_ids_by_group_id[group_id];
+        let mut split_state_ids = Vec::new();
+
+        let mut i = 0;
+        while i < state_ids.len() {
+            let left_state_id = state_ids[i];
+            if split_state_ids.contains(&left_state_id) {
+                i += 1;
+                continue;
+            }
+
+            let left_state = &states[left_state_id];
+
+            // Identify all of the other states in the group that are incompatible with
+            // this state.
+            let mut j = i + 1;
+            while j < state_ids.len() {
+                let right_state_id = state_ids[j];
+                if split_state_ids.contains(&right_state_id) {
+                    j += 1;
+                    continue;
+                }
+                let right_state = &states[right_state_id];
+
+                if f(left_state, right_state, &group_ids_by_state_id) {
+                    split_state_ids.push(right_state_id);
+                }
+
+                j += 1;
+            }
+
+            i += 1;
+        }
+
+        // If any states were removed from the group, add them all as a new group.
+        if split_state_ids.len() > 0 {
+            result = true;
+            state_ids_by_group_id[group_id].retain(|i| !split_state_ids.contains(&i));
+
+            let new_group_id = state_ids_by_group_id.len();
+            for id in &split_state_ids {
+                group_ids_by_state_id[*id] = new_group_id;
+            }
+
+            state_ids_by_group_id.push(split_state_ids);
+        }
+
+        group_id += 1;
+    }
+
+    result
+}
--- a/cli/src/generate/build_tables/token_conflicts.rs
+++ b/cli/src/generate/build_tables/token_conflicts.rs
@ -79,10 +79,10 @@ impl<'a> TokenConflictMap<'a> {

    pub fn does_overlap(&self, i: usize, j: usize) -> bool {
        let status = &self.status_matrix[matrix_index(self.n, i, j)];
-        status.does_match_separators ||
-            status.matches_prefix ||
-            status.matches_same_string ||
-            status.does_match_continuation
+        status.does_match_separators
+            || status.matches_prefix
+            || status.matches_same_string
+            || status.does_match_continuation
    }

    pub fn prefer_token(grammar: &LexicalGrammar, left: (i32, usize), right: (i32, usize)) -> bool {
--- a/cli/src/generate/tables.rs
+++ b/cli/src/generate/tables.rs
@ -69,8 +69,8 @@ pub(crate) struct AdvanceAction {

 #[derive(Clone, Debug, Default, PartialEq, Eq, PartialOrd, Ord)]
 pub(crate) struct LexState {
-    pub advance_actions: Vec<(CharacterSet, AdvanceAction)>,
    pub accept_action: Option<Symbol>,
+    pub advance_actions: Vec<(CharacterSet, AdvanceAction)>,
 }

 #[derive(Debug, PartialEq, Eq)]
@ -152,39 +152,3 @@ impl ParseAction {
        }
    }
 }
-
-impl LexState {
-    pub fn equals(&self, other: &LexState, left_state: usize, right_state: usize) -> bool {
-        if self.accept_action != other.accept_action {
-            return false;
-        }
-
-        if self.advance_actions.len() != other.advance_actions.len() {
-            return false;
-        }
-
-        for (left, right) in self
-            .advance_actions
-            .iter()
-            .zip(other.advance_actions.iter())
-        {
-            if left.0 != right.0 || left.1.in_main_token != right.1.in_main_token {
-                return false;
-            }
-
-            let left_successor = left.1.state;
-            let right_successor = right.1.state;
-
-            // Two states can be equal if they have different successors but the successor
-            // states are equal.
-            if left_successor != right_successor
-                && (left_successor != left_state || right_successor != right_state)
-                && (left_successor != right_state || right_successor != left_state)
-            {
-                return false;
-            }
-        }
-
-        true
-    }
-}