From 223a656fc81647e2595c3da26577594f59e8b9a7 Mon Sep 17 00:00:00 2001
From: Max Brunsfeld <maxbrunsfeld@gmail.com>
Date: Fri, 21 Jun 2019 13:12:09 -0700
Subject: [PATCH] Fix another bug in lex state merging

Reuse more logic for lex and parse state merging algorithms
---
 .../generate/build_tables/build_lex_table.rs  | 137 ++++++++++--------
 .../build_tables/build_parse_table.rs         |   5 +-
 .../build_tables/minimize_parse_table.rs      |  87 +----------
 cli/src/generate/build_tables/mod.rs          |  64 ++++++++
 .../generate/build_tables/token_conflicts.rs  |   8 +-
 cli/src/generate/tables.rs                    |  38 +----
 6 files changed, 155 insertions(+), 184 deletions(-)

diff --git a/cli/src/generate/build_tables/build_lex_table.rs b/cli/src/generate/build_tables/build_lex_table.rs
index 0e3da4ff..f1d1acf1 100644
--- a/cli/src/generate/build_tables/build_lex_table.rs
+++ b/cli/src/generate/build_tables/build_lex_table.rs
@@ -1,5 +1,6 @@
 use super::coincident_tokens::CoincidentTokenIndex;
 use super::item::TokenSet;
+use super::split_state_id_groups;
 use super::token_conflicts::TokenConflictMap;
 use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar};
 use crate::generate::nfa::{CharacterSet, NfaCursor};
@@ -7,7 +8,7 @@ use crate::generate::rules::Symbol;
 use crate::generate::tables::{AdvanceAction, LexState, LexTable, ParseStateId, ParseTable};
 use log::info;
 use std::collections::hash_map::Entry;
-use std::collections::{BTreeMap, HashMap, VecDeque};
+use std::collections::{HashMap, VecDeque};
 use std::mem;
 
 pub(crate) fn build_lex_table(
@@ -251,13 +252,15 @@ fn merge_token_set(
         };
 
         for existing_token in set_without_terminal.terminals() {
-            if token_conflict_map.does_conflict(i, existing_token.index) ||
-               token_conflict_map.does_match_prefix(i, existing_token.index) {
+            if token_conflict_map.does_conflict(i, existing_token.index)
+                || token_conflict_map.does_match_prefix(i, existing_token.index)
+            {
                 return false;
             }
             if !coincident_token_index.contains(symbol, existing_token) {
-                if token_conflict_map.does_overlap(existing_token.index, i) ||
-                   token_conflict_map.does_overlap(i, existing_token.index) {
+                if token_conflict_map.does_overlap(existing_token.index, i)
+                    || token_conflict_map.does_overlap(i, existing_token.index)
+                {
                     return false;
                 }
             }
@@ -269,76 +272,86 @@ fn merge_token_set(
 }
 
 fn minimize_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) {
-    let mut state_replacements = BTreeMap::new();
-    let mut done = false;
-    while !done {
-        done = true;
-        for (i, state_i) in table.states.iter().enumerate() {
-            if state_replacements.contains_key(&i) {
-                continue;
-            }
-            for (j, state_j) in table.states.iter().enumerate() {
-                if j == i {
-                    break;
-                }
-                if state_replacements.contains_key(&j) {
-                    continue;
-                }
-                if state_i.equals(state_j, i, j) {
-                    info!("replace state {} with state {}", i, j);
-                    state_replacements.insert(i, j);
-                    done = false;
-                    break;
-                }
-            }
-        }
-        for state in table.states.iter_mut() {
-            for (_, advance_action) in state.advance_actions.iter_mut() {
-                advance_action.state = state_replacements
-                    .get(&advance_action.state)
-                    .cloned()
-                    .unwrap_or(advance_action.state);
-            }
+    // Initially group the states by their accept action and their
+    // valid lookahead characters.
+    let mut state_ids_by_signature = HashMap::new();
+    for (i, state) in table.states.iter().enumerate() {
+        let signature = (
+            i == 0,
+            state.accept_action,
+            state
+                .advance_actions
+                .iter()
+                .map(|(characters, action)| (characters.clone(), action.in_main_token))
+                .collect::<Vec<_>>(),
+        );
+        state_ids_by_signature
+            .entry(signature)
+            .or_insert(Vec::new())
+            .push(i);
+    }
+    let mut state_ids_by_group_id = state_ids_by_signature
+        .into_iter()
+        .map(|e| e.1)
+        .collect::<Vec<_>>();
+    let error_group_index = state_ids_by_group_id
+        .iter()
+        .position(|g| g.contains(&0))
+        .unwrap();
+    state_ids_by_group_id.swap(error_group_index, 0);
+
+    let mut group_ids_by_state_id = vec![0; table.states.len()];
+    for (group_id, state_ids) in state_ids_by_group_id.iter().enumerate() {
+        for state_id in state_ids {
+            group_ids_by_state_id[*state_id] = group_id;
         }
     }
 
-    let final_state_replacements = (0..table.states.len())
-        .into_iter()
-        .map(|state_id| {
-            let replacement = state_replacements
-                .get(&state_id)
-                .cloned()
-                .unwrap_or(state_id);
-            let prior_removed = state_replacements
-                .iter()
-                .take_while(|i| *i.0 < replacement)
-                .count();
-            replacement - prior_removed
-        })
-        .collect::<Vec<_>>();
+    while split_state_id_groups(
+        &table.states,
+        &mut state_ids_by_group_id,
+        &mut group_ids_by_state_id,
+        1,
+        lex_states_differ,
+    ) {
+        continue;
+    }
+
+    let mut new_states = Vec::with_capacity(state_ids_by_group_id.len());
+    for state_ids in &state_ids_by_group_id {
+        let mut new_state = LexState::default();
+        mem::swap(&mut new_state, &mut table.states[state_ids[0]]);
+
+        for (_, advance_action) in new_state.advance_actions.iter_mut() {
+            advance_action.state = group_ids_by_state_id[advance_action.state];
+        }
+        new_states.push(new_state);
+    }
 
     for state in parse_table.states.iter_mut() {
-        state.lex_state_id = final_state_replacements[state.lex_state_id];
+        state.lex_state_id = group_ids_by_state_id[state.lex_state_id];
     }
 
-    for state in table.states.iter_mut() {
-        for (_, advance_action) in state.advance_actions.iter_mut() {
-            advance_action.state = final_state_replacements[advance_action.state];
-        }
-    }
+    table.states = new_states;
+}
 
-    let mut i = 0;
-    table.states.retain(|_| {
-        let result = !state_replacements.contains_key(&i);
-        i += 1;
-        result
-    });
+fn lex_states_differ(
+    left: &LexState,
+    right: &LexState,
+    group_ids_by_state_id: &Vec<usize>,
+) -> bool {
+    left.advance_actions
+        .iter()
+        .zip(right.advance_actions.iter())
+        .any(|(left, right)| {
+            group_ids_by_state_id[left.1.state] != group_ids_by_state_id[right.1.state]
+        })
 }
 
 fn sort_states(table: &mut LexTable, parse_table: &mut ParseTable) {
     // Get a mapping of old state index -> new_state_index
     let mut old_ids_by_new_id = (0..table.states.len()).collect::<Vec<_>>();
-    &old_ids_by_new_id[1..].sort_unstable_by_key(|id| &table.states[*id]);
+    &old_ids_by_new_id[1..].sort_by_key(|id| &table.states[*id]);
 
     // Get the inverse mapping
     let mut new_ids_by_old_id = vec![0; old_ids_by_new_id.len()];
diff --git a/cli/src/generate/build_tables/build_parse_table.rs b/cli/src/generate/build_tables/build_parse_table.rs
index c3372091..f7c3b42a 100644
--- a/cli/src/generate/build_tables/build_parse_table.rs
+++ b/cli/src/generate/build_tables/build_parse_table.rs
@@ -745,7 +745,10 @@ fn populate_following_tokens(
         .iter()
         .flat_map(|v| &v.productions)
         .chain(&inlines.productions);
-    let all_tokens = (0..result.len()).into_iter().map(Symbol::terminal).collect::<TokenSet>();
+    let all_tokens = (0..result.len())
+        .into_iter()
+        .map(Symbol::terminal)
+        .collect::<TokenSet>();
     for production in productions {
         for i in 1..production.steps.len() {
             let left_tokens = builder.last_set(&production.steps[i - 1].symbol);
diff --git a/cli/src/generate/build_tables/minimize_parse_table.rs b/cli/src/generate/build_tables/minimize_parse_table.rs
index 5ab8eaf6..d1823fc7 100644
--- a/cli/src/generate/build_tables/minimize_parse_table.rs
+++ b/cli/src/generate/build_tables/minimize_parse_table.rs
@@ -1,4 +1,5 @@
 use super::item::TokenSet;
+use super::split_state_id_groups;
 use super::token_conflicts::TokenConflictMap;
 use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar, VariableType};
 use crate::generate::rules::{AliasMap, Symbol};
@@ -126,15 +127,19 @@ impl<'a> Minimizer<'a> {
             group_ids_by_state_id.push(state.core_id);
         }
 
-        self.split_state_id_groups_by(
+        split_state_id_groups(
+            &self.parse_table.states,
             &mut state_ids_by_group_id,
             &mut group_ids_by_state_id,
+            0,
             |left, right, groups| self.states_conflict(left, right, groups),
         );
 
-        while self.split_state_id_groups_by(
+        while split_state_id_groups(
+            &self.parse_table.states,
             &mut state_ids_by_group_id,
             &mut group_ids_by_state_id,
+            0,
             |left, right, groups| self.state_successors_differ(left, right, groups),
         ) {
             continue;
@@ -183,84 +188,6 @@ impl<'a> Minimizer<'a> {
         self.parse_table.states = new_states;
     }
 
-    fn split_state_id_groups_by(
-        &self,
-        state_ids_by_group_id: &mut Vec<Vec<ParseStateId>>,
-        group_ids_by_state_id: &mut Vec<ParseStateId>,
-        mut f: impl FnMut(&ParseState, &ParseState, &Vec<ParseStateId>) -> bool,
-    ) -> bool {
-        let mut result = false;
-
-        // Examine each group of states, and split them up if necessary. For
-        // each group of states, find a subgroup where all the states are mutually
-        // compatible. Leave that subgroup in place, and split off all of the
-        // other states in the group into a new group. Those states are not
-        // necessarily mutually compatible, but they will be split up in later
-        // iterations.
-        let mut group_id = 0;
-        while group_id < state_ids_by_group_id.len() {
-            let state_ids = &state_ids_by_group_id[group_id];
-            let mut split_state_ids = Vec::new();
-
-            let mut i = 0;
-            while i < state_ids.len() {
-                let left_state_id = state_ids[i];
-                if split_state_ids.contains(&left_state_id) {
-                    i += 1;
-                    continue;
-                }
-
-                let left_state = &self.parse_table.states[left_state_id];
-
-                // Identify all of the other states in the group that are incompatible with
-                // this state.
-                let mut j = i + 1;
-                while j < state_ids.len() {
-                    let right_state_id = state_ids[j];
-                    if split_state_ids.contains(&right_state_id) {
-                        j += 1;
-                        continue;
-                    }
-                    let right_state = &self.parse_table.states[right_state_id];
-
-                    if f(left_state, right_state, &group_ids_by_state_id) {
-                        split_state_ids.push(right_state_id);
-                    }
-
-                    j += 1;
-                }
-
-                i += 1;
-            }
-
-            // If any states were removed from the group, add them all as a new group.
-            if split_state_ids.len() > 0 {
-                result = true;
-                state_ids_by_group_id[group_id].retain(|i| !split_state_ids.contains(&i));
-
-                info!(
-                    "split state groups {:?} {:?}",
-                    state_ids_by_group_id[group_id], split_state_ids,
-                );
-
-                let new_group_id = state_ids_by_group_id.len();
-                for id in &split_state_ids {
-                    group_ids_by_state_id[*id] = new_group_id;
-                }
-
-                state_ids_by_group_id.push(Vec::new());
-                mem::swap(
-                    &mut split_state_ids,
-                    state_ids_by_group_id.last_mut().unwrap(),
-                );
-            }
-
-            group_id += 1;
-        }
-
-        result
-    }
-
     fn states_conflict(
         &self,
         left_state: &ParseState,
diff --git a/cli/src/generate/build_tables/mod.rs b/cli/src/generate/build_tables/mod.rs
index de28cda3..7b33d4c3 100644
--- a/cli/src/generate/build_tables/mod.rs
+++ b/cli/src/generate/build_tables/mod.rs
@@ -355,3 +355,67 @@ fn all_chars_are_alphabetical(cursor: &NfaCursor) -> bool {
         }
     })
 }
+
+fn split_state_id_groups<S>(
+    states: &Vec<S>,
+    state_ids_by_group_id: &mut Vec<Vec<usize>>,
+    group_ids_by_state_id: &mut Vec<usize>,
+    start_group_id: usize,
+    mut f: impl FnMut(&S, &S, &Vec<usize>) -> bool,
+) -> bool {
+    let mut result = false;
+
+    let mut group_id = start_group_id;
+    while group_id < state_ids_by_group_id.len() {
+        let state_ids = &state_ids_by_group_id[group_id];
+        let mut split_state_ids = Vec::new();
+
+        let mut i = 0;
+        while i < state_ids.len() {
+            let left_state_id = state_ids[i];
+            if split_state_ids.contains(&left_state_id) {
+                i += 1;
+                continue;
+            }
+
+            let left_state = &states[left_state_id];
+
+            // Identify all of the other states in the group that are incompatible with
+            // this state.
+            let mut j = i + 1;
+            while j < state_ids.len() {
+                let right_state_id = state_ids[j];
+                if split_state_ids.contains(&right_state_id) {
+                    j += 1;
+                    continue;
+                }
+                let right_state = &states[right_state_id];
+
+                if f(left_state, right_state, &group_ids_by_state_id) {
+                    split_state_ids.push(right_state_id);
+                }
+
+                j += 1;
+            }
+
+            i += 1;
+        }
+
+        // If any states were removed from the group, add them all as a new group.
+        if split_state_ids.len() > 0 {
+            result = true;
+            state_ids_by_group_id[group_id].retain(|i| !split_state_ids.contains(&i));
+
+            let new_group_id = state_ids_by_group_id.len();
+            for id in &split_state_ids {
+                group_ids_by_state_id[*id] = new_group_id;
+            }
+
+            state_ids_by_group_id.push(split_state_ids);
+        }
+
+        group_id += 1;
+    }
+
+    result
+}
diff --git a/cli/src/generate/build_tables/token_conflicts.rs b/cli/src/generate/build_tables/token_conflicts.rs
index 303e79fb..a7737eb3 100644
--- a/cli/src/generate/build_tables/token_conflicts.rs
+++ b/cli/src/generate/build_tables/token_conflicts.rs
@@ -79,10 +79,10 @@ impl<'a> TokenConflictMap<'a> {
 
     pub fn does_overlap(&self, i: usize, j: usize) -> bool {
         let status = &self.status_matrix[matrix_index(self.n, i, j)];
-        status.does_match_separators ||
-            status.matches_prefix ||
-            status.matches_same_string ||
-            status.does_match_continuation
+        status.does_match_separators
+            || status.matches_prefix
+            || status.matches_same_string
+            || status.does_match_continuation
     }
 
     pub fn prefer_token(grammar: &LexicalGrammar, left: (i32, usize), right: (i32, usize)) -> bool {
diff --git a/cli/src/generate/tables.rs b/cli/src/generate/tables.rs
index f7d96556..75fb7710 100644
--- a/cli/src/generate/tables.rs
+++ b/cli/src/generate/tables.rs
@@ -69,8 +69,8 @@ pub(crate) struct AdvanceAction {
 
 #[derive(Clone, Debug, Default, PartialEq, Eq, PartialOrd, Ord)]
 pub(crate) struct LexState {
-    pub advance_actions: Vec<(CharacterSet, AdvanceAction)>,
     pub accept_action: Option<Symbol>,
+    pub advance_actions: Vec<(CharacterSet, AdvanceAction)>,
 }
 
 #[derive(Debug, PartialEq, Eq)]
@@ -152,39 +152,3 @@ impl ParseAction {
         }
     }
 }
-
-impl LexState {
-    pub fn equals(&self, other: &LexState, left_state: usize, right_state: usize) -> bool {
-        if self.accept_action != other.accept_action {
-            return false;
-        }
-
-        if self.advance_actions.len() != other.advance_actions.len() {
-            return false;
-        }
-
-        for (left, right) in self
-            .advance_actions
-            .iter()
-            .zip(other.advance_actions.iter())
-        {
-            if left.0 != right.0 || left.1.in_main_token != right.1.in_main_token {
-                return false;
-            }
-
-            let left_successor = left.1.state;
-            let right_successor = right.1.state;
-
-            // Two states can be equal if they have different successors but the successor
-            // states are equal.
-            if left_successor != right_successor
-                && (left_successor != left_state || right_successor != right_state)
-                && (left_successor != right_state || right_successor != left_state)
-            {
-                return false;
-            }
-        }
-
-        true
-    }
-}