From 70dc79b41261848cd5302049a0ded86111be1ba6 Mon Sep 17 00:00:00 2001
From: Max Brunsfeld <maxbrunsfeld@gmail.com>
Date: Thu, 20 Jun 2019 14:05:15 -0700
Subject: [PATCH] Merge lex states more liberally

---
 .../generate/build_tables/build_lex_table.rs  | 11 +++-
 .../build_tables/build_parse_table.rs         |  9 +++
 .../generate/build_tables/token_conflicts.rs  | 62 ++++++++++++-------
 3 files changed, 57 insertions(+), 25 deletions(-)
diff --git a/cli/src/generate/build_tables/build_lex_table.rs b/cli/src/generate/build_tables/build_lex_table.rs
index 96e6cfea..0e3da4ff 100644
--- a/cli/src/generate/build_tables/build_lex_table.rs
+++ b/cli/src/generate/build_tables/build_lex_table.rs
@@ -251,11 +251,16 @@ fn merge_token_set(
         };
 
         for existing_token in set_without_terminal.terminals() {
-            if token_conflict_map.does_conflict(i, existing_token.index)
-                || !coincident_token_index.contains(symbol, existing_token)
-            {
+            if token_conflict_map.does_conflict(i, existing_token.index) ||
+               token_conflict_map.does_match_prefix(i, existing_token.index) {
                 return false;
             }
+            if !coincident_token_index.contains(symbol, existing_token) {
+                if token_conflict_map.does_overlap(existing_token.index, i) ||
+                   token_conflict_map.does_overlap(i, existing_token.index) {
+                    return false;
+                }
+            }
         }
     }
 
diff --git a/cli/src/generate/build_tables/build_parse_table.rs b/cli/src/generate/build_tables/build_parse_table.rs
index b0ad2a40..c3372091 100644
--- a/cli/src/generate/build_tables/build_parse_table.rs
+++ b/cli/src/generate/build_tables/build_parse_table.rs
@@ -745,6 +745,7 @@ fn populate_following_tokens(
         .iter()
         .flat_map(|v| &v.productions)
         .chain(&inlines.productions);
+    let all_tokens = (0..result.len()).into_iter().map(Symbol::terminal).collect::<TokenSet>();
     for production in productions {
         for i in 1..production.steps.len() {
             let left_tokens = builder.last_set(&production.steps[i - 1].symbol);
@@ -756,6 +757,14 @@ fn populate_following_tokens(
             }
         }
     }
+    for extra in &grammar.extra_tokens {
+        if extra.is_terminal() {
+            for entry in result.iter_mut() {
+                entry.insert(*extra);
+            }
+            result[extra.index] = all_tokens.clone();
+        }
+    }
 }
 
 pub(crate) fn build_parse_table(
diff --git a/cli/src/generate/build_tables/token_conflicts.rs b/cli/src/generate/build_tables/token_conflicts.rs
index 5c8b3ff5..303e79fb 100644
--- a/cli/src/generate/build_tables/token_conflicts.rs
+++ b/cli/src/generate/build_tables/token_conflicts.rs
@@ -7,7 +7,8 @@ use std::fmt;
 
 #[derive(Clone, Debug, Default, PartialEq, Eq)]
 struct TokenConflictStatus {
-    does_overlap: bool,
+    matches_prefix: bool,
+    does_match_continuation: bool,
     does_match_valid_continuation: bool,
     does_match_separators: bool,
     matches_same_string: bool,
@@ -65,6 +66,10 @@ impl<'a> TokenConflictMap<'a> {
             || entry.matches_same_string
     }
 
+    pub fn does_match_prefix(&self, i: usize, j: usize) -> bool {
+        self.status_matrix[matrix_index(self.n, i, j)].matches_prefix
+    }
+
     pub fn does_match_shorter_or_longer(&self, i: usize, j: usize) -> bool {
         let entry = &self.status_matrix[matrix_index(self.n, i, j)];
         let reverse_entry = &self.status_matrix[matrix_index(self.n, j, i)];
@@ -73,7 +78,11 @@ impl<'a> TokenConflictMap<'a> {
     }
 
     pub fn does_overlap(&self, i: usize, j: usize) -> bool {
-        self.status_matrix[matrix_index(self.n, i, j)].does_overlap
+        let status = &self.status_matrix[matrix_index(self.n, i, j)];
+        status.does_match_separators ||
+            status.matches_prefix ||
+            status.matches_same_string ||
+            status.does_match_continuation
     }
 
     pub fn prefer_token(grammar: &LexicalGrammar, left: (i32, usize), right: (i32, usize)) -> bool {
@@ -238,6 +247,7 @@ fn compute_conflict_status(
 
         let has_sep = cursor.transition_chars().any(|(_, sep)| sep);
 
+        // Examine each possible completed token in this state.
         let mut completion = None;
         for (id, precedence) in cursor.completions() {
             if has_sep {
@@ -248,13 +258,14 @@ fn compute_conflict_status(
                 }
             }
 
+            // If the other token has already completed, then this is
+            // a same-string conflict.
             if let Some((prev_id, prev_precedence)) = completion {
                 if id == prev_id {
                     continue;
                 }
 
-                // Prefer tokens with higher precedence. For tokens with equal precedence,
-                // prefer those listed earlier in the grammar.
+                // Determine which of the two tokens is preferred.
                 let preferred_id;
                 if TokenConflictMap::prefer_token(
                     grammar,
@@ -269,32 +280,37 @@ fn compute_conflict_status(
 
                 if preferred_id == i {
                     result.0.matches_same_string = true;
-                    result.0.does_overlap = true;
                 } else {
                     result.1.matches_same_string = true;
-                    result.1.does_overlap = true;
                 }
             } else {
                 completion = Some((id, precedence));
             }
         }
 
+        // Examine each possible transition from this state to detect substring conflicts.
         for transition in cursor.transitions() {
             let mut can_advance = true;
+
+            // If there is already a completed token in this state, then determine
+            // if the next state can also match the completed token. If so, then
+            // this is *not* a conflict.
             if let Some((completed_id, completed_precedence)) = completion {
-                let mut other_id = None;
+                let mut advanced_id = None;
                 let mut successor_contains_completed_id = false;
                 for variable_id in grammar.variable_indices_for_nfa_states(&transition.states) {
                     if variable_id == completed_id {
                         successor_contains_completed_id = true;
                         break;
                     } else {
-                        other_id = Some(variable_id);
+                        advanced_id = Some(variable_id);
                     }
                 }
 
-                if let (Some(other_id), false) = (other_id, successor_contains_completed_id) {
-                    let preferred_id = if TokenConflictMap::prefer_transition(
+                // Determine which action is preferred: matching the already complete
+                // token, or continuing on to try and match the other longer token.
+                if let (Some(advanced_id), false) = (advanced_id, successor_contains_completed_id) {
+                    if TokenConflictMap::prefer_transition(
                         grammar,
                         &transition,
                         completed_id,
@@ -302,20 +318,22 @@ fn compute_conflict_status(
                         has_sep,
                     ) {
                         can_advance = true;
-                        other_id
-                    } else {
-                        completed_id
-                    };
-
-                    if preferred_id == i {
-                        result.0.does_overlap = true;
-                        if transition.characters.does_intersect(&following_chars[j]) {
-                            result.0.does_match_valid_continuation = true;
+                        if advanced_id == i {
+                            result.0.does_match_continuation = true;
+                            if transition.characters.does_intersect(&following_chars[j]) {
+                                result.0.does_match_valid_continuation = true;
+                            }
+                        } else {
+                            result.1.does_match_continuation = true;
+                            if transition.characters.does_intersect(&following_chars[i]) {
+                                result.1.does_match_valid_continuation = true;
+                            }
                         }
                     } else {
-                        result.1.does_overlap = true;
-                        if transition.characters.does_intersect(&following_chars[i]) {
-                            result.1.does_match_valid_continuation = true;
+                        if completed_id == i {
+                            result.0.matches_prefix = true;
+                        } else {
+                            result.1.matches_prefix = true;
                         }
                     }
                 }