From 1b5ae380ce57a4e65a9a5f7255dfe070cc3f3147 Mon Sep 17 00:00:00 2001
From: Max Brunsfeld <maxbrunsfeld@gmail.com>
Date: Wed, 11 Dec 2019 17:18:15 -0800
Subject: [PATCH] Don't attempt to extract keywords that don't entirely match
 word token (#505)

---
 cli/src/generate/build_tables/mod.rs          |  1 +
 .../generate/build_tables/token_conflicts.rs  | 41 +++++++++++++++----
 2 files changed, 35 insertions(+), 7 deletions(-)
diff --git a/cli/src/generate/build_tables/mod.rs b/cli/src/generate/build_tables/mod.rs
index e0f84244..2e5d2f57 100644
--- a/cli/src/generate/build_tables/mod.rs
+++ b/cli/src/generate/build_tables/mod.rs
@@ -271,6 +271,7 @@ fn identify_keywords(
             cursor.reset(vec![variable.start_state]);
             if all_chars_are_alphabetical(&cursor)
                 && token_conflict_map.does_match_same_string(i, word_token.index)
+                && !token_conflict_map.does_match_different_string(i, word_token.index)
             {
                 info!(
                     "Keywords - add candidate {}",
diff --git a/cli/src/generate/build_tables/token_conflicts.rs b/cli/src/generate/build_tables/token_conflicts.rs
index a71dcca6..64e7564b 100644
--- a/cli/src/generate/build_tables/token_conflicts.rs
+++ b/cli/src/generate/build_tables/token_conflicts.rs
@@ -13,6 +13,7 @@ struct TokenConflictStatus {
     does_match_valid_continuation: bool,
     does_match_separators: bool,
     matches_same_string: bool,
+    matches_different_string: bool,
 }
 
 pub(crate) struct TokenConflictMap<'a> {
@@ -25,6 +26,12 @@ pub(crate) struct TokenConflictMap<'a> {
 }
 
 impl<'a> TokenConflictMap<'a> {
+    /// Create a token conflict map based on a lexical grammar, which describes the structure
+    /// each token, and a `following_token` map, which indicates which tokens may be appear
+    /// immediately after each other token.
+    ///
+    /// This analyzes the possible kinds of overlap between each pair of tokens and stores
+    /// them in a matrix.
     pub fn new(grammar: &'a LexicalGrammar, following_tokens: Vec<TokenSet>) -> Self {
         let mut cursor = NfaCursor::new(&grammar.nfa, Vec::new());
         let starting_chars = get_starting_chars(&mut cursor, grammar);
@@ -50,12 +57,21 @@ impl<'a> TokenConflictMap<'a> {
         }
     }
 
+    /// Does token `i` match any strings that token `j` also matches, such that token `i`
+    /// is preferred over token `j`?
     pub fn has_same_conflict_status(&self, a: usize, b: usize, other: usize) -> bool {
         let left = &self.status_matrix[matrix_index(self.n, a, other)];
         let right = &self.status_matrix[matrix_index(self.n, b, other)];
         left == right
     }
 
+    /// Does token `i` match any strings that token `j` does *not* match?
+    pub fn does_match_different_string(&self, i: usize, j: usize) -> bool {
+        self.status_matrix[matrix_index(self.n, i, j)].matches_different_string
+    }
+
+    /// Does token `i` match any strings that token `j` also matches, where
+    /// token `i` is preferred over token `j`?
     pub fn does_match_same_string(&self, i: usize, j: usize) -> bool {
         self.status_matrix[matrix_index(self.n, i, j)].matches_same_string
     }
@@ -67,6 +83,7 @@ impl<'a> TokenConflictMap<'a> {
             || entry.matches_same_string
     }
 
+    /// Does token `i` match any strings that are *prefixes* of strings matched by `j`?
     pub fn does_match_prefix(&self, i: usize, j: usize) -> bool {
         self.status_matrix[matrix_index(self.n, i, j)].matches_prefix
     }
@@ -239,19 +256,29 @@ fn compute_conflict_status(
     );
 
     while let Some(state_set) = state_set_queue.pop() {
-        // Don't pursue states where there's no potential for conflict.
-        if grammar.variable_indices_for_nfa_states(&state_set).count() > 1 {
-            cursor.reset(state_set);
-        } else {
+        let mut live_variable_indices = grammar.variable_indices_for_nfa_states(&state_set);
+
+        // If only one of the two tokens could possibly match from this state, then
+        // there is no reason to analyze any of its successors. Just record the fact
+        // that the token matches a string that the other token does not match.
+        let first_live_variable_index = live_variable_indices.next().unwrap();
+        if live_variable_indices.count() == 0 {
+            if first_live_variable_index == i {
+                result.0.matches_different_string = true;
+            } else {
+                result.1.matches_different_string = true;
+            }
             continue;
         }
 
-        let has_sep = cursor.transition_chars().any(|(_, sep)| sep);
+        // Don't pursue states where there's no potential for conflict.
+        cursor.reset(state_set);
+        let within_separator = cursor.transition_chars().any(|(_, sep)| sep);
 
         // Examine each possible completed token in this state.
         let mut completion = None;
         for (id, precedence) in cursor.completions() {
-            if has_sep {
+            if within_separator {
                 if id == i {
                     result.0.does_match_separators = true;
                 } else {
@@ -316,7 +343,7 @@ fn compute_conflict_status(
                         &transition,
                         completed_id,
                         completed_precedence,
-                        has_sep,
+                        within_separator,
                     ) {
                         can_advance = true;
                         if advanced_id == i {