From 0dd41f0d743231bde0801aa6f8852d45c2f69075 Mon Sep 17 00:00:00 2001
From: Max Brunsfeld <maxbrunsfeld@gmail.com>
Date: Fri, 15 Jun 2018 10:26:58 -0700
Subject: [PATCH] Restore logic for restricting keyword tokens

Removing this restriction created problems for the Rust grammar, and
possibly others. The proper fix would be to ensure that the 'word
token' matches *every* possible string that a 'keyword token'
matches, as opposed to just matching *some* of the same strings.
This would require us to gather a little more information
about how tokens conflict. For now, I'm just going to put back the
hard-coded logic that we had.
---
 .../build_tables/lex_table_builder.cc         | 30 ++++++++++++-------
 1 file changed, 20 insertions(+), 10 deletions(-)
diff --git a/src/compiler/build_tables/lex_table_builder.cc b/src/compiler/build_tables/lex_table_builder.cc
index eb7e6b41..0b309c7d 100644
--- a/src/compiler/build_tables/lex_table_builder.cc
+++ b/src/compiler/build_tables/lex_table_builder.cc
@@ -5,6 +5,7 @@
 #include <string>
 #include <unordered_map>
 #include <utility>
+#include <cwctype>
 #include <vector>
 #include "compiler/build_tables/lex_item.h"
 #include "compiler/build_tables/lookahead_set.h"
@@ -26,6 +27,7 @@ using std::vector;
 using std::unordered_map;
 using std::unordered_set;
 using std::unique_ptr;
+using std::iswalpha;
 using rules::Rule;
 using rules::Blank;
 using rules::Choice;
@@ -102,12 +104,11 @@ class LexTableBuilderImpl : public LexTableBuilder {
     // Compute the set of characters that each token can start with and the set of non-separator
     // characters that can follow each token. Also identify all of the tokens that can be
     // considered 'keywords'.
-    LOG_START("characterizing tokens");
+    LOG("characterizing tokens");
     for (unsigned i = 0, n = grammar.variables.size(); i < n; i++) {
       Symbol token = Symbol::terminal(i);
 
       add_starting_characters(&starting_characters_by_token[i], grammar.variables[i].rule);
-
       const auto &following_tokens = following_tokens_by_token.find(token);
       if (following_tokens != following_tokens_by_token.end()) {
         following_tokens->second.for_each([&](Symbol following_token) {
@@ -119,7 +120,6 @@ class LexTableBuilderImpl : public LexTableBuilder {
         });
       }
     }
-    LOG_END();
 
     // For each pair of tokens, generate a lex table for just those two tokens and record what
     // conflicts arise.
@@ -140,15 +140,24 @@ class LexTableBuilderImpl : public LexTableBuilder {
     }
     LOG_END();
 
-    if (word_token != rules::NONE()) {
-      identify_keywords();
-    }
+    if (word_token != rules::NONE()) identify_keywords();
   }
 
   void identify_keywords() {
     LookaheadSet homonyms;
     for (Symbol::Index j = 0, n = grammar.variables.size(); j < n; j++) {
       Symbol other_token = Symbol::terminal(j);
+
+      // For now, only consider tokens as 'keywords' if they start with letters or underscores.
+      bool starts_with_letter = !starting_characters_by_token[j].includes_all;
+      for (auto character : starting_characters_by_token[j].included_chars) {
+        if (!iswalpha(character) && character != '_') {
+          starts_with_letter = false;
+          break;
+        }
+      }
+      if (!starts_with_letter) continue;
+
       if (get_conflict_status(word_token, other_token) == MatchesSameString) {
         homonyms.insert(other_token);
       }
@@ -177,8 +186,6 @@ class LexTableBuilderImpl : public LexTableBuilder {
 
       if (word_rule_shadows_other || other_shadows_word_rule) {
         homonyms.for_each([&](Symbol homonym) {
-          bool other_shadows_homonym = get_conflict_status(homonym, other_token);
-
           bool word_rule_was_already_present = true;
           for (ParseStateId state_id : coincident_token_index.states_with(homonym, other_token)) {
             if (!parse_table->states[state_id].has_terminal_entry(word_token)) {
@@ -188,14 +195,17 @@ class LexTableBuilderImpl : public LexTableBuilder {
           }
           if (word_rule_was_already_present) return true;
 
-          if (word_rule_shadows_other) {
+          bool homonym_shadows_other = get_conflict_status(other_token, homonym);
+          bool other_shadows_homonym = get_conflict_status(homonym, other_token);
+
+          if (word_rule_shadows_other != homonym_shadows_other) {
             homonyms.remove(homonym);
             LOG(
               "remove %s because word_token would shadow %s",
               token_name(homonym).c_str(),
               token_name(other_token).c_str()
             );
-          } else if (other_shadows_word_rule && !other_shadows_homonym) {
+          } else if (other_shadows_word_rule != other_shadows_homonym) {
             homonyms.remove(homonym);
             LOG(
               "remove %s because %s would shadow word_token",