From e17cd42e47bc3a1543dd2eef5692361534ec29f2 Mon Sep 17 00:00:00 2001
From: Max Brunsfeld <maxbrunsfeld@gmail.com>
Date: Wed, 13 Jun 2018 16:54:11 -0700
Subject: [PATCH] Perform keyword optimization using explicitly selected word
 token

rather than trying to infer the word token automatically.

Co-Authored-By: Ashi Krishnan <queerviolet@github.com>
---
 docs/section-3-creating-parsers.md            |  24 +++
 include/tree_sitter/compiler.h                |   1 +
 .../build_tables/lex_table_builder.cc         | 180 ++++++++----------
 src/compiler/grammar.h                        |   1 +
 src/compiler/parse_grammar.cc                 |  14 +-
 .../prepare_grammar/expand_repeats.cc         |   1 +
 .../prepare_grammar/extract_tokens.cc         |  12 ++
 .../prepare_grammar/flatten_grammar.cc        |   2 +
 .../prepare_grammar/initial_syntax_grammar.h  |   1 +
 .../prepare_grammar/intern_symbols.cc         |   2 +
 .../prepare_grammar/interned_grammar.h        |   2 +-
 src/compiler/syntax_grammar.h                 |   1 +
 12 files changed, 142 insertions(+), 99 deletions(-)

diff --git a/docs/section-3-creating-parsers.md b/docs/section-3-creating-parsers.md
index 268b034a..8be78926 100644
--- a/docs/section-3-creating-parsers.md
+++ b/docs/section-3-creating-parsers.md
@@ -217,6 +217,7 @@ In addition to the `name` and `rules` fields, grammars have a few other public f
 * `inline` - an array of rule names that should be automatically *removed* from the grammar by replacing all of their usages with a copy of their definition. This is useful for rules that are used in multiple places but for which you *don't* want to create syntax tree nodes at runtime.
 * `conflicts` - an array of arrays of rule names. Each inner array represents a set of rules that's involved in an *LR(1) conflict* that is *intended to exist* in the grammar. When these conflicts occur at runtime, Tree-sitter will use the GLR algorithm to explore all of the possible interpretations. If *multiple* parses end up succeeding, Tree-sitter will pick the subtree rule with the highest *dynamic precedence*.
 * `externals` - an array of toen names which can be returned by an *external scanner*. External scanners allow you to write custom C code which runs during the lexing process in order to handle lexical rules (e.g. Python's indentation tokens) that cannot be described by regular expressions.
+* `word` - the name of a token that will match keywords for the purpose of [keyword-optimization](#keyword-optimization).
 
 ## Adjusting existing grammars
 
@@ -359,6 +360,29 @@ You may have noticed in the above examples that some of the grammar rule name li
 
 TODO
 
+## Keyword Optimization
+
+Many languages have a set of keywords. Typically, these aren't identifiers, but
+look like them. For example, in Algol-like languages, `if` is a keyword. It could
+be a variable name, and in some contexts (e.g. javascript object literals like
+`{if: 'something'}`) it might be interpreted as a variable, but in many contexts,
+it has special meaning.
+
+You'll know if you have them, because keywords end up in the grammar as strings
+or regexes that match a small finite set of strings.
+
+The naïve parser generated from such a grammar can be huge and take forever to
+compile. Keyword optimization is the fix. Instead of building a parser which
+looks for `choice('break', 'continue', 'async', ...etc)` wherever they
+might occur, `word: $ => $.identifier` will instruct Tree-sitter to instead try
+and parse an `identifier` where it was going to try and parse one of those keywords,
+and then check to see if the parsed `identifier` actually does match a keyword.
+
+You don't have to specify what words actually are keywords. Tree-sitter will
+identify these automatically, as the set of terminals that your word could
+match.
+
+
 [cst]: https://en.wikipedia.org/wiki/Parse_tree
 [non-terminal]: https://en.wikipedia.org/wiki/Terminal_and_nonterminal_symbols
 [language-spec]: https://en.wikipedia.org/wiki/Programming_language_specification
diff --git a/include/tree_sitter/compiler.h b/include/tree_sitter/compiler.h
index ca2a28f7..3db2f7ca 100644
--- a/include/tree_sitter/compiler.h
+++ b/include/tree_sitter/compiler.h
@@ -19,6 +19,7 @@ typedef enum {
   TSCompileErrorTypeEpsilonRule,
   TSCompileErrorTypeInvalidTokenContents,
   TSCompileErrorTypeInvalidRuleName,
+  TSCompileErrorTypeInvalidWordRule,
 } TSCompileErrorType;
 
 typedef struct {
diff --git a/src/compiler/build_tables/lex_table_builder.cc b/src/compiler/build_tables/lex_table_builder.cc
index 178cfb75..5e4bfa85 100644
--- a/src/compiler/build_tables/lex_table_builder.cc
+++ b/src/compiler/build_tables/lex_table_builder.cc
@@ -109,7 +109,7 @@ class LexTableBuilderImpl : public LexTableBuilder {
   vector<ConflictStatus> conflict_matrix;
   bool conflict_detection_mode;
   LookaheadSet keyword_symbols;
-  Symbol keyword_capture_token;
+  Symbol word_rule;
   char encoding_buffer[8];
 
  public:
@@ -125,7 +125,7 @@ class LexTableBuilderImpl : public LexTableBuilder {
       parse_table(parse_table),
       conflict_matrix(lexical_grammar.variables.size() * lexical_grammar.variables.size(), DoesNotMatch),
       conflict_detection_mode(false),
-      keyword_capture_token(rules::NONE()) {
+      word_rule(syntax_grammar.word_rule) {
 
     // Compute the possible separator rules and the set of separator characters that can occur
     // immediately after any token.
@@ -182,7 +182,6 @@ class LexTableBuilderImpl : public LexTableBuilder {
           potential_keyword_symbols.insert(token);
         }
       }
-
     }
     LOG_END();
 
@@ -205,100 +204,87 @@ class LexTableBuilderImpl : public LexTableBuilder {
     }
     LOG_END();
 
-    LOG_START("finding keyword capture token");
-    for (Symbol::Index i = 0, n = grammar.variables.size(); i < n; i++) {
-      Symbol candidate = Symbol::terminal(i);
-
-      LookaheadSet homonyms;
-      potential_keyword_symbols.for_each([&](Symbol other_token) {
-        if (get_conflict_status(other_token, candidate) & MatchesShorterStringWithinSeparators) {
-          homonyms.clear();
-          return false;
-        }
-        if (get_conflict_status(candidate, other_token) == MatchesSameString) {
-          homonyms.insert(other_token);
-        }
-        return true;
-      });
-      if (homonyms.empty()) continue;
-
-      LOG_START(
-        "keyword capture token candidate: %s, homonym count: %lu",
-        token_name(candidate).c_str(),
-        homonyms.size()
-      );
-
-      homonyms.for_each([&](Symbol homonym1) {
-        homonyms.for_each([&](Symbol homonym2) {
-          if (get_conflict_status(homonym1, homonym2) & MatchesSameString) {
-            LOG(
-              "conflict between homonyms %s %s",
-              token_name(homonym1).c_str(),
-              token_name(homonym2).c_str()
-            );
-            homonyms.remove(homonym1);
-          }
-          return false;
-        });
-        return true;
-      });
-
-      for (Symbol::Index j = 0; j < n; j++) {
-        Symbol other_token = Symbol::terminal(j);
-        if (other_token == candidate || homonyms.contains(other_token)) continue;
-        bool candidate_shadows_other = get_conflict_status(other_token, candidate);
-        bool other_shadows_candidate = get_conflict_status(candidate, other_token);
-
-        if (candidate_shadows_other || other_shadows_candidate) {
-          homonyms.for_each([&](Symbol homonym) {
-            bool other_shadows_homonym = get_conflict_status(homonym, other_token);
-
-            bool candidate_was_already_present = true;
-            for (ParseStateId state_id : coincident_token_index.states_with(homonym, other_token)) {
-              if (!parse_table->states[state_id].has_terminal_entry(candidate)) {
-                candidate_was_already_present = false;
-                break;
-              }
-            }
-            if (candidate_was_already_present) return true;
-
-            if (candidate_shadows_other) {
-              homonyms.remove(homonym);
-              LOG(
-                "remove %s because candidate would shadow %s",
-                token_name(homonym).c_str(),
-                token_name(other_token).c_str()
-              );
-            } else if (other_shadows_candidate && !other_shadows_homonym) {
-              homonyms.remove(homonym);
-              LOG(
-                "remove %s because %s would shadow candidate",
-                token_name(homonym).c_str(),
-                token_name(other_token).c_str()
-              );
-            }
-            return true;
-          });
-        }
-      }
-
-      if (homonyms.size() > keyword_symbols.size()) {
-        LOG_START("found capture token. homonyms:");
-        homonyms.for_each([&](Symbol homonym) {
-          LOG("%s", token_name(homonym).c_str());
-          return true;
-        });
-        LOG_END();
-        keyword_symbols = homonyms;
-        keyword_capture_token = candidate;
-      }
-
-      LOG_END();
+    if (word_rule != rules::NONE()) {
+      identify_keywords(potential_keyword_symbols);
     }
 
     LOG_END();
   }
 
+  void identify_keywords(const LookaheadSet &potential_keyword_symbols) {
+    LookaheadSet homonyms;
+    potential_keyword_symbols.for_each([&](Symbol other_token) {
+      if (get_conflict_status(word_rule, other_token) == MatchesSameString) {
+        homonyms.insert(other_token);
+      }
+      return true;
+    });
+
+    homonyms.for_each([&](Symbol homonym1) {
+      homonyms.for_each([&](Symbol homonym2) {
+        if (get_conflict_status(homonym1, homonym2) & MatchesSameString) {
+          LOG(
+            "conflict between homonyms %s %s",
+            token_name(homonym1).c_str(),
+            token_name(homonym2).c_str()
+          );
+          homonyms.remove(homonym1);
+        }
+        return false;
+      });
+      return true;
+    });
+
+    for (Symbol::Index j = 0, n = grammar.variables.size(); j < n; j++) {
+      Symbol other_token = Symbol::terminal(j);
+      if (other_token == word_rule || homonyms.contains(other_token)) continue;
+      bool word_rule_shadows_other = get_conflict_status(other_token, word_rule);
+      bool other_shadows_word_rule = get_conflict_status(word_rule, other_token);
+
+      if (word_rule_shadows_other || other_shadows_word_rule) {
+        homonyms.for_each([&](Symbol homonym) {
+          bool other_shadows_homonym = get_conflict_status(homonym, other_token);
+
+          bool word_rule_was_already_present = true;
+          for (ParseStateId state_id : coincident_token_index.states_with(homonym, other_token)) {
+            if (!parse_table->states[state_id].has_terminal_entry(word_rule)) {
+              word_rule_was_already_present = false;
+              break;
+            }
+          }
+          if (word_rule_was_already_present) return true;
+
+          if (word_rule_shadows_other) {
+            homonyms.remove(homonym);
+            LOG(
+              "remove %s because word_rule would shadow %s",
+              token_name(homonym).c_str(),
+              token_name(other_token).c_str()
+            );
+          } else if (other_shadows_word_rule && !other_shadows_homonym) {
+            homonyms.remove(homonym);
+            LOG(
+              "remove %s because %s would shadow word_rule",
+              token_name(homonym).c_str(),
+              token_name(other_token).c_str()
+            );
+          }
+          return true;
+        });
+      }
+    }
+
+    if (!homonyms.empty()) {
+      LOG_START("found keywords:");
+      homonyms.for_each([&](Symbol homonym) {
+        LOG("%s", token_name(homonym).c_str());
+        return true;
+      });
+      LOG_END();
+      keyword_symbols = homonyms;
+    }
+  }
+
   BuildResult build() {
     clear();
     conflict_detection_mode = false;
@@ -307,8 +293,8 @@ class LexTableBuilderImpl : public LexTableBuilder {
     for (ParseState &parse_state : parse_table->states) {
       LookaheadSet token_set;
       for (auto &entry : parse_state.terminal_entries) {
-        if (keyword_capture_token.is_terminal() && keyword_symbols.contains(entry.first)) {
-          token_set.insert(keyword_capture_token);
+        if (word_rule.is_terminal() && keyword_symbols.contains(entry.first)) {
+          token_set.insert(word_rule);
         } else {
           token_set.insert(entry.first);
         }
@@ -337,7 +323,7 @@ class LexTableBuilderImpl : public LexTableBuilder {
 
     mark_fragile_tokens();
     remove_duplicate_lex_states(main_lex_table);
-    return {main_lex_table, keyword_lex_table, keyword_capture_token};
+    return {main_lex_table, keyword_lex_table, word_rule};
   }
 
   ConflictStatus get_conflict_status(Symbol shadowed_token, Symbol other_token) const {
@@ -411,10 +397,10 @@ class LexTableBuilderImpl : public LexTableBuilder {
                 MatchesLongerStringWithValidNextChar
               )) {
                 LOG(
-                  "%s shadows %s followed by '%s'",
+                  "%s shadows %s followed by %d",
                   token_name(advance_symbol).c_str(),
                   token_name(accept_action.symbol).c_str(),
-                  log_char(*conflicting_following_chars.included_chars.begin())
+                  *conflicting_following_chars.included_chars.begin()
                 );
               }
             }
diff --git a/src/compiler/grammar.h b/src/compiler/grammar.h
index 6c63340c..5e2212fb 100644
--- a/src/compiler/grammar.h
+++ b/src/compiler/grammar.h
@@ -32,6 +32,7 @@ struct InputGrammar {
   std::vector<std::unordered_set<rules::NamedSymbol>> expected_conflicts;
   std::vector<rules::Rule> external_tokens;
   std::unordered_set<rules::NamedSymbol> variables_to_inline;
+  rules::NamedSymbol word_rule;
 };
 
 }  // namespace tree_sitter
diff --git a/src/compiler/parse_grammar.cc b/src/compiler/parse_grammar.cc
index e233cfe0..f589d15a 100644
--- a/src/compiler/parse_grammar.cc
+++ b/src/compiler/parse_grammar.cc
@@ -229,7 +229,9 @@ ParseGrammarResult parse_grammar(const string &input) {
   string error_message;
   string name;
   InputGrammar grammar;
-  json_value name_json, rules_json, extras_json, conflicts_json, external_tokens_json, inline_rules_json;
+  json_value
+    name_json, rules_json, extras_json, conflicts_json, external_tokens_json,
+    inline_rules_json, word_rule_json;
 
   json_settings settings = { 0, json_enable_comments, 0, 0, 0, 0 };
   char parse_error[json_error_max];
@@ -359,6 +361,16 @@ ParseGrammarResult parse_grammar(const string &input) {
     }
   }
 
+  word_rule_json = grammar_json->operator[]("word");
+  if (word_rule_json.type != json_none) {
+    if (word_rule_json.type != json_string) {
+      error_message = "Invalid word property";
+      goto error;
+    }
+
+    grammar.word_rule = NamedSymbol { word_rule_json.u.string.ptr };
+  }
+
   json_value_free(grammar_json);
   return { name, grammar, "" };
 
diff --git a/src/compiler/prepare_grammar/expand_repeats.cc b/src/compiler/prepare_grammar/expand_repeats.cc
index 46230867..42878376 100644
--- a/src/compiler/prepare_grammar/expand_repeats.cc
+++ b/src/compiler/prepare_grammar/expand_repeats.cc
@@ -106,6 +106,7 @@ InitialSyntaxGrammar expand_repeats(const InitialSyntaxGrammar &grammar) {
     expander.aux_rules.end()
   );
 
+  result.word_rule = grammar.word_rule;
   return result;
 }
 
diff --git a/src/compiler/prepare_grammar/extract_tokens.cc b/src/compiler/prepare_grammar/extract_tokens.cc
index 93b06be2..c82b3505 100644
--- a/src/compiler/prepare_grammar/extract_tokens.cc
+++ b/src/compiler/prepare_grammar/extract_tokens.cc
@@ -329,6 +329,18 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
     }
   }
 
+  syntax_grammar.word_rule = symbol_replacer.replace_symbol(grammar.word_rule);
+  if (syntax_grammar.word_rule.is_non_terminal()) {
+    return make_tuple(
+      syntax_grammar,
+      lexical_grammar,
+      CompileError(
+        TSCompileErrorTypeInvalidWordRule,
+        "Word rules must be tokens"
+      )
+    );
+  }
+
   return make_tuple(syntax_grammar, lexical_grammar, CompileError::none());
 }
 
diff --git a/src/compiler/prepare_grammar/flatten_grammar.cc b/src/compiler/prepare_grammar/flatten_grammar.cc
index e135ee67..ebfc3ae4 100644
--- a/src/compiler/prepare_grammar/flatten_grammar.cc
+++ b/src/compiler/prepare_grammar/flatten_grammar.cc
@@ -161,6 +161,8 @@ pair<SyntaxGrammar, CompileError> flatten_grammar(const InitialSyntaxGrammar &gr
     i++;
   }
 
+  result.word_rule = grammar.word_rule;
+  
   return {result, CompileError::none()};
 }
 
diff --git a/src/compiler/prepare_grammar/initial_syntax_grammar.h b/src/compiler/prepare_grammar/initial_syntax_grammar.h
index 881c6396..79cc951e 100644
--- a/src/compiler/prepare_grammar/initial_syntax_grammar.h
+++ b/src/compiler/prepare_grammar/initial_syntax_grammar.h
@@ -17,6 +17,7 @@ struct InitialSyntaxGrammar {
   std::set<std::set<rules::Symbol>> expected_conflicts;
   std::vector<ExternalToken> external_tokens;
   std::set<rules::Symbol> variables_to_inline;
+  rules::Symbol word_rule = rules::NONE();
 };
 
 }  // namespace prepare_grammar
diff --git a/src/compiler/prepare_grammar/intern_symbols.cc b/src/compiler/prepare_grammar/intern_symbols.cc
index 4e610960..dc128779 100644
--- a/src/compiler/prepare_grammar/intern_symbols.cc
+++ b/src/compiler/prepare_grammar/intern_symbols.cc
@@ -166,6 +166,8 @@ pair<InternedGrammar, CompileError> intern_symbols(const InputGrammar &grammar)
     }
   }
 
+  result.word_rule = interner.intern_symbol(grammar.word_rule);
+
   return {result, CompileError::none()};
 }
 
diff --git a/src/compiler/prepare_grammar/interned_grammar.h b/src/compiler/prepare_grammar/interned_grammar.h
index 83117ced..fc322522 100644
--- a/src/compiler/prepare_grammar/interned_grammar.h
+++ b/src/compiler/prepare_grammar/interned_grammar.h
@@ -15,8 +15,8 @@ struct InternedGrammar {
   std::vector<rules::Rule> extra_tokens;
   std::set<std::set<rules::Symbol>> expected_conflicts;
   std::vector<Variable> external_tokens;
-  std::set<rules::Symbol> blank_external_tokens;
   std::set<rules::Symbol> variables_to_inline;
+  rules::Symbol word_rule;
 };
 
 }  // namespace prepare_grammar
diff --git a/src/compiler/syntax_grammar.h b/src/compiler/syntax_grammar.h
index 2d55686b..ff056e3f 100644
--- a/src/compiler/syntax_grammar.h
+++ b/src/compiler/syntax_grammar.h
@@ -60,6 +60,7 @@ struct SyntaxGrammar {
   std::set<std::set<rules::Symbol>> expected_conflicts;
   std::vector<ExternalToken> external_tokens;
   std::set<rules::Symbol> variables_to_inline;
+  rules::Symbol word_rule = rules::NONE();
 };
 
 }  // namespace tree_sitter