From cb784975a443ef4f22e01d1f8cdd499bcf962251 Mon Sep 17 00:00:00 2001
From: Max Brunsfeld <maxbrunsfeld@gmail.com>
Date: Wed, 1 Aug 2018 14:00:57 -0700
Subject: [PATCH 1/3] Add IMMEDIATE_TOKEN rule type, for enforcing no preceding
 extras

---
 .../build_tables/lex_table_builder.cc         |  31 ++++--
 src/compiler/generate_code/c_code.cc          | 101 +++++++++++-------
 src/compiler/grammar-schema.json              |   2 +-
 src/compiler/lex_table.cc                     |  20 ++--
 src/compiler/lex_table.h                      |   4 +-
 src/compiler/parse_grammar.cc                 |   9 ++
 .../prepare_grammar/extract_tokens.cc         |   2 +
 src/compiler/rule.cc                          |   3 +
 src/compiler/rules/metadata.cc                |   7 ++
 src/compiler/rules/metadata.h                 |   1 +
 .../test_grammars/immediate_tokens/corpus.txt |  29 +++++
 .../immediate_tokens/grammar.json             |  61 +++++++++++
 .../test_grammars/immediate_tokens/readme.md  |   1 +
 13 files changed, 212 insertions(+), 59 deletions(-)
 create mode 100644 test/fixtures/test_grammars/immediate_tokens/corpus.txt
 create mode 100644 test/fixtures/test_grammars/immediate_tokens/grammar.json
 create mode 100644 test/fixtures/test_grammars/immediate_tokens/readme.md

diff --git a/src/compiler/build_tables/lex_table_builder.cc b/src/compiler/build_tables/lex_table_builder.cc
index 0b309c7d..e577d690 100644
--- a/src/compiler/build_tables/lex_table_builder.cc
+++ b/src/compiler/build_tables/lex_table_builder.cc
@@ -379,9 +379,14 @@ class LexTableBuilderImpl : public LexTableBuilder {
     for (const LexItem &item : item_set.entries) {
       LexItem::CompletionStatus completion_status = item.completion_status();
       if (completion_status.is_done) {
-        AcceptTokenAction action(item.lhs, completion_status.precedence.max,
-                                 item.lhs.is_built_in() ||
-                                 grammar.variables[item.lhs.index].is_string);
+        AcceptTokenAction action(item.lhs, completion_status.precedence.max);
+
+        if (!item.lhs.is_built_in()) {
+          const LexicalVariable &variable = grammar.variables[item.lhs.index];
+          if (variable.is_string) action.implicit_precedence += 2;
+          if (is_immediate_token(variable.rule)) action.implicit_precedence += 1;
+        }
+
         AcceptTokenAction &existing_action = lex_table.states[state_id].accept_action;
         if (existing_action.is_present()) {
           if (should_replace_accept_action(existing_action, action)) {
@@ -458,8 +463,8 @@ class LexTableBuilderImpl : public LexTableBuilder {
 
   void remove_duplicate_lex_states(LexTable &lex_table) {
     for (LexState &state : lex_table.states) {
-      state.accept_action.is_string = false;
       state.accept_action.precedence = 0;
+      state.accept_action.implicit_precedence = 0;
     }
 
     map<LexStateId, LexStateId> replacements;
@@ -523,12 +528,24 @@ class LexTableBuilderImpl : public LexTableBuilder {
     }
   }
 
+  bool is_immediate_token(const Rule &rule) const {
+    return rule.match(
+      [](const Metadata &metadata) {
+        return metadata.params.is_main_token;
+      },
+
+      [](auto rule) {
+        return false;
+      }
+    );
+  }
+
   LexItemSet item_set_for_terminals(const LookaheadSet &terminals, bool with_separators) {
     LexItemSet result;
     terminals.for_each([&](Symbol symbol) {
       if (symbol.is_terminal()) {
         for (auto &&rule : rules_for_symbol(symbol)) {
-          if (with_separators) {
+          if (with_separators && !is_immediate_token(rule)) {
             for (const auto &separator_rule : separator_rules) {
               result.entries.insert(LexItem(
                 symbol,
@@ -598,8 +615,8 @@ class LexTableBuilderImpl : public LexTableBuilder {
                                     const AcceptTokenAction &new_action) {
     if (new_action.precedence > old_action.precedence) return true;
     if (new_action.precedence < old_action.precedence) return false;
-    if (new_action.is_string && !old_action.is_string) return true;
-    if (old_action.is_string && !new_action.is_string) return false;
+    if (new_action.implicit_precedence > old_action.implicit_precedence) return true;
+    if (new_action.implicit_precedence < old_action.implicit_precedence) return false;
     return new_action.symbol.index < old_action.symbol.index;
   }
 
diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc
index 806adf66..1038701c 100644
--- a/src/compiler/generate_code/c_code.cc
+++ b/src/compiler/generate_code/c_code.cc
@@ -23,6 +23,7 @@ using std::pair;
 using std::set;
 using std::string;
 using std::to_string;
+using std::unordered_set;
 using std::vector;
 using util::escape_char;
 using rules::Symbol;
@@ -76,7 +77,7 @@ class CCodeGenerator {
   Symbol keyword_capture_token;
   const SyntaxGrammar syntax_grammar;
   const LexicalGrammar lexical_grammar;
-  map<string, string> sanitized_names;
+  map<Symbol, string> symbol_ids;
   vector<pair<size_t, ParseTableEntry>> parse_table_entries;
   vector<set<Symbol::Index>> external_scanner_states;
   size_t next_parse_action_list_index;
@@ -165,6 +166,24 @@ class CCodeGenerator {
       }
     }
 
+    unordered_set<string> symbol_id_values;
+    symbol_ids[rules::END_OF_INPUT()] = "ts_builtin_sym_end";
+
+    for (const Symbol &symbol : parse_table.symbols) {
+      if (!symbol.is_built_in()) {
+        assign_symbol_id(symbol, &symbol_id_values);
+      }
+    }
+
+    for (size_t i = 0; i < syntax_grammar.external_tokens.size(); i++) {
+      const ExternalToken &external_token = syntax_grammar.external_tokens[i];
+      if (external_token.corresponding_internal_token == rules::NONE()) {
+        assign_symbol_id(Symbol::external(i), &symbol_id_values);
+      } else {
+        symbol_ids[Symbol::external(i)] = symbol_ids[external_token.corresponding_internal_token];
+      }
+    }
+
     line("#define LANGUAGE_VERSION " + to_string(TREE_SITTER_LANGUAGE_VERSION));
     line("#define STATE_COUNT " + to_string(parse_table.states.size()));
     line("#define SYMBOL_COUNT " + to_string(parse_table.symbols.size()));
@@ -175,6 +194,33 @@ class CCodeGenerator {
     line();
   }
 
+  void assign_symbol_id(const Symbol &symbol, unordered_set<string> *symbol_id_values) {
+    auto entry = entry_for_symbol(symbol);
+
+    string symbol_id;
+    switch (entry.second) {
+      case VariableTypeAuxiliary:
+        symbol_id = "aux_sym_" + sanitize_name(entry.first);
+        break;
+      case VariableTypeAnonymous:
+        symbol_id = "anon_sym_" + sanitize_name(entry.first);
+        break;
+      default:
+        symbol_id = "sym_" + sanitize_name(entry.first);
+        break;
+    }
+
+    unsigned suffix_number = 1;
+    string unique_symbol_id = symbol_id;
+    while (symbol_id_values->count(unique_symbol_id)) {
+      suffix_number++;
+      unique_symbol_id = symbol_id + to_string(suffix_number);
+    }
+
+    symbol_id_values->insert(unique_symbol_id);
+    symbol_ids[symbol] = unique_symbol_id;
+  }
+
   void add_symbol_enum() {
     line("enum {");
     indent([&]() {
@@ -696,20 +742,7 @@ class CCodeGenerator {
   }
 
   string symbol_id(const Symbol &symbol) {
-    if (symbol == rules::END_OF_INPUT())
-      return "ts_builtin_sym_end";
-
-    auto entry = entry_for_symbol(symbol);
-    string name = sanitize_name(entry.first);
-
-    switch (entry.second) {
-      case VariableTypeAuxiliary:
-        return "aux_sym_" + name;
-      case VariableTypeAnonymous:
-        return "anon_sym_" + name;
-      default:
-        return "sym_" + name;
-    }
+    return symbol_ids[symbol];
   }
 
   string alias_id(const Alias &alias) {
@@ -776,47 +809,35 @@ class CCodeGenerator {
     return name;
   }
 
-  string sanitize_name(string name) {
-    auto existing = sanitized_names.find(name);
-    if (existing != sanitized_names.end())
-      return existing->second;
-
-    string stripped_name;
+  string sanitize_name(const string &name) {
+    string result;
     for (char c : name) {
       if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') ||
           ('0' <= c && c <= '9') || (c == '_')) {
-        stripped_name += c;
+        result += c;
       } else {
         auto replacement = REPLACEMENTS.find(c);
-        size_t i = stripped_name.size();
+        size_t i = result.size();
         if (replacement != REPLACEMENTS.end()) {
-          if (i > 0 && stripped_name[i - 1] != '_')
-            stripped_name += "_";
-          stripped_name += replacement->second;
+          if (i > 0 && result[i - 1] != '_')
+            result += "_";
+          result += replacement->second;
         }
       }
     }
-
-    for (size_t extra_number = 0;; extra_number++) {
-      string suffix = extra_number ? to_string(extra_number) : "";
-      string unique_name = stripped_name + suffix;
-      if (unique_name == "")
-        continue;
-      if (!has_sanitized_name(unique_name)) {
-        sanitized_names.insert({ name, unique_name });
-        return unique_name;
-      }
-    }
+    return result;
   }
 
   string _boolean(bool value) {
     return value ? "true" : "false";
   }
 
-  bool has_sanitized_name(string name) {
-    for (const auto &pair : sanitized_names)
-      if (pair.second == name)
+  bool has_sanitized_name(const Symbol &symbol, string name) {
+    for (const auto &pair : symbol_ids) {
+      if (pair.second == name) {
         return true;
+      }
+    }
     return false;
   }
 
diff --git a/src/compiler/grammar-schema.json b/src/compiler/grammar-schema.json
index 24e47abb..55388364 100644
--- a/src/compiler/grammar-schema.json
+++ b/src/compiler/grammar-schema.json
@@ -201,7 +201,7 @@
       "properties": {
         "type": {
           "type": "string",
-          "pattern": "^TOKEN$"
+          "pattern": "^(TOKEN|IMMEDIATE_TOKEN)$"
         },
         "content": {
           "$ref": "#/definitions/rule"
diff --git a/src/compiler/lex_table.cc b/src/compiler/lex_table.cc
index daf4517a..e13d6fcb 100644
--- a/src/compiler/lex_table.cc
+++ b/src/compiler/lex_table.cc
@@ -16,9 +16,9 @@ AdvanceAction::AdvanceAction() : state_index(-1) {}
 AdvanceAction::AdvanceAction(size_t state_index,
                              PrecedenceRange precedence_range,
                              bool in_main_token)
-    : state_index(state_index),
-      precedence_range(precedence_range),
-      in_main_token(in_main_token) {}
+  : state_index(state_index),
+    precedence_range(precedence_range),
+    in_main_token(in_main_token) {}
 
 bool AdvanceAction::operator==(const AdvanceAction &other) const {
   return (state_index == other.state_index) &&
@@ -26,19 +26,21 @@ bool AdvanceAction::operator==(const AdvanceAction &other) const {
 }
 
 AcceptTokenAction::AcceptTokenAction()
-    : symbol(rules::NONE()), precedence(0), is_string(false) {}
+  : symbol(rules::NONE()), precedence(0), implicit_precedence(0) {}
 
-AcceptTokenAction::AcceptTokenAction(Symbol symbol, int precedence,
-                                     bool is_string)
-    : symbol(symbol), precedence(precedence), is_string(is_string) {}
+AcceptTokenAction::AcceptTokenAction(Symbol symbol, int precedence)
+  : symbol(symbol), precedence(precedence), implicit_precedence(0) {}
 
 bool AcceptTokenAction::is_present() const {
   return symbol != rules::NONE();
 }
 
 bool AcceptTokenAction::operator==(const AcceptTokenAction &other) const {
-  return (symbol == other.symbol) && (precedence == other.precedence) &&
-         (is_string == other.is_string);
+  return (
+    symbol == other.symbol &&
+    precedence == other.precedence &&
+    implicit_precedence == other.implicit_precedence
+  );
 }
 
 bool LexState::operator==(const LexState &other) const {
diff --git a/src/compiler/lex_table.h b/src/compiler/lex_table.h
index 6de0792d..9419e8e2 100644
--- a/src/compiler/lex_table.h
+++ b/src/compiler/lex_table.h
@@ -25,14 +25,14 @@ struct AdvanceAction {
 
 struct AcceptTokenAction {
   AcceptTokenAction();
-  AcceptTokenAction(rules::Symbol, int, bool);
+  AcceptTokenAction(rules::Symbol, int);
   bool is_present() const;
   bool operator==(const AcceptTokenAction &other) const;
   inline bool operator!=(const AcceptTokenAction &other) const { return !operator==(other); }
 
   rules::Symbol symbol;
   int precedence;
-  bool is_string;
+  int implicit_precedence;
 };
 
 struct LexState {
diff --git a/src/compiler/parse_grammar.cc b/src/compiler/parse_grammar.cc
index 63ddb40b..7b69ed61 100644
--- a/src/compiler/parse_grammar.cc
+++ b/src/compiler/parse_grammar.cc
@@ -116,6 +116,15 @@ ParseRuleResult parse_rule(json_value *rule_json) {
       return Rule(Metadata::token(move(result.rule)));
   }
 
+  if (type == "IMMEDIATE_TOKEN") {
+    json_value content_json = rule_json->operator[]("content");
+    auto result = parse_rule(&content_json);
+    if (!result.error_message.empty()) {
+      return "Invalid token content: " + result.error_message;
+    }
+      return Rule(Metadata::immediate_token(move(result.rule)));
+  }
+
   if (type == "PATTERN") {
     json_value value_json = rule_json->operator[]("value");
     if (value_json.type == json_string) {
diff --git a/src/compiler/prepare_grammar/extract_tokens.cc b/src/compiler/prepare_grammar/extract_tokens.cc
index b5110693..bf01e722 100644
--- a/src/compiler/prepare_grammar/extract_tokens.cc
+++ b/src/compiler/prepare_grammar/extract_tokens.cc
@@ -118,6 +118,8 @@ class TokenExtractor {
           metadata.params.is_token = false;
           if (metadata.params == rules::MetadataParams{}) {
             return extract_token(*metadata.rule, VariableTypeAuxiliary);
+          } else if (metadata.rule->is<rules::String>()) {
+            return extract_token(metadata, VariableTypeAnonymous);
           } else {
             return extract_token(metadata, VariableTypeAuxiliary);
           }
diff --git a/src/compiler/rule.cc b/src/compiler/rule.cc
index 29ee1793..e7277459 100644
--- a/src/compiler/rule.cc
+++ b/src/compiler/rule.cc
@@ -135,6 +135,9 @@ bool Rule::is<Blank>() const { return type == BlankType; }
 template <>
 bool Rule::is<Symbol>() const { return type == SymbolType; }
 
+template <>
+bool Rule::is<String>() const { return type == StringType; }
+
 template <>
 bool Rule::is<Repeat>() const { return type == RepeatType; }
 
diff --git a/src/compiler/rules/metadata.cc b/src/compiler/rules/metadata.cc
index 40dcb21e..c54d29cd 100644
--- a/src/compiler/rules/metadata.cc
+++ b/src/compiler/rules/metadata.cc
@@ -75,6 +75,13 @@ Metadata Metadata::token(Rule &&rule) {
   });
 }
 
+Metadata Metadata::immediate_token(Rule &&rule) {
+  return add_metadata(move(rule), [](MetadataParams &params) {
+    params.is_token = true;
+    params.is_main_token = true;
+  });
+}
+
 Metadata Metadata::active_prec(int precedence, Rule &&rule) {
   return add_metadata(move(rule), [&](MetadataParams &params) {
     params.has_precedence = true;
diff --git a/src/compiler/rules/metadata.h b/src/compiler/rules/metadata.h
index 73a4a66d..3c023b3e 100644
--- a/src/compiler/rules/metadata.h
+++ b/src/compiler/rules/metadata.h
@@ -64,6 +64,7 @@ struct Metadata {
 
   static Metadata merge(Rule &&rule, MetadataParams params);
   static Metadata token(Rule &&rule);
+  static Metadata immediate_token(Rule &&rule);
   static Metadata active_prec(int precedence, Rule &&rule);
   static Metadata prec(int precedence, Rule &&rule);
   static Metadata prec_left(int precedence, Rule &&rule);
diff --git a/test/fixtures/test_grammars/immediate_tokens/corpus.txt b/test/fixtures/test_grammars/immediate_tokens/corpus.txt
new file mode 100644
index 00000000..d5d2e0f8
--- /dev/null
+++ b/test/fixtures/test_grammars/immediate_tokens/corpus.txt
@@ -0,0 +1,29 @@
+===============================
+prefix expressions as arguments
+===============================
+
+a ::b ::c
+
+---
+
+(program
+  (call
+    (call
+      (identifier)
+      (prefix (identifier)))
+    (prefix (identifier))))
+
+===============================
+infix expressions
+===============================
+
+a::b::c
+
+---
+
+(program
+  (infix
+    (infix
+      (identifier)
+      (identifier))
+    (identifier)))
diff --git a/test/fixtures/test_grammars/immediate_tokens/grammar.json b/test/fixtures/test_grammars/immediate_tokens/grammar.json
new file mode 100644
index 00000000..1506e3a7
--- /dev/null
+++ b/test/fixtures/test_grammars/immediate_tokens/grammar.json
@@ -0,0 +1,61 @@
+{
+  "name": "immediate_tokens",
+
+  "extras": [
+    {
+      "type": "PATTERN",
+      "value": "\\s"
+    }
+  ],
+
+  "rules": {
+    "program": {"type": "SYMBOL", "name": "_expression"},
+
+    "_expression": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "call"},
+        {"type": "SYMBOL", "name": "infix"},
+        {"type": "SYMBOL", "name": "prefix"},
+        {"type": "SYMBOL", "name": "identifier"}
+      ]
+    },
+
+    "call": {
+      "type": "PREC_LEFT",
+      "value": -1,
+      "content": {
+        "type": "SEQ",
+        "members": [
+          {"type": "SYMBOL", "name": "_expression"},
+          {"type": "SYMBOL", "name": "_expression"}
+        ]
+      }
+    },
+
+    "prefix": {
+      "type": "SEQ",
+      "members": [
+        {"type": "STRING", "value": "::"},
+        {"type": "SYMBOL", "name": "identifier"}
+      ]
+    },
+
+    "infix": {
+      "type": "SEQ",
+      "members": [
+        {"type": "SYMBOL", "name": "_expression"},
+        {
+          "type": "IMMEDIATE_TOKEN",
+          "content": {"type": "STRING", "value": "::"}
+        },
+        {"type": "SYMBOL", "name": "identifier"}
+      ]
+    },
+
+    "identifier": {
+      "type": "PATTERN",
+      "value": "[a-z]+"
+    }
+  }
+}
diff --git a/test/fixtures/test_grammars/immediate_tokens/readme.md b/test/fixtures/test_grammars/immediate_tokens/readme.md
new file mode 100644
index 00000000..39599fcb
--- /dev/null
+++ b/test/fixtures/test_grammars/immediate_tokens/readme.md
@@ -0,0 +1 @@
+This grammar demonstrates the usage of the IMMEDIATE_TOKEN rule. It allows the parser to produce a different token based on whether or not there are `extras` preceding the token's main content. When there are *no* leading `extras`, an immediate token is preferred over a normal token which would otherwise match.

From 68618f61a623769a040b441da33ea8d43568ee8e Mon Sep 17 00:00:00 2001
From: Max Brunsfeld <maxbrunsfeld@gmail.com>
Date: Wed, 1 Aug 2018 14:23:52 -0700
Subject: [PATCH 2/3] Test against immediate token branches of grammar repos

---
 script/fetch-fixtures     | 4 ++--
 script/fetch-fixtures.cmd | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/script/fetch-fixtures b/script/fetch-fixtures
index 4923e62c..1d53a3b7 100755
--- a/script/fetch-fixtures
+++ b/script/fetch-fixtures
@@ -21,9 +21,9 @@ fetch_grammar() {
   )
 }
 
-fetch_grammar javascript master
+fetch_grammar javascript immediate-tokens
 fetch_grammar json       master
-fetch_grammar c          master
+fetch_grammar c          immediate-tokens
 fetch_grammar cpp        master
 fetch_grammar python     master
 fetch_grammar go         master
diff --git a/script/fetch-fixtures.cmd b/script/fetch-fixtures.cmd
index 17ff224d..fffb668e 100644
--- a/script/fetch-fixtures.cmd
+++ b/script/fetch-fixtures.cmd
@@ -1,8 +1,8 @@
 @echo off
 
-call:fetch_grammar javascript master
+call:fetch_grammar javascript immediate-tokens
 call:fetch_grammar json       master
-call:fetch_grammar c          master
+call:fetch_grammar c          immediate-tokens
 call:fetch_grammar cpp        master
 call:fetch_grammar python     master
 call:fetch_grammar go         master

From 41fe564a901a986a88c5bc647e22806fc7e76b65 Mon Sep 17 00:00:00 2001
From: Max Brunsfeld <maxbrunsfeld@gmail.com>
Date: Wed, 1 Aug 2018 15:09:45 -0700
Subject: [PATCH 3/3] Update error recovery fixture

---
 test/fixtures/error_corpus/c_errors.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/fixtures/error_corpus/c_errors.txt b/test/fixtures/error_corpus/c_errors.txt
index b2931b7d..ee63debf 100644
--- a/test/fixtures/error_corpus/c_errors.txt
+++ b/test/fixtures/error_corpus/c_errors.txt
@@ -69,7 +69,7 @@ int main() {
     b();
     c();
 
-    if () d();
+    if (*) d();
   }
 }
 
@@ -81,14 +81,14 @@ int main() {
     (function_declarator (identifier) (parameter_list))
     (compound_statement
       (if_statement
-        (field_expression
+        (parenthesized_expression (field_expression
           (identifier)
-          (MISSING))
+          (MISSING)))
         (compound_statement
           (expression_statement (call_expression (identifier) (argument_list)))
           (expression_statement (call_expression (identifier) (argument_list)))
           (if_statement
-            (MISSING)
+            (parenthesized_expression (pointer_expression (MISSING)))
             (expression_statement (call_expression (identifier) (argument_list)))))))))
 
 ====================================