Merge pull request #190 from tree-sitter/immediate-tokens

Add immediate token rule for enforcing no preceding extras
2018-08-01 15:21:42 -07:00 · 2018-08-01 15:21:42 -07:00 · 1dcbd21bbe
commit 1dcbd21bbe
parent f1821bb04d 41fe564a90
16 changed files with 220 additions and 67 deletions
--- a/script/fetch-fixtures
+++ b/script/fetch-fixtures
@ -21,9 +21,9 @@ fetch_grammar() {
  )
 }

-fetch_grammar javascript master
+fetch_grammar javascript immediate-tokens
 fetch_grammar json       master
-fetch_grammar c          master
+fetch_grammar c          immediate-tokens
 fetch_grammar cpp        master
 fetch_grammar python     master
 fetch_grammar go         master
--- a/script/fetch-fixtures.cmd
+++ b/script/fetch-fixtures.cmd
@ -1,8 +1,8 @@
@echo off

-call:fetch_grammar javascript master
+call:fetch_grammar javascript immediate-tokens
 call:fetch_grammar json       master
-call:fetch_grammar c          master
+call:fetch_grammar c          immediate-tokens
 call:fetch_grammar cpp        master
 call:fetch_grammar python     master
 call:fetch_grammar go         master
--- a/src/compiler/build_tables/lex_table_builder.cc
+++ b/src/compiler/build_tables/lex_table_builder.cc
@ -379,9 +379,14 @@ class LexTableBuilderImpl : public LexTableBuilder {
    for (const LexItem &item : item_set.entries) {
      LexItem::CompletionStatus completion_status = item.completion_status();
      if (completion_status.is_done) {
-        AcceptTokenAction action(item.lhs, completion_status.precedence.max,
-                                 item.lhs.is_built_in() ||
-                                 grammar.variables[item.lhs.index].is_string);
+        AcceptTokenAction action(item.lhs, completion_status.precedence.max);
+
+        if (!item.lhs.is_built_in()) {
+          const LexicalVariable &variable = grammar.variables[item.lhs.index];
+          if (variable.is_string) action.implicit_precedence += 2;
+          if (is_immediate_token(variable.rule)) action.implicit_precedence += 1;
+        }
+
        AcceptTokenAction &existing_action = lex_table.states[state_id].accept_action;
        if (existing_action.is_present()) {
          if (should_replace_accept_action(existing_action, action)) {
@ -458,8 +463,8 @@ class LexTableBuilderImpl : public LexTableBuilder {

  void remove_duplicate_lex_states(LexTable &lex_table) {
    for (LexState &state : lex_table.states) {
-      state.accept_action.is_string = false;
      state.accept_action.precedence = 0;
+      state.accept_action.implicit_precedence = 0;
    }

    map<LexStateId, LexStateId> replacements;
@ -523,12 +528,24 @@ class LexTableBuilderImpl : public LexTableBuilder {
    }
  }

+  bool is_immediate_token(const Rule &rule) const {
+    return rule.match(
+      [](const Metadata &metadata) {
+        return metadata.params.is_main_token;
+      },
+
+      [](auto rule) {
+        return false;
+      }
+    );
+  }
+
  LexItemSet item_set_for_terminals(const LookaheadSet &terminals, bool with_separators) {
    LexItemSet result;
    terminals.for_each([&](Symbol symbol) {
      if (symbol.is_terminal()) {
        for (auto &&rule : rules_for_symbol(symbol)) {
-          if (with_separators) {
+          if (with_separators && !is_immediate_token(rule)) {
            for (const auto &separator_rule : separator_rules) {
              result.entries.insert(LexItem(
                symbol,
@ -598,8 +615,8 @@ class LexTableBuilderImpl : public LexTableBuilder {
                                    const AcceptTokenAction &new_action) {
    if (new_action.precedence > old_action.precedence) return true;
    if (new_action.precedence < old_action.precedence) return false;
-    if (new_action.is_string && !old_action.is_string) return true;
-    if (old_action.is_string && !new_action.is_string) return false;
+    if (new_action.implicit_precedence > old_action.implicit_precedence) return true;
+    if (new_action.implicit_precedence < old_action.implicit_precedence) return false;
    return new_action.symbol.index < old_action.symbol.index;
  }

--- a/src/compiler/generate_code/c_code.cc
+++ b/src/compiler/generate_code/c_code.cc
@ -23,6 +23,7 @@ using std::pair;
 using std::set;
 using std::string;
 using std::to_string;
+using std::unordered_set;
 using std::vector;
 using util::escape_char;
 using rules::Symbol;
@ -76,7 +77,7 @@ class CCodeGenerator {
  Symbol keyword_capture_token;
  const SyntaxGrammar syntax_grammar;
  const LexicalGrammar lexical_grammar;
-  map<string, string> sanitized_names;
+  map<Symbol, string> symbol_ids;
  vector<pair<size_t, ParseTableEntry>> parse_table_entries;
  vector<set<Symbol::Index>> external_scanner_states;
  size_t next_parse_action_list_index;
@ -165,6 +166,24 @@ class CCodeGenerator {
      }
    }

+    unordered_set<string> symbol_id_values;
+    symbol_ids[rules::END_OF_INPUT()] = "ts_builtin_sym_end";
+
+    for (const Symbol &symbol : parse_table.symbols) {
+      if (!symbol.is_built_in()) {
+        assign_symbol_id(symbol, &symbol_id_values);
+      }
+    }
+
+    for (size_t i = 0; i < syntax_grammar.external_tokens.size(); i++) {
+      const ExternalToken &external_token = syntax_grammar.external_tokens[i];
+      if (external_token.corresponding_internal_token == rules::NONE()) {
+        assign_symbol_id(Symbol::external(i), &symbol_id_values);
+      } else {
+        symbol_ids[Symbol::external(i)] = symbol_ids[external_token.corresponding_internal_token];
+      }
+    }
+
    line("#define LANGUAGE_VERSION " + to_string(TREE_SITTER_LANGUAGE_VERSION));
    line("#define STATE_COUNT " + to_string(parse_table.states.size()));
    line("#define SYMBOL_COUNT " + to_string(parse_table.symbols.size()));
@ -175,6 +194,33 @@ class CCodeGenerator {
    line();
  }

+  void assign_symbol_id(const Symbol &symbol, unordered_set<string> *symbol_id_values) {
+    auto entry = entry_for_symbol(symbol);
+
+    string symbol_id;
+    switch (entry.second) {
+      case VariableTypeAuxiliary:
+        symbol_id = "aux_sym_" + sanitize_name(entry.first);
+        break;
+      case VariableTypeAnonymous:
+        symbol_id = "anon_sym_" + sanitize_name(entry.first);
+        break;
+      default:
+        symbol_id = "sym_" + sanitize_name(entry.first);
+        break;
+    }
+
+    unsigned suffix_number = 1;
+    string unique_symbol_id = symbol_id;
+    while (symbol_id_values->count(unique_symbol_id)) {
+      suffix_number++;
+      unique_symbol_id = symbol_id + to_string(suffix_number);
+    }
+
+    symbol_id_values->insert(unique_symbol_id);
+    symbol_ids[symbol] = unique_symbol_id;
+  }
+
  void add_symbol_enum() {
    line("enum {");
    indent([&]() {
@ -696,20 +742,7 @@ class CCodeGenerator {
  }

  string symbol_id(const Symbol &symbol) {
-    if (symbol == rules::END_OF_INPUT())
-      return "ts_builtin_sym_end";
-
-    auto entry = entry_for_symbol(symbol);
-    string name = sanitize_name(entry.first);
-
-    switch (entry.second) {
-      case VariableTypeAuxiliary:
-        return "aux_sym_" + name;
-      case VariableTypeAnonymous:
-        return "anon_sym_" + name;
-      default:
-        return "sym_" + name;
-    }
+    return symbol_ids[symbol];
  }

  string alias_id(const Alias &alias) {
@ -776,47 +809,35 @@ class CCodeGenerator {
    return name;
  }

-  string sanitize_name(string name) {
-    auto existing = sanitized_names.find(name);
-    if (existing != sanitized_names.end())
-      return existing->second;
-
-    string stripped_name;
+  string sanitize_name(const string &name) {
+    string result;
    for (char c : name) {
      if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') ||
          ('0' <= c && c <= '9') || (c == '_')) {
-        stripped_name += c;
+        result += c;
      } else {
        auto replacement = REPLACEMENTS.find(c);
-        size_t i = stripped_name.size();
+        size_t i = result.size();
        if (replacement != REPLACEMENTS.end()) {
-          if (i > 0 && stripped_name[i - 1] != '_')
-            stripped_name += "_";
-          stripped_name += replacement->second;
+          if (i > 0 && result[i - 1] != '_')
+            result += "_";
+          result += replacement->second;
        }
      }
    }
-
-    for (size_t extra_number = 0;; extra_number++) {
-      string suffix = extra_number ? to_string(extra_number) : "";
-      string unique_name = stripped_name + suffix;
-      if (unique_name == "")
-        continue;
-      if (!has_sanitized_name(unique_name)) {
-        sanitized_names.insert({ name, unique_name });
-        return unique_name;
-      }
-    }
+    return result;
  }

  string _boolean(bool value) {
    return value ? "true" : "false";
  }

-  bool has_sanitized_name(string name) {
-    for (const auto &pair : sanitized_names)
-      if (pair.second == name)
+  bool has_sanitized_name(const Symbol &symbol, string name) {
+    for (const auto &pair : symbol_ids) {
+      if (pair.second == name) {
        return true;
+      }
+    }
    return false;
  }

--- a/src/compiler/grammar-schema.json
+++ b/src/compiler/grammar-schema.json
@ -201,7 +201,7 @@
      "properties": {
        "type": {
          "type": "string",
-          "pattern": "^TOKEN$"
+          "pattern": "^(TOKEN|IMMEDIATE_TOKEN)$"
        },
        "content": {
          "$ref": "#/definitions/rule"
--- a/src/compiler/lex_table.cc
+++ b/src/compiler/lex_table.cc
@ -16,9 +16,9 @@ AdvanceAction::AdvanceAction() : state_index(-1) {}
 AdvanceAction::AdvanceAction(size_t state_index,
                             PrecedenceRange precedence_range,
                             bool in_main_token)
-    : state_index(state_index),
-      precedence_range(precedence_range),
-      in_main_token(in_main_token) {}
+  : state_index(state_index),
+    precedence_range(precedence_range),
+    in_main_token(in_main_token) {}

 bool AdvanceAction::operator==(const AdvanceAction &other) const {
  return (state_index == other.state_index) &&
@ -26,19 +26,21 @@ bool AdvanceAction::operator==(const AdvanceAction &other) const {
 }

 AcceptTokenAction::AcceptTokenAction()
-    : symbol(rules::NONE()), precedence(0), is_string(false) {}
+  : symbol(rules::NONE()), precedence(0), implicit_precedence(0) {}

-AcceptTokenAction::AcceptTokenAction(Symbol symbol, int precedence,
-                                     bool is_string)
-    : symbol(symbol), precedence(precedence), is_string(is_string) {}
+AcceptTokenAction::AcceptTokenAction(Symbol symbol, int precedence)
+  : symbol(symbol), precedence(precedence), implicit_precedence(0) {}

 bool AcceptTokenAction::is_present() const {
  return symbol != rules::NONE();
 }

 bool AcceptTokenAction::operator==(const AcceptTokenAction &other) const {
-  return (symbol == other.symbol) && (precedence == other.precedence) &&
-         (is_string == other.is_string);
+  return (
+    symbol == other.symbol &&
+    precedence == other.precedence &&
+    implicit_precedence == other.implicit_precedence
+  );
 }

 bool LexState::operator==(const LexState &other) const {
--- a/src/compiler/lex_table.h
+++ b/src/compiler/lex_table.h
@ -25,14 +25,14 @@ struct AdvanceAction {

 struct AcceptTokenAction {
  AcceptTokenAction();
-  AcceptTokenAction(rules::Symbol, int, bool);
+  AcceptTokenAction(rules::Symbol, int);
  bool is_present() const;
  bool operator==(const AcceptTokenAction &other) const;
  inline bool operator!=(const AcceptTokenAction &other) const { return !operator==(other); }

  rules::Symbol symbol;
  int precedence;
-  bool is_string;
+  int implicit_precedence;
 };

 struct LexState {
--- a/src/compiler/parse_grammar.cc
+++ b/src/compiler/parse_grammar.cc
@ -116,6 +116,15 @@ ParseRuleResult parse_rule(json_value *rule_json) {
      return Rule(Metadata::token(move(result.rule)));
  }

+  if (type == "IMMEDIATE_TOKEN") {
+    json_value content_json = rule_json->operator[]("content");
+    auto result = parse_rule(&content_json);
+    if (!result.error_message.empty()) {
+      return "Invalid token content: " + result.error_message;
+    }
+      return Rule(Metadata::immediate_token(move(result.rule)));
+  }
+
  if (type == "PATTERN") {
    json_value value_json = rule_json->operator[]("value");
    if (value_json.type == json_string) {
--- a/src/compiler/prepare_grammar/extract_tokens.cc
+++ b/src/compiler/prepare_grammar/extract_tokens.cc
@ -118,6 +118,8 @@ class TokenExtractor {
          metadata.params.is_token = false;
          if (metadata.params == rules::MetadataParams{}) {
            return extract_token(*metadata.rule, VariableTypeAuxiliary);
+          } else if (metadata.rule->is<rules::String>()) {
+            return extract_token(metadata, VariableTypeAnonymous);
          } else {
            return extract_token(metadata, VariableTypeAuxiliary);
          }
--- a/src/compiler/rule.cc
+++ b/src/compiler/rule.cc
@ -135,6 +135,9 @@ bool Rule::is<Blank>() const { return type == BlankType; }
 template <>
 bool Rule::is<Symbol>() const { return type == SymbolType; }

+template <>
+bool Rule::is<String>() const { return type == StringType; }
+
 template <>
 bool Rule::is<Repeat>() const { return type == RepeatType; }

--- a/src/compiler/rules/metadata.cc
+++ b/src/compiler/rules/metadata.cc
@ -75,6 +75,13 @@ Metadata Metadata::token(Rule &&rule) {
  });
 }

+Metadata Metadata::immediate_token(Rule &&rule) {
+  return add_metadata(move(rule), [](MetadataParams &params) {
+    params.is_token = true;
+    params.is_main_token = true;
+  });
+}
+
 Metadata Metadata::active_prec(int precedence, Rule &&rule) {
  return add_metadata(move(rule), [&](MetadataParams &params) {
    params.has_precedence = true;
--- a/src/compiler/rules/metadata.h
+++ b/src/compiler/rules/metadata.h
@ -64,6 +64,7 @@ struct Metadata {

  static Metadata merge(Rule &&rule, MetadataParams params);
  static Metadata token(Rule &&rule);
+  static Metadata immediate_token(Rule &&rule);
  static Metadata active_prec(int precedence, Rule &&rule);
  static Metadata prec(int precedence, Rule &&rule);
  static Metadata prec_left(int precedence, Rule &&rule);
--- a/test/fixtures/error_corpus/c_errors.txt
+++ b/test/fixtures/error_corpus/c_errors.txt
@ -69,7 +69,7 @@ int main() {
    b();
    c();

-    if () d();
+    if (*) d();
  }
 }

@ -81,14 +81,14 @@ int main() {
    (function_declarator (identifier) (parameter_list))
    (compound_statement
      (if_statement
-        (field_expression
+        (parenthesized_expression (field_expression
          (identifier)
-          (MISSING))
+          (MISSING)))
        (compound_statement
          (expression_statement (call_expression (identifier) (argument_list)))
          (expression_statement (call_expression (identifier) (argument_list)))
          (if_statement
-            (MISSING)
+            (parenthesized_expression (pointer_expression (MISSING)))
            (expression_statement (call_expression (identifier) (argument_list)))))))))

 ====================================
--- a/test/fixtures/test_grammars/immediate_tokens/corpus.txt
+++ b/test/fixtures/test_grammars/immediate_tokens/corpus.txt
@ -0,0 +1,29 @@
+===============================
+prefix expressions as arguments
+===============================
+
+a ::b ::c
+
+---
+
+(program
+  (call
+    (call
+      (identifier)
+      (prefix (identifier)))
+    (prefix (identifier))))
+
+===============================
+infix expressions
+===============================
+
+a::b::c
+
+---
+
+(program
+  (infix
+    (infix
+      (identifier)
+      (identifier))
+    (identifier)))
--- a/test/fixtures/test_grammars/immediate_tokens/grammar.json
+++ b/test/fixtures/test_grammars/immediate_tokens/grammar.json
@ -0,0 +1,61 @@
+{
+  "name": "immediate_tokens",
+
+  "extras": [
+    {
+      "type": "PATTERN",
+      "value": "\\s"
+    }
+  ],
+
+  "rules": {
+    "program": {"type": "SYMBOL", "name": "_expression"},
+
+    "_expression": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "call"},
+        {"type": "SYMBOL", "name": "infix"},
+        {"type": "SYMBOL", "name": "prefix"},
+        {"type": "SYMBOL", "name": "identifier"}
+      ]
+    },
+
+    "call": {
+      "type": "PREC_LEFT",
+      "value": -1,
+      "content": {
+        "type": "SEQ",
+        "members": [
+          {"type": "SYMBOL", "name": "_expression"},
+          {"type": "SYMBOL", "name": "_expression"}
+        ]
+      }
+    },
+
+    "prefix": {
+      "type": "SEQ",
+      "members": [
+        {"type": "STRING", "value": "::"},
+        {"type": "SYMBOL", "name": "identifier"}
+      ]
+    },
+
+    "infix": {
+      "type": "SEQ",
+      "members": [
+        {"type": "SYMBOL", "name": "_expression"},
+        {
+          "type": "IMMEDIATE_TOKEN",
+          "content": {"type": "STRING", "value": "::"}
+        },
+        {"type": "SYMBOL", "name": "identifier"}
+      ]
+    },
+
+    "identifier": {
+      "type": "PATTERN",
+      "value": "[a-z]+"
+    }
+  }
+}
--- a/test/fixtures/test_grammars/immediate_tokens/readme.md
+++ b/test/fixtures/test_grammars/immediate_tokens/readme.md
@ -0,0 +1 @@
+This grammar demonstrates the usage of the IMMEDIATE_TOKEN rule. It allows the parser to produce a different token based on whether or not there are `extras` preceding the token's main content. When there are *no* leading `extras`, an immediate token is preferred over a normal token which would otherwise match.
				`@ -0,0 +1 @@`
				This grammar demonstrates the usage of the IMMEDIATE_TOKEN rule. It allows the parser to produce a different token based on whether or not there are `extras` preceding the token's main content. When there are no leading `extras`, an immediate token is preferred over a normal token which would otherwise match.