Add a way to automatically inline rules

2017-07-11 21:17:27 -07:00 · 2017-07-11 21:17:27 -07:00 · 65bf1389e1
commit 65bf1389e1
parent 26a25278cd
15 changed files with 219 additions and 14 deletions
--- a/src/compiler/build_tables/parse_item.cc
+++ b/src/compiler/build_tables/parse_item.cc
@ -156,14 +156,14 @@ struct hash<ParseItem> {
    if (item.is_done()) {
      if (!item.production->empty()) {
        hash_combine(&result, item.production->back().precedence);
-        hash_combine(&result, item.production->back().associativity);
+        hash_combine<unsigned>(&result, item.production->back().associativity);
      }
    } else {
      for (size_t i = 0, n = item.production->size(); i < n; i++) {
        auto &step = item.production->at(i);
        hash_combine(&result, step.symbol);
        hash_combine(&result, step.precedence);
-        hash_combine(&result, step.associativity);
+        hash_combine<unsigned>(&result, step.associativity);
      }
    }
    return result;
--- a/src/compiler/build_tables/parse_item_set_builder.cc
+++ b/src/compiler/build_tables/parse_item_set_builder.cc
@ -1,4 +1,5 @@
 #include "compiler/build_tables/parse_item_set_builder.h"
+#include <algorithm>
 #include <cassert>
 #include <set>
 #include <unordered_map>
@ -11,8 +12,10 @@
 namespace tree_sitter {
 namespace build_tables {

+using std::move;
 using std::vector;
 using std::set;
+using std::find;
 using std::get;
 using std::pair;
 using std::tuple;
@ -21,8 +24,36 @@ using std::make_tuple;
 using rules::Symbol;
 using rules::NONE;

+static vector<Production> inline_production(const ParseItem &item, const SyntaxGrammar &grammar) {
+  vector<Production> result;
+  for (const Production &production_to_insert : grammar.variables[item.next_symbol().index].productions) {
+    auto begin = item.production->steps.begin();
+    auto end = item.production->steps.end();
+    auto step = begin + item.step_index;
+
+    Production production{{begin, step}, item.production->dynamic_precedence};
+    production.steps.insert(
+      production.steps.end(),
+      production_to_insert.steps.begin(),
+      production_to_insert.steps.end()
+    );
+    production.back().precedence = item.precedence();
+    production.back().associativity = item.associativity();
+    production.steps.insert(
+      production.steps.end(),
+      step + 1,
+      end
+    );
+
+    if (find(result.begin(), result.end(), production) == result.end()) {
+      result.push_back(move(production));
+    }
+  }
+  return result;
+}
+
 ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar,
-                                         const LexicalGrammar &lexical_grammar) {
+                                         const LexicalGrammar &lexical_grammar) : grammar{grammar} {
  vector<Symbol> symbols_to_process;
  set<Symbol::Index> processed_non_terminals;

@ -145,24 +176,56 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar,

    for (auto &pair : cached_lookaheads_by_non_terminal) {
      for (const Production &production : grammar.variables[pair.first].productions) {
-        component_cache[i].push_back({
-          ParseItem(Symbol::non_terminal(pair.first), production, 0),
-          pair.second.first,
-          pair.second.second
-        });
+        Symbol lhs = Symbol::non_terminal(pair.first);
+        ParseItem item(lhs, production, 0);
+
+        if (grammar.variables_to_inline.count(item.next_symbol())) {
+          vector<Production> &inlined_productions = inlined_productions_by_original_production[item];
+          if (inlined_productions.empty()) {
+            inlined_productions = inline_production(item, grammar);
+          }
+
+          for (const Production &inlined_production : inlined_productions) {
+            ParseItemSetComponent component{
+              ParseItem(lhs, inlined_production, 0),
+              pair.second.first,
+              pair.second.second
+            };
+
+            if (find(component_cache[i].begin(), component_cache[i].end(), component) == component_cache[i].end()) {
+              component_cache[i].push_back(component);
+            }
+          }
+        } else if (!grammar.variables_to_inline.count(lhs)) {
+          ParseItemSetComponent component{
+            ParseItem(lhs, production, 0),
+            pair.second.first,
+            pair.second.second
+          };
+
+          if (find(component_cache[i].begin(), component_cache[i].end(), component) == component_cache[i].end()) {
+            component_cache[i].push_back(component);
+          }
+        }
      }
    }
  }
 }

 void ParseItemSetBuilder::apply_transitive_closure(ParseItemSet *item_set) {
-  for (const auto &pair : item_set->entries) {
-    const ParseItem &item = pair.first;
-    const LookaheadSet &lookaheads = pair.second;
-    if (item.lhs() != rules::START() && item.step_index == 0) continue;
+  for (auto iter = item_set->entries.begin(), end = item_set->entries.end(); iter != end;) {
+    const ParseItem &item = iter->first;
+    const LookaheadSet &lookaheads = iter->second;
+    if (item.lhs() != rules::START() && item.step_index == 0) {
+      ++iter;
+      continue;
+    }

    const Symbol &next_symbol = item.next_symbol();
-    if (!next_symbol.is_non_terminal() || next_symbol.is_built_in()) continue;
+    if (!next_symbol.is_non_terminal() || next_symbol.is_built_in()) {
+      ++iter;
+      continue;
+    }

    LookaheadSet next_lookaheads;
    size_t next_step = item.step_index + 1;
@ -178,6 +241,24 @@ void ParseItemSetBuilder::apply_transitive_closure(ParseItemSet *item_set) {
      current_lookaheads.insert_all(component.lookaheads);
      if (component.propagates_lookaheads) current_lookaheads.insert_all(next_lookaheads);
    }
+
+    if (grammar.variables_to_inline.count(next_symbol)) {
+      vector<Production> &inlined_productions = inlined_productions_by_original_production[item];
+      if (inlined_productions.empty()) {
+        inlined_productions = inline_production(item, grammar);
+      }
+
+      for (const Production &inlined_production : inlined_productions) {
+        item_set->entries.insert({
+          ParseItem(item.lhs(), inlined_production, item.step_index),
+          lookaheads
+        });
+      }
+
+      iter = item_set->entries.erase(iter);
+    } else {
+      ++iter;
+    }
  }
 }

--- a/src/compiler/build_tables/parse_item_set_builder.h
+++ b/src/compiler/build_tables/parse_item_set_builder.h
@ -4,6 +4,7 @@
 #include "compiler/build_tables/parse_item.h"
 #include "compiler/rule.h"
 #include <map>
+#include <vector>

 namespace tree_sitter {

@ -17,11 +18,19 @@ class ParseItemSetBuilder {
    ParseItem item;
    LookaheadSet lookaheads;
    bool propagates_lookaheads;
+
+    inline bool operator==(const ParseItemSetComponent &other) {
+      return item == other.item &&
+        lookaheads == other.lookaheads &&
+        propagates_lookaheads == other.propagates_lookaheads;
+    }
  };

+  const SyntaxGrammar &grammar;
  std::map<rules::Symbol, LookaheadSet> first_sets;
  std::map<rules::Symbol, LookaheadSet> last_sets;
  std::map<rules::Symbol::Index, std::vector<ParseItemSetComponent>> component_cache;
+  std::map<ParseItem, std::vector<Production>> inlined_productions_by_original_production;

 public:
  ParseItemSetBuilder(const SyntaxGrammar &, const LexicalGrammar &);
--- a/src/compiler/grammar.h
+++ b/src/compiler/grammar.h
@ -31,6 +31,7 @@ struct InputGrammar {
  std::vector<rules::Rule> extra_tokens;
  std::vector<std::unordered_set<rules::NamedSymbol>> expected_conflicts;
  std::vector<Variable> external_tokens;
+  std::unordered_set<rules::NamedSymbol> variables_to_inline;
 };

 }  // namespace tree_sitter
--- a/src/compiler/parse_grammar.cc
+++ b/src/compiler/parse_grammar.cc
@ -205,7 +205,7 @@ ParseGrammarResult parse_grammar(const string &input) {
  string error_message;
  string name;
  InputGrammar grammar;
-  json_value name_json, rules_json, extras_json, conflicts_json, external_tokens_json;
+  json_value name_json, rules_json, extras_json, conflicts_json, external_tokens_json, inline_rules_json;

  json_settings settings = { 0, json_enable_comments, 0, 0, 0, 0 };
  char parse_error[json_error_max];
@ -299,6 +299,24 @@ ParseGrammarResult parse_grammar(const string &input) {
    }
  }

+  inline_rules_json = grammar_json->operator[]("inline");
+  if (inline_rules_json.type != json_none) {
+    if (inline_rules_json.type != json_array) {
+      error_message = "Inline rules must be an array";
+      goto error;
+    }
+
+    for (size_t i = 0, length = inline_rules_json.u.array.length; i < length; i++) {
+      json_value *inline_rule_json = inline_rules_json.u.array.values[i];
+      if (inline_rule_json->type != json_string) {
+        error_message = "Inline rules must be an array of rule names";
+        goto error;
+      }
+
+      grammar.variables_to_inline.insert(rules::NamedSymbol{string(inline_rule_json->u.string.ptr)});
+    }
+  }
+
  external_tokens_json = grammar_json->operator[]("externals");
  if (external_tokens_json.type != json_none) {
    if (external_tokens_json.type != json_array) {
--- a/src/compiler/parse_grammar.h
+++ b/src/compiler/parse_grammar.h
@ -2,6 +2,7 @@
 #define COMPILER_GRAMMAR_JSON_H_

 #include <string>
+#include <unordered_set>
 #include "tree_sitter/compiler.h"
 #include "compiler/grammar.h"

--- a/src/compiler/prepare_grammar/expand_repeats.cc
+++ b/src/compiler/prepare_grammar/expand_repeats.cc
@ -94,6 +94,7 @@ InitialSyntaxGrammar expand_repeats(const InitialSyntaxGrammar &grammar) {
  result.extra_tokens = grammar.extra_tokens;
  result.expected_conflicts = grammar.expected_conflicts;
  result.external_tokens = grammar.external_tokens;
+  result.variables_to_inline = grammar.variables_to_inline;

  ExpandRepeats expander(result.variables.size());
  for (auto &variable : result.variables) {
--- a/src/compiler/prepare_grammar/extract_tokens.cc
+++ b/src/compiler/prepare_grammar/extract_tokens.cc
@ -235,6 +235,10 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
    syntax_grammar.expected_conflicts.insert(new_conflict_set);
  }

+  for (const Symbol &symbol : grammar.variables_to_inline) {
+    syntax_grammar.variables_to_inline.insert(symbol_replacer.replace_symbol(symbol));
+  }
+
  // The grammar's extra tokens can be either token rules or symbols
  // pointing to token rules. If they are symbols, then they'll be handled by
  // the parser; add them to the syntax grammar's extra tokens. If they
--- a/src/compiler/prepare_grammar/flatten_grammar.cc
+++ b/src/compiler/prepare_grammar/flatten_grammar.cc
@ -111,6 +111,7 @@ SyntaxVariable flatten_rule(const Variable &variable) {
 pair<SyntaxGrammar, CompileError> flatten_grammar(const InitialSyntaxGrammar &grammar) {
  SyntaxGrammar result;
  result.external_tokens = grammar.external_tokens;
+  result.variables_to_inline = grammar.variables_to_inline;

  for (const auto &expected_conflict : grammar.expected_conflicts) {
    result.expected_conflicts.insert({
--- a/src/compiler/prepare_grammar/initial_syntax_grammar.h
+++ b/src/compiler/prepare_grammar/initial_syntax_grammar.h
@ -16,6 +16,7 @@ struct InitialSyntaxGrammar {
  std::set<rules::Symbol> extra_tokens;
  std::set<std::set<rules::Symbol>> expected_conflicts;
  std::vector<ExternalToken> external_tokens;
+  std::set<rules::Symbol> variables_to_inline;
 };

 }  // namespace prepare_grammar
--- a/src/compiler/prepare_grammar/intern_symbols.cc
+++ b/src/compiler/prepare_grammar/intern_symbols.cc
@ -142,6 +142,13 @@ pair<InternedGrammar, CompileError> intern_symbols(const InputGrammar &grammar)
    result.expected_conflicts.insert(entry);
  }

+  for (auto &named_symbol : grammar.variables_to_inline) {
+    auto symbol = interner.intern_symbol(named_symbol);
+    if (symbol != rules::NONE()) {
+      result.variables_to_inline.insert(symbol);
+    }
+  }
+
  return {result, CompileError::none()};
 }

--- a/src/compiler/prepare_grammar/interned_grammar.h
+++ b/src/compiler/prepare_grammar/interned_grammar.h
@ -15,6 +15,7 @@ struct InternedGrammar {
  std::vector<rules::Rule> extra_tokens;
  std::set<std::set<rules::Symbol>> expected_conflicts;
  std::vector<Variable> external_tokens;
+  std::set<rules::Symbol> variables_to_inline;
 };

 }  // namespace prepare_grammar
--- a/src/compiler/syntax_grammar.h
+++ b/src/compiler/syntax_grammar.h
@ -74,6 +74,7 @@ struct SyntaxGrammar {
  std::set<rules::Symbol> extra_tokens;
  std::set<ConflictSet> expected_conflicts;
  std::vector<ExternalToken> external_tokens;
+  std::set<rules::Symbol> variables_to_inline;
 };

 }  // namespace tree_sitter
--- a/test/fixtures/test_grammars/inline_rules/corpus.txt
+++ b/test/fixtures/test_grammars/inline_rules/corpus.txt
@ -0,0 +1,11 @@
+==================================
+Expressions
+==================================
+
+1 + 2 * 3;
+
+---
+
+(statement (sum
+  (number)
+  (product (number) (number))))
--- a/test/fixtures/test_grammars/inline_rules/grammar.json
+++ b/test/fixtures/test_grammars/inline_rules/grammar.json
@ -0,0 +1,68 @@
+{
+  "name": "inline_rules",
+
+  "extras": [
+    {"type": "PATTERN", "value": "\\s"}
+  ],
+
+  "inline": [
+    "expression"
+  ],
+
+  "rules": {
+    "statement": {
+      "type": "SEQ",
+      "members": [
+        {"type": "SYMBOL", "name": "expression"},
+        {"type": "STRING", "value": ";"}
+      ]
+    },
+
+    "expression": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "sum"},
+        {"type": "SYMBOL", "name": "product"},
+        {"type": "SYMBOL", "name": "number"},
+        {"type": "SYMBOL", "name": "parenthesized_expression"}
+      ]
+    },
+
+    "parenthesized_expression": {
+      "type": "SEQ",
+      "members": [
+        {"type": "STRING", "value": "("},
+        {"type": "SYMBOL", "name": "expression"},
+        {"type": "STRING", "value": ")"}
+      ]
+    },
+
+    "sum": {
+      "type": "PREC_LEFT",
+      "value": 0,
+      "content": {
+        "type": "SEQ",
+        "members": [
+          {"type": "SYMBOL", "name": "expression"},
+          {"type": "STRING", "value": "+"},
+          {"type": "SYMBOL", "name": "expression"}
+        ]
+      }
+    },
+
+    "product": {
+      "type": "PREC_LEFT",
+      "value": 2,
+      "content": {
+        "type": "SEQ",
+        "members": [
+          {"type": "SYMBOL", "name": "expression"},
+          {"type": "STRING", "value": "*"},
+          {"type": "SYMBOL", "name": "expression"}
+        ]
+      }
+    },
+
+    "number": {"type": "PATTERN", "value": "\\d+"}
+  }
+}