In compiler, distinguish between anonymous tokens and hidden rules

2015-09-05 17:05:37 -07:00 · 2015-09-05 17:05:37 -07:00 · 5982b77c97
commit 5982b77c97
parent 4b270c8604
46 changed files with 41131 additions and 40884 deletions
--- a/src/compiler/build_tables/build_lex_table.cc
+++ b/src/compiler/build_tables/build_lex_table.cc
@ -10,7 +10,7 @@
 #include "compiler/build_tables/get_metadata.h"
 #include "compiler/build_tables/lex_item.h"
 #include "compiler/parse_table.h"
-#include "compiler/lexical_grammar.h"
+#include "compiler/prepared_grammar.h"
 #include "compiler/rules/built_in_symbols.h"
 #include "compiler/rules/choice.h"
 #include "compiler/rules/metadata.h"
@ -63,9 +63,9 @@ class LexTableBuilder {
        result.insert(
          LexItem(symbol, after_separators(CharacterSet().include(0).copy())));

-      else if (symbol.is_token())
-        result.insert(
-          LexItem(symbol, after_separators(lex_grammar.rule(symbol))));
+      else if (symbol.is_token)
+        result.insert(LexItem(
+          symbol, after_separators(lex_grammar.rules[symbol.index].rule)));
    }
    return result;
  }
--- a/src/compiler/build_tables/build_parse_table.cc
+++ b/src/compiler/build_tables/build_parse_table.cc
@ -11,8 +11,7 @@
 #include "compiler/build_tables/parse_item.h"
 #include "compiler/build_tables/get_completion_status.h"
 #include "compiler/build_tables/get_metadata.h"
-#include "compiler/lexical_grammar.h"
-#include "compiler/syntax_grammar.h"
+#include "compiler/prepared_grammar.h"
 #include "compiler/rules/symbol.h"
 #include "compiler/rules/built_in_symbols.h"

@ -48,9 +47,8 @@ class ParseTableBuilder {
        conflict_manager(grammar) {}

  pair<ParseTable, const GrammarError *> build() {
-    auto start_symbol = grammar.rules.empty()
-                          ? make_shared<Symbol>(0, rules::SymbolOptionToken)
-                          : make_shared<Symbol>(0);
+    auto start_symbol = grammar.rules.empty() ? make_shared<Symbol>(0, true)
+                                              : make_shared<Symbol>(0);
    ParseItem start_item(rules::START(), start_symbol, {});
    add_parse_state(
      item_set_closure(start_item, { rules::END_OF_INPUT() }, grammar));
@ -260,10 +258,10 @@ class ParseTableBuilder {
        return "END_OF_INPUT";
      else
        return "";
-    } else if (symbol.is_token())
-      return lexical_grammar.rule_name(symbol);
+    } else if (symbol.is_token)
+      return lexical_grammar.rules[symbol.index].name;
    else
-      return grammar.rule_name(symbol);
+      return grammar.rules[symbol.index].name;
  }

  string action_description(const ParseAction &action) const {
--- a/src/compiler/build_tables/build_tables.cc
+++ b/src/compiler/build_tables/build_tables.cc
@ -1,7 +1,6 @@
 #include "compiler/build_tables/build_lex_table.h"
 #include "compiler/build_tables/build_parse_table.h"
-#include "compiler/lexical_grammar.h"
-#include "compiler/syntax_grammar.h"
+#include "compiler/prepared_grammar.h"

 namespace tree_sitter {
 namespace build_tables {
--- a/src/compiler/build_tables/first_symbols.cc
+++ b/src/compiler/build_tables/first_symbols.cc
@ -1,6 +1,6 @@
 #include "compiler/build_tables/first_symbols.h"
 #include "compiler/build_tables/rule_can_be_blank.h"
-#include "compiler/syntax_grammar.h"
+#include "compiler/prepared_grammar.h"
 #include "compiler/rules/choice.h"
 #include "compiler/rules/metadata.h"
 #include "compiler/rules/seq.h"
@ -28,8 +28,8 @@ class FirstSymbols : public rules::RuleFn<set<Symbol>> {
      return set<Symbol>();

    set<Symbol> result({ *rule });
-    if (!rule->is_token()) {
-      set<Symbol> &&symbols = apply(grammar->rule(*rule));
+    if (!rule->is_token) {
+      set<Symbol> &&symbols = apply(grammar->rules[rule->index].rule);
      result.insert(symbols.begin(), symbols.end());
    }

--- a/src/compiler/build_tables/item_set_closure.cc
+++ b/src/compiler/build_tables/item_set_closure.cc
@ -7,7 +7,7 @@
 #include "compiler/build_tables/rule_transitions.h"
 #include "compiler/build_tables/rule_can_be_blank.h"
 #include "compiler/build_tables/item.h"
-#include "compiler/syntax_grammar.h"
+#include "compiler/prepared_grammar.h"

 namespace tree_sitter {
 namespace build_tables {
@ -41,7 +41,7 @@ const ParseItemSet item_set_closure(const ParseItem &starting_item,
      const Symbol &symbol = pair.first;
      const rule_ptr &next_rule = pair.second;

-      if (symbol.is_token() || symbol.is_built_in())
+      if (symbol.is_token || symbol.is_built_in())
        continue;

      set<Symbol> next_lookahead_symbols = first_symbols(next_rule, grammar);
@ -49,8 +49,9 @@ const ParseItemSet item_set_closure(const ParseItem &starting_item,
        next_lookahead_symbols.insert(lookahead_symbols.begin(),
                                      lookahead_symbols.end());

-      items_to_process.push_back({ ParseItem(symbol, grammar.rule(symbol), {}),
-                                   next_lookahead_symbols });
+      items_to_process.push_back(
+        { ParseItem(symbol, grammar.rules[symbol.index].rule, {}),
+          next_lookahead_symbols });
    }
  }

--- a/src/compiler/build_tables/item_set_transitions.cc
+++ b/src/compiler/build_tables/item_set_transitions.cc
@ -4,7 +4,7 @@
 #include "compiler/build_tables/merge_transitions.h"
 #include "compiler/build_tables/parse_item.h"
 #include "compiler/build_tables/rule_transitions.h"
-#include "compiler/syntax_grammar.h"
+#include "compiler/prepared_grammar.h"
 #include "compiler/rules/symbol.h"

 namespace tree_sitter {
--- a/src/compiler/build_tables/lex_conflict_manager.h
+++ b/src/compiler/build_tables/lex_conflict_manager.h
@ -2,7 +2,7 @@
 #define COMPILER_BUILD_TABLES_LEX_CONFLICT_MANAGER_H_

 #include "tree_sitter/compiler.h"
-#include "compiler/lexical_grammar.h"
+#include "compiler/prepared_grammar.h"

 namespace tree_sitter {

--- a/src/compiler/build_tables/parse_conflict_manager.h
+++ b/src/compiler/build_tables/parse_conflict_manager.h
@ -3,8 +3,7 @@

 #include <utility>
 #include "tree_sitter/compiler.h"
-#include "compiler/syntax_grammar.h"
-#include "compiler/lexical_grammar.h"
+#include "compiler/prepared_grammar.h"
 #include "compiler/build_tables/parse_item.h"

 namespace tree_sitter {
--- a/src/compiler/build_tables/rule_can_be_blank.cc
+++ b/src/compiler/build_tables/rule_can_be_blank.cc
@ -1,7 +1,7 @@
 #include "compiler/build_tables/rule_can_be_blank.h"
 #include <set>
 #include "tree_sitter/compiler.h"
-#include "compiler/syntax_grammar.h"
+#include "compiler/prepared_grammar.h"
 #include "compiler/rules/symbol.h"
 #include "compiler/rules/visitor.h"
 #include "compiler/rules/seq.h"
@ -55,7 +55,7 @@ class CanBeBlankRecursive : public CanBeBlank {
  bool apply_to(const rules::Symbol *rule) {
    if (visited_symbols.find(*rule) == visited_symbols.end()) {
      visited_symbols.insert(*rule);
-      return !rule->is_token() && apply(grammar->rule(*rule));
+      return !rule->is_token && apply(grammar->rules[rule->index].rule);
    } else {
      return false;
    }
--- a/src/compiler/compile.cc
+++ b/src/compiler/compile.cc
@ -2,8 +2,7 @@
 #include "compiler/prepare_grammar/prepare_grammar.h"
 #include "compiler/build_tables/build_tables.h"
 #include "compiler/generate_code/c_code.h"
-#include "compiler/syntax_grammar.h"
-#include "compiler/lexical_grammar.h"
+#include "compiler/prepared_grammar.h"

 namespace tree_sitter {

--- a/src/compiler/generate_code/c_code.cc
+++ b/src/compiler/generate_code/c_code.cc
@ -7,8 +7,7 @@
 #include "compiler/generate_code/c_code.h"
 #include "compiler/lex_table.h"
 #include "compiler/parse_table.h"
-#include "compiler/lexical_grammar.h"
-#include "compiler/syntax_grammar.h"
+#include "compiler/prepared_grammar.h"
 #include "compiler/rules/built_in_symbols.h"
 #include "compiler/util/string_helpers.h"

@ -142,7 +141,7 @@ class CCodeGenerator {
    indent([&]() {
      for (const auto &symbol : parse_table.symbols)
        if (!symbol.is_built_in() &&
-            (symbol.is_auxiliary() || rule_name(symbol)[0] == '_'))
+            (is_auxiliary(symbol) || rule_name(symbol)[0] == '_'))
          line("[" + symbol_id(symbol) + "] = 1,");
    });
    line("};");
@ -329,7 +328,7 @@ class CCodeGenerator {
        return "";
    } else {
      string name = sanitize_name(rule_name(symbol));
-      if (symbol.is_auxiliary())
+      if (is_auxiliary(symbol))
        return "aux_sym_" + name;
      else
        return "sym_" + name;
@ -349,9 +348,20 @@ class CCodeGenerator {
    }
  }

+  bool is_auxiliary(const rules::Symbol &symbol) {
+    if (symbol.is_token) {
+      return lexical_grammar.rules[symbol.index].type != RuleEntryTypeNamed;
+    } else {
+      return syntax_grammar.rules[symbol.index].type != RuleEntryTypeNamed;
+    }
+  }
+
  string rule_name(const rules::Symbol &symbol) {
-    return symbol.is_token() ? lexical_grammar.rule_name(symbol)
-                             : syntax_grammar.rule_name(symbol);
+    if (symbol.is_token) {
+      return lexical_grammar.rules[symbol.index].name;
+    } else {
+      return syntax_grammar.rules[symbol.index].name;
+    }
  }

  bool reduce_action_is_fragile(const ParseAction &action) const {
@ -394,15 +404,14 @@ class CCodeGenerator {
      if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') ||
          ('0' <= c && c <= '9') || (c == '_')) {
        stripped_name += c;
-        continue;
-      }
-
-      auto replacement = REPLACEMENTS.find(c);
-      if (replacement != REPLACEMENTS.end()) {
-        if (stripped_name[stripped_name.size() - 1] != '_')
-          stripped_name += "_";
-        stripped_name += replacement->second;
-        continue;
+      } else {
+        auto replacement = REPLACEMENTS.find(c);
+        size_t i = stripped_name.size();
+        if (replacement != REPLACEMENTS.end()) {
+          if (i > 0 && stripped_name[i - 1] != '_')
+            stripped_name += "_";
+          stripped_name += replacement->second;
+        }
      }
    }

--- a/src/compiler/lexical_grammar.cc
+++ b/src/compiler/lexical_grammar.cc
@ -1,24 +0,0 @@
-#include "compiler/lexical_grammar.h"
-#include <vector>
-#include <string>
-#include <utility>
-#include "compiler/rules/symbol.h"
-
-namespace tree_sitter {
-
-using std::string;
-using std::pair;
-using std::vector;
-using std::set;
-
-const rule_ptr &LexicalGrammar::rule(const rules::Symbol &symbol) const {
-  return symbol.is_auxiliary() ? aux_rules[symbol.index].second
-                               : rules[symbol.index].second;
-}
-
-const string &LexicalGrammar::rule_name(const rules::Symbol &symbol) const {
-  return symbol.is_auxiliary() ? aux_rules[symbol.index].first
-                               : rules[symbol.index].first;
-}
-
-}  // namespace tree_sitter
--- a/src/compiler/lexical_grammar.h
+++ b/src/compiler/lexical_grammar.h
@ -1,24 +0,0 @@
-#ifndef COMPILER_LEXICAL_GRAMMAR_H_
-#define COMPILER_LEXICAL_GRAMMAR_H_
-
-#include <vector>
-#include <string>
-#include <utility>
-#include "tree_sitter/compiler.h"
-#include "compiler/rules/symbol.h"
-
-namespace tree_sitter {
-
-class LexicalGrammar {
- public:
-  const std::string &rule_name(const rules::Symbol &symbol) const;
-  const rule_ptr &rule(const rules::Symbol &symbol) const;
-
-  std::vector<std::pair<std::string, rule_ptr>> rules;
-  std::vector<std::pair<std::string, rule_ptr>> aux_rules;
-  std::vector<rule_ptr> separators;
-};
-
-}  // namespace tree_sitter
-
-#endif  // COMPILER_LEXICAL_GRAMMAR_H_
--- a/src/compiler/prepare_grammar/expand_repeats.cc
+++ b/src/compiler/prepare_grammar/expand_repeats.cc
@ -2,7 +2,7 @@
 #include <vector>
 #include <string>
 #include <utility>
-#include "compiler/syntax_grammar.h"
+#include "compiler/prepared_grammar.h"
 #include "compiler/rules/visitor.h"
 #include "compiler/rules/seq.h"
 #include "compiler/rules/symbol.h"
@ -40,12 +40,14 @@ class ExpandRepeats : public rules::IdentityRuleFn {
    size_t index = aux_rules.size();
    string helper_rule_name =
      rule_name + string("_repeat") + to_string(++repeat_count);
-    Symbol repeat_symbol(offset + index, rules::SymbolOptionAuxiliary);
+    Symbol repeat_symbol(offset + index);
    existing_repeats.push_back({ rule->copy(), repeat_symbol });
-    aux_rules.push_back(
-      { helper_rule_name,
-        Seq::build({ inner_rule, Choice::build({ repeat_symbol.copy(),
-                                                 make_shared<Blank>() }) }) });
+    aux_rules.push_back({
+      helper_rule_name,
+      Seq::build({ inner_rule, Choice::build({ repeat_symbol.copy(),
+                                               make_shared<Blank>() }) }),
+      RuleEntryTypeHidden,
+    });
    return repeat_symbol.copy();
  }

@ -62,22 +64,21 @@ class ExpandRepeats : public rules::IdentityRuleFn {
    return apply(rule);
  }

-  vector<pair<string, rule_ptr>> aux_rules;
+  vector<RuleEntry> aux_rules;
 };

 SyntaxGrammar expand_repeats(const SyntaxGrammar &grammar) {
  SyntaxGrammar result;
-  result.aux_rules = grammar.aux_rules;
+  result.rules = grammar.rules;
  result.ubiquitous_tokens = grammar.ubiquitous_tokens;
  result.expected_conflicts = grammar.expected_conflicts;

-  ExpandRepeats expander(result.aux_rules.size());
-  for (auto &pair : grammar.rules)
-    result.rules.push_back(
-      { pair.first, expander.expand(pair.second, pair.first) });
+  ExpandRepeats expander(result.rules.size());
+  for (auto &rule_entry : result.rules)
+    rule_entry.rule = expander.expand(rule_entry.rule, rule_entry.name);

-  result.aux_rules.insert(result.aux_rules.end(), expander.aux_rules.begin(),
-                          expander.aux_rules.end());
+  result.rules.insert(result.rules.end(), expander.aux_rules.begin(),
+                      expander.aux_rules.end());
  return result;
 }

--- a/src/compiler/prepare_grammar/expand_tokens.cc
+++ b/src/compiler/prepare_grammar/expand_tokens.cc
@ -2,12 +2,14 @@
 #include <vector>
 #include <string>
 #include <utility>
-#include "compiler/lexical_grammar.h"
+#include <map>
+#include "compiler/prepared_grammar.h"
 #include "compiler/rules/visitor.h"
 #include "compiler/rules/pattern.h"
 #include "compiler/rules/string.h"
 #include "compiler/rules/blank.h"
 #include "compiler/rules/seq.h"
+#include "compiler/rules/metadata.h"
 #include "compiler/rules/character_set.h"
 #include "compiler/prepare_grammar/parse_regex.h"
 #include "utf8proc.h"
@ -17,10 +19,12 @@ namespace prepare_grammar {

 using std::string;
 using std::vector;
+using std::map;
 using std::pair;
 using std::make_shared;
 using rules::String;
 using rules::Pattern;
+using rules::Metadata;

 class ExpandTokens : public rules::IdentityRuleFn {
  using rules::IdentityRuleFn::apply_to;
@ -40,7 +44,11 @@ class ExpandTokens : public rules::IdentityRuleFn {
      elements.push_back(rules::CharacterSet().include(el).copy());
    }

-    return rules::Seq::build(elements);
+    return make_shared<rules::Metadata>(
+      rules::Seq::build(elements),
+      std::map<rules::MetadataKey, int>({
+        { rules::IS_TOKEN, 1 }, { rules::PRECEDENCE, 1 },
+      }));
  }

  rule_ptr apply_to(const Pattern *rule) {
@ -60,18 +68,11 @@ pair<LexicalGrammar, const GrammarError *> expand_tokens(
  LexicalGrammar result;
  ExpandTokens expander;

-  for (auto &pair : grammar.rules) {
-    auto rule = expander.apply(pair.second);
+  for (auto &entry : grammar.rules) {
+    auto rule = expander.apply(entry.rule);
    if (expander.error)
      return { result, expander.error };
-    result.rules.push_back({ pair.first, rule });
-  }
-
-  for (auto &pair : grammar.aux_rules) {
-    auto rule = expander.apply(pair.second);
-    if (expander.error)
-      return { result, expander.error };
-    result.aux_rules.push_back({ pair.first, rule });
+    result.rules.push_back({ entry.name, rule, entry.type });
  }

  for (auto &sep : grammar.separators) {
@ -81,9 +82,7 @@ pair<LexicalGrammar, const GrammarError *> expand_tokens(
    result.separators.push_back(rule);
  }

-  return {
-    result, nullptr,
-  };
+  return { result, nullptr };
 }

 }  // namespace prepare_grammar
--- a/src/compiler/prepare_grammar/extract_tokens.cc
+++ b/src/compiler/prepare_grammar/extract_tokens.cc
@ -4,8 +4,7 @@
 #include <set>
 #include <string>
 #include "tree_sitter/compiler.h"
-#include "compiler/lexical_grammar.h"
-#include "compiler/syntax_grammar.h"
+#include "compiler/prepared_grammar.h"
 #include "compiler/rules/visitor.h"
 #include "compiler/rules/symbol.h"
 #include "compiler/rules/string.h"
@ -21,27 +20,15 @@ using std::dynamic_pointer_cast;
 using std::make_shared;
 using std::make_tuple;
 using std::map;
-using std::pair;
 using std::set;
 using std::string;
 using std::tuple;
 using std::vector;
 using rules::Symbol;
-using rules::SymbolOptionToken;
-using rules::SymbolOptionAuxToken;

 class SymbolReplacer : public rules::IdentityRuleFn {
  using rules::IdentityRuleFn::apply_to;

-  int new_index_for_symbol(const Symbol &symbol) {
-    int result = symbol.index;
-    for (const auto &pair : replacements)
-      if (pair.first.index < symbol.index &&
-          pair.first.is_auxiliary() == symbol.is_auxiliary())
-        result--;
-    return result;
-  }
-
  rule_ptr apply_to(const Symbol *rule) {
    return replace_symbol(*rule).copy();
  }
@ -49,54 +36,64 @@ class SymbolReplacer : public rules::IdentityRuleFn {
 public:
  map<Symbol, Symbol> replacements;

-  Symbol replace_symbol(const Symbol &rule) {
-    if (rule.is_built_in())
-      return rule;
-    auto replacement_pair = replacements.find(rule);
+  Symbol replace_symbol(const Symbol &symbol) {
+    if (symbol.is_built_in() || symbol.is_token)
+      return symbol;
+
+    auto replacement_pair = replacements.find(symbol);
    if (replacement_pair != replacements.end())
      return replacement_pair->second;
-    else
-      return Symbol(new_index_for_symbol(rule), rule.options);
+
+    int new_index = symbol.index;
+    for (const auto &pair : replacements)
+      if (pair.first.index < symbol.index)
+        new_index--;
+    return Symbol(new_index);
  }
 };

 class TokenExtractor : public rules::IdentityRuleFn {
-  rule_ptr apply_to_token(const Rule *input) {
-    auto rule = input->copy();
+  using rules::IdentityRuleFn::apply_to;
+
+  rule_ptr apply_to_token(const Rule *input, RuleEntryType entry_type) {
    for (size_t i = 0; i < tokens.size(); i++)
-      if (tokens[i].second->operator==(*rule)) {
+      if (tokens[i].rule->operator==(*input)) {
        token_usage_counts[i]++;
-        return make_shared<Symbol>(i, SymbolOptionAuxToken);
+        return make_shared<Symbol>(i, true);
      }
+
+    rule_ptr rule = input->copy();
    size_t index = tokens.size();
-    tokens.push_back({ token_description(rule), rule });
+    tokens.push_back({
+      token_description(rule), rule, entry_type,
+    });
    token_usage_counts.push_back(1);
-    return make_shared<Symbol>(index, SymbolOptionAuxToken);
+    return make_shared<Symbol>(index, true);
  }

-  rule_ptr default_apply(const Rule *rule) {
-    auto result = rule->copy();
-    if (is_token(result))
-      return apply_to_token(rule);
-    else
-      return result;
+  rule_ptr apply_to(const rules::String *rule) {
+    return apply_to_token(rule, RuleEntryTypeAnonymous);
+  }
+
+  rule_ptr apply_to(const rules::Pattern *rule) {
+    return apply_to_token(rule, RuleEntryTypeHidden);
  }

  rule_ptr apply_to(const rules::Metadata *rule) {
-    if (is_token(rule->copy()))
-      return apply_to_token(rule);
+    if (rule->value_for(rules::IS_TOKEN) > 0)
+      return apply_to_token(rule->rule.get(), RuleEntryTypeHidden);
    else
      return rules::IdentityRuleFn::apply_to(rule);
  }

 public:
  vector<size_t> token_usage_counts;
-  vector<pair<string, rule_ptr>> tokens;
+  vector<RuleEntry> tokens;
 };

-static const GrammarError *ubiq_token_err(const string &msg) {
+static const GrammarError *ubiq_token_err(const string &message) {
  return new GrammarError(GrammarErrorTypeInvalidUbiquitousToken,
-                          "Not a token: " + msg);
+                          "Not a token: " + message);
 }

 tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> extract_tokens(
@ -106,51 +103,43 @@ tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> extract_tokens(
  SymbolReplacer symbol_replacer;
  TokenExtractor extractor;

-  vector<pair<string, rule_ptr>> extracted_rules;
-  for (auto &pair : grammar.rules)
-    extracted_rules.push_back({ pair.first, extractor.apply(pair.second) });
+  /*
+   *  First, extract all of the grammar's tokens into the lexical grammar.
+   */
+  vector<RuleEntry> processed_rules;
+  for (const auto &pair : grammar.rules)
+    processed_rules.push_back({
+      pair.first, extractor.apply(pair.second), RuleEntryTypeNamed,
+    });
+  lexical_grammar.rules = extractor.tokens;

+  /*
+   *  If a rule's entire content was extracted as a token and that token didn't
+   *  appear within any other rule, then remove that rule from the syntax
+   *  grammar, giving its name to the token in the lexical grammar. Any symbols
+   *  that pointed to that rule will need to be updated to point to the rule in
+   *  the lexical grammar. Symbols that pointed to later rules will need to have
+   *  their indices decremented.
+   */
  size_t i = 0;
-  for (auto &pair : extracted_rules) {
-    auto &rule = pair.second;
-    auto symbol = dynamic_pointer_cast<const Symbol>(rule);
-    if (symbol.get() && symbol->is_auxiliary() &&
+  for (const RuleEntry &entry : processed_rules) {
+    auto symbol = dynamic_pointer_cast<const Symbol>(entry.rule);
+    if (symbol.get() && symbol->is_token && !symbol->is_built_in() &&
        extractor.token_usage_counts[symbol->index] == 1) {
-      lexical_grammar.rules.push_back(
-        { pair.first, extractor.tokens[symbol->index].second });
-      extractor.token_usage_counts[symbol->index] = 0;
-      symbol_replacer.replacements.insert(
-        { Symbol(i),
-          Symbol(lexical_grammar.rules.size() - 1, SymbolOptionToken) });
+      lexical_grammar.rules[symbol->index].type = entry.type;
+      lexical_grammar.rules[symbol->index].name = entry.name;
+      symbol_replacer.replacements.insert({ Symbol(i), *symbol });
    } else {
-      syntax_grammar.rules.push_back(pair);
+      syntax_grammar.rules.push_back(entry);
    }
    i++;
  }

-  for (auto &pair : syntax_grammar.rules)
-    pair.second = symbol_replacer.apply(pair.second);
-
-  lexical_grammar.aux_rules = extractor.tokens;
-
-  for (auto &rule : grammar.ubiquitous_tokens) {
-    if (is_token(rule)) {
-      lexical_grammar.separators.push_back(rule);
-    } else {
-      auto sym = dynamic_pointer_cast<const Symbol>(extractor.apply(rule));
-      if (!sym.get())
-        return make_tuple(syntax_grammar, lexical_grammar,
-                          ubiq_token_err(rule->to_string()));
-
-      Symbol symbol = symbol_replacer.replace_symbol(*sym);
-      if (!symbol.is_token())
-        return make_tuple(
-          syntax_grammar, lexical_grammar,
-          ubiq_token_err(syntax_grammar.rules[symbol.index].first));
-
-      syntax_grammar.ubiquitous_tokens.insert(symbol);
-    }
-  }
+  /*
+   *  Perform any replacements of symbols needed based on the previous step.
+   */
+  for (RuleEntry &entry : syntax_grammar.rules)
+    entry.rule = symbol_replacer.apply(entry.rule);

  for (auto &symbol_set : grammar.expected_conflicts) {
    set<Symbol> new_symbol_set;
@ -159,6 +148,33 @@ tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> extract_tokens(
    syntax_grammar.expected_conflicts.insert(new_symbol_set);
  }

+  /*
+   *  The grammar's ubiquitous tokens can be either token rules or symbols
+   *  pointing to token rules. If they are symbols, then they'll be handled by
+   *  the parser; add them to the syntax grammar's ubiqutous tokens. If they
+   *  are anonymous rules, they can be handled by the lexer; add them to the
+   *  lexical grammar's separator rules.
+   */
+  for (const rule_ptr &rule : grammar.ubiquitous_tokens) {
+    if (is_token(rule)) {
+      lexical_grammar.separators.push_back(rule);
+      continue;
+    }
+
+    auto symbol = dynamic_pointer_cast<const Symbol>(rule);
+    if (!symbol.get())
+      return make_tuple(syntax_grammar, lexical_grammar,
+                        ubiq_token_err(rule->to_string()));
+
+    Symbol new_symbol = symbol_replacer.replace_symbol(*symbol);
+    if (!new_symbol.is_token)
+      return make_tuple(
+        syntax_grammar, lexical_grammar,
+        ubiq_token_err(syntax_grammar.rules[new_symbol.index].name));
+
+    syntax_grammar.ubiquitous_tokens.insert(new_symbol);
+  }
+
  return make_tuple(syntax_grammar, lexical_grammar, nullptr);
 }

--- a/src/compiler/prepare_grammar/prepare_grammar.cc
+++ b/src/compiler/prepare_grammar/prepare_grammar.cc
@ -3,8 +3,7 @@
 #include "compiler/prepare_grammar/extract_tokens.h"
 #include "compiler/prepare_grammar/intern_symbols.h"
 #include "compiler/prepare_grammar/prepare_grammar.h"
-#include "compiler/lexical_grammar.h"
-#include "compiler/syntax_grammar.h"
+#include "compiler/prepared_grammar.h"

 namespace tree_sitter {
 namespace prepare_grammar {
--- a/src/compiler/prepare_grammar/prepare_grammar.h
+++ b/src/compiler/prepare_grammar/prepare_grammar.h
@ -2,8 +2,7 @@
 #define COMPILER_PREPARE_GRAMMAR_PREPARE_GRAMMAR_H_

 #include <utility>
-#include "compiler/lexical_grammar.h"
-#include "compiler/syntax_grammar.h"
+#include "compiler/prepared_grammar.h"

 namespace tree_sitter {

--- a/src/compiler/prepare_grammar/token_description.cc
+++ b/src/compiler/prepare_grammar/token_description.cc
@ -5,6 +5,7 @@
 #include "compiler/rules/seq.h"
 #include "compiler/rules/choice.h"
 #include "compiler/rules/string.h"
+#include "compiler/rules/repeat.h"
 #include "compiler/rules/metadata.h"
 #include "compiler/util/string_helpers.h"

@ -15,11 +16,12 @@ using std::string;

 class TokenDescription : public rules::RuleFn<string> {
  string apply_to(const rules::Pattern *rule) {
-    return "PAT_" + util::escape_string(rule->value);
+    is_trivial = false;
+    return rule->value;
  }

  string apply_to(const rules::String *rule) {
-    return "STR_" + util::escape_string(rule->value);
+    return rule->value;
  }

  string apply_to(const rules::Metadata *rule) {
@ -27,19 +29,41 @@ class TokenDescription : public rules::RuleFn<string> {
  }

  string apply_to(const rules::Seq *rule) {
-    return "(seq " + apply(rule->left) + " " + apply(rule->right) + ")";
+    is_trivial = false;
+    return apply(rule->left) + apply(rule->right);
+  }
+
+  string apply_to(const rules::Repeat *rule) {
+    is_trivial = false;
+    return apply(rule->content) + "*";
  }

  string apply_to(const rules::Choice *rule) {
-    string result = "(choice";
-    for (auto &element : rule->elements)
-      result += " " + apply(element);
+    is_trivial = false;
+    string result = "(";
+    bool started = false;
+    for (auto &element : rule->elements) {
+      if (started)
+        result += "|";
+      result += apply(element);
+      started = true;
+    }
    return result + ")";
  }
+
+ public:
+  bool is_trivial;
+
+  TokenDescription() : is_trivial(true) {}
 };

-std::string token_description(const rule_ptr &rule) {
-  return TokenDescription().apply(rule);
+string token_description(const rule_ptr &rule) {
+  TokenDescription description;
+  string result = description.apply(rule);
+  if (description.is_trivial)
+    return result;
+  else
+    return "/" + result + "/";
 }

 }  // namespace prepare_grammar
--- a/src/compiler/prepared_grammar.h
+++ b/src/compiler/prepared_grammar.h
@ -0,0 +1,39 @@
+#ifndef COMPILER_PREPARED_GRAMMAR_H_
+#define COMPILER_PREPARED_GRAMMAR_H_
+
+#include <vector>
+#include <string>
+#include <set>
+#include "tree_sitter/compiler.h"
+#include "compiler/rules/symbol.h"
+
+namespace tree_sitter {
+
+enum RuleEntryType {
+  RuleEntryTypeNamed,
+  RuleEntryTypeAnonymous,
+  RuleEntryTypeHidden,
+};
+
+struct RuleEntry {
+  std::string name;
+  rule_ptr rule;
+  RuleEntryType type;
+};
+
+class SyntaxGrammar {
+ public:
+  std::vector<RuleEntry> rules;
+  std::set<rules::Symbol> ubiquitous_tokens;
+  std::set<std::set<rules::Symbol>> expected_conflicts;
+};
+
+class LexicalGrammar {
+ public:
+  std::vector<RuleEntry> rules;
+  std::vector<rule_ptr> separators;
+};
+
+}  // namespace tree_sitter
+
+#endif  // COMPILER_PREPARED_GRAMMAR_H_
--- a/src/compiler/rules/built_in_symbols.cc
+++ b/src/compiler/rules/built_in_symbols.cc
@ -4,11 +4,11 @@ namespace tree_sitter {
 namespace rules {

 Symbol END_OF_INPUT() {
-  return Symbol(-1, SymbolOptionToken);
+  return Symbol(-1, true);
 }

 Symbol ERROR() {
-  return Symbol(-2, SymbolOptionToken);
+  return Symbol(-2, true);
 }

 Symbol START() {
--- a/src/compiler/rules/rules.cc
+++ b/src/compiler/rules/rules.cc
@ -52,7 +52,7 @@ rule_ptr pattern(const string &value) {
 }

 rule_ptr str(const string &value) {
-  return token(prec(1, make_shared<rules::String>(value)));
+  return make_shared<rules::String>(value);
 }

 rule_ptr err(const rule_ptr &rule) {
--- a/src/compiler/rules/symbol.cc
+++ b/src/compiler/rules/symbol.cc
@ -10,16 +10,12 @@ using std::string;
 using std::to_string;
 using std::hash;

-SymbolOption SymbolOptionAuxToken =
-  SymbolOption(SymbolOptionToken | SymbolOptionAuxiliary);
+Symbol::Symbol(int index) : index(index), is_token(false) {}

-Symbol::Symbol(int index) : index(index), options(SymbolOption(0)) {}
-
-Symbol::Symbol(int index, SymbolOption options)
-    : index(index), options(options) {}
+Symbol::Symbol(int index, bool is_token) : index(index), is_token(is_token) {}

 bool Symbol::operator==(const Symbol &other) const {
-  return (other.index == index) && (other.options == options);
+  return (other.index == index) && (other.is_token == is_token);
 }

 bool Symbol::operator==(const Rule &rule) const {
@ -28,7 +24,7 @@ bool Symbol::operator==(const Rule &rule) const {
 }

 size_t Symbol::hash_code() const {
-  return hash<int>()(index) ^ hash<int16_t>()(options);
+  return hash<int>()(index) ^ hash<bool>()(is_token);
 }

 rule_ptr Symbol::copy() const {
@ -36,31 +32,22 @@ rule_ptr Symbol::copy() const {
 }

 string Symbol::to_string() const {
-  string name = (options & SymbolOptionAuxiliary) ? "aux_" : "";
-  name += (options & SymbolOptionToken) ? "token" : "sym";
+  string name = is_token ? "token" : "sym";
  return "(" + name + " " + std::to_string(index) + ")";
 }

 bool Symbol::operator<(const Symbol &other) const {
-  if (options < other.options)
+  if (!is_token && other.is_token)
    return true;
-  if (options > other.options)
+  if (is_token && !other.is_token)
    return false;
  return (index < other.index);
 }

-bool Symbol::is_token() const {
-  return options & SymbolOptionToken;
-}
-
 bool Symbol::is_built_in() const {
  return index < 0;
 }

-bool Symbol::is_auxiliary() const {
-  return options & SymbolOptionAuxiliary;
-}
-
 void Symbol::accept(Visitor *visitor) const {
  visitor->visit(this);
 }
--- a/src/compiler/rules/symbol.h
+++ b/src/compiler/rules/symbol.h
@ -7,17 +7,10 @@
 namespace tree_sitter {
 namespace rules {

-typedef enum {
-  SymbolOptionToken = 1 << 0,
-  SymbolOptionAuxiliary = 1 << 1,
-} SymbolOption;
-
-extern SymbolOption SymbolOptionAuxToken;
-
 class Symbol : public Rule {
 public:
  explicit Symbol(int index);
-  Symbol(int index, SymbolOption options);
+  Symbol(int index, bool is_token);

  bool operator==(const Symbol &other) const;
  bool operator==(const Rule &other) const;
@ -28,12 +21,10 @@ class Symbol : public Rule {
  void accept(Visitor *visitor) const;

  bool operator<(const Symbol &other) const;
-  bool is_token() const;
  bool is_built_in() const;
-  bool is_auxiliary() const;

  int index;
-  SymbolOption options;
+  bool is_token;
 };

 }  // namespace rules
--- a/src/compiler/syntax_grammar.cc
+++ b/src/compiler/syntax_grammar.cc
@ -1,21 +0,0 @@
-#include "compiler/syntax_grammar.h"
-#include <vector>
-#include <string>
-#include <utility>
-#include "compiler/rules/symbol.h"
-
-namespace tree_sitter {
-
-using std::string;
-
-const rule_ptr &SyntaxGrammar::rule(const rules::Symbol &symbol) const {
-  return symbol.is_auxiliary() ? aux_rules[symbol.index].second
-                               : rules[symbol.index].second;
-}
-
-const string &SyntaxGrammar::rule_name(const rules::Symbol &symbol) const {
-  return symbol.is_auxiliary() ? aux_rules[symbol.index].first
-                               : rules[symbol.index].first;
-}
-
-}  // namespace tree_sitter
--- a/src/compiler/syntax_grammar.h
+++ b/src/compiler/syntax_grammar.h
@ -1,26 +0,0 @@
-#ifndef COMPILER_SYNTAX_GRAMMAR_H_
-#define COMPILER_SYNTAX_GRAMMAR_H_
-
-#include <vector>
-#include <string>
-#include <set>
-#include <utility>
-#include "tree_sitter/compiler.h"
-#include "compiler/rules/symbol.h"
-
-namespace tree_sitter {
-
-class SyntaxGrammar {
- public:
-  const std::string &rule_name(const rules::Symbol &symbol) const;
-  const rule_ptr &rule(const rules::Symbol &symbol) const;
-
-  std::vector<std::pair<std::string, rule_ptr>> rules;
-  std::vector<std::pair<std::string, rule_ptr>> aux_rules;
-  std::set<rules::Symbol> ubiquitous_tokens;
-  std::set<std::set<rules::Symbol>> expected_conflicts;
-};
-
-}  // namespace tree_sitter
-
-#endif  // COMPILER_SYNTAX_GRAMMAR_H_