Merge pull request #66 from tree-sitter/handle-unused-tokens

Handle unused tokens
2017-03-09 21:20:13 -08:00 · 2017-03-09 21:20:13 -08:00 · 12d2a9d93f
commit 12d2a9d93f
parent dc2035c262 f04d7c5860
5 changed files with 114 additions and 29 deletions
--- a/src/compiler/generate_code/c_code.cc
+++ b/src/compiler/generate_code/c_code.cc
@ -126,10 +126,16 @@ class CCodeGenerator {
  }

  void add_stats() {
-    size_t token_count = 1 + lexical_grammar.variables.size();
-    for (const ExternalToken &external_token : syntax_grammar.external_tokens) {
-      if (external_token.corresponding_internal_token == rules::NONE()) {
+    size_t token_count = 0;
+    for (const auto &entry : parse_table.symbols) {
+      const Symbol &symbol = entry.first;
+      if (symbol.is_token()) {
        token_count++;
+      } else if (symbol.is_external()) {
+        const ExternalToken &external_token = syntax_grammar.external_tokens[symbol.index];
+        if (external_token.corresponding_internal_token == rules::NONE()) {
+          token_count++;
+        }
      }
    }

--- a/src/compiler/prepare_grammar/extract_tokens.cc
+++ b/src/compiler/prepare_grammar/extract_tokens.cc
@ -96,32 +96,30 @@ static CompileError extra_token_error(const string &message) {
 }

 tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
-  const InternedGrammar &grammar) {
+  const InternedGrammar &grammar
+) {
  InitialSyntaxGrammar syntax_grammar;
  LexicalGrammar lexical_grammar;
  SymbolReplacer symbol_replacer;
  TokenExtractor extractor;

-  /*
-   *  First, extract all of the grammar's tokens into the lexical grammar.
-   */
+  // First, extract all of the grammar's tokens into the lexical grammar.
  vector<Variable> processed_variables;
-  for (const Variable &variable : grammar.variables)
+  for (const Variable &variable : grammar.variables) {
    processed_variables.push_back(Variable{
      variable.name,
      variable.type,
      extractor.apply(variable.rule)
    });
+  }
  lexical_grammar.variables = extractor.tokens;

-  /*
-   *  If a variable's entire rule was extracted as a token and that token didn't
-   *  appear within any other rule, then remove that variable from the syntax
-   *  grammar, giving its name to the token in the lexical grammar. Any symbols
-   *  that pointed to that variable will need to be updated to point to the
-   *  variable in the lexical grammar. Symbols that pointed to later variables
-   *  will need to have their indices decremented.
-   */
+  // If a variable's entire rule was extracted as a token and that token didn't
+  // appear within any other rule, then remove that variable from the syntax
+  // grammar, giving its name to the token in the lexical grammar. Any symbols
+  // that pointed to that variable will need to be updated to point to the
+  // variable in the lexical grammar. Symbols that pointed to later variables
+  // will need to have their indices decremented.
  size_t i = 0;
  for (const Variable &variable : processed_variables) {
    auto symbol = variable.rule->as<Symbol>();
@ -135,11 +133,10 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
    i++;
  }

-  /*
-   *  Perform any replacements of symbols needed based on the previous step.
-   */
-  for (Variable &variable : syntax_grammar.variables)
+  // Perform any replacements of symbols needed based on the previous step.
+  for (Variable &variable : syntax_grammar.variables) {
    variable.rule = symbol_replacer.apply(variable.rule);
+  }

  for (const ConflictSet &conflict_set : grammar.expected_conflicts) {
    ConflictSet new_conflict_set;
@ -149,13 +146,11 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
    syntax_grammar.expected_conflicts.insert(new_conflict_set);
  }

-  /*
-   *  The grammar's extra tokens can be either token rules or symbols
-   *  pointing to token rules. If they are symbols, then they'll be handled by
-   *  the parser; add them to the syntax grammar's ubiqutous tokens. If they
-   *  are anonymous rules, they can be handled by the lexer; add them to the
-   *  lexical grammar's separator rules.
-   */
+  // The grammar's extra tokens can be either token rules or symbols
+  // pointing to token rules. If they are symbols, then they'll be handled by
+  // the parser; add them to the syntax grammar's ubiqutous tokens. If they
+  // are anonymous rules, they can be handled by the lexer; add them to the
+  // lexical grammar's separator rules.
  for (const rule_ptr &rule : grammar.extra_tokens) {
    int i = 0;
    bool used_elsewhere_in_grammar = false;
@ -167,8 +162,9 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
      i++;
    }

-    if (used_elsewhere_in_grammar)
+    if (used_elsewhere_in_grammar) {
      continue;
+    }

    if (is_token(rule)) {
      lexical_grammar.separators.push_back(rule);
@ -205,7 +201,7 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
      );
    }

-    syntax_grammar.external_tokens.push_back({
+    syntax_grammar.external_tokens.push_back(ExternalToken{
      external_token.name,
      external_token.type,
      internal_token
--- a/test/fixtures/test_grammars/unused_rules/corpus.txt
+++ b/test/fixtures/test_grammars/unused_rules/corpus.txt
@ -0,0 +1,9 @@
+=========================
+the language
+=========================
+
+E F I J
+
+---
+
+(a (d (e) (f)) (h (i) (j)))
--- a/test/fixtures/test_grammars/unused_rules/grammar.json
+++ b/test/fixtures/test_grammars/unused_rules/grammar.json
@ -0,0 +1,73 @@
+{
+  "name": "unused_rules",
+
+  "extras": [
+    {"type": "PATTERN", "value": "\\s"}
+  ],
+
+  "rules": {
+    "a": {
+      "type": "SEQ",
+      "members": [
+        {"type": "SYMBOL", "name": "d"},
+        {"type": "SYMBOL", "name": "h"}
+      ]
+    },
+
+    "b": {
+      "type": "STRING",
+      "value": "B"
+    },
+
+    "c": {
+      "type": "STRING",
+      "value": "C"
+    },
+
+    "d": {
+      "type": "SEQ",
+      "members": [
+        {"type": "SYMBOL", "name": "e"},
+        {"type": "SYMBOL", "name": "f"}
+      ]
+    },
+
+    "e": {
+      "type": "STRING",
+      "value": "E"
+    },
+
+    "f": {
+      "type": "STRING",
+      "value": "F"
+    },
+
+    "g": {
+      "type": "STRING",
+      "value": "G"
+    },
+
+    "h": {
+      "type": "SEQ",
+      "members": [
+        {"type": "SYMBOL", "name": "i"},
+        {"type": "SYMBOL", "name": "j"}
+      ]
+    },
+
+    "i": {
+      "type": "STRING",
+      "value": "I"
+    },
+
+    "j": {
+      "type": "STRING",
+      "value": "J"
+    },
+
+    "k": {
+      "type": "STRING",
+      "value": "K"
+    }
+  }
+}
--- a/test/fixtures/test_grammars/unused_rules/readme.md
+++ b/test/fixtures/test_grammars/unused_rules/readme.md
@ -0,0 +1 @@
+The generated parsers use the grammar's token count to distinguish between terminal and non-terminal symbols. When the grammar has unused tokens, these tokens don't appear in the parser, so they need to be omitted from the token count.
				`@ -0,0 +1 @@`
				`The generated parsers use the grammar's token count to distinguish between terminal and non-terminal symbols. When the grammar has unused tokens, these tokens don't appear in the parser, so they need to be omitted from the token count.`