Merge pull request #66 from tree-sitter/handle-unused-tokens

Handle unused tokens
This commit is contained in:
Max Brunsfeld 2017-03-09 21:20:13 -08:00 committed by GitHub
commit 12d2a9d93f
5 changed files with 114 additions and 29 deletions

View file

@ -126,10 +126,16 @@ class CCodeGenerator {
}
void add_stats() {
size_t token_count = 1 + lexical_grammar.variables.size();
for (const ExternalToken &external_token : syntax_grammar.external_tokens) {
if (external_token.corresponding_internal_token == rules::NONE()) {
size_t token_count = 0;
for (const auto &entry : parse_table.symbols) {
const Symbol &symbol = entry.first;
if (symbol.is_token()) {
token_count++;
} else if (symbol.is_external()) {
const ExternalToken &external_token = syntax_grammar.external_tokens[symbol.index];
if (external_token.corresponding_internal_token == rules::NONE()) {
token_count++;
}
}
}

View file

@ -96,32 +96,30 @@ static CompileError extra_token_error(const string &message) {
}
tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
const InternedGrammar &grammar) {
const InternedGrammar &grammar
) {
InitialSyntaxGrammar syntax_grammar;
LexicalGrammar lexical_grammar;
SymbolReplacer symbol_replacer;
TokenExtractor extractor;
/*
* First, extract all of the grammar's tokens into the lexical grammar.
*/
// First, extract all of the grammar's tokens into the lexical grammar.
vector<Variable> processed_variables;
for (const Variable &variable : grammar.variables)
for (const Variable &variable : grammar.variables) {
processed_variables.push_back(Variable{
variable.name,
variable.type,
extractor.apply(variable.rule)
});
}
lexical_grammar.variables = extractor.tokens;
/*
* If a variable's entire rule was extracted as a token and that token didn't
* appear within any other rule, then remove that variable from the syntax
* grammar, giving its name to the token in the lexical grammar. Any symbols
* that pointed to that variable will need to be updated to point to the
* variable in the lexical grammar. Symbols that pointed to later variables
* will need to have their indices decremented.
*/
// If a variable's entire rule was extracted as a token and that token didn't
// appear within any other rule, then remove that variable from the syntax
// grammar, giving its name to the token in the lexical grammar. Any symbols
// that pointed to that variable will need to be updated to point to the
// variable in the lexical grammar. Symbols that pointed to later variables
// will need to have their indices decremented.
size_t i = 0;
for (const Variable &variable : processed_variables) {
auto symbol = variable.rule->as<Symbol>();
@ -135,11 +133,10 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
i++;
}
/*
* Perform any replacements of symbols needed based on the previous step.
*/
for (Variable &variable : syntax_grammar.variables)
// Perform any replacements of symbols needed based on the previous step.
for (Variable &variable : syntax_grammar.variables) {
variable.rule = symbol_replacer.apply(variable.rule);
}
for (const ConflictSet &conflict_set : grammar.expected_conflicts) {
ConflictSet new_conflict_set;
@ -149,13 +146,11 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
syntax_grammar.expected_conflicts.insert(new_conflict_set);
}
/*
* The grammar's extra tokens can be either token rules or symbols
* pointing to token rules. If they are symbols, then they'll be handled by
* the parser; add them to the syntax grammar's ubiqutous tokens. If they
* are anonymous rules, they can be handled by the lexer; add them to the
* lexical grammar's separator rules.
*/
// The grammar's extra tokens can be either token rules or symbols
// pointing to token rules. If they are symbols, then they'll be handled by
// the parser; add them to the syntax grammar's ubiqutous tokens. If they
// are anonymous rules, they can be handled by the lexer; add them to the
// lexical grammar's separator rules.
for (const rule_ptr &rule : grammar.extra_tokens) {
int i = 0;
bool used_elsewhere_in_grammar = false;
@ -167,8 +162,9 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
i++;
}
if (used_elsewhere_in_grammar)
if (used_elsewhere_in_grammar) {
continue;
}
if (is_token(rule)) {
lexical_grammar.separators.push_back(rule);
@ -205,7 +201,7 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
);
}
syntax_grammar.external_tokens.push_back({
syntax_grammar.external_tokens.push_back(ExternalToken{
external_token.name,
external_token.type,
internal_token

View file

@ -0,0 +1,9 @@
=========================
the language
=========================
E F I J
---
(a (d (e) (f)) (h (i) (j)))

View file

@ -0,0 +1,73 @@
{
"name": "unused_rules",
"extras": [
{"type": "PATTERN", "value": "\\s"}
],
"rules": {
"a": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "d"},
{"type": "SYMBOL", "name": "h"}
]
},
"b": {
"type": "STRING",
"value": "B"
},
"c": {
"type": "STRING",
"value": "C"
},
"d": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "e"},
{"type": "SYMBOL", "name": "f"}
]
},
"e": {
"type": "STRING",
"value": "E"
},
"f": {
"type": "STRING",
"value": "F"
},
"g": {
"type": "STRING",
"value": "G"
},
"h": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "i"},
{"type": "SYMBOL", "name": "j"}
]
},
"i": {
"type": "STRING",
"value": "I"
},
"j": {
"type": "STRING",
"value": "J"
},
"k": {
"type": "STRING",
"value": "K"
}
}
}

View file

@ -0,0 +1 @@
The generated parsers use the grammar's token count to distinguish between terminal and non-terminal symbols. When the grammar has unused tokens, these tokens don't appear in the parser, so they need to be omitted from the token count.