Merge pull request #66 from tree-sitter/handle-unused-tokens
Handle unused tokens
This commit is contained in:
commit
12d2a9d93f
5 changed files with 114 additions and 29 deletions
|
|
@ -126,10 +126,16 @@ class CCodeGenerator {
|
|||
}
|
||||
|
||||
void add_stats() {
|
||||
size_t token_count = 1 + lexical_grammar.variables.size();
|
||||
for (const ExternalToken &external_token : syntax_grammar.external_tokens) {
|
||||
if (external_token.corresponding_internal_token == rules::NONE()) {
|
||||
size_t token_count = 0;
|
||||
for (const auto &entry : parse_table.symbols) {
|
||||
const Symbol &symbol = entry.first;
|
||||
if (symbol.is_token()) {
|
||||
token_count++;
|
||||
} else if (symbol.is_external()) {
|
||||
const ExternalToken &external_token = syntax_grammar.external_tokens[symbol.index];
|
||||
if (external_token.corresponding_internal_token == rules::NONE()) {
|
||||
token_count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -96,32 +96,30 @@ static CompileError extra_token_error(const string &message) {
|
|||
}
|
||||
|
||||
tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
|
||||
const InternedGrammar &grammar) {
|
||||
const InternedGrammar &grammar
|
||||
) {
|
||||
InitialSyntaxGrammar syntax_grammar;
|
||||
LexicalGrammar lexical_grammar;
|
||||
SymbolReplacer symbol_replacer;
|
||||
TokenExtractor extractor;
|
||||
|
||||
/*
|
||||
* First, extract all of the grammar's tokens into the lexical grammar.
|
||||
*/
|
||||
// First, extract all of the grammar's tokens into the lexical grammar.
|
||||
vector<Variable> processed_variables;
|
||||
for (const Variable &variable : grammar.variables)
|
||||
for (const Variable &variable : grammar.variables) {
|
||||
processed_variables.push_back(Variable{
|
||||
variable.name,
|
||||
variable.type,
|
||||
extractor.apply(variable.rule)
|
||||
});
|
||||
}
|
||||
lexical_grammar.variables = extractor.tokens;
|
||||
|
||||
/*
|
||||
* If a variable's entire rule was extracted as a token and that token didn't
|
||||
* appear within any other rule, then remove that variable from the syntax
|
||||
* grammar, giving its name to the token in the lexical grammar. Any symbols
|
||||
* that pointed to that variable will need to be updated to point to the
|
||||
* variable in the lexical grammar. Symbols that pointed to later variables
|
||||
* will need to have their indices decremented.
|
||||
*/
|
||||
// If a variable's entire rule was extracted as a token and that token didn't
|
||||
// appear within any other rule, then remove that variable from the syntax
|
||||
// grammar, giving its name to the token in the lexical grammar. Any symbols
|
||||
// that pointed to that variable will need to be updated to point to the
|
||||
// variable in the lexical grammar. Symbols that pointed to later variables
|
||||
// will need to have their indices decremented.
|
||||
size_t i = 0;
|
||||
for (const Variable &variable : processed_variables) {
|
||||
auto symbol = variable.rule->as<Symbol>();
|
||||
|
|
@ -135,11 +133,10 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
|
|||
i++;
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform any replacements of symbols needed based on the previous step.
|
||||
*/
|
||||
for (Variable &variable : syntax_grammar.variables)
|
||||
// Perform any replacements of symbols needed based on the previous step.
|
||||
for (Variable &variable : syntax_grammar.variables) {
|
||||
variable.rule = symbol_replacer.apply(variable.rule);
|
||||
}
|
||||
|
||||
for (const ConflictSet &conflict_set : grammar.expected_conflicts) {
|
||||
ConflictSet new_conflict_set;
|
||||
|
|
@ -149,13 +146,11 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
|
|||
syntax_grammar.expected_conflicts.insert(new_conflict_set);
|
||||
}
|
||||
|
||||
/*
|
||||
* The grammar's extra tokens can be either token rules or symbols
|
||||
* pointing to token rules. If they are symbols, then they'll be handled by
|
||||
* the parser; add them to the syntax grammar's ubiqutous tokens. If they
|
||||
* are anonymous rules, they can be handled by the lexer; add them to the
|
||||
* lexical grammar's separator rules.
|
||||
*/
|
||||
// The grammar's extra tokens can be either token rules or symbols
|
||||
// pointing to token rules. If they are symbols, then they'll be handled by
|
||||
// the parser; add them to the syntax grammar's ubiqutous tokens. If they
|
||||
// are anonymous rules, they can be handled by the lexer; add them to the
|
||||
// lexical grammar's separator rules.
|
||||
for (const rule_ptr &rule : grammar.extra_tokens) {
|
||||
int i = 0;
|
||||
bool used_elsewhere_in_grammar = false;
|
||||
|
|
@ -167,8 +162,9 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
|
|||
i++;
|
||||
}
|
||||
|
||||
if (used_elsewhere_in_grammar)
|
||||
if (used_elsewhere_in_grammar) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (is_token(rule)) {
|
||||
lexical_grammar.separators.push_back(rule);
|
||||
|
|
@ -205,7 +201,7 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
|
|||
);
|
||||
}
|
||||
|
||||
syntax_grammar.external_tokens.push_back({
|
||||
syntax_grammar.external_tokens.push_back(ExternalToken{
|
||||
external_token.name,
|
||||
external_token.type,
|
||||
internal_token
|
||||
|
|
|
|||
9
test/fixtures/test_grammars/unused_rules/corpus.txt
vendored
Normal file
9
test/fixtures/test_grammars/unused_rules/corpus.txt
vendored
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
=========================
|
||||
the language
|
||||
=========================
|
||||
|
||||
E F I J
|
||||
|
||||
---
|
||||
|
||||
(a (d (e) (f)) (h (i) (j)))
|
||||
73
test/fixtures/test_grammars/unused_rules/grammar.json
vendored
Normal file
73
test/fixtures/test_grammars/unused_rules/grammar.json
vendored
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
{
|
||||
"name": "unused_rules",
|
||||
|
||||
"extras": [
|
||||
{"type": "PATTERN", "value": "\\s"}
|
||||
],
|
||||
|
||||
"rules": {
|
||||
"a": {
|
||||
"type": "SEQ",
|
||||
"members": [
|
||||
{"type": "SYMBOL", "name": "d"},
|
||||
{"type": "SYMBOL", "name": "h"}
|
||||
]
|
||||
},
|
||||
|
||||
"b": {
|
||||
"type": "STRING",
|
||||
"value": "B"
|
||||
},
|
||||
|
||||
"c": {
|
||||
"type": "STRING",
|
||||
"value": "C"
|
||||
},
|
||||
|
||||
"d": {
|
||||
"type": "SEQ",
|
||||
"members": [
|
||||
{"type": "SYMBOL", "name": "e"},
|
||||
{"type": "SYMBOL", "name": "f"}
|
||||
]
|
||||
},
|
||||
|
||||
"e": {
|
||||
"type": "STRING",
|
||||
"value": "E"
|
||||
},
|
||||
|
||||
"f": {
|
||||
"type": "STRING",
|
||||
"value": "F"
|
||||
},
|
||||
|
||||
"g": {
|
||||
"type": "STRING",
|
||||
"value": "G"
|
||||
},
|
||||
|
||||
"h": {
|
||||
"type": "SEQ",
|
||||
"members": [
|
||||
{"type": "SYMBOL", "name": "i"},
|
||||
{"type": "SYMBOL", "name": "j"}
|
||||
]
|
||||
},
|
||||
|
||||
"i": {
|
||||
"type": "STRING",
|
||||
"value": "I"
|
||||
},
|
||||
|
||||
"j": {
|
||||
"type": "STRING",
|
||||
"value": "J"
|
||||
},
|
||||
|
||||
"k": {
|
||||
"type": "STRING",
|
||||
"value": "K"
|
||||
}
|
||||
}
|
||||
}
|
||||
1
test/fixtures/test_grammars/unused_rules/readme.md
vendored
Normal file
1
test/fixtures/test_grammars/unused_rules/readme.md
vendored
Normal file
|
|
@ -0,0 +1 @@
|
|||
The generated parsers use the grammar's token count to distinguish between terminal and non-terminal symbols. When the grammar has unused tokens, these tokens don't appear in the parser, so they need to be omitted from the token count.
|
||||
Loading…
Add table
Add a link
Reference in a new issue