diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index 0bd01c29..111340c1 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -126,10 +126,16 @@ class CCodeGenerator { } void add_stats() { - size_t token_count = 1 + lexical_grammar.variables.size(); - for (const ExternalToken &external_token : syntax_grammar.external_tokens) { - if (external_token.corresponding_internal_token == rules::NONE()) { + size_t token_count = 0; + for (const auto &entry : parse_table.symbols) { + const Symbol &symbol = entry.first; + if (symbol.is_token()) { token_count++; + } else if (symbol.is_external()) { + const ExternalToken &external_token = syntax_grammar.external_tokens[symbol.index]; + if (external_token.corresponding_internal_token == rules::NONE()) { + token_count++; + } } } diff --git a/src/compiler/prepare_grammar/extract_tokens.cc b/src/compiler/prepare_grammar/extract_tokens.cc index 32b524e3..ec821ecc 100644 --- a/src/compiler/prepare_grammar/extract_tokens.cc +++ b/src/compiler/prepare_grammar/extract_tokens.cc @@ -96,32 +96,30 @@ static CompileError extra_token_error(const string &message) { } tuple extract_tokens( - const InternedGrammar &grammar) { + const InternedGrammar &grammar +) { InitialSyntaxGrammar syntax_grammar; LexicalGrammar lexical_grammar; SymbolReplacer symbol_replacer; TokenExtractor extractor; - /* - * First, extract all of the grammar's tokens into the lexical grammar. - */ + // First, extract all of the grammar's tokens into the lexical grammar. vector processed_variables; - for (const Variable &variable : grammar.variables) + for (const Variable &variable : grammar.variables) { processed_variables.push_back(Variable{ variable.name, variable.type, extractor.apply(variable.rule) }); + } lexical_grammar.variables = extractor.tokens; - /* - * If a variable's entire rule was extracted as a token and that token didn't - * appear within any other rule, then remove that variable from the syntax - * grammar, giving its name to the token in the lexical grammar. Any symbols - * that pointed to that variable will need to be updated to point to the - * variable in the lexical grammar. Symbols that pointed to later variables - * will need to have their indices decremented. - */ + // If a variable's entire rule was extracted as a token and that token didn't + // appear within any other rule, then remove that variable from the syntax + // grammar, giving its name to the token in the lexical grammar. Any symbols + // that pointed to that variable will need to be updated to point to the + // variable in the lexical grammar. Symbols that pointed to later variables + // will need to have their indices decremented. size_t i = 0; for (const Variable &variable : processed_variables) { auto symbol = variable.rule->as(); @@ -135,11 +133,10 @@ tuple extract_tokens( i++; } - /* - * Perform any replacements of symbols needed based on the previous step. - */ - for (Variable &variable : syntax_grammar.variables) + // Perform any replacements of symbols needed based on the previous step. + for (Variable &variable : syntax_grammar.variables) { variable.rule = symbol_replacer.apply(variable.rule); + } for (const ConflictSet &conflict_set : grammar.expected_conflicts) { ConflictSet new_conflict_set; @@ -149,13 +146,11 @@ tuple extract_tokens( syntax_grammar.expected_conflicts.insert(new_conflict_set); } - /* - * The grammar's extra tokens can be either token rules or symbols - * pointing to token rules. If they are symbols, then they'll be handled by - * the parser; add them to the syntax grammar's ubiqutous tokens. If they - * are anonymous rules, they can be handled by the lexer; add them to the - * lexical grammar's separator rules. - */ + // The grammar's extra tokens can be either token rules or symbols + // pointing to token rules. If they are symbols, then they'll be handled by + // the parser; add them to the syntax grammar's ubiqutous tokens. If they + // are anonymous rules, they can be handled by the lexer; add them to the + // lexical grammar's separator rules. for (const rule_ptr &rule : grammar.extra_tokens) { int i = 0; bool used_elsewhere_in_grammar = false; @@ -167,8 +162,9 @@ tuple extract_tokens( i++; } - if (used_elsewhere_in_grammar) + if (used_elsewhere_in_grammar) { continue; + } if (is_token(rule)) { lexical_grammar.separators.push_back(rule); @@ -205,7 +201,7 @@ tuple extract_tokens( ); } - syntax_grammar.external_tokens.push_back({ + syntax_grammar.external_tokens.push_back(ExternalToken{ external_token.name, external_token.type, internal_token diff --git a/test/fixtures/test_grammars/unused_rules/corpus.txt b/test/fixtures/test_grammars/unused_rules/corpus.txt new file mode 100644 index 00000000..11fd569d --- /dev/null +++ b/test/fixtures/test_grammars/unused_rules/corpus.txt @@ -0,0 +1,9 @@ +========================= +the language +========================= + +E F I J + +--- + +(a (d (e) (f)) (h (i) (j))) \ No newline at end of file diff --git a/test/fixtures/test_grammars/unused_rules/grammar.json b/test/fixtures/test_grammars/unused_rules/grammar.json new file mode 100644 index 00000000..7ed2a0da --- /dev/null +++ b/test/fixtures/test_grammars/unused_rules/grammar.json @@ -0,0 +1,73 @@ +{ + "name": "unused_rules", + + "extras": [ + {"type": "PATTERN", "value": "\\s"} + ], + + "rules": { + "a": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "d"}, + {"type": "SYMBOL", "name": "h"} + ] + }, + + "b": { + "type": "STRING", + "value": "B" + }, + + "c": { + "type": "STRING", + "value": "C" + }, + + "d": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "e"}, + {"type": "SYMBOL", "name": "f"} + ] + }, + + "e": { + "type": "STRING", + "value": "E" + }, + + "f": { + "type": "STRING", + "value": "F" + }, + + "g": { + "type": "STRING", + "value": "G" + }, + + "h": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "i"}, + {"type": "SYMBOL", "name": "j"} + ] + }, + + "i": { + "type": "STRING", + "value": "I" + }, + + "j": { + "type": "STRING", + "value": "J" + }, + + "k": { + "type": "STRING", + "value": "K" + } + } +} \ No newline at end of file diff --git a/test/fixtures/test_grammars/unused_rules/readme.md b/test/fixtures/test_grammars/unused_rules/readme.md new file mode 100644 index 00000000..6390bdeb --- /dev/null +++ b/test/fixtures/test_grammars/unused_rules/readme.md @@ -0,0 +1 @@ +The generated parsers use the grammar's token count to distinguish between terminal and non-terminal symbols. When the grammar has unused tokens, these tokens don't appear in the parser, so they need to be omitted from the token count. \ No newline at end of file