diff --git a/spec/compiler/prepare_grammar/intern_symbols_spec.cc b/spec/compiler/prepare_grammar/intern_symbols_spec.cc index 4ec27149..9142eab6 100644 --- a/spec/compiler/prepare_grammar/intern_symbols_spec.cc +++ b/spec/compiler/prepare_grammar/intern_symbols_spec.cc @@ -3,8 +3,10 @@ #include "compiler/grammar.h" #include "compiler/rules/named_symbol.h" #include "compiler/rules/symbol.h" +#include "compiler/rules/built_in_symbols.h" #include "helpers/equals_pointer.h" #include "helpers/rule_helpers.h" +#include "helpers/stream_methods.h" START_TEST @@ -56,6 +58,32 @@ describe("intern_symbols", []() { AssertThat(result.first.extra_tokens.size(), Equals(1)); AssertThat(*result.first.extra_tokens.begin(), EqualsPointer(i_sym(2))); }); + + it("records any rule names that match external token names", [&]() { + Grammar grammar{{ + { "x", choice({ sym("y"), sym("z") }) }, + { "y", sym("z") }, + { "z", str("stuff") } + }, {}, {}, { + "w", + "z" + }}; + + auto result = intern_symbols(grammar); + + AssertThat(result.first.external_tokens, Equals(vector({ + { + "w", + VariableTypeNamed, + rules::NONE() + }, + { + "z", + VariableTypeNamed, + Symbol(2, Symbol::NonTerminal) + } + }))) + }); }); END_TEST diff --git a/spec/helpers/stream_methods.cc b/spec/helpers/stream_methods.cc index b47363a0..a4b275ea 100644 --- a/spec/helpers/stream_methods.cc +++ b/spec/helpers/stream_methods.cc @@ -76,6 +76,11 @@ ostream &operator<<(ostream &stream, const ParseState &state) { return stream << string(">"); } +ostream &operator<<(ostream &stream, const ExternalToken &external_token) { + return stream << "{" << external_token.name << ", " << external_token.type << + "," << external_token.corresponding_internal_token << "}"; +} + ostream &operator<<(ostream &stream, const ProductionStep &step) { stream << "(symbol: " << step.symbol << ", precedence:" << to_string(step.precedence); stream << ", associativity: "; diff --git a/spec/helpers/stream_methods.h b/spec/helpers/stream_methods.h index 515060eb..28b201c3 100644 --- a/spec/helpers/stream_methods.h +++ b/spec/helpers/stream_methods.h @@ -97,6 +97,7 @@ struct AdvanceAction; struct AcceptTokenAction; class ParseAction; class ParseState; +struct ExternalToken; struct ProductionStep; struct PrecedenceRange; @@ -110,6 +111,7 @@ ostream &operator<<(ostream &, const AdvanceAction &); ostream &operator<<(ostream &, const AcceptTokenAction &); ostream &operator<<(ostream &, const ParseAction &); ostream &operator<<(ostream &, const ParseState &); +ostream &operator<<(ostream &, const ExternalToken &); ostream &operator<<(ostream &, const ProductionStep &); ostream &operator<<(ostream &, const PrecedenceRange &); diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index ed034c1b..7c3601a3 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -77,7 +77,6 @@ class CCodeGenerator { vector> parse_table_entries; vector> external_token_id_sets; size_t next_parse_action_list_index; - map shared_token_indices; public: CCodeGenerator(string name, const ParseTable &parse_table, @@ -94,17 +93,6 @@ class CCodeGenerator { string code() { buffer = ""; - for (size_t i = 0; i < lexical_grammar.variables.size(); i++) { - const Variable &variable = lexical_grammar.variables[i]; - for (size_t j = 0; j < syntax_grammar.external_tokens.size(); j++) { - const ExternalToken &external_token = syntax_grammar.external_tokens[j]; - if (external_token.name == variable.name) { - shared_token_indices.insert({i, j}); - break; - } - } - } - add_includes(); add_warning_pragma(); add_stats(); @@ -138,16 +126,17 @@ class CCodeGenerator { } void add_stats() { + size_t token_count = 1 + lexical_grammar.variables.size(); + for (const ExternalToken &external_token : syntax_grammar.external_tokens) { + if (external_token.corresponding_internal_token == rules::NONE()) { + token_count++; + } + } + line("#define STATE_COUNT " + to_string(parse_table.states.size())); line("#define SYMBOL_COUNT " + to_string(parse_table.symbols.size())); - line("#define TOKEN_COUNT " + to_string( - 1 + - lexical_grammar.variables.size() + - syntax_grammar.external_tokens.size() - shared_token_indices.size() - )); - line("#define EXTERNAL_TOKEN_COUNT " + to_string( - syntax_grammar.external_tokens.size() - )); + line("#define TOKEN_COUNT " + to_string(token_count)); + line("#define EXTERNAL_TOKEN_COUNT " + to_string(syntax_grammar.external_tokens.size())); line(); } @@ -233,6 +222,17 @@ class CCodeGenerator { void add_lex_modes_list() { add_external_scanner_state({}); + map external_tokens_by_corresponding_internal_token; + for (size_t i = 0, n = lexical_grammar.variables.size(); i < n; i++) { + for (size_t j = 0; j < syntax_grammar.external_tokens.size(); j++) { + const ExternalToken &external_token = syntax_grammar.external_tokens[j]; + if (external_token.corresponding_internal_token.index == i) { + external_tokens_by_corresponding_internal_token.insert({i, j}); + break; + } + } + } + line("static TSLexMode ts_lex_modes[STATE_COUNT] = {"); indent([&]() { size_t state_id = 0; @@ -241,22 +241,23 @@ class CCodeGenerator { line("[" + to_string(state_id++) + "] = {.lex_state = "); add(to_string(state.lex_state_id)); - bool has_external_tokens = false; + bool needs_external_scanner = false; set external_token_indices; for (const auto &pair : state.terminal_entries) { Symbol symbol = pair.first; if (symbol.is_external()) { - has_external_tokens = true; + needs_external_scanner = true; external_token_indices.insert(symbol.index); } else if (symbol.is_token()) { - auto shared_token_entry = shared_token_indices.find(symbol.index); - if (shared_token_entry != shared_token_indices.end()) { - external_token_indices.insert(shared_token_entry->second); + auto corresponding_external_token = + external_tokens_by_corresponding_internal_token.find(symbol.index); + if (corresponding_external_token != external_tokens_by_corresponding_internal_token.end()) { + external_token_indices.insert(corresponding_external_token->second); } } } - if (has_external_tokens) { + if (needs_external_scanner) { add(", .external_tokens = " + add_external_scanner_state(external_token_indices)); } diff --git a/src/compiler/prepare_grammar/extract_tokens.cc b/src/compiler/prepare_grammar/extract_tokens.cc index ace6294a..e84d028d 100644 --- a/src/compiler/prepare_grammar/extract_tokens.cc +++ b/src/compiler/prepare_grammar/extract_tokens.cc @@ -186,7 +186,13 @@ tuple extract_tokens( syntax_grammar.extra_tokens.insert(new_symbol); } - syntax_grammar.external_tokens = grammar.external_tokens; + for (const ExternalToken &external_token : grammar.external_tokens) { + syntax_grammar.external_tokens.push_back({ + external_token.name, + external_token.type, + symbol_replacer.replace_symbol(external_token.corresponding_internal_token) + }); + } return make_tuple(syntax_grammar, lexical_grammar, CompileError::none()); } diff --git a/src/compiler/prepare_grammar/intern_symbols.cc b/src/compiler/prepare_grammar/intern_symbols.cc index 06b4d430..0786982b 100644 --- a/src/compiler/prepare_grammar/intern_symbols.cc +++ b/src/compiler/prepare_grammar/intern_symbols.cc @@ -8,6 +8,7 @@ #include "compiler/rules/blank.h" #include "compiler/rules/named_symbol.h" #include "compiler/rules/symbol.h" +#include "compiler/rules/built_in_symbols.h" namespace tree_sitter { namespace prepare_grammar { @@ -56,9 +57,18 @@ pair intern_symbols(const Grammar &grammar) { InternedGrammar result; for (auto &external_token_name : grammar.external_tokens) { + Symbol corresponding_internal_token = rules::NONE(); + for (size_t i = 0, n = grammar.rules.size(); i < n; i++) { + if (grammar.rules[i].first == external_token_name) { + corresponding_internal_token = Symbol(i, Symbol::NonTerminal); + break; + } + } + result.external_tokens.push_back(ExternalToken{ external_token_name, - external_token_name[0] == '_' ? VariableTypeHidden : VariableTypeNamed + external_token_name[0] == '_' ? VariableTypeHidden : VariableTypeNamed, + corresponding_internal_token }); } diff --git a/src/compiler/syntax_grammar.cc b/src/compiler/syntax_grammar.cc index 535ddcda..aa3074e8 100644 --- a/src/compiler/syntax_grammar.cc +++ b/src/compiler/syntax_grammar.cc @@ -21,6 +21,11 @@ ProductionStep::ProductionStep(const rules::Symbol &symbol, int precedence, rules::Associativity associativity) : symbol(symbol), precedence(precedence), associativity(associativity) {} +bool ExternalToken::operator==(const ExternalToken &other) const { + return name == other.name && type == other.type && + corresponding_internal_token == other.corresponding_internal_token; +} + bool ProductionStep::operator==(const ProductionStep &other) const { return symbol == other.symbol && precedence == other.precedence && associativity == other.associativity; diff --git a/src/compiler/syntax_grammar.h b/src/compiler/syntax_grammar.h index 3d001b61..e3af8f28 100644 --- a/src/compiler/syntax_grammar.h +++ b/src/compiler/syntax_grammar.h @@ -13,6 +13,9 @@ namespace tree_sitter { struct ExternalToken { std::string name; VariableType type; + rules::Symbol corresponding_internal_token; + + bool operator==(const ExternalToken &) const; }; struct ProductionStep {