diff --git a/spec/fixtures/external_scanners/line_breaks.c b/spec/fixtures/external_scanners/line_breaks.c new file mode 100644 index 00000000..eb63a37f --- /dev/null +++ b/spec/fixtures/external_scanners/line_breaks.c @@ -0,0 +1,53 @@ +#include +#include + +enum { + STRING, + LINE_BREAK +}; + +void *ts_language_shared_external_tokens_external_scanner_create() { + return NULL; +} + +void ts_language_shared_external_tokens_external_scanner_destroy(void *payload) { +} + +bool ts_language_shared_external_tokens_external_scanner_scan( + void *payload, TSLexer *lexer, const bool *whitelist) { + + // If a line-break is a valid lookahead token, only skip spaces. + if (whitelist[LINE_BREAK]) { + while (lexer->lookahead == ' ') { + lexer->advance(lexer, true); + } + + if (lexer->lookahead == '\n') { + lexer->advance(lexer, false); + lexer->result_symbol = LINE_BREAK; + return true; + } + } + + // If a line-break is not a valid lookahead token, skip line breaks as well + // as spaces. + if (whitelist[STRING]) { + while (lexer->lookahead == ' ' || lexer->lookahead == '\n') { + lexer->advance(lexer, true); + } + + if (lexer->lookahead == '\'') { + lexer->advance(lexer, false); + + while (lexer->lookahead != '\'') { + lexer->advance(lexer, false); + } + + lexer->advance(lexer, false); + lexer->result_symbol = STRING; + return true; + } + } + + return false; +} diff --git a/spec/fixtures/external_scanners/external_scan.c b/spec/fixtures/external_scanners/percent_strings.c similarity index 100% rename from spec/fixtures/external_scanners/external_scan.c rename to spec/fixtures/external_scanners/percent_strings.c diff --git a/spec/integration/compile_grammar_spec.cc b/spec/integration/compile_grammar_spec.cc index 743ea286..f26bbcc8 100644 --- a/spec/integration/compile_grammar_spec.cc +++ b/spec/integration/compile_grammar_spec.cc @@ -1,6 +1,7 @@ #include "spec_helper.h" #include "runtime/alloc.h" #include "helpers/load_language.h" +#include "helpers/stderr_logger.h" #include "compiler/util/string_helpers.h" #include @@ -508,7 +509,7 @@ describe("compile_grammar", []() { }); describe("external scanners", [&]() { - it("can call out to arbitrary scanner functions during parsing", [&]() { + it("can tokenize using arbitrary user-defined scanner functions", [&]() { string grammar = R"JSON({ "name": "external_scanner_example", @@ -573,7 +574,7 @@ describe("compile_grammar", []() { ts_document_set_language(document, load_compile_result( "external_scanner_example", result, - "spec/fixtures/external_scanners/external_scan.c" + "spec/fixtures/external_scanners/percent_strings.c" )); ts_document_set_input_string(document, "x + %(sup (external) scanner?)"); @@ -584,6 +585,71 @@ describe("compile_grammar", []() { ts_document_parse(document); assert_root_node("(expression (string (expression (sum (expression (identifier)) (expression (identifier))))))"); }); + + it("allows external scanners to refer to tokens that are defined internally", [&]() { + string grammar = R"JSON({ + "name": "shared_external_tokens", + + "externals": [ + "string", + "line_break" + ], + + "extras": [ + {"type": "PATTERN", "value": "\\s"} + ], + + "rules": { + "statement": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "_expression"}, + {"type": "SYMBOL", "name": "_expression"}, + {"type": "SYMBOL", "name": "line_break"} + ] + }, + + "_expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "string"}, + {"type": "SYMBOL", "name": "variable"}, + {"type": "SYMBOL", "name": "number"} + ] + }, + + "variable": {"type": "PATTERN", "value": "\\a+"}, + "number": {"type": "PATTERN", "value": "\\d+"}, + "line_break": {"type": "STRING", "value": "\n"} + } + })JSON"; + + TSCompileResult result = ts_compile_grammar(grammar.c_str()); + AssertThat(result.error_message, IsNull()); + + ts_document_set_language(document, load_compile_result( + "shared_external_tokens", + result, + "spec/fixtures/external_scanners/line_breaks.c" + )); + + ts_document_set_input_string(document, "a b\n"); + ts_document_parse(document); + assert_root_node("(statement (variable) (variable) (line_break))"); + + ts_document_set_input_string(document, "a \nb\n"); + ts_document_parse(document); + assert_root_node("(statement (variable) (variable) (line_break))"); + + + ts_document_set_input_string(document, "'hello' 'world'\n"); + ts_document_parse(document); + assert_root_node("(statement (string) (string) (line_break))"); + + ts_document_set_input_string(document, "'hello' \n'world'\n"); + ts_document_parse(document); + assert_root_node("(statement (string) (string) (line_break))"); + }); }); describe("when the grammar's start symbol is a token", [&]() { diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index e5272595..5df39413 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -77,6 +77,7 @@ class CCodeGenerator { vector> parse_table_entries; vector> external_token_id_sets; size_t next_parse_action_list_index; + map shared_token_indices; public: CCodeGenerator(string name, const ParseTable &parse_table, @@ -93,6 +94,17 @@ class CCodeGenerator { string code() { buffer = ""; + for (size_t i = 0; i < lexical_grammar.variables.size(); i++) { + const Variable &variable = lexical_grammar.variables[i]; + for (size_t j = 0; j < syntax_grammar.external_tokens.size(); j++) { + const ExternalToken &external_token = syntax_grammar.external_tokens[j]; + if (external_token.name == variable.name) { + shared_token_indices.insert({i, j}); + break; + } + } + } + add_includes(); add_warning_pragma(); add_stats(); @@ -128,8 +140,14 @@ class CCodeGenerator { void add_stats() { line("#define STATE_COUNT " + to_string(parse_table.states.size())); line("#define SYMBOL_COUNT " + to_string(parse_table.symbols.size())); - line("#define TOKEN_COUNT " + to_string(lexical_grammar.variables.size() + 1 + syntax_grammar.external_tokens.size())); - line("#define EXTERNAL_TOKEN_COUNT " + to_string(syntax_grammar.external_tokens.size())); + line("#define TOKEN_COUNT " + to_string( + 1 + + lexical_grammar.variables.size() + + syntax_grammar.external_tokens.size() - shared_token_indices.size() + )); + line("#define EXTERNAL_TOKEN_COUNT " + to_string( + syntax_grammar.external_tokens.size() + )); line(); } @@ -213,7 +231,7 @@ class CCodeGenerator { } void add_lex_modes_list() { - add_external_tokens_id({}); + add_external_scanner_state({}); line("static TSLexMode ts_lex_modes[STATE_COUNT] = {"); indent([&]() { @@ -223,15 +241,25 @@ class CCodeGenerator { line("[" + to_string(state_id++) + "] = {.lex_state = "); add(to_string(state.lex_state_id)); + bool has_external_tokens = false; set external_token_indices; for (const auto &pair : state.terminal_entries) { Symbol symbol = pair.first; - if (symbol.is_external()) + if (symbol.is_external()) { + has_external_tokens = true; external_token_indices.insert(symbol.index); + } else if (symbol.is_token()) { + auto shared_token_entry = shared_token_indices.find(symbol.index); + if (shared_token_entry != shared_token_indices.end()) { + external_token_indices.insert(shared_token_entry->second); + } + } + } + + if (has_external_tokens) { + add(", .external_tokens = " + add_external_scanner_state(external_token_indices)); } - if (!external_token_indices.empty()) - add(", .external_tokens = " + add_external_tokens_id(external_token_indices)); add("},"); } }); @@ -239,7 +267,7 @@ class CCodeGenerator { line(); } - string add_external_tokens_id(set external_token_ids) { + string add_external_scanner_state(set external_token_ids) { for (size_t i = 0, n = external_token_id_sets.size(); i < n; i++) if (external_token_id_sets[i] == external_token_ids) return to_string(i); diff --git a/src/compiler/prepare_grammar/intern_symbols.cc b/src/compiler/prepare_grammar/intern_symbols.cc index daad9d2e..06b4d430 100644 --- a/src/compiler/prepare_grammar/intern_symbols.cc +++ b/src/compiler/prepare_grammar/intern_symbols.cc @@ -19,7 +19,7 @@ using std::pair; using std::make_shared; using rules::Symbol; -class InternSymbols : public rules::IdentityRuleFn { +class SymbolInterner : public rules::IdentityRuleFn { using rules::IdentityRuleFn::apply_to; rule_ptr apply_to(const rules::NamedSymbol *rule) { @@ -42,7 +42,7 @@ class InternSymbols : public rules::IdentityRuleFn { return nullptr; } - explicit InternSymbols(const Grammar &grammar) : grammar(grammar) {} + explicit SymbolInterner(const Grammar &grammar) : grammar(grammar) {} const Grammar grammar; string missing_rule_name; }; @@ -62,7 +62,7 @@ pair intern_symbols(const Grammar &grammar) { }); } - InternSymbols interner(grammar); + SymbolInterner interner(grammar); for (auto &pair : grammar.rules) { auto new_rule = interner.apply(pair.second);