From 10b51a05a1dae660888e7bee7e7ad26ecff4fbe9 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 8 Dec 2016 22:35:16 -0800 Subject: [PATCH] Allow external scanners to refer to (and return) internally-defined tokens Tokens that are defined in the grammar's rules may now be included in the externals list also, so that external scanners can check if they are valid lookaheads or not, and if so, can return them to the parser if needed. --- spec/fixtures/external_scanners/line_breaks.c | 53 ++++++++++++++ .../{external_scan.c => percent_strings.c} | 0 spec/integration/compile_grammar_spec.cc | 70 ++++++++++++++++++- src/compiler/generate_code/c_code.cc | 42 +++++++++-- .../prepare_grammar/intern_symbols.cc | 6 +- 5 files changed, 159 insertions(+), 12 deletions(-) create mode 100644 spec/fixtures/external_scanners/line_breaks.c rename spec/fixtures/external_scanners/{external_scan.c => percent_strings.c} (100%) diff --git a/spec/fixtures/external_scanners/line_breaks.c b/spec/fixtures/external_scanners/line_breaks.c new file mode 100644 index 00000000..eb63a37f --- /dev/null +++ b/spec/fixtures/external_scanners/line_breaks.c @@ -0,0 +1,53 @@ +#include +#include + +enum { + STRING, + LINE_BREAK +}; + +void *ts_language_shared_external_tokens_external_scanner_create() { + return NULL; +} + +void ts_language_shared_external_tokens_external_scanner_destroy(void *payload) { +} + +bool ts_language_shared_external_tokens_external_scanner_scan( + void *payload, TSLexer *lexer, const bool *whitelist) { + + // If a line-break is a valid lookahead token, only skip spaces. + if (whitelist[LINE_BREAK]) { + while (lexer->lookahead == ' ') { + lexer->advance(lexer, true); + } + + if (lexer->lookahead == '\n') { + lexer->advance(lexer, false); + lexer->result_symbol = LINE_BREAK; + return true; + } + } + + // If a line-break is not a valid lookahead token, skip line breaks as well + // as spaces. + if (whitelist[STRING]) { + while (lexer->lookahead == ' ' || lexer->lookahead == '\n') { + lexer->advance(lexer, true); + } + + if (lexer->lookahead == '\'') { + lexer->advance(lexer, false); + + while (lexer->lookahead != '\'') { + lexer->advance(lexer, false); + } + + lexer->advance(lexer, false); + lexer->result_symbol = STRING; + return true; + } + } + + return false; +} diff --git a/spec/fixtures/external_scanners/external_scan.c b/spec/fixtures/external_scanners/percent_strings.c similarity index 100% rename from spec/fixtures/external_scanners/external_scan.c rename to spec/fixtures/external_scanners/percent_strings.c diff --git a/spec/integration/compile_grammar_spec.cc b/spec/integration/compile_grammar_spec.cc index 743ea286..f26bbcc8 100644 --- a/spec/integration/compile_grammar_spec.cc +++ b/spec/integration/compile_grammar_spec.cc @@ -1,6 +1,7 @@ #include "spec_helper.h" #include "runtime/alloc.h" #include "helpers/load_language.h" +#include "helpers/stderr_logger.h" #include "compiler/util/string_helpers.h" #include @@ -508,7 +509,7 @@ describe("compile_grammar", []() { }); describe("external scanners", [&]() { - it("can call out to arbitrary scanner functions during parsing", [&]() { + it("can tokenize using arbitrary user-defined scanner functions", [&]() { string grammar = R"JSON({ "name": "external_scanner_example", @@ -573,7 +574,7 @@ describe("compile_grammar", []() { ts_document_set_language(document, load_compile_result( "external_scanner_example", result, - "spec/fixtures/external_scanners/external_scan.c" + "spec/fixtures/external_scanners/percent_strings.c" )); ts_document_set_input_string(document, "x + %(sup (external) scanner?)"); @@ -584,6 +585,71 @@ describe("compile_grammar", []() { ts_document_parse(document); assert_root_node("(expression (string (expression (sum (expression (identifier)) (expression (identifier))))))"); }); + + it("allows external scanners to refer to tokens that are defined internally", [&]() { + string grammar = R"JSON({ + "name": "shared_external_tokens", + + "externals": [ + "string", + "line_break" + ], + + "extras": [ + {"type": "PATTERN", "value": "\\s"} + ], + + "rules": { + "statement": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "_expression"}, + {"type": "SYMBOL", "name": "_expression"}, + {"type": "SYMBOL", "name": "line_break"} + ] + }, + + "_expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "string"}, + {"type": "SYMBOL", "name": "variable"}, + {"type": "SYMBOL", "name": "number"} + ] + }, + + "variable": {"type": "PATTERN", "value": "\\a+"}, + "number": {"type": "PATTERN", "value": "\\d+"}, + "line_break": {"type": "STRING", "value": "\n"} + } + })JSON"; + + TSCompileResult result = ts_compile_grammar(grammar.c_str()); + AssertThat(result.error_message, IsNull()); + + ts_document_set_language(document, load_compile_result( + "shared_external_tokens", + result, + "spec/fixtures/external_scanners/line_breaks.c" + )); + + ts_document_set_input_string(document, "a b\n"); + ts_document_parse(document); + assert_root_node("(statement (variable) (variable) (line_break))"); + + ts_document_set_input_string(document, "a \nb\n"); + ts_document_parse(document); + assert_root_node("(statement (variable) (variable) (line_break))"); + + + ts_document_set_input_string(document, "'hello' 'world'\n"); + ts_document_parse(document); + assert_root_node("(statement (string) (string) (line_break))"); + + ts_document_set_input_string(document, "'hello' \n'world'\n"); + ts_document_parse(document); + assert_root_node("(statement (string) (string) (line_break))"); + }); }); describe("when the grammar's start symbol is a token", [&]() { diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index e5272595..5df39413 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -77,6 +77,7 @@ class CCodeGenerator { vector> parse_table_entries; vector> external_token_id_sets; size_t next_parse_action_list_index; + map shared_token_indices; public: CCodeGenerator(string name, const ParseTable &parse_table, @@ -93,6 +94,17 @@ class CCodeGenerator { string code() { buffer = ""; + for (size_t i = 0; i < lexical_grammar.variables.size(); i++) { + const Variable &variable = lexical_grammar.variables[i]; + for (size_t j = 0; j < syntax_grammar.external_tokens.size(); j++) { + const ExternalToken &external_token = syntax_grammar.external_tokens[j]; + if (external_token.name == variable.name) { + shared_token_indices.insert({i, j}); + break; + } + } + } + add_includes(); add_warning_pragma(); add_stats(); @@ -128,8 +140,14 @@ class CCodeGenerator { void add_stats() { line("#define STATE_COUNT " + to_string(parse_table.states.size())); line("#define SYMBOL_COUNT " + to_string(parse_table.symbols.size())); - line("#define TOKEN_COUNT " + to_string(lexical_grammar.variables.size() + 1 + syntax_grammar.external_tokens.size())); - line("#define EXTERNAL_TOKEN_COUNT " + to_string(syntax_grammar.external_tokens.size())); + line("#define TOKEN_COUNT " + to_string( + 1 + + lexical_grammar.variables.size() + + syntax_grammar.external_tokens.size() - shared_token_indices.size() + )); + line("#define EXTERNAL_TOKEN_COUNT " + to_string( + syntax_grammar.external_tokens.size() + )); line(); } @@ -213,7 +231,7 @@ class CCodeGenerator { } void add_lex_modes_list() { - add_external_tokens_id({}); + add_external_scanner_state({}); line("static TSLexMode ts_lex_modes[STATE_COUNT] = {"); indent([&]() { @@ -223,15 +241,25 @@ class CCodeGenerator { line("[" + to_string(state_id++) + "] = {.lex_state = "); add(to_string(state.lex_state_id)); + bool has_external_tokens = false; set external_token_indices; for (const auto &pair : state.terminal_entries) { Symbol symbol = pair.first; - if (symbol.is_external()) + if (symbol.is_external()) { + has_external_tokens = true; external_token_indices.insert(symbol.index); + } else if (symbol.is_token()) { + auto shared_token_entry = shared_token_indices.find(symbol.index); + if (shared_token_entry != shared_token_indices.end()) { + external_token_indices.insert(shared_token_entry->second); + } + } + } + + if (has_external_tokens) { + add(", .external_tokens = " + add_external_scanner_state(external_token_indices)); } - if (!external_token_indices.empty()) - add(", .external_tokens = " + add_external_tokens_id(external_token_indices)); add("},"); } }); @@ -239,7 +267,7 @@ class CCodeGenerator { line(); } - string add_external_tokens_id(set external_token_ids) { + string add_external_scanner_state(set external_token_ids) { for (size_t i = 0, n = external_token_id_sets.size(); i < n; i++) if (external_token_id_sets[i] == external_token_ids) return to_string(i); diff --git a/src/compiler/prepare_grammar/intern_symbols.cc b/src/compiler/prepare_grammar/intern_symbols.cc index daad9d2e..06b4d430 100644 --- a/src/compiler/prepare_grammar/intern_symbols.cc +++ b/src/compiler/prepare_grammar/intern_symbols.cc @@ -19,7 +19,7 @@ using std::pair; using std::make_shared; using rules::Symbol; -class InternSymbols : public rules::IdentityRuleFn { +class SymbolInterner : public rules::IdentityRuleFn { using rules::IdentityRuleFn::apply_to; rule_ptr apply_to(const rules::NamedSymbol *rule) { @@ -42,7 +42,7 @@ class InternSymbols : public rules::IdentityRuleFn { return nullptr; } - explicit InternSymbols(const Grammar &grammar) : grammar(grammar) {} + explicit SymbolInterner(const Grammar &grammar) : grammar(grammar) {} const Grammar grammar; string missing_rule_name; }; @@ -62,7 +62,7 @@ pair intern_symbols(const Grammar &grammar) { }); } - InternSymbols interner(grammar); + SymbolInterner interner(grammar); for (auto &pair : grammar.rules) { auto new_rule = interner.apply(pair.second);