From cb784975a443ef4f22e01d1f8cdd499bcf962251 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 1 Aug 2018 14:00:57 -0700 Subject: [PATCH 1/3] Add IMMEDIATE_TOKEN rule type, for enforcing no preceding extras --- .../build_tables/lex_table_builder.cc | 31 ++++-- src/compiler/generate_code/c_code.cc | 101 +++++++++++------- src/compiler/grammar-schema.json | 2 +- src/compiler/lex_table.cc | 20 ++-- src/compiler/lex_table.h | 4 +- src/compiler/parse_grammar.cc | 9 ++ .../prepare_grammar/extract_tokens.cc | 2 + src/compiler/rule.cc | 3 + src/compiler/rules/metadata.cc | 7 ++ src/compiler/rules/metadata.h | 1 + .../test_grammars/immediate_tokens/corpus.txt | 29 +++++ .../immediate_tokens/grammar.json | 61 +++++++++++ .../test_grammars/immediate_tokens/readme.md | 1 + 13 files changed, 212 insertions(+), 59 deletions(-) create mode 100644 test/fixtures/test_grammars/immediate_tokens/corpus.txt create mode 100644 test/fixtures/test_grammars/immediate_tokens/grammar.json create mode 100644 test/fixtures/test_grammars/immediate_tokens/readme.md diff --git a/src/compiler/build_tables/lex_table_builder.cc b/src/compiler/build_tables/lex_table_builder.cc index 0b309c7d..e577d690 100644 --- a/src/compiler/build_tables/lex_table_builder.cc +++ b/src/compiler/build_tables/lex_table_builder.cc @@ -379,9 +379,14 @@ class LexTableBuilderImpl : public LexTableBuilder { for (const LexItem &item : item_set.entries) { LexItem::CompletionStatus completion_status = item.completion_status(); if (completion_status.is_done) { - AcceptTokenAction action(item.lhs, completion_status.precedence.max, - item.lhs.is_built_in() || - grammar.variables[item.lhs.index].is_string); + AcceptTokenAction action(item.lhs, completion_status.precedence.max); + + if (!item.lhs.is_built_in()) { + const LexicalVariable &variable = grammar.variables[item.lhs.index]; + if (variable.is_string) action.implicit_precedence += 2; + if (is_immediate_token(variable.rule)) action.implicit_precedence += 1; + } + AcceptTokenAction &existing_action = lex_table.states[state_id].accept_action; if (existing_action.is_present()) { if (should_replace_accept_action(existing_action, action)) { @@ -458,8 +463,8 @@ class LexTableBuilderImpl : public LexTableBuilder { void remove_duplicate_lex_states(LexTable &lex_table) { for (LexState &state : lex_table.states) { - state.accept_action.is_string = false; state.accept_action.precedence = 0; + state.accept_action.implicit_precedence = 0; } map replacements; @@ -523,12 +528,24 @@ class LexTableBuilderImpl : public LexTableBuilder { } } + bool is_immediate_token(const Rule &rule) const { + return rule.match( + [](const Metadata &metadata) { + return metadata.params.is_main_token; + }, + + [](auto rule) { + return false; + } + ); + } + LexItemSet item_set_for_terminals(const LookaheadSet &terminals, bool with_separators) { LexItemSet result; terminals.for_each([&](Symbol symbol) { if (symbol.is_terminal()) { for (auto &&rule : rules_for_symbol(symbol)) { - if (with_separators) { + if (with_separators && !is_immediate_token(rule)) { for (const auto &separator_rule : separator_rules) { result.entries.insert(LexItem( symbol, @@ -598,8 +615,8 @@ class LexTableBuilderImpl : public LexTableBuilder { const AcceptTokenAction &new_action) { if (new_action.precedence > old_action.precedence) return true; if (new_action.precedence < old_action.precedence) return false; - if (new_action.is_string && !old_action.is_string) return true; - if (old_action.is_string && !new_action.is_string) return false; + if (new_action.implicit_precedence > old_action.implicit_precedence) return true; + if (new_action.implicit_precedence < old_action.implicit_precedence) return false; return new_action.symbol.index < old_action.symbol.index; } diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index 806adf66..1038701c 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -23,6 +23,7 @@ using std::pair; using std::set; using std::string; using std::to_string; +using std::unordered_set; using std::vector; using util::escape_char; using rules::Symbol; @@ -76,7 +77,7 @@ class CCodeGenerator { Symbol keyword_capture_token; const SyntaxGrammar syntax_grammar; const LexicalGrammar lexical_grammar; - map sanitized_names; + map symbol_ids; vector> parse_table_entries; vector> external_scanner_states; size_t next_parse_action_list_index; @@ -165,6 +166,24 @@ class CCodeGenerator { } } + unordered_set symbol_id_values; + symbol_ids[rules::END_OF_INPUT()] = "ts_builtin_sym_end"; + + for (const Symbol &symbol : parse_table.symbols) { + if (!symbol.is_built_in()) { + assign_symbol_id(symbol, &symbol_id_values); + } + } + + for (size_t i = 0; i < syntax_grammar.external_tokens.size(); i++) { + const ExternalToken &external_token = syntax_grammar.external_tokens[i]; + if (external_token.corresponding_internal_token == rules::NONE()) { + assign_symbol_id(Symbol::external(i), &symbol_id_values); + } else { + symbol_ids[Symbol::external(i)] = symbol_ids[external_token.corresponding_internal_token]; + } + } + line("#define LANGUAGE_VERSION " + to_string(TREE_SITTER_LANGUAGE_VERSION)); line("#define STATE_COUNT " + to_string(parse_table.states.size())); line("#define SYMBOL_COUNT " + to_string(parse_table.symbols.size())); @@ -175,6 +194,33 @@ class CCodeGenerator { line(); } + void assign_symbol_id(const Symbol &symbol, unordered_set *symbol_id_values) { + auto entry = entry_for_symbol(symbol); + + string symbol_id; + switch (entry.second) { + case VariableTypeAuxiliary: + symbol_id = "aux_sym_" + sanitize_name(entry.first); + break; + case VariableTypeAnonymous: + symbol_id = "anon_sym_" + sanitize_name(entry.first); + break; + default: + symbol_id = "sym_" + sanitize_name(entry.first); + break; + } + + unsigned suffix_number = 1; + string unique_symbol_id = symbol_id; + while (symbol_id_values->count(unique_symbol_id)) { + suffix_number++; + unique_symbol_id = symbol_id + to_string(suffix_number); + } + + symbol_id_values->insert(unique_symbol_id); + symbol_ids[symbol] = unique_symbol_id; + } + void add_symbol_enum() { line("enum {"); indent([&]() { @@ -696,20 +742,7 @@ class CCodeGenerator { } string symbol_id(const Symbol &symbol) { - if (symbol == rules::END_OF_INPUT()) - return "ts_builtin_sym_end"; - - auto entry = entry_for_symbol(symbol); - string name = sanitize_name(entry.first); - - switch (entry.second) { - case VariableTypeAuxiliary: - return "aux_sym_" + name; - case VariableTypeAnonymous: - return "anon_sym_" + name; - default: - return "sym_" + name; - } + return symbol_ids[symbol]; } string alias_id(const Alias &alias) { @@ -776,47 +809,35 @@ class CCodeGenerator { return name; } - string sanitize_name(string name) { - auto existing = sanitized_names.find(name); - if (existing != sanitized_names.end()) - return existing->second; - - string stripped_name; + string sanitize_name(const string &name) { + string result; for (char c : name) { if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9') || (c == '_')) { - stripped_name += c; + result += c; } else { auto replacement = REPLACEMENTS.find(c); - size_t i = stripped_name.size(); + size_t i = result.size(); if (replacement != REPLACEMENTS.end()) { - if (i > 0 && stripped_name[i - 1] != '_') - stripped_name += "_"; - stripped_name += replacement->second; + if (i > 0 && result[i - 1] != '_') + result += "_"; + result += replacement->second; } } } - - for (size_t extra_number = 0;; extra_number++) { - string suffix = extra_number ? to_string(extra_number) : ""; - string unique_name = stripped_name + suffix; - if (unique_name == "") - continue; - if (!has_sanitized_name(unique_name)) { - sanitized_names.insert({ name, unique_name }); - return unique_name; - } - } + return result; } string _boolean(bool value) { return value ? "true" : "false"; } - bool has_sanitized_name(string name) { - for (const auto &pair : sanitized_names) - if (pair.second == name) + bool has_sanitized_name(const Symbol &symbol, string name) { + for (const auto &pair : symbol_ids) { + if (pair.second == name) { return true; + } + } return false; } diff --git a/src/compiler/grammar-schema.json b/src/compiler/grammar-schema.json index 24e47abb..55388364 100644 --- a/src/compiler/grammar-schema.json +++ b/src/compiler/grammar-schema.json @@ -201,7 +201,7 @@ "properties": { "type": { "type": "string", - "pattern": "^TOKEN$" + "pattern": "^(TOKEN|IMMEDIATE_TOKEN)$" }, "content": { "$ref": "#/definitions/rule" diff --git a/src/compiler/lex_table.cc b/src/compiler/lex_table.cc index daf4517a..e13d6fcb 100644 --- a/src/compiler/lex_table.cc +++ b/src/compiler/lex_table.cc @@ -16,9 +16,9 @@ AdvanceAction::AdvanceAction() : state_index(-1) {} AdvanceAction::AdvanceAction(size_t state_index, PrecedenceRange precedence_range, bool in_main_token) - : state_index(state_index), - precedence_range(precedence_range), - in_main_token(in_main_token) {} + : state_index(state_index), + precedence_range(precedence_range), + in_main_token(in_main_token) {} bool AdvanceAction::operator==(const AdvanceAction &other) const { return (state_index == other.state_index) && @@ -26,19 +26,21 @@ bool AdvanceAction::operator==(const AdvanceAction &other) const { } AcceptTokenAction::AcceptTokenAction() - : symbol(rules::NONE()), precedence(0), is_string(false) {} + : symbol(rules::NONE()), precedence(0), implicit_precedence(0) {} -AcceptTokenAction::AcceptTokenAction(Symbol symbol, int precedence, - bool is_string) - : symbol(symbol), precedence(precedence), is_string(is_string) {} +AcceptTokenAction::AcceptTokenAction(Symbol symbol, int precedence) + : symbol(symbol), precedence(precedence), implicit_precedence(0) {} bool AcceptTokenAction::is_present() const { return symbol != rules::NONE(); } bool AcceptTokenAction::operator==(const AcceptTokenAction &other) const { - return (symbol == other.symbol) && (precedence == other.precedence) && - (is_string == other.is_string); + return ( + symbol == other.symbol && + precedence == other.precedence && + implicit_precedence == other.implicit_precedence + ); } bool LexState::operator==(const LexState &other) const { diff --git a/src/compiler/lex_table.h b/src/compiler/lex_table.h index 6de0792d..9419e8e2 100644 --- a/src/compiler/lex_table.h +++ b/src/compiler/lex_table.h @@ -25,14 +25,14 @@ struct AdvanceAction { struct AcceptTokenAction { AcceptTokenAction(); - AcceptTokenAction(rules::Symbol, int, bool); + AcceptTokenAction(rules::Symbol, int); bool is_present() const; bool operator==(const AcceptTokenAction &other) const; inline bool operator!=(const AcceptTokenAction &other) const { return !operator==(other); } rules::Symbol symbol; int precedence; - bool is_string; + int implicit_precedence; }; struct LexState { diff --git a/src/compiler/parse_grammar.cc b/src/compiler/parse_grammar.cc index 63ddb40b..7b69ed61 100644 --- a/src/compiler/parse_grammar.cc +++ b/src/compiler/parse_grammar.cc @@ -116,6 +116,15 @@ ParseRuleResult parse_rule(json_value *rule_json) { return Rule(Metadata::token(move(result.rule))); } + if (type == "IMMEDIATE_TOKEN") { + json_value content_json = rule_json->operator[]("content"); + auto result = parse_rule(&content_json); + if (!result.error_message.empty()) { + return "Invalid token content: " + result.error_message; + } + return Rule(Metadata::immediate_token(move(result.rule))); + } + if (type == "PATTERN") { json_value value_json = rule_json->operator[]("value"); if (value_json.type == json_string) { diff --git a/src/compiler/prepare_grammar/extract_tokens.cc b/src/compiler/prepare_grammar/extract_tokens.cc index b5110693..bf01e722 100644 --- a/src/compiler/prepare_grammar/extract_tokens.cc +++ b/src/compiler/prepare_grammar/extract_tokens.cc @@ -118,6 +118,8 @@ class TokenExtractor { metadata.params.is_token = false; if (metadata.params == rules::MetadataParams{}) { return extract_token(*metadata.rule, VariableTypeAuxiliary); + } else if (metadata.rule->is()) { + return extract_token(metadata, VariableTypeAnonymous); } else { return extract_token(metadata, VariableTypeAuxiliary); } diff --git a/src/compiler/rule.cc b/src/compiler/rule.cc index 29ee1793..e7277459 100644 --- a/src/compiler/rule.cc +++ b/src/compiler/rule.cc @@ -135,6 +135,9 @@ bool Rule::is() const { return type == BlankType; } template <> bool Rule::is() const { return type == SymbolType; } +template <> +bool Rule::is() const { return type == StringType; } + template <> bool Rule::is() const { return type == RepeatType; } diff --git a/src/compiler/rules/metadata.cc b/src/compiler/rules/metadata.cc index 40dcb21e..c54d29cd 100644 --- a/src/compiler/rules/metadata.cc +++ b/src/compiler/rules/metadata.cc @@ -75,6 +75,13 @@ Metadata Metadata::token(Rule &&rule) { }); } +Metadata Metadata::immediate_token(Rule &&rule) { + return add_metadata(move(rule), [](MetadataParams ¶ms) { + params.is_token = true; + params.is_main_token = true; + }); +} + Metadata Metadata::active_prec(int precedence, Rule &&rule) { return add_metadata(move(rule), [&](MetadataParams ¶ms) { params.has_precedence = true; diff --git a/src/compiler/rules/metadata.h b/src/compiler/rules/metadata.h index 73a4a66d..3c023b3e 100644 --- a/src/compiler/rules/metadata.h +++ b/src/compiler/rules/metadata.h @@ -64,6 +64,7 @@ struct Metadata { static Metadata merge(Rule &&rule, MetadataParams params); static Metadata token(Rule &&rule); + static Metadata immediate_token(Rule &&rule); static Metadata active_prec(int precedence, Rule &&rule); static Metadata prec(int precedence, Rule &&rule); static Metadata prec_left(int precedence, Rule &&rule); diff --git a/test/fixtures/test_grammars/immediate_tokens/corpus.txt b/test/fixtures/test_grammars/immediate_tokens/corpus.txt new file mode 100644 index 00000000..d5d2e0f8 --- /dev/null +++ b/test/fixtures/test_grammars/immediate_tokens/corpus.txt @@ -0,0 +1,29 @@ +=============================== +prefix expressions as arguments +=============================== + +a ::b ::c + +--- + +(program + (call + (call + (identifier) + (prefix (identifier))) + (prefix (identifier)))) + +=============================== +infix expressions +=============================== + +a::b::c + +--- + +(program + (infix + (infix + (identifier) + (identifier)) + (identifier))) diff --git a/test/fixtures/test_grammars/immediate_tokens/grammar.json b/test/fixtures/test_grammars/immediate_tokens/grammar.json new file mode 100644 index 00000000..1506e3a7 --- /dev/null +++ b/test/fixtures/test_grammars/immediate_tokens/grammar.json @@ -0,0 +1,61 @@ +{ + "name": "immediate_tokens", + + "extras": [ + { + "type": "PATTERN", + "value": "\\s" + } + ], + + "rules": { + "program": {"type": "SYMBOL", "name": "_expression"}, + + "_expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "call"}, + {"type": "SYMBOL", "name": "infix"}, + {"type": "SYMBOL", "name": "prefix"}, + {"type": "SYMBOL", "name": "identifier"} + ] + }, + + "call": { + "type": "PREC_LEFT", + "value": -1, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "_expression"}, + {"type": "SYMBOL", "name": "_expression"} + ] + } + }, + + "prefix": { + "type": "SEQ", + "members": [ + {"type": "STRING", "value": "::"}, + {"type": "SYMBOL", "name": "identifier"} + ] + }, + + "infix": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "_expression"}, + { + "type": "IMMEDIATE_TOKEN", + "content": {"type": "STRING", "value": "::"} + }, + {"type": "SYMBOL", "name": "identifier"} + ] + }, + + "identifier": { + "type": "PATTERN", + "value": "[a-z]+" + } + } +} diff --git a/test/fixtures/test_grammars/immediate_tokens/readme.md b/test/fixtures/test_grammars/immediate_tokens/readme.md new file mode 100644 index 00000000..39599fcb --- /dev/null +++ b/test/fixtures/test_grammars/immediate_tokens/readme.md @@ -0,0 +1 @@ +This grammar demonstrates the usage of the IMMEDIATE_TOKEN rule. It allows the parser to produce a different token based on whether or not there are `extras` preceding the token's main content. When there are *no* leading `extras`, an immediate token is preferred over a normal token which would otherwise match. From 68618f61a623769a040b441da33ea8d43568ee8e Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 1 Aug 2018 14:23:52 -0700 Subject: [PATCH 2/3] Test against immediate token branches of grammar repos --- script/fetch-fixtures | 4 ++-- script/fetch-fixtures.cmd | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/script/fetch-fixtures b/script/fetch-fixtures index 4923e62c..1d53a3b7 100755 --- a/script/fetch-fixtures +++ b/script/fetch-fixtures @@ -21,9 +21,9 @@ fetch_grammar() { ) } -fetch_grammar javascript master +fetch_grammar javascript immediate-tokens fetch_grammar json master -fetch_grammar c master +fetch_grammar c immediate-tokens fetch_grammar cpp master fetch_grammar python master fetch_grammar go master diff --git a/script/fetch-fixtures.cmd b/script/fetch-fixtures.cmd index 17ff224d..fffb668e 100644 --- a/script/fetch-fixtures.cmd +++ b/script/fetch-fixtures.cmd @@ -1,8 +1,8 @@ @echo off -call:fetch_grammar javascript master +call:fetch_grammar javascript immediate-tokens call:fetch_grammar json master -call:fetch_grammar c master +call:fetch_grammar c immediate-tokens call:fetch_grammar cpp master call:fetch_grammar python master call:fetch_grammar go master From 41fe564a901a986a88c5bc647e22806fc7e76b65 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 1 Aug 2018 15:09:45 -0700 Subject: [PATCH 3/3] Update error recovery fixture --- test/fixtures/error_corpus/c_errors.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/fixtures/error_corpus/c_errors.txt b/test/fixtures/error_corpus/c_errors.txt index b2931b7d..ee63debf 100644 --- a/test/fixtures/error_corpus/c_errors.txt +++ b/test/fixtures/error_corpus/c_errors.txt @@ -69,7 +69,7 @@ int main() { b(); c(); - if () d(); + if (*) d(); } } @@ -81,14 +81,14 @@ int main() { (function_declarator (identifier) (parameter_list)) (compound_statement (if_statement - (field_expression + (parenthesized_expression (field_expression (identifier) - (MISSING)) + (MISSING))) (compound_statement (expression_statement (call_expression (identifier) (argument_list))) (expression_statement (call_expression (identifier) (argument_list))) (if_statement - (MISSING) + (parenthesized_expression (pointer_expression (MISSING))) (expression_statement (call_expression (identifier) (argument_list))))))))) ====================================