From ed8fbff175188f429e4d3ff919d9c2e72e84190e Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 17 Mar 2017 16:31:29 -0700 Subject: [PATCH 1/4] Allow anonymous tokens to be used in grammars' external token lists --- .../build_tables/build_parse_table.cc | 62 +++++++++---------- src/compiler/generate_code/c_code.cc | 2 +- src/compiler/grammar.h | 18 ++---- src/compiler/parse_grammar.cc | 23 ++++--- .../prepare_grammar/expand_repeats.cc | 2 +- .../prepare_grammar/extract_tokens.cc | 53 ++++++++++++---- .../prepare_grammar/flatten_grammar.cc | 2 +- .../prepare_grammar/flatten_grammar.h | 2 +- .../prepare_grammar/initial_syntax_grammar.h | 11 +--- .../prepare_grammar/intern_symbols.cc | 49 +++++++++------ .../prepare_grammar/interned_grammar.h | 12 +--- src/compiler/syntax_grammar.h | 12 ++++ .../prepare_grammar/expand_repeats_test.cc | 5 -- .../prepare_grammar/extract_tokens_test.cc | 60 +++++++++--------- .../prepare_grammar/intern_symbols_test.cc | 18 +++--- .../corpus.txt | 41 ++++++++++++ .../grammar.json | 35 +++++++++++ .../readme.md | 1 + .../scanner.c | 23 +++++++ .../external_and_internal_tokens/grammar.json | 4 +- .../external_extra_tokens/grammar.json | 2 +- .../external_tokens/grammar.json | 6 +- test/helpers/stream_methods.cc | 14 +---- test/helpers/stream_methods.h | 8 +-- 24 files changed, 282 insertions(+), 183 deletions(-) create mode 100644 test/fixtures/test_grammars/external_and_internal_anonymous_tokens/corpus.txt create mode 100644 test/fixtures/test_grammars/external_and_internal_anonymous_tokens/grammar.json create mode 100644 test/fixtures/test_grammars/external_and_internal_anonymous_tokens/readme.md create mode 100644 test/fixtures/test_grammars/external_and_internal_anonymous_tokens/scanner.c diff --git a/src/compiler/build_tables/build_parse_table.cc b/src/compiler/build_tables/build_parse_table.cc index 6ab35f76..50c84af7 100644 --- a/src/compiler/build_tables/build_parse_table.cc +++ b/src/compiler/build_tables/build_parse_table.cc @@ -38,7 +38,7 @@ class ParseTableBuilder { set conflicts; ParseItemSetBuilder item_set_builder; set fragile_productions; - vector> incompatible_token_indices_by_index; + vector> incompatible_tokens_by_index; bool allow_any_conflict; public: @@ -109,10 +109,13 @@ class ParseTableBuilder { void build_error_parse_state() { ParseState error_state; - for (Symbol::Index i = 0; i < lexical_grammar.variables.size(); i++) { + for (unsigned i = 0; i < lexical_grammar.variables.size(); i++) { + Symbol token = Symbol::terminal(i); bool has_non_reciprocal_conflict = false; - for (Symbol::Index incompatible_index : incompatible_token_indices_by_index[i]) { - if (!incompatible_token_indices_by_index[incompatible_index].count(i)) { + + for (Symbol incompatible_token : incompatible_tokens_by_index[i]) { + if (incompatible_token.is_terminal() && + !incompatible_tokens_by_index[incompatible_token.index].count(token)) { has_non_reciprocal_conflict = true; break; } @@ -302,28 +305,25 @@ class ParseTableBuilder { } void compute_unmergable_token_pairs() { - incompatible_token_indices_by_index.resize(lexical_grammar.variables.size()); + incompatible_tokens_by_index.resize(lexical_grammar.variables.size()); - // First, assume that all tokens are mutually incompatible. - for (Symbol::Index i = 0, n = lexical_grammar.variables.size(); i < n; i++) { - auto &incompatible_indices = incompatible_token_indices_by_index[i]; - for (Symbol::Index j = 0; j < n; j++) { - if (j != i) incompatible_indices.insert(j); - } - } - - // For the remaining possibly-incompatible pairs of tokens, check if they - // are actually incompatible by actually generating lexical states that - // contain them both. auto lex_table_builder = LexTableBuilder::create(lexical_grammar); - for (Symbol::Index i = 0, n = lexical_grammar.variables.size(); i < n; i++) { - auto &incompatible_indices = incompatible_token_indices_by_index[i]; - auto iter = incompatible_indices.begin(); - while (iter != incompatible_indices.end()) { - if (lex_table_builder->detect_conflict(i, *iter)) { - ++iter; - } else { - iter = incompatible_indices.erase(iter); + for (unsigned i = 0, n = lexical_grammar.variables.size(); i < n; i++) { + Symbol token = Symbol::terminal(i); + auto &incompatible_indices = incompatible_tokens_by_index[i]; + + for (unsigned j = 0; j < n; j++) { + if (i == j) continue; + if (lex_table_builder->detect_conflict(i, j)) { + incompatible_indices.insert(Symbol::terminal(j)); + } + } + + for (const ExternalToken &external_token : grammar.external_tokens) { + if (external_token.corresponding_internal_token == token) { + for (unsigned j = 0; j < grammar.external_tokens.size(); j++) { + incompatible_indices.insert(Symbol::external(j)); + } } } } @@ -419,15 +419,14 @@ class ParseTableBuilder { for (auto &entry : state.terminal_entries) { Symbol lookahead = entry.first; const vector &actions = entry.second.actions; - auto &incompatible_token_indices = incompatible_token_indices_by_index[lookahead.index]; + auto &incompatible_tokens = incompatible_tokens_by_index[lookahead.index]; const auto &other_entry = other.terminal_entries.find(lookahead); if (other_entry == other.terminal_entries.end()) { if (lookahead.is_external()) return false; if (!lookahead.is_built_in()) { - for (Symbol::Index incompatible_index : incompatible_token_indices) { - Symbol incompatible_symbol = Symbol::terminal(incompatible_index); - if (other.terminal_entries.count(incompatible_symbol)) return false; + for (const Symbol &incompatible_token : incompatible_tokens) { + if (other.terminal_entries.count(incompatible_token)) return false; } } if (actions.back().type != ParseActionTypeReduce) @@ -444,14 +443,13 @@ class ParseTableBuilder { for (auto &entry : other.terminal_entries) { Symbol lookahead = entry.first; const vector &actions = entry.second.actions; - auto &incompatible_token_indices = incompatible_token_indices_by_index[lookahead.index]; + auto &incompatible_tokens = incompatible_tokens_by_index[lookahead.index]; if (!state.terminal_entries.count(lookahead)) { if (lookahead.is_external()) return false; if (!lookahead.is_built_in()) { - for (Symbol::Index incompatible_index : incompatible_token_indices) { - Symbol incompatible_symbol = Symbol::terminal(incompatible_index); - if (state.terminal_entries.count(incompatible_symbol)) return false; + for (const Symbol &incompatible_token : incompatible_tokens) { + if (state.terminal_entries.count(incompatible_token)) return false; } } if (actions.back().type != ParseActionTypeReduce) diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index b51db626..76d82ce2 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -526,7 +526,7 @@ class CCodeGenerator { // Helper functions string external_token_id(Symbol::Index index) { - return "ts_external_token_" + syntax_grammar.external_tokens[index].name; + return "ts_external_token_" + sanitize_name(syntax_grammar.external_tokens[index].name); } string symbol_id(const Symbol &symbol) { diff --git a/src/compiler/grammar.h b/src/compiler/grammar.h index f24c0030..6d16524b 100644 --- a/src/compiler/grammar.h +++ b/src/compiler/grammar.h @@ -16,29 +16,21 @@ enum VariableType { VariableTypeNamed, }; -struct ExternalToken { +struct Variable { std::string name; VariableType type; - rules::Symbol corresponding_internal_token; + rules::Rule rule; - inline bool operator==(const ExternalToken &other) const { - return name == other.name && - type == other.type && - corresponding_internal_token == other.corresponding_internal_token; + inline bool operator==(const Variable &other) const { + return name == other.name && rule == other.rule && type == other.type; } }; struct InputGrammar { - struct Variable { - std::string name; - VariableType type; - rules::Rule rule; - }; - std::vector variables; std::vector extra_tokens; std::vector> expected_conflicts; - std::vector external_tokens; + std::vector external_tokens; }; } // namespace tree_sitter diff --git a/src/compiler/parse_grammar.cc b/src/compiler/parse_grammar.cc index 536672f4..aa1e2fb8 100644 --- a/src/compiler/parse_grammar.cc +++ b/src/compiler/parse_grammar.cc @@ -228,7 +228,7 @@ ParseGrammarResult parse_grammar(const string &input) { error_message = result.error_message; goto error; } - grammar.variables.push_back(InputGrammar::Variable{ + grammar.variables.push_back(Variable{ string(entry_json.name), VariableTypeNamed, result.rule @@ -293,18 +293,21 @@ ParseGrammarResult parse_grammar(const string &input) { } for (size_t i = 0, length = external_tokens_json.u.array.length; i < length; i++) { - json_value *token_name_json = external_tokens_json.u.array.values[i]; - if (token_name_json->type != json_string) { - error_message = "External token values must be strings"; + json_value *external_token_json = external_tokens_json.u.array.values[i]; + auto result = parse_rule(external_token_json); + if (!result.error_message.empty()) { + error_message = "Invalid external token: " + result.error_message; goto error; } - string token_name = token_name_json->u.string.ptr; - grammar.external_tokens.push_back({ - token_name, - VariableTypeNamed, - rules::NONE() - }); + grammar.external_tokens.push_back(result.rule.match( + [](rules::NamedSymbol named_symbol) { + return Variable{named_symbol.value, VariableTypeNamed, named_symbol}; + }, + [](auto rule) { + return Variable{"", VariableTypeAnonymous, rule}; + } + )); } } diff --git a/src/compiler/prepare_grammar/expand_repeats.cc b/src/compiler/prepare_grammar/expand_repeats.cc index ec2ec19a..39b2075d 100644 --- a/src/compiler/prepare_grammar/expand_repeats.cc +++ b/src/compiler/prepare_grammar/expand_repeats.cc @@ -85,7 +85,7 @@ class ExpandRepeats { return apply(rule); } - vector aux_rules; + vector aux_rules; }; InitialSyntaxGrammar expand_repeats(const InitialSyntaxGrammar &grammar) { diff --git a/src/compiler/prepare_grammar/extract_tokens.cc b/src/compiler/prepare_grammar/extract_tokens.cc index 39f21698..6893cde4 100644 --- a/src/compiler/prepare_grammar/extract_tokens.cc +++ b/src/compiler/prepare_grammar/extract_tokens.cc @@ -156,7 +156,7 @@ class TokenExtractor { } vector token_usage_counts; - vector tokens; + vector tokens; }; tuple extract_tokens( @@ -167,8 +167,8 @@ tuple extract_tokens( SymbolReplacer symbol_replacer; TokenExtractor extractor; - // First, extract all of the grammar's tokens into the lexical grammar. - vector processed_variables; + // Extract all of the grammar's tokens into the lexical grammar. + vector processed_variables; for (const auto &variable : grammar.variables) { processed_variables.push_back({ variable.name, @@ -177,6 +177,15 @@ tuple extract_tokens( }); } + vector processed_external_tokens; + for (const auto &external_token : grammar.external_tokens) { + processed_external_tokens.push_back({ + external_token.name, + external_token.type, + extractor.apply(external_token.rule) + }); + } + for (const auto &extracted_token : extractor.tokens) { auto expansion = expand_token(extracted_token.rule); if (expansion.error) return make_tuple( @@ -269,12 +278,22 @@ tuple extract_tokens( if (error) return make_tuple(syntax_grammar, lexical_grammar, error); } - for (const ExternalToken &external_token : grammar.external_tokens) { - Symbol internal_token = symbol_replacer.replace_symbol( - external_token.corresponding_internal_token - ); + for (const auto &external_token : processed_external_tokens) { + Rule new_rule = symbol_replacer.apply(external_token.rule); - if (internal_token.is_non_terminal()) { + if (!new_rule.is()) { + return make_tuple( + syntax_grammar, + lexical_grammar, + CompileError( + TSCompileErrorTypeInvalidExternalToken, + "Non-symbol rule expressions can't be used as external tokens" + ) + ); + } + + Symbol symbol = new_rule.get_unchecked(); + if (symbol.is_non_terminal()) { return make_tuple( syntax_grammar, lexical_grammar, @@ -285,11 +304,19 @@ tuple extract_tokens( ); } - syntax_grammar.external_tokens.push_back(ExternalToken{ - external_token.name, - external_token.type, - internal_token - }); + if (symbol.is_external()) { + syntax_grammar.external_tokens.push_back(ExternalToken{ + external_token.name, + external_token.type, + rules::NONE() + }); + } else { + syntax_grammar.external_tokens.push_back(ExternalToken{ + lexical_grammar.variables[symbol.index].name, + external_token.type, + symbol + }); + } } return make_tuple(syntax_grammar, lexical_grammar, CompileError::none()); diff --git a/src/compiler/prepare_grammar/flatten_grammar.cc b/src/compiler/prepare_grammar/flatten_grammar.cc index 846c361d..71b19f21 100644 --- a/src/compiler/prepare_grammar/flatten_grammar.cc +++ b/src/compiler/prepare_grammar/flatten_grammar.cc @@ -89,7 +89,7 @@ class FlattenRule { } }; -SyntaxVariable flatten_rule(const InitialSyntaxGrammar::Variable &variable) { +SyntaxVariable flatten_rule(const Variable &variable) { vector productions; for (const Rule &rule_component : extract_choices(variable.rule)) { diff --git a/src/compiler/prepare_grammar/flatten_grammar.h b/src/compiler/prepare_grammar/flatten_grammar.h index 4efd9561..73873d61 100644 --- a/src/compiler/prepare_grammar/flatten_grammar.h +++ b/src/compiler/prepare_grammar/flatten_grammar.h @@ -11,7 +11,7 @@ namespace tree_sitter { namespace prepare_grammar { -SyntaxVariable flatten_rule(const InitialSyntaxGrammar::Variable &variable); +SyntaxVariable flatten_rule(const Variable &variable); std::pair flatten_grammar(const InitialSyntaxGrammar &); } // namespace prepare_grammar diff --git a/src/compiler/prepare_grammar/initial_syntax_grammar.h b/src/compiler/prepare_grammar/initial_syntax_grammar.h index bc200483..55eb2b7e 100644 --- a/src/compiler/prepare_grammar/initial_syntax_grammar.h +++ b/src/compiler/prepare_grammar/initial_syntax_grammar.h @@ -5,22 +5,13 @@ #include #include "tree_sitter/compiler.h" #include "compiler/grammar.h" +#include "compiler/syntax_grammar.h" #include "compiler/rule.h" namespace tree_sitter { namespace prepare_grammar { struct InitialSyntaxGrammar { - struct Variable { - std::string name; - VariableType type; - rules::Rule rule; - - inline bool operator==(const Variable &other) const { - return name == other.name && type == other.type && rule == other.rule; - } - }; - std::vector variables; std::set extra_tokens; std::set> expected_conflicts; diff --git a/src/compiler/prepare_grammar/intern_symbols.cc b/src/compiler/prepare_grammar/intern_symbols.cc index d705f121..deaeb122 100644 --- a/src/compiler/prepare_grammar/intern_symbols.cc +++ b/src/compiler/prepare_grammar/intern_symbols.cc @@ -21,14 +21,21 @@ class SymbolInterner { public: Rule apply(const Rule &rule) { return rule.match( - [&](const rules::Blank &blank) -> Rule { return blank; }, + [&](const rules::Blank &blank) -> Rule { + return blank; + }, [&](const rules::NamedSymbol &symbol) { return intern_symbol(symbol); }, - [&](const rules::String &string) { return string; }, - [&](const rules::Pattern &pattern) { return pattern; }, + [&](const rules::String &string) { + return string; + }, + + [&](const rules::Pattern &pattern) { + return pattern; + }, [&](const rules::Choice &choice) { vector elements; @@ -58,12 +65,18 @@ class SymbolInterner { } Symbol intern_symbol(rules::NamedSymbol named_symbol) { - for (size_t i = 0; i < grammar.variables.size(); i++) - if (grammar.variables[i].name == named_symbol.value) + for (size_t i = 0; i < grammar.variables.size(); i++) { + if (grammar.variables[i].name == named_symbol.value) { return Symbol::non_terminal(i); - for (size_t i = 0; i < grammar.external_tokens.size(); i++) - if (grammar.external_tokens[i].name == named_symbol.value) + } + } + + for (size_t i = 0; i < grammar.external_tokens.size(); i++) { + if (grammar.external_tokens[i].name == named_symbol.value) { return Symbol::external(i); + } + } + missing_rule_name = named_symbol.value; return rules::NONE(); } @@ -81,23 +94,21 @@ CompileError missing_rule_error(string rule_name) { pair intern_symbols(const InputGrammar &grammar) { InternedGrammar result; + SymbolInterner interner(grammar); + for (auto &external_token : grammar.external_tokens) { - Symbol corresponding_internal_token = rules::NONE(); - for (size_t i = 0, n = grammar.variables.size(); i < n; i++) { - if (grammar.variables[i].name == external_token.name) { - corresponding_internal_token = Symbol::non_terminal(i); - break; - } + auto new_rule = interner.apply(external_token.rule); + if (!interner.missing_rule_name.empty()) { + return { result, missing_rule_error(interner.missing_rule_name) }; } - result.external_tokens.push_back(ExternalToken{ + result.external_tokens.push_back(Variable{ external_token.name, - external_token.name[0] == '_' ? VariableTypeHidden : VariableTypeNamed, - corresponding_internal_token + external_token.name[0] == '_' ? VariableTypeHidden : external_token.type, + new_rule }); } - SymbolInterner interner(grammar); for (auto &variable : grammar.variables) { auto new_rule = interner.apply(variable.rule); @@ -105,7 +116,7 @@ pair intern_symbols(const InputGrammar &grammar) return { result, missing_rule_error(interner.missing_rule_name) }; } - result.variables.push_back(InternedGrammar::Variable{ + result.variables.push_back(Variable{ variable.name, variable.name[0] == '_' ? VariableTypeHidden : VariableTypeNamed, new_rule @@ -131,7 +142,7 @@ pair intern_symbols(const InputGrammar &grammar) result.expected_conflicts.insert(entry); } - return { result, CompileError::none() }; + return {result, CompileError::none()}; } } // namespace prepare_grammar diff --git a/src/compiler/prepare_grammar/interned_grammar.h b/src/compiler/prepare_grammar/interned_grammar.h index f7abdd4f..99987f42 100644 --- a/src/compiler/prepare_grammar/interned_grammar.h +++ b/src/compiler/prepare_grammar/interned_grammar.h @@ -11,20 +11,10 @@ namespace tree_sitter { namespace prepare_grammar { struct InternedGrammar { - struct Variable { - std::string name; - VariableType type; - rules::Rule rule; - - bool operator==(const Variable &other) const { - return name == other.name && type == other.type && rule == other.rule; - } - }; - std::vector variables; std::vector extra_tokens; std::set> expected_conflicts; - std::vector external_tokens; + std::vector external_tokens; }; } // namespace prepare_grammar diff --git a/src/compiler/syntax_grammar.h b/src/compiler/syntax_grammar.h index 4099de18..4c177240 100644 --- a/src/compiler/syntax_grammar.h +++ b/src/compiler/syntax_grammar.h @@ -30,6 +30,18 @@ struct SyntaxVariable { using ConflictSet = std::set; +struct ExternalToken { + std::string name; + VariableType type; + rules::Symbol corresponding_internal_token; + + inline bool operator==(const ExternalToken &other) const { + return name == other.name && + type == other.type && + corresponding_internal_token == other.corresponding_internal_token; + } +}; + struct SyntaxGrammar { std::vector variables; std::set extra_tokens; diff --git a/test/compiler/prepare_grammar/expand_repeats_test.cc b/test/compiler/prepare_grammar/expand_repeats_test.cc index c025a898..244e5c5f 100644 --- a/test/compiler/prepare_grammar/expand_repeats_test.cc +++ b/test/compiler/prepare_grammar/expand_repeats_test.cc @@ -6,11 +6,6 @@ using namespace rules; using prepare_grammar::InitialSyntaxGrammar; using prepare_grammar::expand_repeats; -using Variable = InitialSyntaxGrammar::Variable; - -bool operator==(const Variable &left, const Variable &right) { - return left.name == right.name && left.rule == right.rule && left.type == right.type; -} START_TEST diff --git a/test/compiler/prepare_grammar/extract_tokens_test.cc b/test/compiler/prepare_grammar/extract_tokens_test.cc index dd9156a2..e720ec3c 100644 --- a/test/compiler/prepare_grammar/extract_tokens_test.cc +++ b/test/compiler/prepare_grammar/extract_tokens_test.cc @@ -11,14 +11,12 @@ using namespace rules; using prepare_grammar::extract_tokens; using prepare_grammar::InternedGrammar; using prepare_grammar::InitialSyntaxGrammar; -using InternedVariable = InternedGrammar::Variable; -using InitialSyntaxVariable = InitialSyntaxGrammar::Variable; describe("extract_tokens", []() { it("moves strings, patterns, and sub-rules marked as tokens into the lexical grammar", [&]() { auto result = extract_tokens(InternedGrammar{ { - InternedVariable{ + Variable{ "rule_A", VariableTypeNamed, Repeat{Rule::seq({ @@ -34,17 +32,17 @@ describe("extract_tokens", []() { }), })} }, - InternedVariable{ + Variable{ "rule_B", VariableTypeNamed, Pattern{"h+"} }, - InternedVariable{ + Variable{ "rule_C", VariableTypeNamed, Rule::choice({ String{"i"}, Blank{} }) }, - InternedVariable{ + Variable{ "rule_D", VariableTypeNamed, Repeat{Symbol::non_terminal(3)} @@ -61,8 +59,8 @@ describe("extract_tokens", []() { AssertThat(error, Equals(CompileError::none())); - AssertThat(syntax_grammar.variables, Equals(vector{ - InitialSyntaxVariable{ + AssertThat(syntax_grammar.variables, Equals(vector{ + Variable{ "rule_A", VariableTypeNamed, Repeat{Rule::seq({ @@ -88,13 +86,13 @@ describe("extract_tokens", []() { })} }, - InitialSyntaxVariable{ + Variable{ "rule_C", VariableTypeNamed, Rule::choice({Symbol::terminal(4), Blank{}}) }, - InitialSyntaxVariable{ + Variable{ "rule_D", VariableTypeNamed, Repeat{Symbol::non_terminal(2)} @@ -168,8 +166,8 @@ describe("extract_tokens", []() { InitialSyntaxGrammar &syntax_grammar = get<0>(result); LexicalGrammar &lexical_grammar = get<1>(result); - AssertThat(syntax_grammar.variables, Equals(vector { - InitialSyntaxVariable{ + AssertThat(syntax_grammar.variables, Equals(vector { + Variable{ "rule_A", VariableTypeNamed, Rule::seq({ @@ -192,17 +190,17 @@ describe("extract_tokens", []() { it("does not move entire rules into the lexical grammar if their content is used elsewhere in the grammar", [&]() { auto result = extract_tokens(InternedGrammar{{ - InternedVariable{ + Variable{ "rule_A", VariableTypeNamed, Rule::seq({ Symbol::non_terminal(1), String{"ab"} }) }, - InternedVariable{ + Variable{ "rule_B", VariableTypeNamed, String{"cd"} }, - InternedVariable{ + Variable{ "rule_C", VariableTypeNamed, Rule::seq({ String{"ef"}, String{"cd"} }) @@ -212,18 +210,18 @@ describe("extract_tokens", []() { InitialSyntaxGrammar &syntax_grammar = get<0>(result); LexicalGrammar &lexical_grammar = get<1>(result); - AssertThat(syntax_grammar.variables, Equals(vector({ - InitialSyntaxVariable{ + AssertThat(syntax_grammar.variables, Equals(vector({ + Variable{ "rule_A", VariableTypeNamed, Rule::seq({ Symbol::non_terminal(1), Symbol::terminal(0) }) }, - InitialSyntaxVariable{ + Variable{ "rule_B", VariableTypeNamed, Symbol::terminal(1) }, - InitialSyntaxVariable{ + Variable{ "rule_C", VariableTypeNamed, Rule::seq({ Symbol::terminal(2), Symbol::terminal(1) }) @@ -255,17 +253,17 @@ describe("extract_tokens", []() { it("renumbers the grammar's expected conflict symbols based on any moved rules", [&]() { auto result = extract_tokens(InternedGrammar{ { - InternedVariable{ + Variable{ "rule_A", VariableTypeNamed, String{"ok"} }, - InternedVariable{ + Variable{ "rule_B", VariableTypeNamed, Repeat{Symbol::non_terminal(0)} }, - InternedVariable{ + Variable{ "rule_C", VariableTypeNamed, Repeat{Seq{Symbol::non_terminal(0), Symbol::non_terminal(0)}} @@ -292,7 +290,7 @@ describe("extract_tokens", []() { it("adds inline extra tokens to the lexical grammar's separators", [&]() { auto result = extract_tokens(InternedGrammar{ { - InternedVariable{"rule_A", VariableTypeNamed, String{"x"}}, + Variable{"rule_A", VariableTypeNamed, String{"x"}}, }, { String{"y"}, @@ -314,8 +312,8 @@ describe("extract_tokens", []() { it("handles inline extra tokens that match tokens in the grammar", [&]() { auto result = extract_tokens(InternedGrammar{ { - InternedVariable{"rule_A", VariableTypeNamed, String{"x"}}, - InternedVariable{"rule_B", VariableTypeNamed, String{"y"}}, + Variable{"rule_A", VariableTypeNamed, String{"x"}}, + Variable{"rule_B", VariableTypeNamed, String{"y"}}, }, { String{"y"}, @@ -332,17 +330,17 @@ describe("extract_tokens", []() { it("updates extra symbols according to the new symbol numbers", [&]() { auto result = extract_tokens(InternedGrammar{ { - InternedVariable{ + Variable{ "rule_A", VariableTypeNamed, Rule::seq({ String{"w"}, String{"x"}, Symbol::non_terminal(1) }) }, - InternedVariable{ + Variable{ "rule_B", VariableTypeNamed, String{"y"} }, - InternedVariable{ + Variable{ "rule_C", VariableTypeNamed, String{"z"} @@ -367,12 +365,12 @@ describe("extract_tokens", []() { it("returns an error if any extra tokens are non-token symbols", [&]() { auto result = extract_tokens(InternedGrammar{ { - InternedVariable{ + Variable{ "rule_A", VariableTypeNamed, Rule::seq({ String{"x"}, Symbol::non_terminal(1) }) }, - InternedVariable{ + Variable{ "rule_B", VariableTypeNamed, Rule::seq({ String{"y"}, String{"z"} }) @@ -428,7 +426,7 @@ describe("extract_tokens", []() { {}, {}, { - ExternalToken {"rule_A", VariableTypeNamed, Symbol::non_terminal(0)} + Variable{"rule_A", VariableTypeNamed, Symbol::non_terminal(0)} } }); diff --git a/test/compiler/prepare_grammar/intern_symbols_test.cc b/test/compiler/prepare_grammar/intern_symbols_test.cc index dc488951..a0097544 100644 --- a/test/compiler/prepare_grammar/intern_symbols_test.cc +++ b/test/compiler/prepare_grammar/intern_symbols_test.cc @@ -22,7 +22,7 @@ describe("intern_symbols", []() { auto result = intern_symbols(grammar); AssertThat(result.second, Equals(CompileError::none())); - AssertThat(result.first.variables, Equals(vector{ + AssertThat(result.first.variables, Equals(vector{ {"x", VariableTypeNamed, Rule::choice({ Symbol::non_terminal(1), Symbol::non_terminal(2) })}, {"y", VariableTypeNamed, Symbol::non_terminal(2)}, {"_z", VariableTypeHidden, String{"stuff"}}, @@ -74,28 +74,28 @@ describe("intern_symbols", []() { {}, {}, { - ExternalToken{ + Variable{ "w", VariableTypeNamed, - NONE() + NamedSymbol{"w"} }, - ExternalToken{ + Variable{ "z", VariableTypeNamed, - NONE() + NamedSymbol{"z"} }, } }; auto result = intern_symbols(grammar); - AssertThat(result.first.external_tokens, Equals(vector{ - ExternalToken{ + AssertThat(result.first.external_tokens, Equals(vector{ + Variable{ "w", VariableTypeNamed, - rules::NONE() + Symbol::external(0) }, - ExternalToken{ + Variable{ "z", VariableTypeNamed, Symbol::non_terminal(2) diff --git a/test/fixtures/test_grammars/external_and_internal_anonymous_tokens/corpus.txt b/test/fixtures/test_grammars/external_and_internal_anonymous_tokens/corpus.txt new file mode 100644 index 00000000..52c5a6d6 --- /dev/null +++ b/test/fixtures/test_grammars/external_and_internal_anonymous_tokens/corpus.txt @@ -0,0 +1,41 @@ +========================================= +single-line statements - internal tokens +========================================= + +a b + +--- + +(statement (variable) (variable)) + +========================================= +multi-line statements - internal tokens +========================================= + +a +b + +--- + +(statement (variable) (variable)) + +========================================= +single-line statements - external tokens +========================================= + +'hello' 'world' + +--- + +(statement (string) (string)) + +========================================= +multi-line statements - external tokens +========================================= + +'hello' +'world' + +--- + +(statement (string) (string)) diff --git a/test/fixtures/test_grammars/external_and_internal_anonymous_tokens/grammar.json b/test/fixtures/test_grammars/external_and_internal_anonymous_tokens/grammar.json new file mode 100644 index 00000000..e62c5b7c --- /dev/null +++ b/test/fixtures/test_grammars/external_and_internal_anonymous_tokens/grammar.json @@ -0,0 +1,35 @@ +{ + "name": "external_and_internal_anonymous_tokens", + + "externals": [ + {"type": "SYMBOL", "name": "string"}, + {"type": "STRING", "value": "\n"} + ], + + "extras": [ + {"type": "PATTERN", "value": "\\s"} + ], + + "rules": { + "statement": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "_expression"}, + {"type": "SYMBOL", "name": "_expression"}, + {"type": "STRING", "value": "\n"} + ] + }, + + "_expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "string"}, + {"type": "SYMBOL", "name": "variable"}, + {"type": "SYMBOL", "name": "number"} + ] + }, + + "variable": {"type": "PATTERN", "value": "\\a+"}, + "number": {"type": "PATTERN", "value": "\\d+"} + } +} \ No newline at end of file diff --git a/test/fixtures/test_grammars/external_and_internal_anonymous_tokens/readme.md b/test/fixtures/test_grammars/external_and_internal_anonymous_tokens/readme.md new file mode 100644 index 00000000..0b302a82 --- /dev/null +++ b/test/fixtures/test_grammars/external_and_internal_anonymous_tokens/readme.md @@ -0,0 +1 @@ +This grammar is just like the `external_and_internal_tokens` grammar, except that the shared external token is *anonymous*; it's specified as a string in the grammar. \ No newline at end of file diff --git a/test/fixtures/test_grammars/external_and_internal_anonymous_tokens/scanner.c b/test/fixtures/test_grammars/external_and_internal_anonymous_tokens/scanner.c new file mode 100644 index 00000000..c940be1a --- /dev/null +++ b/test/fixtures/test_grammars/external_and_internal_anonymous_tokens/scanner.c @@ -0,0 +1,23 @@ +#include "../external_and_internal_tokens/scanner.c" + +void *tree_sitter_external_and_internal_anonymous_tokens_external_scanner_create() { return NULL; } + +void tree_sitter_external_and_internal_anonymous_tokens_external_scanner_destroy(void *payload) {} + +void tree_sitter_external_and_internal_anonymous_tokens_external_scanner_reset(void *payload) {} + +bool tree_sitter_external_and_internal_anonymous_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) { return true; } + +void tree_sitter_external_and_internal_anonymous_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) {} + +bool tree_sitter_external_and_internal_anonymous_tokens_external_scanner_scan( + void *payload, + TSLexer *lexer, + const bool *whitelist +) { + return tree_sitter_external_and_internal_tokens_external_scanner_scan( + payload, + lexer, + whitelist + ); +} \ No newline at end of file diff --git a/test/fixtures/test_grammars/external_and_internal_tokens/grammar.json b/test/fixtures/test_grammars/external_and_internal_tokens/grammar.json index f24e1c1c..63ce1c33 100644 --- a/test/fixtures/test_grammars/external_and_internal_tokens/grammar.json +++ b/test/fixtures/test_grammars/external_and_internal_tokens/grammar.json @@ -2,8 +2,8 @@ "name": "external_and_internal_tokens", "externals": [ - "string", - "line_break" + {"type": "SYMBOL", "name": "string"}, + {"type": "SYMBOL", "name": "line_break"} ], "extras": [ diff --git a/test/fixtures/test_grammars/external_extra_tokens/grammar.json b/test/fixtures/test_grammars/external_extra_tokens/grammar.json index ed13b34a..a836b289 100644 --- a/test/fixtures/test_grammars/external_extra_tokens/grammar.json +++ b/test/fixtures/test_grammars/external_extra_tokens/grammar.json @@ -2,7 +2,7 @@ "name": "external_extra_tokens", "externals": [ - "comment" + {"type": "SYMBOL", "name": "comment"} ], "extras": [ diff --git a/test/fixtures/test_grammars/external_tokens/grammar.json b/test/fixtures/test_grammars/external_tokens/grammar.json index 8a175404..1d64915b 100644 --- a/test/fixtures/test_grammars/external_tokens/grammar.json +++ b/test/fixtures/test_grammars/external_tokens/grammar.json @@ -2,9 +2,9 @@ "name": "external_tokens", "externals": [ - "_percent_string", - "_percent_string_start", - "_percent_string_end" + {"type": "SYMBOL", "name": "_percent_string"}, + {"type": "SYMBOL", "name": "_percent_string_start"}, + {"type": "SYMBOL", "name": "_percent_string_end"} ], "extras": [ diff --git a/test/helpers/stream_methods.cc b/test/helpers/stream_methods.cc index 56c85890..91f9e87f 100644 --- a/test/helpers/stream_methods.cc +++ b/test/helpers/stream_methods.cc @@ -132,7 +132,7 @@ ostream &operator<<(ostream &stream, const Rule &rule) { } // namespace rules -ostream &operator<<(ostream &stream, const InputGrammar::Variable &variable) { +ostream &operator<<(ostream &stream, const Variable &variable) { return stream << "(Variable " << variable.name << " " << variable.rule << ")"; } @@ -165,18 +165,6 @@ ostream &operator<<(ostream &stream, const PrecedenceRange &range) { } } -namespace prepare_grammar { - -ostream &operator<<(ostream &stream, const prepare_grammar::InternedGrammar::Variable &variable) { - return stream << "(Variable " << variable.name << " " << variable.rule << ")"; -} - -ostream &operator<<(ostream &stream, const prepare_grammar::InitialSyntaxGrammar::Variable &variable) { - return stream << "(Variable " << variable.name << " " << variable.rule << ")"; -} - -} // namespace prepare_grammar - namespace build_tables { ostream &operator<<(ostream &stream, const LexItem &item) { diff --git a/test/helpers/stream_methods.h b/test/helpers/stream_methods.h index 58b7fd17..49853c19 100644 --- a/test/helpers/stream_methods.h +++ b/test/helpers/stream_methods.h @@ -111,6 +111,7 @@ ostream &operator<<(ostream &, const CompileError &); ostream &operator<<(ostream &, const ExternalToken &); ostream &operator<<(ostream &, const ProductionStep &); ostream &operator<<(ostream &, const PrecedenceRange &); +ostream &operator<<(ostream &, const Variable &); ostream &operator<<(ostream &, const LexicalVariable &); namespace rules { @@ -130,13 +131,6 @@ ostream &operator<<(ostream &stream, const Rule &rule); } // namespace rules -namespace prepare_grammar { - -ostream &operator<<(ostream &, const InitialSyntaxGrammar::Variable &); -ostream &operator<<(ostream &, const InternedGrammar::Variable &); - -} // namespace prepare_grammar - namespace build_tables { class LexItem; From 17876b6826a0cef199c400032d6c7efbf3cda16b Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 17 Mar 2017 16:41:30 -0700 Subject: [PATCH 2/4] Update grammar JSON schema --- doc/grammar-schema.json | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/doc/grammar-schema.json b/doc/grammar-schema.json index f37cd983..916d5eff 100644 --- a/doc/grammar-schema.json +++ b/doc/grammar-schema.json @@ -31,6 +31,13 @@ } }, + "externals": { + "type": "array", + "items": { + "$ref": "#/definitions/rule" + } + }, + "conflicts": { "type": "array", "items": { @@ -40,14 +47,6 @@ "pattern": "^[a-zA-Z_]\\w*$" } } - }, - - "externals": { - "type": "array", - "items": { - "type": "string", - "pattern": "^[a-zA-Z_]\\w*$" - } } }, From 6d8f9ebabafb0a0451d3905d5a16e8a3f0632bc1 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 17 Mar 2017 17:04:02 -0700 Subject: [PATCH 3/4] In tests, regenerate parser if its timestamp matches grammar.json After running the fetch-fixtures script, their timestamps may be equal, but in this situation we *do* want to re-generate the parsers. --- test/helpers/load_language.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/helpers/load_language.cc b/test/helpers/load_language.cc index cb67243a..d5e70db0 100644 --- a/test/helpers/load_language.cc +++ b/test/helpers/load_language.cc @@ -164,7 +164,7 @@ const TSLanguage *load_real_language(const string &language_name) { int parser_mtime = get_modified_time(parser_filename); - if (parser_mtime < grammar_mtime || parser_mtime < libcompiler_mtime) { + if (parser_mtime <= grammar_mtime || parser_mtime <= libcompiler_mtime) { printf("\n" "Regenerating the %s parser...\n", language_name.c_str()); string grammar_json = read_file(grammar_filename); From 24878277e979e2f2d9ee7c54f894783ece8e7678 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 17 Mar 2017 17:05:02 -0700 Subject: [PATCH 4/4] Use new version of python grammar in tests --- script/fetch-fixtures | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/script/fetch-fixtures b/script/fetch-fixtures index 996fe891..a829962f 100755 --- a/script/fetch-fixtures +++ b/script/fetch-fixtures @@ -25,4 +25,4 @@ fetch_grammar 'javascript' '76cd7dd5eb793db21640c725e58301bde83781f7' fetch_grammar 'json' 'origin/master' fetch_grammar 'c' 'origin/master' fetch_grammar 'cpp' 'origin/master' -fetch_grammar 'python' 'origin/master' +fetch_grammar 'python' '179cb35e5b35baeef4a37f00732ff2de15e2e8bd'