diff --git a/project.gyp b/project.gyp index feeb1fac..77002a65 100644 --- a/project.gyp +++ b/project.gyp @@ -29,6 +29,7 @@ 'src/compiler/prepare_grammar/expand_repeats.cc', 'src/compiler/prepare_grammar/expand_tokens.cc', 'src/compiler/prepare_grammar/extract_choices.cc', + 'src/compiler/prepare_grammar/extract_simple_aliases.cc', 'src/compiler/prepare_grammar/extract_tokens.cc', 'src/compiler/prepare_grammar/flatten_grammar.cc', 'src/compiler/prepare_grammar/intern_symbols.cc', diff --git a/src/compiler/build_tables/parse_table_builder.cc b/src/compiler/build_tables/parse_table_builder.cc index 26dae5b7..2531754b 100644 --- a/src/compiler/build_tables/parse_table_builder.cc +++ b/src/compiler/build_tables/parse_table_builder.cc @@ -45,6 +45,7 @@ struct ParseStateQueueEntry { class ParseTableBuilderImpl : public ParseTableBuilder { const SyntaxGrammar grammar; const LexicalGrammar lexical_grammar; + const std::unordered_map &simple_aliases; unordered_map state_ids_by_item_set; vector item_sets_by_state_id; deque parse_state_queue; @@ -56,9 +57,13 @@ class ParseTableBuilderImpl : public ParseTableBuilder { set> logged_conflict_tokens; public: - ParseTableBuilderImpl(const SyntaxGrammar &syntax_grammar, const LexicalGrammar &lexical_grammar) - : grammar(syntax_grammar), + ParseTableBuilderImpl( + const SyntaxGrammar &syntax_grammar, + const LexicalGrammar &lexical_grammar, + const std::unordered_map &simple_aliases + ) : grammar(syntax_grammar), lexical_grammar(lexical_grammar), + simple_aliases(simple_aliases), item_set_builder(syntax_grammar, lexical_grammar) {} BuildResult build() { @@ -403,12 +408,12 @@ class ParseTableBuilderImpl : public ParseTableBuilder { } void eliminate_unit_reductions() { - set aliased_symbols; + set aliased_symbols; for (auto &variable : grammar.variables) { for (auto &production : variable.productions) { for (auto &step : production) { if (!step.alias.value.empty()) { - aliased_symbols.insert(step.symbol.index); + aliased_symbols.insert(step.symbol); } } } @@ -430,7 +435,8 @@ class ParseTableBuilderImpl : public ParseTableBuilder { if (action.type == ParseActionTypeReduce && action.consumed_symbol_count == 1 && action.alias_sequence_id == 0 && - !aliased_symbols.count(action.symbol.index) && + !simple_aliases.count(action.symbol) && + !aliased_symbols.count(action.symbol) && grammar.variables[action.symbol.index].type != VariableTypeNamed && (unit_reduction_symbol == -1 || unit_reduction_symbol == action.symbol.index) ) { @@ -887,9 +893,14 @@ class ParseTableBuilderImpl : public ParseTableBuilder { unique_ptr ParseTableBuilder::create( const SyntaxGrammar &syntax_grammar, - const LexicalGrammar &lexical_grammar + const LexicalGrammar &lexical_grammar, + const std::unordered_map &simple_aliases ) { - return unique_ptr(new ParseTableBuilderImpl(syntax_grammar, lexical_grammar)); + return unique_ptr(new ParseTableBuilderImpl( + syntax_grammar, + lexical_grammar, + simple_aliases + )); } ParseTableBuilder::BuildResult ParseTableBuilder::build() { diff --git a/src/compiler/build_tables/parse_table_builder.h b/src/compiler/build_tables/parse_table_builder.h index cb642d6c..bfc8641f 100644 --- a/src/compiler/build_tables/parse_table_builder.h +++ b/src/compiler/build_tables/parse_table_builder.h @@ -2,6 +2,7 @@ #define COMPILER_BUILD_TABLES_PARSE_TABLE_BUILDER_H_ #include +#include #include "compiler/parse_table.h" #include "compiler/compile_error.h" @@ -16,7 +17,11 @@ namespace build_tables { class ParseTableBuilder { public: - static std::unique_ptr create(const SyntaxGrammar &, const LexicalGrammar &); + static std::unique_ptr create( + const SyntaxGrammar &, + const LexicalGrammar &, + const std::unordered_map & + ); struct BuildResult { ParseTable parse_table; diff --git a/src/compiler/compile.cc b/src/compiler/compile.cc index 49a3468b..83bdbcc2 100644 --- a/src/compiler/compile.cc +++ b/src/compiler/compile.cc @@ -22,19 +22,23 @@ extern "C" TSCompileResult ts_compile_grammar(const char *input, FILE *log_file) ParseGrammarResult parse_result = parse_grammar(string(input)); if (!parse_result.error_message.empty()) { - return { nullptr, strdup(parse_result.error_message.c_str()), - TSCompileErrorTypeInvalidGrammar }; + return {nullptr, strdup(parse_result.error_message.c_str()), TSCompileErrorTypeInvalidGrammar}; } auto prepare_grammar_result = prepare_grammar::prepare_grammar(parse_result.grammar); - SyntaxGrammar &syntax_grammar = get<0>(prepare_grammar_result); - LexicalGrammar &lexical_grammar = get<1>(prepare_grammar_result); - CompileError error = get<2>(prepare_grammar_result); + SyntaxGrammar &syntax_grammar = prepare_grammar_result.syntax_grammar; + LexicalGrammar &lexical_grammar = prepare_grammar_result.lexical_grammar; + auto &simple_aliases = prepare_grammar_result.simple_aliases; + CompileError error = prepare_grammar_result.error; if (error.type) { return {nullptr, strdup(error.message.c_str()), error.type}; } - auto builder = build_tables::ParseTableBuilder::create(syntax_grammar, lexical_grammar); + auto builder = build_tables::ParseTableBuilder::create( + syntax_grammar, + lexical_grammar, + simple_aliases + ); auto build_tables_result = builder->build(); error = build_tables_result.error; if (error.type != 0) { @@ -48,11 +52,12 @@ extern "C" TSCompileResult ts_compile_grammar(const char *input, FILE *log_file) move(build_tables_result.keyword_lex_table), build_tables_result.keyword_capture_token, move(syntax_grammar), - move(lexical_grammar) + move(lexical_grammar), + move(simple_aliases) ); set_log_file(nullptr); - return { strdup(code.c_str()), nullptr, TSCompileErrorTypeNone }; + return {strdup(code.c_str()), nullptr, TSCompileErrorTypeNone}; } } // namespace tree_sitter diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index b1f13cbb..f0af966f 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -23,6 +23,7 @@ using std::pair; using std::set; using std::string; using std::to_string; +using std::unordered_map; using std::unordered_set; using std::vector; using util::escape_char; @@ -77,6 +78,7 @@ class CCodeGenerator { Symbol keyword_capture_token; const SyntaxGrammar syntax_grammar; const LexicalGrammar lexical_grammar; + unordered_map simple_aliases; map symbol_ids; vector> parse_table_entries; vector> external_scanner_states; @@ -84,18 +86,21 @@ class CCodeGenerator { set unique_aliases; public: - CCodeGenerator(string name, ParseTable &&parse_table, LexTable &&main_lex_table, - LexTable &&keyword_lex_table, Symbol keyword_capture_token, - SyntaxGrammar &&syntax_grammar, LexicalGrammar &&lexical_grammar) - : indent_level(0), - name(name), - parse_table(move(parse_table)), - main_lex_table(move(main_lex_table)), - keyword_lex_table(move(keyword_lex_table)), - keyword_capture_token(keyword_capture_token), - syntax_grammar(move(syntax_grammar)), - lexical_grammar(move(lexical_grammar)), - next_parse_action_list_index(0) {} + CCodeGenerator( + string name, ParseTable &&parse_table, LexTable &&main_lex_table, + LexTable &&keyword_lex_table, Symbol keyword_capture_token, + SyntaxGrammar &&syntax_grammar, LexicalGrammar &&lexical_grammar, + unordered_map &&simple_aliases + ) : indent_level(0), + name(name), + parse_table(move(parse_table)), + main_lex_table(move(main_lex_table)), + keyword_lex_table(move(keyword_lex_table)), + keyword_capture_token(keyword_capture_token), + syntax_grammar(move(syntax_grammar)), + lexical_grammar(move(lexical_grammar)), + simple_aliases(move(simple_aliases)), + next_parse_action_list_index(0) {} string code() { buffer = ""; @@ -757,14 +762,28 @@ class CCodeGenerator { } string symbol_name(const Symbol &symbol) { - if (symbol == rules::END_OF_INPUT()) + if (symbol == rules::END_OF_INPUT()) { return "END"; + } + + auto simple_alias_entry = simple_aliases.find(symbol); + if (simple_alias_entry != simple_aliases.end()) { + return simple_alias_entry->second.value; + } + return entry_for_symbol(symbol).first; } VariableType symbol_type(const Symbol &symbol) { - if (symbol == rules::END_OF_INPUT()) + if (symbol == rules::END_OF_INPUT()) { return VariableTypeHidden; + } + + auto simple_alias_entry = simple_aliases.find(symbol); + if (simple_alias_entry != simple_aliases.end()) { + return simple_alias_entry->second.is_named ? VariableTypeNamed : VariableTypeHidden; + } + return entry_for_symbol(symbol).second; } @@ -874,9 +893,12 @@ class CCodeGenerator { } }; -string c_code(string name, ParseTable &&parse_table, LexTable &&lex_table, - LexTable &&keyword_lex_table, Symbol keyword_capture_token, - SyntaxGrammar &&syntax_grammar, LexicalGrammar &&lexical_grammar) { +string c_code( + string name, ParseTable &&parse_table, LexTable &&lex_table, + LexTable &&keyword_lex_table, Symbol keyword_capture_token, + SyntaxGrammar &&syntax_grammar, LexicalGrammar &&lexical_grammar, + unordered_map &&simple_aliases +) { return CCodeGenerator( name, move(parse_table), @@ -884,7 +906,8 @@ string c_code(string name, ParseTable &&parse_table, LexTable &&lex_table, move(keyword_lex_table), keyword_capture_token, move(syntax_grammar), - move(lexical_grammar) + move(lexical_grammar), + move(simple_aliases) ).code(); } diff --git a/src/compiler/generate_code/c_code.h b/src/compiler/generate_code/c_code.h index dc0a8ddf..a7ce3c7f 100644 --- a/src/compiler/generate_code/c_code.h +++ b/src/compiler/generate_code/c_code.h @@ -2,6 +2,7 @@ #define COMPILER_GENERATE_CODE_C_CODE_H_ #include +#include #include "compiler/rule.h" namespace tree_sitter { @@ -20,7 +21,8 @@ std::string c_code( LexTable &&, rules::Symbol, SyntaxGrammar &&, - LexicalGrammar && + LexicalGrammar &&, + std::unordered_map && ); } // namespace generate_code diff --git a/src/compiler/prepare_grammar/extract_simple_aliases.cc b/src/compiler/prepare_grammar/extract_simple_aliases.cc new file mode 100644 index 00000000..208fe6f4 --- /dev/null +++ b/src/compiler/prepare_grammar/extract_simple_aliases.cc @@ -0,0 +1,111 @@ +#include "compiler/prepare_grammar/extract_simple_aliases.h" +#include "compiler/lexical_grammar.h" +#include "compiler/syntax_grammar.h" +#include +#include + +namespace tree_sitter { +namespace prepare_grammar { + +using std::pair; +using std::vector; +using std::unordered_map; +using rules::Alias; +using rules::Symbol; + +template +static void apply_alias(T *variable, Alias alias) { + if (!alias.value.empty()) { + variable->name = alias.value; + variable->type = alias.is_named ? VariableTypeNamed : VariableTypeAnonymous; + } +} + +std::unordered_map +extract_simple_aliases(SyntaxGrammar *syntax_grammar, LexicalGrammar *lexical_grammar) { + struct SymbolStatus { + Alias alias; + bool eligible = true; + }; + + vector terminal_status_list(lexical_grammar->variables.size()); + vector non_terminal_status_list(syntax_grammar->variables.size()); + vector external_status_list(syntax_grammar->external_tokens.size()); + + for (const SyntaxVariable &variable : syntax_grammar->variables) { + for (const Production &production : variable.productions) { + for (const ProductionStep &step : production.steps) { + SymbolStatus *status; + if (step.symbol.is_built_in()) { + continue; + } else if (step.symbol.is_external()) { + status = &external_status_list[step.symbol.index]; + } else if (step.symbol.is_terminal()) { + status = &terminal_status_list[step.symbol.index]; + } else { + status = &non_terminal_status_list[step.symbol.index]; + } + + if (step.alias.value.empty()) { + status->alias = Alias(); + status->eligible = false; + } + + if (status->eligible) { + if (status->alias.value.empty()) { + status->alias = step.alias; + } else if (status->alias != step.alias) { + status->alias = Alias(); + status->eligible = false; + } + } + } + } + } + + for (SyntaxVariable &variable : syntax_grammar->variables) { + for (Production &production : variable.productions) { + for (ProductionStep &step : production.steps) { + SymbolStatus *status; + if (step.symbol.is_built_in()) { + continue; + } else if (step.symbol.is_external()) { + status = &external_status_list[step.symbol.index]; + } else if (step.symbol.is_terminal()) { + status = &terminal_status_list[step.symbol.index]; + } else { + status = &non_terminal_status_list[step.symbol.index]; + } + + if (!status->alias.value.empty()) { + step.alias = Alias(); + } + } + } + } + + unordered_map result; + + for (unsigned i = 0, n = terminal_status_list.size(); i < n; i++) { + if (!terminal_status_list[i].alias.value.empty()) { + result[Symbol::terminal(i)] = terminal_status_list[i].alias; + } + } + + for (unsigned i = 0, n = non_terminal_status_list.size(); i < n; i++) { + if (!non_terminal_status_list[i].alias.value.empty()) { + result[Symbol::non_terminal(i)] = non_terminal_status_list[i].alias; + } + } + + for (unsigned i = 0, n = external_status_list.size(); i < n; i++) { + if (!external_status_list[i].alias.value.empty()) { + result[Symbol::external(i)] = external_status_list[i].alias; + } + } + + return result; +} + +} // namespace prepare_grammar +} // namespace tree_sitter diff --git a/src/compiler/prepare_grammar/extract_simple_aliases.h b/src/compiler/prepare_grammar/extract_simple_aliases.h new file mode 100644 index 00000000..9970ad1a --- /dev/null +++ b/src/compiler/prepare_grammar/extract_simple_aliases.h @@ -0,0 +1,21 @@ +#ifndef COMPILER_PREPARE_GRAMMAR_EXTRACT_SIMPLE_ALIASES_H_ +#define COMPILER_PREPARE_GRAMMAR_EXTRACT_SIMPLE_ALIASES_H_ + +#include "compiler/rules/symbol.h" +#include "compiler/rules/metadata.h" +#include + +namespace tree_sitter { + +struct SyntaxGrammar; +struct LexicalGrammar; + +namespace prepare_grammar { + +std::unordered_map +extract_simple_aliases(SyntaxGrammar *, LexicalGrammar *); + +} // namespace prepare_grammar +} // namespace tree_sitter + +#endif // COMPILER_PREPARE_GRAMMAR_EXTRACT_SIMPLE_ALIASES_H_ diff --git a/src/compiler/prepare_grammar/prepare_grammar.cc b/src/compiler/prepare_grammar/prepare_grammar.cc index ac573a28..aef16846 100644 --- a/src/compiler/prepare_grammar/prepare_grammar.cc +++ b/src/compiler/prepare_grammar/prepare_grammar.cc @@ -2,69 +2,64 @@ #include "compiler/prepare_grammar/expand_repeats.h" #include "compiler/prepare_grammar/expand_tokens.h" #include "compiler/prepare_grammar/extract_tokens.h" +#include "compiler/prepare_grammar/extract_simple_aliases.h" #include "compiler/prepare_grammar/intern_symbols.h" #include "compiler/prepare_grammar/flatten_grammar.h" #include "compiler/prepare_grammar/normalize_rules.h" -#include "compiler/lexical_grammar.h" #include "compiler/prepare_grammar/initial_syntax_grammar.h" +#include "compiler/lexical_grammar.h" #include "compiler/syntax_grammar.h" namespace tree_sitter { namespace prepare_grammar { -using std::tuple; using std::get; -using std::make_tuple; +using std::move; -tuple prepare_grammar( - const InputGrammar &input_grammar) { - /* - * Convert all string-based `NamedSymbols` into numerical `Symbols` - */ +PrepareGrammarResult prepare_grammar(const InputGrammar &input_grammar) { + PrepareGrammarResult result; + + // Convert all string-based `NamedSymbols` into numerical `Symbols` auto intern_result = intern_symbols(input_grammar); CompileError error = intern_result.second; - if (error.type) - return make_tuple(SyntaxGrammar(), LexicalGrammar(), error); + if (error.type) { + result.error = error; + return result; + } - /* - * Separate grammar into lexical and syntactic components - */ + // Separate grammar into lexical and syntactic components auto extract_result = extract_tokens(intern_result.first); error = get<2>(extract_result); if (error.type) { - return make_tuple(SyntaxGrammar(), LexicalGrammar(), error); + result.error = error; + return result; } - /* - * Replace `Repeat` rules with pairs of recursive rules - */ + // Replace `Repeat` rules with pairs of recursive rules InitialSyntaxGrammar syntax_grammar1 = expand_repeats(get<0>(extract_result)); - /* - * Expand `String` and `Pattern` rules into full rule trees - */ - LexicalGrammar lex_grammar = get<1>(extract_result); - // auto expand_tokens_result = expand_tokens(get<1>(extract_result)); - // LexicalGrammar lex_grammar = expand_tokens_result.first; - // error = expand_tokens_result.second; - // if (error.type) - // return make_tuple(SyntaxGrammar(), LexicalGrammar(), error); - - /* - * Flatten syntax rules into lists of productions. - */ + // Flatten syntax rules into lists of productions. auto flatten_result = flatten_grammar(syntax_grammar1); SyntaxGrammar syntax_grammar = flatten_result.first; error = flatten_result.second; - if (error.type) - return make_tuple(SyntaxGrammar(), LexicalGrammar(), error); + if (error.type) { + result.error = error; + return result; + } - /* - * Ensure all lexical rules are in a consistent format. - */ - lex_grammar = normalize_rules(lex_grammar); + // Ensure all lexical rules are in a consistent format. + LexicalGrammar lexical_grammar = normalize_rules(get<1>(extract_result)); - return make_tuple(syntax_grammar, lex_grammar, CompileError::none()); + // Find any symbols that always have the same alias applied to them. + // Remove those aliases since they can be applied in a simpler way. + auto simple_aliases = extract_simple_aliases(&syntax_grammar, &lexical_grammar); + + return { + move(syntax_grammar), + move(lexical_grammar), + move(simple_aliases), + CompileError::none(), + }; } } // namespace prepare_grammar diff --git a/src/compiler/prepare_grammar/prepare_grammar.h b/src/compiler/prepare_grammar/prepare_grammar.h index bed59a53..d71beed1 100644 --- a/src/compiler/prepare_grammar/prepare_grammar.h +++ b/src/compiler/prepare_grammar/prepare_grammar.h @@ -1,7 +1,7 @@ #ifndef COMPILER_PREPARE_GRAMMAR_PREPARE_GRAMMAR_H_ #define COMPILER_PREPARE_GRAMMAR_PREPARE_GRAMMAR_H_ -#include +#include #include "compiler/grammar.h" #include "compiler/syntax_grammar.h" #include "compiler/lexical_grammar.h" @@ -10,7 +10,14 @@ namespace tree_sitter { namespace prepare_grammar { -std::tuple prepare_grammar(const InputGrammar &); +struct PrepareGrammarResult { + SyntaxGrammar syntax_grammar; + LexicalGrammar lexical_grammar; + std::unordered_map simple_aliases; + CompileError error; +}; + +PrepareGrammarResult prepare_grammar(const InputGrammar &); } // namespace prepare_grammar } // namespace tree_sitter diff --git a/src/compiler/rule.h b/src/compiler/rule.h index 5c4064e5..b66e2c63 100644 --- a/src/compiler/rule.h +++ b/src/compiler/rule.h @@ -91,7 +91,7 @@ struct Rule { } template - inline auto match(FunctionTypes && ...functions) const -> decltype(accept(util::make_visitor(std::forward(functions)...))){ + inline auto match(FunctionTypes && ...functions) const -> decltype(accept(util::make_visitor(std::forward(functions)...))) { return accept(util::make_visitor(std::forward(functions)...)); } diff --git a/src/runtime/parser.c b/src/runtime/parser.c index 0906f754..efeab009 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -36,6 +36,7 @@ #define SYM_NAME(symbol) ts_language_symbol_name(self->language, symbol) static const unsigned MAX_VERSION_COUNT = 6; +static const unsigned MAX_VERSION_COUNT_OVERFLOW = 4; static const unsigned MAX_SUMMARY_DEPTH = 16; static const unsigned MAX_COST_DIFFERENCE = 16 * ERROR_COST_PER_SKIPPED_TREE; @@ -633,7 +634,7 @@ static StackVersion ts_parser__reduce(TSParser *self, StackVersion version, TSSy // Error recovery can sometimes cause lots of stack versions to merge, // such that a single pop operation can produce a lots of slices. // Avoid creating too many stack versions in that situation. - if (i > 0 && slice_version > MAX_VERSION_COUNT) { + if (i > 0 && slice_version > MAX_VERSION_COUNT + MAX_VERSION_COUNT_OVERFLOW) { ts_stack_remove_version(self->stack, slice_version); ts_subtree_array_delete(&self->tree_pool, &slice.subtrees); removed_version_count++; diff --git a/test/fixtures/test_grammars/aliased_unit_reductions/corpus.txt b/test/fixtures/test_grammars/aliased_unit_reductions/corpus.txt index 80217b76..d9be3f85 100644 --- a/test/fixtures/test_grammars/aliased_unit_reductions/corpus.txt +++ b/test/fixtures/test_grammars/aliased_unit_reductions/corpus.txt @@ -2,11 +2,12 @@ Aliases on rules that are unit reductions ========================================== -one two three; +one two three four; --- (statement (identifier) (b_prime (identifier)) - (c_prime (identifier))) + (c_prime (identifier)) + (identifier)) diff --git a/test/fixtures/test_grammars/aliased_unit_reductions/grammar.json b/test/fixtures/test_grammars/aliased_unit_reductions/grammar.json index d2e4153f..34080b7e 100644 --- a/test/fixtures/test_grammars/aliased_unit_reductions/grammar.json +++ b/test/fixtures/test_grammars/aliased_unit_reductions/grammar.json @@ -10,6 +10,9 @@ "type": "SEQ", "members": [ {"type": "SYMBOL", "name": "_a"}, + + // The `_b` rule is always aliased to `b_prime`, so it is internally treated + // as a simple alias. { "type": "ALIAS", "named": true, @@ -19,6 +22,9 @@ "name": "_b" } }, + + // The `_c` rule is used without an alias in addition to being aliased to `c_prime`, + // so it is not a simple alias. { "type": "ALIAS", "named": true, @@ -28,6 +34,11 @@ "name": "_c" } }, + { + "type": "SYMBOL", + "name": "_c" + }, + { "type": "STRING", "value": ";" @@ -57,7 +68,7 @@ "_c": { "type": "SYMBOL", - "name": "_B" + "name": "_C" }, "_C": {