diff --git a/src/compiler/build_tables/parse_item.cc b/src/compiler/build_tables/parse_item.cc index 0ef56e79..2d4257b6 100644 --- a/src/compiler/build_tables/parse_item.cc +++ b/src/compiler/build_tables/parse_item.cc @@ -156,14 +156,14 @@ struct hash { if (item.is_done()) { if (!item.production->empty()) { hash_combine(&result, item.production->back().precedence); - hash_combine(&result, item.production->back().associativity); + hash_combine(&result, item.production->back().associativity); } } else { for (size_t i = 0, n = item.production->size(); i < n; i++) { auto &step = item.production->at(i); hash_combine(&result, step.symbol); hash_combine(&result, step.precedence); - hash_combine(&result, step.associativity); + hash_combine(&result, step.associativity); } } return result; diff --git a/src/compiler/build_tables/parse_item_set_builder.cc b/src/compiler/build_tables/parse_item_set_builder.cc index 236be6f0..3b119157 100644 --- a/src/compiler/build_tables/parse_item_set_builder.cc +++ b/src/compiler/build_tables/parse_item_set_builder.cc @@ -1,4 +1,5 @@ #include "compiler/build_tables/parse_item_set_builder.h" +#include #include #include #include @@ -11,8 +12,10 @@ namespace tree_sitter { namespace build_tables { +using std::move; using std::vector; using std::set; +using std::find; using std::get; using std::pair; using std::tuple; @@ -21,8 +24,36 @@ using std::make_tuple; using rules::Symbol; using rules::NONE; +static vector inline_production(const ParseItem &item, const SyntaxGrammar &grammar) { + vector result; + for (const Production &production_to_insert : grammar.variables[item.next_symbol().index].productions) { + auto begin = item.production->steps.begin(); + auto end = item.production->steps.end(); + auto step = begin + item.step_index; + + Production production{{begin, step}, item.production->dynamic_precedence}; + production.steps.insert( + production.steps.end(), + production_to_insert.steps.begin(), + production_to_insert.steps.end() + ); + production.back().precedence = item.precedence(); + production.back().associativity = item.associativity(); + production.steps.insert( + production.steps.end(), + step + 1, + end + ); + + if (find(result.begin(), result.end(), production) == result.end()) { + result.push_back(move(production)); + } + } + return result; +} + ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar, - const LexicalGrammar &lexical_grammar) { + const LexicalGrammar &lexical_grammar) : grammar{grammar} { vector symbols_to_process; set processed_non_terminals; @@ -145,24 +176,56 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar, for (auto &pair : cached_lookaheads_by_non_terminal) { for (const Production &production : grammar.variables[pair.first].productions) { - component_cache[i].push_back({ - ParseItem(Symbol::non_terminal(pair.first), production, 0), - pair.second.first, - pair.second.second - }); + Symbol lhs = Symbol::non_terminal(pair.first); + ParseItem item(lhs, production, 0); + + if (grammar.variables_to_inline.count(item.next_symbol())) { + vector &inlined_productions = inlined_productions_by_original_production[item]; + if (inlined_productions.empty()) { + inlined_productions = inline_production(item, grammar); + } + + for (const Production &inlined_production : inlined_productions) { + ParseItemSetComponent component{ + ParseItem(lhs, inlined_production, 0), + pair.second.first, + pair.second.second + }; + + if (find(component_cache[i].begin(), component_cache[i].end(), component) == component_cache[i].end()) { + component_cache[i].push_back(component); + } + } + } else if (!grammar.variables_to_inline.count(lhs)) { + ParseItemSetComponent component{ + ParseItem(lhs, production, 0), + pair.second.first, + pair.second.second + }; + + if (find(component_cache[i].begin(), component_cache[i].end(), component) == component_cache[i].end()) { + component_cache[i].push_back(component); + } + } } } } } void ParseItemSetBuilder::apply_transitive_closure(ParseItemSet *item_set) { - for (const auto &pair : item_set->entries) { - const ParseItem &item = pair.first; - const LookaheadSet &lookaheads = pair.second; - if (item.lhs() != rules::START() && item.step_index == 0) continue; + for (auto iter = item_set->entries.begin(), end = item_set->entries.end(); iter != end;) { + const ParseItem &item = iter->first; + const LookaheadSet &lookaheads = iter->second; + if (item.lhs() != rules::START() && item.step_index == 0) { + ++iter; + continue; + } const Symbol &next_symbol = item.next_symbol(); - if (!next_symbol.is_non_terminal() || next_symbol.is_built_in()) continue; + if (!next_symbol.is_non_terminal() || next_symbol.is_built_in()) { + ++iter; + continue; + } LookaheadSet next_lookaheads; size_t next_step = item.step_index + 1; @@ -178,6 +241,24 @@ void ParseItemSetBuilder::apply_transitive_closure(ParseItemSet *item_set) { current_lookaheads.insert_all(component.lookaheads); if (component.propagates_lookaheads) current_lookaheads.insert_all(next_lookaheads); } + + if (grammar.variables_to_inline.count(next_symbol)) { + vector &inlined_productions = inlined_productions_by_original_production[item]; + if (inlined_productions.empty()) { + inlined_productions = inline_production(item, grammar); + } + + for (const Production &inlined_production : inlined_productions) { + item_set->entries.insert({ + ParseItem(item.lhs(), inlined_production, item.step_index), + lookaheads + }); + } + + iter = item_set->entries.erase(iter); + } else { + ++iter; + } } } diff --git a/src/compiler/build_tables/parse_item_set_builder.h b/src/compiler/build_tables/parse_item_set_builder.h index 5357a385..c180f38b 100644 --- a/src/compiler/build_tables/parse_item_set_builder.h +++ b/src/compiler/build_tables/parse_item_set_builder.h @@ -4,6 +4,7 @@ #include "compiler/build_tables/parse_item.h" #include "compiler/rule.h" #include +#include namespace tree_sitter { @@ -17,11 +18,19 @@ class ParseItemSetBuilder { ParseItem item; LookaheadSet lookaheads; bool propagates_lookaheads; + + inline bool operator==(const ParseItemSetComponent &other) { + return item == other.item && + lookaheads == other.lookaheads && + propagates_lookaheads == other.propagates_lookaheads; + } }; + const SyntaxGrammar &grammar; std::map first_sets; std::map last_sets; std::map> component_cache; + std::map> inlined_productions_by_original_production; public: ParseItemSetBuilder(const SyntaxGrammar &, const LexicalGrammar &); diff --git a/src/compiler/grammar.h b/src/compiler/grammar.h index 6d16524b..54fe69e9 100644 --- a/src/compiler/grammar.h +++ b/src/compiler/grammar.h @@ -31,6 +31,7 @@ struct InputGrammar { std::vector extra_tokens; std::vector> expected_conflicts; std::vector external_tokens; + std::unordered_set variables_to_inline; }; } // namespace tree_sitter diff --git a/src/compiler/parse_grammar.cc b/src/compiler/parse_grammar.cc index 7589904c..43ceed51 100644 --- a/src/compiler/parse_grammar.cc +++ b/src/compiler/parse_grammar.cc @@ -205,7 +205,7 @@ ParseGrammarResult parse_grammar(const string &input) { string error_message; string name; InputGrammar grammar; - json_value name_json, rules_json, extras_json, conflicts_json, external_tokens_json; + json_value name_json, rules_json, extras_json, conflicts_json, external_tokens_json, inline_rules_json; json_settings settings = { 0, json_enable_comments, 0, 0, 0, 0 }; char parse_error[json_error_max]; @@ -299,6 +299,24 @@ ParseGrammarResult parse_grammar(const string &input) { } } + inline_rules_json = grammar_json->operator[]("inline"); + if (inline_rules_json.type != json_none) { + if (inline_rules_json.type != json_array) { + error_message = "Inline rules must be an array"; + goto error; + } + + for (size_t i = 0, length = inline_rules_json.u.array.length; i < length; i++) { + json_value *inline_rule_json = inline_rules_json.u.array.values[i]; + if (inline_rule_json->type != json_string) { + error_message = "Inline rules must be an array of rule names"; + goto error; + } + + grammar.variables_to_inline.insert(rules::NamedSymbol{string(inline_rule_json->u.string.ptr)}); + } + } + external_tokens_json = grammar_json->operator[]("externals"); if (external_tokens_json.type != json_none) { if (external_tokens_json.type != json_array) { diff --git a/src/compiler/parse_grammar.h b/src/compiler/parse_grammar.h index 04e7672b..c24cd9ca 100644 --- a/src/compiler/parse_grammar.h +++ b/src/compiler/parse_grammar.h @@ -2,6 +2,7 @@ #define COMPILER_GRAMMAR_JSON_H_ #include +#include #include "tree_sitter/compiler.h" #include "compiler/grammar.h" diff --git a/src/compiler/prepare_grammar/expand_repeats.cc b/src/compiler/prepare_grammar/expand_repeats.cc index 39b2075d..c4a25634 100644 --- a/src/compiler/prepare_grammar/expand_repeats.cc +++ b/src/compiler/prepare_grammar/expand_repeats.cc @@ -94,6 +94,7 @@ InitialSyntaxGrammar expand_repeats(const InitialSyntaxGrammar &grammar) { result.extra_tokens = grammar.extra_tokens; result.expected_conflicts = grammar.expected_conflicts; result.external_tokens = grammar.external_tokens; + result.variables_to_inline = grammar.variables_to_inline; ExpandRepeats expander(result.variables.size()); for (auto &variable : result.variables) { diff --git a/src/compiler/prepare_grammar/extract_tokens.cc b/src/compiler/prepare_grammar/extract_tokens.cc index 6893cde4..73d3d866 100644 --- a/src/compiler/prepare_grammar/extract_tokens.cc +++ b/src/compiler/prepare_grammar/extract_tokens.cc @@ -235,6 +235,10 @@ tuple extract_tokens( syntax_grammar.expected_conflicts.insert(new_conflict_set); } + for (const Symbol &symbol : grammar.variables_to_inline) { + syntax_grammar.variables_to_inline.insert(symbol_replacer.replace_symbol(symbol)); + } + // The grammar's extra tokens can be either token rules or symbols // pointing to token rules. If they are symbols, then they'll be handled by // the parser; add them to the syntax grammar's extra tokens. If they diff --git a/src/compiler/prepare_grammar/flatten_grammar.cc b/src/compiler/prepare_grammar/flatten_grammar.cc index 86c76a86..e41e8398 100644 --- a/src/compiler/prepare_grammar/flatten_grammar.cc +++ b/src/compiler/prepare_grammar/flatten_grammar.cc @@ -111,6 +111,7 @@ SyntaxVariable flatten_rule(const Variable &variable) { pair flatten_grammar(const InitialSyntaxGrammar &grammar) { SyntaxGrammar result; result.external_tokens = grammar.external_tokens; + result.variables_to_inline = grammar.variables_to_inline; for (const auto &expected_conflict : grammar.expected_conflicts) { result.expected_conflicts.insert({ diff --git a/src/compiler/prepare_grammar/initial_syntax_grammar.h b/src/compiler/prepare_grammar/initial_syntax_grammar.h index 55eb2b7e..881c6396 100644 --- a/src/compiler/prepare_grammar/initial_syntax_grammar.h +++ b/src/compiler/prepare_grammar/initial_syntax_grammar.h @@ -16,6 +16,7 @@ struct InitialSyntaxGrammar { std::set extra_tokens; std::set> expected_conflicts; std::vector external_tokens; + std::set variables_to_inline; }; } // namespace prepare_grammar diff --git a/src/compiler/prepare_grammar/intern_symbols.cc b/src/compiler/prepare_grammar/intern_symbols.cc index deaeb122..7bb2a80b 100644 --- a/src/compiler/prepare_grammar/intern_symbols.cc +++ b/src/compiler/prepare_grammar/intern_symbols.cc @@ -142,6 +142,13 @@ pair intern_symbols(const InputGrammar &grammar) result.expected_conflicts.insert(entry); } + for (auto &named_symbol : grammar.variables_to_inline) { + auto symbol = interner.intern_symbol(named_symbol); + if (symbol != rules::NONE()) { + result.variables_to_inline.insert(symbol); + } + } + return {result, CompileError::none()}; } diff --git a/src/compiler/prepare_grammar/interned_grammar.h b/src/compiler/prepare_grammar/interned_grammar.h index 99987f42..c96dfa66 100644 --- a/src/compiler/prepare_grammar/interned_grammar.h +++ b/src/compiler/prepare_grammar/interned_grammar.h @@ -15,6 +15,7 @@ struct InternedGrammar { std::vector extra_tokens; std::set> expected_conflicts; std::vector external_tokens; + std::set variables_to_inline; }; } // namespace prepare_grammar diff --git a/src/compiler/syntax_grammar.h b/src/compiler/syntax_grammar.h index 55e55568..3c3d3b66 100644 --- a/src/compiler/syntax_grammar.h +++ b/src/compiler/syntax_grammar.h @@ -74,6 +74,7 @@ struct SyntaxGrammar { std::set extra_tokens; std::set expected_conflicts; std::vector external_tokens; + std::set variables_to_inline; }; } // namespace tree_sitter diff --git a/test/fixtures/test_grammars/inline_rules/corpus.txt b/test/fixtures/test_grammars/inline_rules/corpus.txt new file mode 100644 index 00000000..af5e496e --- /dev/null +++ b/test/fixtures/test_grammars/inline_rules/corpus.txt @@ -0,0 +1,11 @@ +================================== +Expressions +================================== + +1 + 2 * 3; + +--- + +(statement (sum + (number) + (product (number) (number)))) diff --git a/test/fixtures/test_grammars/inline_rules/grammar.json b/test/fixtures/test_grammars/inline_rules/grammar.json new file mode 100644 index 00000000..4438004f --- /dev/null +++ b/test/fixtures/test_grammars/inline_rules/grammar.json @@ -0,0 +1,68 @@ +{ + "name": "inline_rules", + + "extras": [ + {"type": "PATTERN", "value": "\\s"} + ], + + "inline": [ + "expression" + ], + + "rules": { + "statement": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": ";"} + ] + }, + + "expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "sum"}, + {"type": "SYMBOL", "name": "product"}, + {"type": "SYMBOL", "name": "number"}, + {"type": "SYMBOL", "name": "parenthesized_expression"} + ] + }, + + "parenthesized_expression": { + "type": "SEQ", + "members": [ + {"type": "STRING", "value": "("}, + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": ")"} + ] + }, + + "sum": { + "type": "PREC_LEFT", + "value": 0, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "+"}, + {"type": "SYMBOL", "name": "expression"} + ] + } + }, + + "product": { + "type": "PREC_LEFT", + "value": 2, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "*"}, + {"type": "SYMBOL", "name": "expression"} + ] + } + }, + + "number": {"type": "PATTERN", "value": "\\d+"} + } +}