diff --git a/project.gyp b/project.gyp index 74124b15..91001727 100644 --- a/project.gyp +++ b/project.gyp @@ -15,12 +15,12 @@ 'src/compiler/build_tables/build_parse_table.cc', 'src/compiler/build_tables/build_tables.cc', 'src/compiler/build_tables/recovery_tokens.cc', - 'src/compiler/build_tables/item_set_closure.cc', 'src/compiler/build_tables/lex_item.cc', 'src/compiler/build_tables/lex_item_transitions.cc', 'src/compiler/build_tables/lex_conflict_manager.cc', 'src/compiler/build_tables/lookahead_set.cc', 'src/compiler/build_tables/parse_item.cc', + 'src/compiler/build_tables/parse_item_set_builder.cc', 'src/compiler/build_tables/parse_conflict_manager.cc', 'src/compiler/build_tables/rule_can_be_blank.cc', 'src/compiler/compile.cc', diff --git a/spec/compiler/build_tables/item_set_closure_spec.cc b/spec/compiler/build_tables/parse_item_set_builder_spec.cc similarity index 88% rename from spec/compiler/build_tables/item_set_closure_spec.cc rename to spec/compiler/build_tables/parse_item_set_builder_spec.cc index c8b30b71..5e387e51 100644 --- a/spec/compiler/build_tables/item_set_closure_spec.cc +++ b/spec/compiler/build_tables/parse_item_set_builder_spec.cc @@ -1,6 +1,6 @@ #include "spec_helper.h" #include "compiler/syntax_grammar.h" -#include "compiler/build_tables/item_set_closure.h" +#include "compiler/build_tables/parse_item_set_builder.h" #include "compiler/build_tables/lookahead_set.h" #include "compiler/rules/built_in_symbols.h" @@ -9,7 +9,7 @@ using namespace rules; START_TEST -describe("item_set_closure", []() { +describe("ParseItemSetBuilder", []() { it("adds items at the beginnings of referenced rules", [&]() { SyntaxGrammar grammar{{ SyntaxVariable("rule0", VariableTypeNamed, { @@ -39,12 +39,15 @@ describe("item_set_closure", []() { return grammar.variables[variable_index].productions[production_index]; }; - ParseItemSet item_set = item_set_closure(ParseItemSet({ + ParseItemSet item_set({ { ParseItem(Symbol(0), production(0, 0), 0), LookaheadSet({ Symbol(10, true) }), } - }), grammar); + }); + + ParseItemSetBuilder item_set_builder(grammar); + item_set_builder.apply_transitive_closure(&item_set); AssertThat(item_set, Equals(ParseItemSet({ { @@ -87,12 +90,15 @@ describe("item_set_closure", []() { return grammar.variables[variable_index].productions[production_index]; }; - ParseItemSet item_set = item_set_closure(ParseItemSet({ + ParseItemSet item_set({ { ParseItem(Symbol(0), production(0, 0), 0), LookaheadSet({ Symbol(10, true) }), } - }), grammar); + }); + + ParseItemSetBuilder item_set_builder(grammar); + item_set_builder.apply_transitive_closure(&item_set); AssertThat(item_set, Equals(ParseItemSet({ { diff --git a/src/compiler/build_tables/build_parse_table.cc b/src/compiler/build_tables/build_parse_table.cc index c5079848..829d8e93 100644 --- a/src/compiler/build_tables/build_parse_table.cc +++ b/src/compiler/build_tables/build_parse_table.cc @@ -9,7 +9,7 @@ #include "compiler/build_tables/parse_conflict_manager.h" #include "compiler/build_tables/remove_duplicate_states.h" #include "compiler/build_tables/parse_item.h" -#include "compiler/build_tables/item_set_closure.h" +#include "compiler/build_tables/parse_item_set_builder.h" #include "compiler/lexical_grammar.h" #include "compiler/syntax_grammar.h" #include "compiler/rules/symbol.h" @@ -40,6 +40,7 @@ class ParseTableBuilder { vector> item_sets_to_process; ParseTable parse_table; set conflicts; + ParseItemSetBuilder item_set_builder; set fragile_productions; bool allow_any_conflict; @@ -48,6 +49,7 @@ class ParseTableBuilder { const LexicalGrammar &lex_grammar) : grammar(grammar), lexical_grammar(lex_grammar), + item_set_builder(grammar), allow_any_conflict(false) {} pair build() { @@ -88,11 +90,11 @@ class ParseTableBuilder { CompileError process_part_state_queue() { while (!item_sets_to_process.empty()) { auto pair = item_sets_to_process.back(); - ParseItemSet item_set = item_set_closure(pair.first, grammar); - + ParseItemSet &item_set = pair.first; ParseStateId state_id = pair.second; item_sets_to_process.pop_back(); + item_set_builder.apply_transitive_closure(&item_set); add_reduce_actions(item_set, state_id); add_shift_actions(item_set, state_id); add_shift_extra_actions(state_id); @@ -143,7 +145,7 @@ class ParseTableBuilder { ParseStateId state_id = parse_table.add_state(); parse_state_ids[item_set] = state_id; - item_sets_to_process.push_back({ item_set, state_id }); + item_sets_to_process.push_back({ std::move(item_set), state_id }); return state_id; } else { return pair->second; diff --git a/src/compiler/build_tables/item_set_closure.cc b/src/compiler/build_tables/item_set_closure.cc deleted file mode 100644 index b74431ba..00000000 --- a/src/compiler/build_tables/item_set_closure.cc +++ /dev/null @@ -1,80 +0,0 @@ -#include "compiler/build_tables/item_set_closure.h" -#include -#include -#include -#include "compiler/syntax_grammar.h" -#include "compiler/rules/built_in_symbols.h" - -namespace tree_sitter { -namespace build_tables { - -using std::vector; -using std::pair; -using std::shared_ptr; -using std::make_shared; -using rules::Symbol; -using rules::NONE; - -ParseItemSet item_set_closure(const ParseItemSet &input_item_set, - const SyntaxGrammar &grammar) { - ParseItemSet result; - - // An item set's closure is defined recursively. Use an explicit stack to - // store the recursively-added items. - vector> items_to_process( - input_item_set.entries.begin(), input_item_set.entries.end()); - - while (!items_to_process.empty()) { - ParseItem item = items_to_process.back().first; - LookaheadSet lookahead_symbols = items_to_process.back().second; - items_to_process.pop_back(); - - // Add the parse-item and lookahead symbols to the item set. - // If they were already present, skip to the next item. - if (!result.entries[item].insert_all(lookahead_symbols)) - continue; - - // If the next symbol in the production is not a non-terminal, skip to the - // next item. - Symbol next_symbol = item.next_symbol(); - if (next_symbol == NONE() || next_symbol.is_token || - next_symbol.is_built_in()) - continue; - - // If the next symbol is the last symbol in the item's production, then the - // lookahead symbols for the new items are the same as for the current item. - // Otherwise, compute the FOLLOW-SET of the symbol in this production. This - // is defined recursively as well, so use another queue to store the - // recursively-added follow symbols. - LookaheadSet next_lookahead_symbols; - size_t next_step = item.step_index + 1; - if (next_step == item.production->size()) { - next_lookahead_symbols = lookahead_symbols; - } else { - vector symbols_to_process( - { item.production->at(next_step).symbol }); - while (!symbols_to_process.empty()) { - Symbol symbol = symbols_to_process.back(); - symbols_to_process.pop_back(); - - if (!next_lookahead_symbols.insert(symbol)) - continue; - - for (const Production &production : grammar.productions(symbol)) - if (!production.empty()) - symbols_to_process.push_back(production[0].symbol); - } - } - - // Add each of the next symbol's productions to be processed recursively. - for (const Production &production : grammar.productions(next_symbol)) - items_to_process.push_back({ - ParseItem(next_symbol, production, 0), next_lookahead_symbols, - }); - } - - return result; -} - -} // namespace build_tables -} // namespace tree_sitter diff --git a/src/compiler/build_tables/item_set_closure.h b/src/compiler/build_tables/item_set_closure.h deleted file mode 100644 index 093d19d7..00000000 --- a/src/compiler/build_tables/item_set_closure.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef COMPILER_BUILD_TABLES_ITEM_SET_CLOSURE_H_ -#define COMPILER_BUILD_TABLES_ITEM_SET_CLOSURE_H_ - -#include "compiler/build_tables/parse_item.h" -#include "compiler/rules/symbol.h" - -namespace tree_sitter { - -struct SyntaxGrammar; - -namespace build_tables { - -ParseItemSet item_set_closure(const ParseItemSet &, const SyntaxGrammar &); - -} // namespace build_tables -} // namespace tree_sitter - -#endif // COMPILER_BUILD_TABLES_ITEM_SET_CLOSURE_H_ diff --git a/src/compiler/build_tables/lookahead_set.cc b/src/compiler/build_tables/lookahead_set.cc index 55a1f78e..239bc029 100644 --- a/src/compiler/build_tables/lookahead_set.cc +++ b/src/compiler/build_tables/lookahead_set.cc @@ -28,6 +28,8 @@ bool LookaheadSet::contains(const Symbol &symbol) const { } bool LookaheadSet::insert_all(const LookaheadSet &other) { + if (!other.entries.get()) + return false; if (!entries.get()) entries = make_shared>(); size_t previous_size = entries->size(); diff --git a/src/compiler/build_tables/parse_item_set_builder.cc b/src/compiler/build_tables/parse_item_set_builder.cc new file mode 100644 index 00000000..93ffbf78 --- /dev/null +++ b/src/compiler/build_tables/parse_item_set_builder.cc @@ -0,0 +1,109 @@ +#include "compiler/build_tables/parse_item_set_builder.h" +#include +#include +#include +#include "compiler/syntax_grammar.h" +#include "compiler/rules/built_in_symbols.h" + +namespace tree_sitter { +namespace build_tables { + +using std::vector; +using std::set; +using std::map; +using std::get; +using std::tuple; +using std::make_tuple; +using std::shared_ptr; +using std::make_shared; +using rules::Symbol; +using rules::NONE; + +static map build_first_sets(const SyntaxGrammar &grammar) { + map result; + vector symbol_stack; + set processed_symbols; + + for (size_t i = 0; i < grammar.variables.size(); i++) { + Symbol symbol(i); + LookaheadSet first_set; + + processed_symbols.clear(); + symbol_stack.clear(); + symbol_stack.push_back(symbol); + while (!symbol_stack.empty()) { + Symbol current_symbol = symbol_stack.back(); + symbol_stack.pop_back(); + if (current_symbol.is_token) { + first_set.insert(current_symbol); + } else if (processed_symbols.insert(current_symbol).second) { + for (const Production &production : grammar.productions(current_symbol)) { + if (!production.empty()) { + symbol_stack.push_back(production[0].symbol); + } + } + } + } + + result.insert({symbol, first_set}); + } + + return result; +} + +ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar) : + grammar{&grammar}, first_sets{build_first_sets(grammar)} { +} + +void ParseItemSetBuilder::apply_transitive_closure(ParseItemSet *item_set) { + items_to_process.clear(); + for (const auto &entry : item_set->entries) { + items_to_process.push_back(make_tuple(entry.first, entry.second, true)); + } + + while (!items_to_process.empty()) { + ParseItem item = get<0>(items_to_process.back()); + LookaheadSet lookahead_symbols = get<1>(items_to_process.back()); + bool from_original_set = get<2>(items_to_process.back()); + items_to_process.pop_back(); + + // Add the parse-item and lookahead symbols to the item set. + // If they were already present, skip to the next item. + if (!from_original_set && !item_set->entries[item].insert_all(lookahead_symbols)) + continue; + + // If the next symbol in the production is not a non-terminal, skip to the + // next item. + Symbol next_symbol = item.next_symbol(); + if (next_symbol == NONE() || next_symbol.is_token || + next_symbol.is_built_in()) + continue; + + // If the next symbol is the last symbol in the item's production, then the + // lookahead symbols for the new items are the same as for the current item. + // Otherwise, they are the FOLLOW set of the symbol in this production. + LookaheadSet next_lookahead_symbols; + size_t next_step = item.step_index + 1; + if (next_step == item.production->size()) { + next_lookahead_symbols = lookahead_symbols; + } else { + Symbol symbol_after_next = item.production->at(next_step).symbol; + if (symbol_after_next.is_token) { + next_lookahead_symbols.insert(symbol_after_next); + } else { + next_lookahead_symbols = first_sets.find(symbol_after_next)->second; + } + } + + // Add each of the next symbol's productions to be processed recursively. + for (const Production &production : grammar->productions(next_symbol)) + items_to_process.push_back(make_tuple( + ParseItem(next_symbol, production, 0), + next_lookahead_symbols, + false + )); + } +} + +} // namespace build_tables +} // namespace tree_sitter diff --git a/src/compiler/build_tables/parse_item_set_builder.h b/src/compiler/build_tables/parse_item_set_builder.h new file mode 100644 index 00000000..3d8eb5c6 --- /dev/null +++ b/src/compiler/build_tables/parse_item_set_builder.h @@ -0,0 +1,27 @@ +#ifndef COMPILER_BUILD_TABLES_PARSE_ITEM_SET_BUILDER_H_ +#define COMPILER_BUILD_TABLES_PARSE_ITEM_SET_BUILDER_H_ + +#include "compiler/build_tables/parse_item.h" +#include "compiler/rules/symbol.h" +#include + +namespace tree_sitter { + +struct SyntaxGrammar; + +namespace build_tables { + +class ParseItemSetBuilder { + const SyntaxGrammar *grammar; + std::map first_sets; + std::vector> items_to_process; + + public: + ParseItemSetBuilder(const SyntaxGrammar &); + void apply_transitive_closure(ParseItemSet *); +}; + +} // namespace build_tables +} // namespace tree_sitter + +#endif // COMPILER_BUILD_TABLES_PARSE_ITEM_SET_BUILDER_H_ diff --git a/src/runtime/parser.c b/src/runtime/parser.c index fa7a7a26..04556ebb 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -1086,7 +1086,7 @@ static void parser__advance(Parser *self, StackVersion version, return; } - parser__handle_error(self, version, lookahead->symbol); + parser__handle_error(self, version, lookahead->first_leaf.symbol); if (ts_stack_is_halted(self->stack, version)) { ts_tree_release(lookahead);