#include "compiler/build_tables/build_parse_table.h" #include #include #include #include #include #include #include "compiler/parse_table.h" #include "compiler/build_tables/remove_duplicate_states.h" #include "compiler/build_tables/parse_item.h" #include "compiler/build_tables/parse_item_set_builder.h" #include "compiler/lexical_grammar.h" #include "compiler/syntax_grammar.h" #include "compiler/rules/symbol.h" #include "compiler/rules/built_in_symbols.h" #include "compiler/build_tables/recovery_tokens.h" namespace tree_sitter { namespace build_tables { using std::find; using std::pair; using std::vector; using std::set; using std::map; using std::string; using std::to_string; using std::unordered_map; using std::make_shared; using rules::Associativity; using rules::Symbol; using rules::END_OF_INPUT; class ParseTableBuilder { const SyntaxGrammar grammar; const LexicalGrammar lexical_grammar; unordered_map recovery_states; unordered_map parse_state_ids; vector> item_sets_to_process; ParseTable parse_table; set conflicts; ParseItemSetBuilder item_set_builder; set fragile_productions; bool allow_any_conflict; public: ParseTableBuilder(const SyntaxGrammar &grammar, const LexicalGrammar &lex_grammar) : grammar(grammar), lexical_grammar(lex_grammar), item_set_builder(grammar, lex_grammar), allow_any_conflict(false) {} pair build() { Symbol start_symbol = Symbol(0, grammar.variables.empty()); Production start_production({ ProductionStep(start_symbol, 0, rules::AssociativityNone), }); // Placeholder for error state add_parse_state(ParseItemSet()); add_parse_state(ParseItemSet({ { ParseItem(rules::START(), start_production, 0), LookaheadSet({ END_OF_INPUT().index }), }, })); CompileError error = process_part_state_queue(); if (error.type != TSCompileErrorTypeNone) return { parse_table, error }; parse_table.mergeable_symbols = recovery_tokens(lexical_grammar); build_error_parse_state(); allow_any_conflict = true; process_part_state_queue(); allow_any_conflict = false; mark_fragile_actions(); remove_duplicate_parse_states(); return { parse_table, CompileError::none() }; } private: CompileError process_part_state_queue() { while (!item_sets_to_process.empty()) { auto pair = item_sets_to_process.back(); ParseItemSet &item_set = pair.first; ParseStateId state_id = pair.second; item_sets_to_process.pop_back(); item_set_builder.apply_transitive_closure(&item_set); string conflict = add_actions(item_set, state_id); if (!conflict.empty()) { return CompileError(TSCompileErrorTypeParseConflict, conflict); } } return CompileError::none(); } void build_error_parse_state() { ParseState error_state; for (const Symbol::Index index : parse_table.mergeable_symbols) { add_out_of_context_parse_state(&error_state, Symbol(index, true)); } for (const Symbol &symbol : grammar.extra_tokens) { if (!error_state.terminal_entries.count(symbol.index)) { error_state.terminal_entries[symbol.index].actions.push_back(ParseAction::ShiftExtra()); } } for (size_t i = 0; i < grammar.variables.size(); i++) { add_out_of_context_parse_state(&error_state, Symbol(i, false)); } error_state.terminal_entries[END_OF_INPUT().index].actions.push_back(ParseAction::Recover(0)); parse_table.states[0] = error_state; } void add_out_of_context_parse_state(ParseState *error_state, const rules::Symbol &symbol) { const ParseItemSet &item_set = recovery_states[symbol]; if (!item_set.entries.empty()) { ParseStateId state = add_parse_state(item_set); if (symbol.is_token) { error_state->terminal_entries[symbol.index].actions.assign({ ParseAction::Recover(state) }); } else { error_state->nonterminal_entries[symbol.index] = state; } } } ParseStateId add_parse_state(const ParseItemSet &item_set) { auto pair = parse_state_ids.find(item_set); if (pair == parse_state_ids.end()) { ParseStateId state_id = parse_table.add_state(); parse_state_ids[item_set] = state_id; parse_table.states[state_id].shift_actions_signature = item_set.unfinished_item_signature(); item_sets_to_process.push_back({ std::move(item_set), state_id }); return state_id; } else { return pair->second; } } string add_actions(const ParseItemSet &item_set, ParseStateId state_id) { map terminal_successors; map nonterminal_successors; set lookaheads_with_conflicts; for (const auto &pair : item_set.entries) { const ParseItem &item = pair.first; const LookaheadSet &lookahead_symbols = pair.second; // If the item is finished, immediately add a Reduce or Accept action to // the parse table for each of its lookahead terminals. if (item.is_done()) { ParseAction action = (item.lhs() == rules::START()) ? ParseAction::Accept() : ParseAction::Reduce(item.lhs(), item.step_index, *item.production); int precedence = item.precedence(); for (const Symbol::Index lookahead : *lookahead_symbols.entries) { ParseTableEntry &entry = parse_table.states[state_id].terminal_entries[lookahead]; // Only add the highest-precedence Reduce actions to the parse table. // If other lower-precedence actions are possible, ignore them. if (entry.actions.empty()) { parse_table.add_terminal_action(state_id, lookahead, action); } else { ParseAction &existing_action = entry.actions[0]; if (allow_any_conflict) { entry.actions.push_back(action); } else { int existing_precedence = existing_action.precedence(); if (precedence > existing_precedence) { for (const ParseAction &old_action : entry.actions) fragile_productions.insert(old_action.production); entry.actions.clear(); entry.actions.push_back(action); lookaheads_with_conflicts.erase(lookahead); } else if (precedence == existing_precedence) { entry.actions.push_back(action); lookaheads_with_conflicts.insert(lookahead); } else { fragile_productions.insert(item.production); } } } } // If the item is unfinished, create a new item by advancing one symbol. // Add that new item to a successor item set. } else { Symbol symbol = item.production->at(item.step_index).symbol; ParseItem new_item(item.lhs(), *item.production, item.step_index + 1); if (symbol.is_token) { terminal_successors[symbol.index].entries[new_item] = lookahead_symbols; } else { nonterminal_successors[symbol.index].entries[new_item] = lookahead_symbols; } } } // Add a Shift action for each possible successor state. Shift actions for // terminal lookaheads can conflict with Reduce actions added previously. for (auto &pair : terminal_successors) { Symbol::Index lookahead = pair.first; ParseItemSet &next_item_set = pair.second; ParseStateId next_state_id = add_parse_state(next_item_set); ParseState &state = parse_table.states[state_id]; bool had_existing_action = !state.terminal_entries[lookahead].actions.empty(); parse_table.add_terminal_action(state_id, lookahead, ParseAction::Shift(next_state_id)); if (!allow_any_conflict) { if (had_existing_action) lookaheads_with_conflicts.insert(lookahead); recovery_states[Symbol(lookahead, true)].add(next_item_set); } } // Add a Shift action for each non-terminal transition. for (auto &pair : nonterminal_successors) { Symbol::Index lookahead = pair.first; ParseItemSet &next_item_set = pair.second; ParseStateId next_state = add_parse_state(next_item_set); parse_table.set_nonterminal_action(state_id, lookahead, next_state); if (!allow_any_conflict) recovery_states[Symbol(lookahead, false)].add(next_item_set); } for (Symbol::Index lookahead : lookaheads_with_conflicts) { string conflict = handle_conflict(item_set, state_id, lookahead); if (!conflict.empty()) return conflict; } ParseAction shift_extra = ParseAction::ShiftExtra(); ParseState &state = parse_table.states[state_id]; for (const Symbol &extra_symbol : grammar.extra_tokens) { if (!state.terminal_entries.count(extra_symbol.index) || state.has_shift_action() || allow_any_conflict) { parse_table.add_terminal_action(state_id, extra_symbol.index, shift_extra); } } return ""; } void mark_fragile_actions() { for (ParseState &state : parse_table.states) { for (auto &entry : state.terminal_entries) { const Symbol symbol(entry.first, true); auto &actions = entry.second.actions; for (ParseAction &action : actions) { if (action.type == ParseActionTypeReduce) { if (has_fragile_production(action.production)) action.fragile = true; action.production = NULL; } } for (auto i = actions.begin(); i != actions.end();) { bool erased = false; for (auto j = actions.begin(); j != i; j++) { if (*j == *i) { actions.erase(i); erased = true; break; } } if (!erased) { ++i; } } } } } void remove_duplicate_parse_states() { map> state_indices_by_signature; for (ParseStateId i = 0, n = parse_table.states.size(); i < n; i++) { ParseState &state = parse_table.states[i]; state_indices_by_signature[state.shift_actions_signature].insert(i); } set deleted_states; while (true) { std::map state_replacements; for (auto &pair : state_indices_by_signature) { auto &state_group = pair.second; for (ParseStateId i : state_group) { for (ParseStateId j : state_group) { if (j == i) break; if (!state_replacements.count(j) && parse_table.merge_state(j, i)) { state_replacements.insert({ i, j }); deleted_states.insert(i); break; } } } } if (state_replacements.empty()) break; for (ParseStateId i = 0, n = parse_table.states.size(); i < n; i++) { ParseState &state = parse_table.states[i]; if (state_replacements.count(i)) { state_indices_by_signature[state.shift_actions_signature].erase(i); } else { state.each_referenced_state([&state_replacements](ParseStateId *state_index) { auto replacement = state_replacements.find(*state_index); if (replacement != state_replacements.end()) { *state_index = replacement->second; } }); } } } vector new_state_ids(parse_table.states.size()); size_t deleted_state_count = 0; auto deleted_state_iter = deleted_states.begin(); for (ParseStateId i = 0; i < new_state_ids.size(); i++) { while (deleted_state_iter != deleted_states.end() && *deleted_state_iter < i) { deleted_state_count++; deleted_state_iter++; } new_state_ids[i] = i - deleted_state_count; } ParseStateId original_state_index = 0; auto iter = parse_table.states.begin(); while (iter != parse_table.states.end()) { if (deleted_states.count(original_state_index)) { iter = parse_table.states.erase(iter); } else { ParseState &state = *iter; state.each_referenced_state([&new_state_ids](ParseStateId *state_index) { *state_index = new_state_ids[*state_index]; }); ++iter; } original_state_index++; } } string handle_conflict(const ParseItemSet &item_set, ParseStateId state_id, Symbol::Index lookahead) { ParseTableEntry &entry = parse_table.states[state_id].terminal_entries[lookahead]; int reduction_precedence = entry.actions.front().precedence(); set shift_items; bool considered_associativity = false; for (const ParseAction &action : entry.actions) if (action.type == ParseActionTypeReduce) fragile_productions.insert(action.production); if (entry.actions.back().type == ParseActionTypeShift) { PrecedenceRange shift_precedence; for (const auto &item_set_entry : item_set.entries) { const ParseItem &item = item_set_entry.first; if (item.step_index > 0 && !item.is_done()) { LookaheadSet first_set = item_set_builder.get_first_set(item.next_symbol()); if (first_set.contains(lookahead)) { shift_items.insert(item); shift_precedence.add(item.precedence()); } } } // If the shift action has higher precedence, prefer it over any of the // reduce actions. if (shift_precedence.min > reduction_precedence || (shift_precedence.min == reduction_precedence && shift_precedence.max > reduction_precedence)) { for (const ParseAction &action : entry.actions) { if (action.type == ParseActionTypeShift) break; fragile_productions.insert(action.production); } entry.actions.assign({ entry.actions.back() }); } // If the shift action has lower precedence, prefer the reduce actions. else if (shift_precedence.max < reduction_precedence || (shift_precedence.max == reduction_precedence && shift_precedence.min < reduction_precedence)) { entry.actions.pop_back(); } // If the shift action has the same precedence as the reduce actions, // consider the reduce actions' associativity. If they are all left // associative, prefer the reduce actions. If they are all right // associative, prefer the shift. else if (shift_precedence.min == reduction_precedence && shift_precedence.max == reduction_precedence) { considered_associativity = true; bool has_non_associative_reductions = false; bool has_left_associative_reductions = false; bool has_right_associative_reductions = false; for (const ParseAction &action : entry.actions) { if (action.type != ParseActionTypeReduce) break; switch (action.associativity()) { case rules::AssociativityLeft: has_left_associative_reductions = true; break; case rules::AssociativityRight: has_right_associative_reductions = true; break; default: has_non_associative_reductions = true; break; } } if (!has_non_associative_reductions) { if (has_right_associative_reductions && !has_left_associative_reductions) { for (const ParseAction &action : entry.actions) { if (action.type == ParseActionTypeShift) break; fragile_productions.insert(action.production); } entry.actions.assign({ entry.actions.back() }); } else if (has_left_associative_reductions && !has_right_associative_reductions) { entry.actions.pop_back(); } } } } if (entry.actions.size() == 1) return ""; set actual_conflict; for (const ParseItem &item : shift_items) actual_conflict.insert(item.lhs()); for (const ParseAction &action : entry.actions) if (action.type == ParseActionTypeReduce) actual_conflict.insert(action.symbol); for (const auto &expected_conflict : grammar.expected_conflicts) if (expected_conflict == actual_conflict) return ""; ParseItem earliest_starting_item; for (const ParseAction &action : entry.actions) if (action.type == ParseActionTypeReduce) if (action.consumed_symbol_count > earliest_starting_item.step_index) earliest_starting_item = ParseItem(action.symbol, *action.production, action.consumed_symbol_count); for (const ParseItem &shift_item : shift_items) if (shift_item.step_index > earliest_starting_item.step_index) earliest_starting_item = shift_item; string description = "Unresolved conflict for symbol sequence:\n\n"; for (size_t i = 0; i < earliest_starting_item.step_index; i++) { description += " " + symbol_name(earliest_starting_item.production->at(i).symbol); } description += " \u2022 " + symbol_name(Symbol(lookahead, true)) + " \u2026"; description += "\n\n"; description += "Possible interpretations:\n\n"; size_t interpretation_count = 1; for (const ParseAction &action : entry.actions) { if (action.type == ParseActionTypeReduce) { description += " " + to_string(interpretation_count++) + ":"; for (size_t i = 0; i < earliest_starting_item.step_index - action.consumed_symbol_count; i++) { description += " " + symbol_name(earliest_starting_item.production->at(i).symbol); } description += " (" + symbol_name(action.symbol); for (const ProductionStep &step : *action.production) { description += " " + symbol_name(step.symbol); } description += ")"; description += " \u2022 " + symbol_name(Symbol(lookahead, true)) + " \u2026"; description += "\n"; } } for (const ParseItem &shift_item : shift_items) { description += " " + to_string(interpretation_count++) + ":"; for (size_t i = 0; i < earliest_starting_item.step_index - shift_item.step_index; i++) { description += " " + symbol_name(earliest_starting_item.production->at(i).symbol); } description += " (" + symbol_name(shift_item.lhs()); for (size_t i = 0; i < shift_item.production->size(); i++) { if (i == shift_item.step_index) description += " \u2022"; description += " " + symbol_name(shift_item.production->at(i).symbol); } description += ")"; description += "\n"; } description += "\nPossible resolutions:\n\n"; size_t resolution_count = 1; if (actual_conflict.size() > 1) { if (!shift_items.empty()) { description += " " + to_string(resolution_count++) + ": "; description += "Specify a higher precedence in"; bool is_first = true; for (const ParseItem &shift_item : shift_items) { if (!is_first) description += " and"; description += " `" + symbol_name(shift_item.lhs()) + "`"; is_first = false; } description += " than in the other rules.\n"; } for (const ParseAction &action : entry.actions) { if (action.type == ParseActionTypeReduce) { description += " " + to_string(resolution_count++) + ": "; description += "Specify a higher precedence in `"; description += symbol_name(action.symbol); description += "` than in the other rules.\n"; } } } if (considered_associativity) { description += " " + to_string(resolution_count++) + ": "; description += "Specify a left or right associativity in"; for (const ParseAction &action : entry.actions) { bool is_first = true; if (action.type == ParseActionTypeReduce) { if (!is_first) description += " and"; description += " `" + symbol_name(action.symbol) + "`"; is_first = false; } } description += "\n"; } description += " " + to_string(resolution_count++) + ": "; description += "Add a conflict for these rules:"; for (const Symbol &conflict_symbol : actual_conflict) { description += " `" + symbol_name(conflict_symbol) + "`"; } description += "\n"; return description; } string symbol_name(const rules::Symbol &symbol) const { if (symbol.is_built_in()) { if (symbol == END_OF_INPUT()) return "END_OF_INPUT"; else return ""; } else if (symbol.is_token) { const Variable &variable = lexical_grammar.variables[symbol.index]; if (variable.type == VariableTypeNamed) return variable.name; else return "'" + variable.name + "'"; } else { return grammar.variables[symbol.index].name; } } bool has_fragile_production(const Production *production) { return fragile_productions.find(production) != fragile_productions.end(); } }; pair build_parse_table( const SyntaxGrammar &grammar, const LexicalGrammar &lex_grammar) { return ParseTableBuilder(grammar, lex_grammar).build(); } } // namespace build_tables } // namespace tree_sitter