diff --git a/project.gyp b/project.gyp index 2687a4cf..536b308c 100644 --- a/project.gyp +++ b/project.gyp @@ -14,7 +14,7 @@ 'src/compiler/build_tables/build_lex_table.cc', 'src/compiler/build_tables/build_parse_table.cc', 'src/compiler/build_tables/build_tables.cc', - 'src/compiler/build_tables/recovery_tokens.cc', + 'src/compiler/build_tables/compatible_tokens.cc', 'src/compiler/build_tables/lex_item.cc', 'src/compiler/build_tables/lex_item_transitions.cc', 'src/compiler/build_tables/lex_conflict_manager.cc', diff --git a/spec/compiler/build_tables/distinctive_tokens_spec.cc b/spec/compiler/build_tables/compatible_tokens_spec.cc similarity index 81% rename from spec/compiler/build_tables/distinctive_tokens_spec.cc rename to spec/compiler/build_tables/compatible_tokens_spec.cc index f01d76cb..4dcf531a 100644 --- a/spec/compiler/build_tables/distinctive_tokens_spec.cc +++ b/spec/compiler/build_tables/compatible_tokens_spec.cc @@ -1,6 +1,6 @@ #include "spec_helper.h" #include "compiler/rules/character_set.h" -#include "compiler/build_tables/recovery_tokens.h" +#include "compiler/build_tables/compatible_tokens.h" #include "compiler/lexical_grammar.h" #include "helpers/rule_helpers.h" #include "helpers/stream_methods.h" @@ -27,7 +27,7 @@ describe("recovery_tokens(rule)", []() { })), }; - AssertThat(recovery_tokens(grammar), Equals>({ Symbol(1, Symbol::Terminal) })); + AssertThat(get_compatible_tokens(grammar).recovery_tokens, Equals>({ Symbol(1, Symbol::Terminal) })); }); }); diff --git a/src/compiler/build_tables/build_lex_table.cc b/src/compiler/build_tables/build_lex_table.cc index 29d8f4d0..e0dc4a8b 100644 --- a/src/compiler/build_tables/build_lex_table.cc +++ b/src/compiler/build_tables/build_lex_table.cc @@ -7,7 +7,6 @@ #include #include #include "compiler/build_tables/lex_conflict_manager.h" -#include "compiler/build_tables/remove_duplicate_states.h" #include "compiler/build_tables/lex_item.h" #include "compiler/parse_table.h" #include "compiler/lexical_grammar.h" @@ -143,13 +142,64 @@ class LexTableBuilder { state.accept_action.precedence = 0; } - auto replacements = - remove_duplicate_states(&lex_table); + map replacements; + + while (true) { + map duplicates; + for (LexStateId i = 0, size = lex_table.states.size(); i < size; i++) { + for (LexStateId j = 0; j < i; j++) { + if (!duplicates.count(j) && lex_table.states[j] == lex_table.states[i]) { + duplicates.insert({ i, j }); + break; + } + } + } + + if (duplicates.empty()) break; + + map new_replacements; + for (LexStateId i = 0, size = lex_table.states.size(); i < size; i++) { + LexStateId new_state_index = i; + auto duplicate = duplicates.find(i); + if (duplicate != duplicates.end()) { + new_state_index = duplicate->second; + } + + size_t prior_removed = 0; + for (const auto &duplicate : duplicates) { + if (duplicate.first >= new_state_index) break; + prior_removed++; + } + + new_state_index -= prior_removed; + new_replacements.insert({ i, new_state_index }); + replacements.insert({ i, new_state_index }); + for (auto &replacement : replacements) { + if (replacement.second == i) { + replacement.second = new_state_index; + } + } + } + + for (auto &state : lex_table.states) { + for (auto &entry : state.advance_actions) { + auto new_replacement = new_replacements.find(entry.second.state_index); + if (new_replacement != new_replacements.end()) { + entry.second.state_index = new_replacement->second; + } + } + } + + for (auto i = duplicates.rbegin(); i != duplicates.rend(); ++i) { + lex_table.states.erase(lex_table.states.begin() + i->first); + } + } for (ParseState &parse_state : parse_table->states) { auto replacement = replacements.find(parse_state.lex_state_id); - if (replacement != replacements.end()) + if (replacement != replacements.end()) { parse_state.lex_state_id = replacement->second; + } } } diff --git a/src/compiler/build_tables/build_parse_table.cc b/src/compiler/build_tables/build_parse_table.cc index 9fb6859f..ce721119 100644 --- a/src/compiler/build_tables/build_parse_table.cc +++ b/src/compiler/build_tables/build_parse_table.cc @@ -6,14 +6,13 @@ #include #include #include "compiler/parse_table.h" -#include "compiler/build_tables/remove_duplicate_states.h" #include "compiler/build_tables/parse_item.h" #include "compiler/build_tables/parse_item_set_builder.h" #include "compiler/lexical_grammar.h" #include "compiler/syntax_grammar.h" #include "compiler/rules/symbol.h" #include "compiler/rules/built_in_symbols.h" -#include "compiler/build_tables/recovery_tokens.h" +#include "compiler/build_tables/compatible_tokens.h" namespace tree_sitter { namespace build_tables { @@ -41,6 +40,7 @@ class ParseTableBuilder { set conflicts; ParseItemSetBuilder item_set_builder; set fragile_productions; + CompatibleTokensResult compatible_tokens; bool allow_any_conflict; public: @@ -49,6 +49,7 @@ class ParseTableBuilder { : grammar(grammar), lexical_grammar(lex_grammar), item_set_builder(grammar, lex_grammar), + compatible_tokens(get_compatible_tokens(lex_grammar)), allow_any_conflict(false) {} pair build() { @@ -74,7 +75,7 @@ class ParseTableBuilder { if (error.type != TSCompileErrorTypeNone) return { parse_table, error }; - parse_table.mergeable_symbols = recovery_tokens(lexical_grammar); + parse_table.mergeable_symbols = compatible_tokens.recovery_tokens; build_error_parse_state(); @@ -302,7 +303,7 @@ class ParseTableBuilder { set deleted_states; while (true) { - std::map state_replacements; + map state_replacements; for (auto &pair : state_indices_by_signature) { auto &state_group = pair.second; @@ -310,7 +311,7 @@ class ParseTableBuilder { for (ParseStateId i : state_group) { for (ParseStateId j : state_group) { if (j == i) break; - if (!state_replacements.count(j) && parse_table.merge_state(j, i)) { + if (!state_replacements.count(j) && merge_parse_state(j, i)) { state_replacements.insert({ i, j }); deleted_states.insert(i); break; @@ -364,6 +365,60 @@ class ParseTableBuilder { } } + static bool has_entry(const ParseState &state, const ParseTableEntry &entry) { + for (const auto &pair : state.terminal_entries) + if (pair.second == entry) + return true; + return false; + } + + bool merge_parse_state(size_t i, size_t j) { + ParseState &state = parse_table.states[i]; + ParseState &other = parse_table.states[j]; + + if (state.nonterminal_entries != other.nonterminal_entries) + return false; + + for (auto &entry : state.terminal_entries) { + Symbol lookahead = entry.first; + const vector &actions = entry.second.actions; + + const auto &other_entry = other.terminal_entries.find(lookahead); + if (other_entry == other.terminal_entries.end()) { + if (compatible_tokens.recovery_tokens.count(lookahead) == 0 && !lookahead.is_built_in()) + return false; + if (actions.back().type != ParseActionTypeReduce) + return false; + if (!has_entry(other, entry.second)) + return false; + } else if (entry.second != other_entry->second) { + return false; + } + } + + set symbols_to_merge; + + for (auto &entry : other.terminal_entries) { + Symbol lookahead = entry.first; + const vector &actions = entry.second.actions; + + if (!state.terminal_entries.count(lookahead)) { + if (compatible_tokens.recovery_tokens.count(lookahead) == 0 && !lookahead.is_built_in()) + return false; + if (actions.back().type != ParseActionTypeReduce) + return false; + if (!has_entry(state, entry.second)) + return false; + symbols_to_merge.insert(lookahead); + } + } + + for (const Symbol &lookahead : symbols_to_merge) + state.terminal_entries[lookahead] = other.terminal_entries.find(lookahead)->second; + + return true; + } + string handle_conflict(const ParseItemSet &item_set, ParseStateId state_id, Symbol lookahead) { ParseTableEntry &entry = parse_table.states[state_id].terminal_entries[lookahead]; diff --git a/src/compiler/build_tables/compatible_tokens.cc b/src/compiler/build_tables/compatible_tokens.cc new file mode 100644 index 00000000..1f29b671 --- /dev/null +++ b/src/compiler/build_tables/compatible_tokens.cc @@ -0,0 +1,132 @@ +#include "compiler/build_tables/compatible_tokens.h" +#include "compiler/lexical_grammar.h" +#include "compiler/rules/choice.h" +#include "compiler/rules/character_set.h" +#include "compiler/rules/repeat.h" +#include "compiler/rules/visitor.h" +#include "compiler/rules/seq.h" +#include "compiler/rules/metadata.h" + +namespace tree_sitter { +namespace build_tables { + +using rules::Symbol; +using std::set; + +template +class CharacterAggregator : public rules::RuleFn { + void apply_to(const rules::Seq *rule) { + if (left) apply(rule->left); + if (right) apply(rule->right); + } + + void apply_to(const rules::Choice *rule) { + for (const rule_ptr &element : rule->elements) { + apply(element); + } + } + + void apply_to(const rules::Repeat *rule) { + apply(rule->content); + } + + void apply_to(const rules::Metadata *rule) { + apply(rule->rule); + } + + void apply_to(const rules::CharacterSet *rule) { + result.add_set(*rule); + } + + public: + rules::CharacterSet result; +}; + +template +class CharacterIntersector : public rules::RuleFn { + bool apply_to(const rules::Seq *rule) { + bool result = false; + if (left) result = apply(rule->left); + if (right && !result) result = apply(rule->right); + return result; + } + + bool apply_to(const rules::Choice *rule) { + for (const rule_ptr &element : rule->elements) { + if (apply(element)) return true; + } + return false; + } + + bool apply_to(const rules::Repeat *rule) { + return apply(rule->content); + } + + bool apply_to(const rules::Metadata *rule) { + return apply(rule->rule); + } + + bool apply_to(const rules::CharacterSet *rule) { + return character_set->intersects(*rule); + } + + public: + rules::CharacterSet *character_set; + + CharacterIntersector(rules::CharacterSet *set) : character_set {set} {} +}; + +using FirstCharacters = CharacterAggregator; +using LastCharacters = CharacterAggregator; +using AllCharacters = CharacterAggregator; +using FirstCharactersIntersector = CharacterIntersector; + +CompatibleTokensResult get_compatible_tokens(const LexicalGrammar &grammar) { + CompatibleTokensResult result; + + AllCharacters all_separator_characters; + for (const rule_ptr &separator : grammar.separators) + all_separator_characters.apply(separator); + + for (size_t i = 0; i < grammar.variables.size(); i++) { + Symbol symbol(i, Symbol::Terminal); + rule_ptr rule = grammar.variables[i].rule; + + FirstCharacters first_characters; + first_characters.apply(rule); + + LastCharacters last_characters; + last_characters.apply(rule); + + AllCharacters all_characters; + all_characters.apply(rule); + + bool has_distinct_start = + !first_characters.result.includes_all && + !first_characters.result.intersects(all_separator_characters.result); + + bool has_distinct_end = + !last_characters.result.includes_all && + !last_characters.result.intersects(all_separator_characters.result); + + bool has_no_separators = + !all_characters.result.intersects(all_separator_characters.result); + + if ((has_distinct_start && has_distinct_end) || has_no_separators) + result.recovery_tokens.insert(symbol); + + for (size_t j = 0; j < grammar.variables.size(); j++) { + if (j == i) continue; + Symbol other_symbol(j, Symbol::Terminal); + FirstCharactersIntersector intersector(&first_characters.result); + if (intersector.apply(grammar.variables[j].rule)) { + result.unmergeable_pairs[symbol].insert(other_symbol); + } + } + } + + return result; +} + +} // namespace build_tables +} // namespace tree_sitter diff --git a/src/compiler/build_tables/compatible_tokens.h b/src/compiler/build_tables/compatible_tokens.h new file mode 100644 index 00000000..5c15358f --- /dev/null +++ b/src/compiler/build_tables/compatible_tokens.h @@ -0,0 +1,25 @@ +#ifndef COMPILER_BUILD_TABLES_COMPATIBLE_TOKENS_H_ +#define COMPILER_BUILD_TABLES_COMPATIBLE_TOKENS_H_ + +#include "compiler/rule.h" +#include "compiler/rules/symbol.h" +#include +#include + +namespace tree_sitter { + +struct LexicalGrammar; + +namespace build_tables { + +struct CompatibleTokensResult { + std::set recovery_tokens; + std::map> unmergeable_pairs; +}; + +CompatibleTokensResult get_compatible_tokens(const LexicalGrammar &); + +} // namespace build_tables +} // namespace tree_sitter + +#endif // COMPILER_BUILD_TABLES_COMPATIBLE_TOKENS_H_ diff --git a/src/compiler/build_tables/recovery_tokens.cc b/src/compiler/build_tables/recovery_tokens.cc deleted file mode 100644 index 84b175bc..00000000 --- a/src/compiler/build_tables/recovery_tokens.cc +++ /dev/null @@ -1,89 +0,0 @@ -#include "compiler/build_tables/recovery_tokens.h" -#include "compiler/lexical_grammar.h" -#include "compiler/rules/choice.h" -#include "compiler/rules/character_set.h" -#include "compiler/rules/repeat.h" -#include "compiler/rules/visitor.h" -#include "compiler/rules/seq.h" -#include "compiler/rules/metadata.h" - -namespace tree_sitter { -namespace build_tables { - -using rules::Symbol; -using std::set; - -template -class CharacterAggregator : public rules::RuleFn { - void apply_to(const rules::Seq *rule) { - if (left) - apply(rule->left); - if (right) - apply(rule->right); - } - - void apply_to(const rules::Choice *rule) { - for (const rule_ptr &element : rule->elements) - apply(element); - } - - void apply_to(const rules::Repeat *rule) { - apply(rule->content); - } - - void apply_to(const rules::Metadata *rule) { - apply(rule->rule); - } - - void apply_to(const rules::CharacterSet *rule) { - result.add_set(*rule); - } - - public: - rules::CharacterSet result; -}; - -class FirstCharacters : public CharacterAggregator {}; -class LastCharacters : public CharacterAggregator {}; -class AllCharacters : public CharacterAggregator {}; - -set recovery_tokens(const LexicalGrammar &grammar) { - set result; - - AllCharacters all_separator_characters; - for (const rule_ptr &separator : grammar.separators) - all_separator_characters.apply(separator); - - for (size_t i = 0; i < grammar.variables.size(); i++) { - const Variable &variable = grammar.variables[i]; - rule_ptr rule = variable.rule; - - FirstCharacters first_characters; - first_characters.apply(variable.rule); - - LastCharacters last_characters; - last_characters.apply(variable.rule); - - AllCharacters all_characters; - all_characters.apply(variable.rule); - - bool has_distinct_start = - !first_characters.result.includes_all && - !first_characters.result.intersects(all_separator_characters.result); - - bool has_distinct_end = - !last_characters.result.includes_all && - !last_characters.result.intersects(all_separator_characters.result); - - bool has_no_separators = - !all_characters.result.intersects(all_separator_characters.result); - - if ((has_distinct_start && has_distinct_end) || has_no_separators) - result.insert(Symbol(i, Symbol::Terminal)); - } - - return result; -} - -} // namespace build_tables -} // namespace tree_sitter diff --git a/src/compiler/build_tables/recovery_tokens.h b/src/compiler/build_tables/recovery_tokens.h deleted file mode 100644 index c97a8cfd..00000000 --- a/src/compiler/build_tables/recovery_tokens.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef COMPILER_BUILD_TABLES_DOES_MATCH_ANY_LINE_H_ -#define COMPILER_BUILD_TABLES_DOES_MATCH_ANY_LINE_H_ - -#include "compiler/rule.h" -#include "compiler/rules/symbol.h" -#include - -namespace tree_sitter { - -struct LexicalGrammar; - -namespace build_tables { - -std::set recovery_tokens(const LexicalGrammar &); - -} // namespace build_tables -} // namespace tree_sitter - -#endif // COMPILER_BUILD_TABLES_DOES_MATCH_ANY_LINE_H_ diff --git a/src/compiler/build_tables/remove_duplicate_states.h b/src/compiler/build_tables/remove_duplicate_states.h deleted file mode 100644 index a154c05a..00000000 --- a/src/compiler/build_tables/remove_duplicate_states.h +++ /dev/null @@ -1,65 +0,0 @@ -#ifndef COMPILER_BUILD_TABLES_REMOVE_DUPLICATE_STATES_H_ -#define COMPILER_BUILD_TABLES_REMOVE_DUPLICATE_STATES_H_ - -#include -#include - -namespace tree_sitter { -namespace build_tables { - -template -std::map remove_duplicate_states(TableType *table) { - std::map replacements; - - while (true) { - std::map duplicates; - for (size_t i = 0, size = table->states.size(); i < size; i++) - for (size_t j = 0; j < i; j++) - if (!duplicates.count(j) && table->merge_state(j, i)) { - duplicates.insert({ i, j }); - break; - } - - if (duplicates.empty()) - break; - - std::map new_replacements; - for (size_t i = 0, size = table->states.size(); i < size; i++) { - size_t new_state_index = i; - auto duplicate = duplicates.find(i); - if (duplicate != duplicates.end()) - new_state_index = duplicate->second; - - size_t prior_removed = 0; - for (const auto &duplicate : duplicates) { - if (duplicate.first >= new_state_index) - break; - prior_removed++; - } - - new_state_index -= prior_removed; - new_replacements.insert({ i, new_state_index }); - replacements.insert({ i, new_state_index }); - for (auto &replacement : replacements) - if (replacement.second == i) - replacement.second = new_state_index; - } - - for (auto &state : table->states) - state.each_referenced_state([&new_replacements](int64_t *state_index) { - auto new_replacement = new_replacements.find(*state_index); - if (new_replacement != new_replacements.end()) - *state_index = new_replacement->second; - }); - - for (auto i = duplicates.rbegin(); i != duplicates.rend(); ++i) - table->states.erase(table->states.begin() + i->first); - } - - return replacements; -} - -} // namespace build_tables -} // namespace tree_sitter - -#endif // COMPILER_BUILD_TABLES_REMOVE_DUPLICATE_STATES_H_ diff --git a/src/compiler/lex_table.cc b/src/compiler/lex_table.cc index 8f8d2ded..67ab20c0 100644 --- a/src/compiler/lex_table.cc +++ b/src/compiler/lex_table.cc @@ -57,11 +57,6 @@ bool LexState::operator==(const LexState &other) const { is_token_start == other.is_token_start; } -void LexState::each_referenced_state(function fn) { - for (auto &entry : advance_actions) - fn(&entry.second.state_index); -} - LexStateId LexTable::add_state() { states.push_back(LexState()); return states.size() - 1; @@ -71,8 +66,4 @@ LexState &LexTable::state(LexStateId id) { return states[id]; } -bool LexTable::merge_state(size_t i, size_t j) { - return states[i] == states[j]; -} - } // namespace tree_sitter diff --git a/src/compiler/lex_table.h b/src/compiler/lex_table.h index ac7357a1..4ffd3421 100644 --- a/src/compiler/lex_table.h +++ b/src/compiler/lex_table.h @@ -54,7 +54,6 @@ class LexState { LexState(); std::set expected_inputs() const; bool operator==(const LexState &) const; - void each_referenced_state(std::function); std::map advance_actions; AcceptTokenAction accept_action; @@ -66,8 +65,6 @@ class LexTable { LexStateId add_state(); LexState &state(LexStateId state_id); std::vector states; - - bool merge_state(size_t i, size_t j); }; } // namespace tree_sitter diff --git a/src/compiler/parse_table.cc b/src/compiler/parse_table.cc index a04eec8c..1b4355c1 100644 --- a/src/compiler/parse_table.cc +++ b/src/compiler/parse_table.cc @@ -201,58 +201,4 @@ void ParseTable::set_nonterminal_action(ParseStateId state_id, states[state_id].nonterminal_entries[lookahead] = next_state_id; } -static bool has_entry(const ParseState &state, const ParseTableEntry &entry) { - for (const auto &pair : state.terminal_entries) - if (pair.second == entry) - return true; - return false; -} - -bool ParseTable::merge_state(size_t i, size_t j) { - ParseState &state = states[i]; - ParseState &other = states[j]; - - if (state.nonterminal_entries != other.nonterminal_entries) - return false; - - for (auto &entry : state.terminal_entries) { - Symbol lookahead = entry.first; - const vector &actions = entry.second.actions; - - const auto &other_entry = other.terminal_entries.find(lookahead); - if (other_entry == other.terminal_entries.end()) { - if (mergeable_symbols.count(lookahead) == 0 && !lookahead.is_built_in()) - return false; - if (actions.back().type != ParseActionTypeReduce) - return false; - if (!has_entry(other, entry.second)) - return false; - } else if (entry.second != other_entry->second) { - return false; - } - } - - set symbols_to_merge; - - for (auto &entry : other.terminal_entries) { - Symbol lookahead = entry.first; - const vector &actions = entry.second.actions; - - if (!state.terminal_entries.count(lookahead)) { - if (mergeable_symbols.count(lookahead) == 0 && !lookahead.is_built_in()) - return false; - if (actions.back().type != ParseActionTypeReduce) - return false; - if (!has_entry(state, entry.second)) - return false; - symbols_to_merge.insert(lookahead); - } - } - - for (const Symbol &lookahead : symbols_to_merge) - state.terminal_entries[lookahead] = other.terminal_entries.find(lookahead)->second; - - return true; -} - } // namespace tree_sitter diff --git a/src/compiler/parse_table.h b/src/compiler/parse_table.h index 79eec4fc..615714b1 100644 --- a/src/compiler/parse_table.h +++ b/src/compiler/parse_table.h @@ -93,7 +93,6 @@ class ParseTable { ParseStateId add_state(); ParseAction &add_terminal_action(ParseStateId state_id, rules::Symbol, ParseAction); void set_nonterminal_action(ParseStateId, rules::Symbol::Index, ParseStateId); - bool merge_state(size_t i, size_t j); std::vector states; std::map symbols;