Restructure parse state merging logic

* Remove remnants of templatized remove_duplicate_states function
* Rename recovery_tokens function to get_compatible_tokens and augment it
  also compute pairs of tokens which could potentially be incompatible
This commit is contained in:
Max Brunsfeld 2017-02-26 12:23:35 -08:00
parent 8d3b72e1d9
commit 3c8e6f9987
13 changed files with 274 additions and 252 deletions

View file

@ -14,7 +14,7 @@
'src/compiler/build_tables/build_lex_table.cc',
'src/compiler/build_tables/build_parse_table.cc',
'src/compiler/build_tables/build_tables.cc',
'src/compiler/build_tables/recovery_tokens.cc',
'src/compiler/build_tables/compatible_tokens.cc',
'src/compiler/build_tables/lex_item.cc',
'src/compiler/build_tables/lex_item_transitions.cc',
'src/compiler/build_tables/lex_conflict_manager.cc',

View file

@ -1,6 +1,6 @@
#include "spec_helper.h"
#include "compiler/rules/character_set.h"
#include "compiler/build_tables/recovery_tokens.h"
#include "compiler/build_tables/compatible_tokens.h"
#include "compiler/lexical_grammar.h"
#include "helpers/rule_helpers.h"
#include "helpers/stream_methods.h"
@ -27,7 +27,7 @@ describe("recovery_tokens(rule)", []() {
})),
};
AssertThat(recovery_tokens(grammar), Equals<set<Symbol>>({ Symbol(1, Symbol::Terminal) }));
AssertThat(get_compatible_tokens(grammar).recovery_tokens, Equals<set<Symbol>>({ Symbol(1, Symbol::Terminal) }));
});
});

View file

@ -7,7 +7,6 @@
#include <utility>
#include <vector>
#include "compiler/build_tables/lex_conflict_manager.h"
#include "compiler/build_tables/remove_duplicate_states.h"
#include "compiler/build_tables/lex_item.h"
#include "compiler/parse_table.h"
#include "compiler/lexical_grammar.h"
@ -143,13 +142,64 @@ class LexTableBuilder {
state.accept_action.precedence = 0;
}
auto replacements =
remove_duplicate_states<LexTable>(&lex_table);
map<LexStateId, LexStateId> replacements;
while (true) {
map<LexStateId, LexStateId> duplicates;
for (LexStateId i = 0, size = lex_table.states.size(); i < size; i++) {
for (LexStateId j = 0; j < i; j++) {
if (!duplicates.count(j) && lex_table.states[j] == lex_table.states[i]) {
duplicates.insert({ i, j });
break;
}
}
}
if (duplicates.empty()) break;
map<size_t, size_t> new_replacements;
for (LexStateId i = 0, size = lex_table.states.size(); i < size; i++) {
LexStateId new_state_index = i;
auto duplicate = duplicates.find(i);
if (duplicate != duplicates.end()) {
new_state_index = duplicate->second;
}
size_t prior_removed = 0;
for (const auto &duplicate : duplicates) {
if (duplicate.first >= new_state_index) break;
prior_removed++;
}
new_state_index -= prior_removed;
new_replacements.insert({ i, new_state_index });
replacements.insert({ i, new_state_index });
for (auto &replacement : replacements) {
if (replacement.second == i) {
replacement.second = new_state_index;
}
}
}
for (auto &state : lex_table.states) {
for (auto &entry : state.advance_actions) {
auto new_replacement = new_replacements.find(entry.second.state_index);
if (new_replacement != new_replacements.end()) {
entry.second.state_index = new_replacement->second;
}
}
}
for (auto i = duplicates.rbegin(); i != duplicates.rend(); ++i) {
lex_table.states.erase(lex_table.states.begin() + i->first);
}
}
for (ParseState &parse_state : parse_table->states) {
auto replacement = replacements.find(parse_state.lex_state_id);
if (replacement != replacements.end())
if (replacement != replacements.end()) {
parse_state.lex_state_id = replacement->second;
}
}
}

View file

@ -6,14 +6,13 @@
#include <unordered_map>
#include <utility>
#include "compiler/parse_table.h"
#include "compiler/build_tables/remove_duplicate_states.h"
#include "compiler/build_tables/parse_item.h"
#include "compiler/build_tables/parse_item_set_builder.h"
#include "compiler/lexical_grammar.h"
#include "compiler/syntax_grammar.h"
#include "compiler/rules/symbol.h"
#include "compiler/rules/built_in_symbols.h"
#include "compiler/build_tables/recovery_tokens.h"
#include "compiler/build_tables/compatible_tokens.h"
namespace tree_sitter {
namespace build_tables {
@ -41,6 +40,7 @@ class ParseTableBuilder {
set<string> conflicts;
ParseItemSetBuilder item_set_builder;
set<const Production *> fragile_productions;
CompatibleTokensResult compatible_tokens;
bool allow_any_conflict;
public:
@ -49,6 +49,7 @@ class ParseTableBuilder {
: grammar(grammar),
lexical_grammar(lex_grammar),
item_set_builder(grammar, lex_grammar),
compatible_tokens(get_compatible_tokens(lex_grammar)),
allow_any_conflict(false) {}
pair<ParseTable, CompileError> build() {
@ -74,7 +75,7 @@ class ParseTableBuilder {
if (error.type != TSCompileErrorTypeNone)
return { parse_table, error };
parse_table.mergeable_symbols = recovery_tokens(lexical_grammar);
parse_table.mergeable_symbols = compatible_tokens.recovery_tokens;
build_error_parse_state();
@ -302,7 +303,7 @@ class ParseTableBuilder {
set<ParseStateId> deleted_states;
while (true) {
std::map<ParseStateId, ParseStateId> state_replacements;
map<ParseStateId, ParseStateId> state_replacements;
for (auto &pair : state_indices_by_signature) {
auto &state_group = pair.second;
@ -310,7 +311,7 @@ class ParseTableBuilder {
for (ParseStateId i : state_group) {
for (ParseStateId j : state_group) {
if (j == i) break;
if (!state_replacements.count(j) && parse_table.merge_state(j, i)) {
if (!state_replacements.count(j) && merge_parse_state(j, i)) {
state_replacements.insert({ i, j });
deleted_states.insert(i);
break;
@ -364,6 +365,60 @@ class ParseTableBuilder {
}
}
static bool has_entry(const ParseState &state, const ParseTableEntry &entry) {
for (const auto &pair : state.terminal_entries)
if (pair.second == entry)
return true;
return false;
}
bool merge_parse_state(size_t i, size_t j) {
ParseState &state = parse_table.states[i];
ParseState &other = parse_table.states[j];
if (state.nonterminal_entries != other.nonterminal_entries)
return false;
for (auto &entry : state.terminal_entries) {
Symbol lookahead = entry.first;
const vector<ParseAction> &actions = entry.second.actions;
const auto &other_entry = other.terminal_entries.find(lookahead);
if (other_entry == other.terminal_entries.end()) {
if (compatible_tokens.recovery_tokens.count(lookahead) == 0 && !lookahead.is_built_in())
return false;
if (actions.back().type != ParseActionTypeReduce)
return false;
if (!has_entry(other, entry.second))
return false;
} else if (entry.second != other_entry->second) {
return false;
}
}
set<Symbol> symbols_to_merge;
for (auto &entry : other.terminal_entries) {
Symbol lookahead = entry.first;
const vector<ParseAction> &actions = entry.second.actions;
if (!state.terminal_entries.count(lookahead)) {
if (compatible_tokens.recovery_tokens.count(lookahead) == 0 && !lookahead.is_built_in())
return false;
if (actions.back().type != ParseActionTypeReduce)
return false;
if (!has_entry(state, entry.second))
return false;
symbols_to_merge.insert(lookahead);
}
}
for (const Symbol &lookahead : symbols_to_merge)
state.terminal_entries[lookahead] = other.terminal_entries.find(lookahead)->second;
return true;
}
string handle_conflict(const ParseItemSet &item_set, ParseStateId state_id,
Symbol lookahead) {
ParseTableEntry &entry = parse_table.states[state_id].terminal_entries[lookahead];

View file

@ -0,0 +1,132 @@
#include "compiler/build_tables/compatible_tokens.h"
#include "compiler/lexical_grammar.h"
#include "compiler/rules/choice.h"
#include "compiler/rules/character_set.h"
#include "compiler/rules/repeat.h"
#include "compiler/rules/visitor.h"
#include "compiler/rules/seq.h"
#include "compiler/rules/metadata.h"
namespace tree_sitter {
namespace build_tables {
using rules::Symbol;
using std::set;
template <bool left, bool right>
class CharacterAggregator : public rules::RuleFn<void> {
void apply_to(const rules::Seq *rule) {
if (left) apply(rule->left);
if (right) apply(rule->right);
}
void apply_to(const rules::Choice *rule) {
for (const rule_ptr &element : rule->elements) {
apply(element);
}
}
void apply_to(const rules::Repeat *rule) {
apply(rule->content);
}
void apply_to(const rules::Metadata *rule) {
apply(rule->rule);
}
void apply_to(const rules::CharacterSet *rule) {
result.add_set(*rule);
}
public:
rules::CharacterSet result;
};
template <bool left, bool right>
class CharacterIntersector : public rules::RuleFn<bool> {
bool apply_to(const rules::Seq *rule) {
bool result = false;
if (left) result = apply(rule->left);
if (right && !result) result = apply(rule->right);
return result;
}
bool apply_to(const rules::Choice *rule) {
for (const rule_ptr &element : rule->elements) {
if (apply(element)) return true;
}
return false;
}
bool apply_to(const rules::Repeat *rule) {
return apply(rule->content);
}
bool apply_to(const rules::Metadata *rule) {
return apply(rule->rule);
}
bool apply_to(const rules::CharacterSet *rule) {
return character_set->intersects(*rule);
}
public:
rules::CharacterSet *character_set;
CharacterIntersector(rules::CharacterSet *set) : character_set {set} {}
};
using FirstCharacters = CharacterAggregator<true, false>;
using LastCharacters = CharacterAggregator<false, true>;
using AllCharacters = CharacterAggregator<true, true>;
using FirstCharactersIntersector = CharacterIntersector<true, false>;
CompatibleTokensResult get_compatible_tokens(const LexicalGrammar &grammar) {
CompatibleTokensResult result;
AllCharacters all_separator_characters;
for (const rule_ptr &separator : grammar.separators)
all_separator_characters.apply(separator);
for (size_t i = 0; i < grammar.variables.size(); i++) {
Symbol symbol(i, Symbol::Terminal);
rule_ptr rule = grammar.variables[i].rule;
FirstCharacters first_characters;
first_characters.apply(rule);
LastCharacters last_characters;
last_characters.apply(rule);
AllCharacters all_characters;
all_characters.apply(rule);
bool has_distinct_start =
!first_characters.result.includes_all &&
!first_characters.result.intersects(all_separator_characters.result);
bool has_distinct_end =
!last_characters.result.includes_all &&
!last_characters.result.intersects(all_separator_characters.result);
bool has_no_separators =
!all_characters.result.intersects(all_separator_characters.result);
if ((has_distinct_start && has_distinct_end) || has_no_separators)
result.recovery_tokens.insert(symbol);
for (size_t j = 0; j < grammar.variables.size(); j++) {
if (j == i) continue;
Symbol other_symbol(j, Symbol::Terminal);
FirstCharactersIntersector intersector(&first_characters.result);
if (intersector.apply(grammar.variables[j].rule)) {
result.unmergeable_pairs[symbol].insert(other_symbol);
}
}
}
return result;
}
} // namespace build_tables
} // namespace tree_sitter

View file

@ -0,0 +1,25 @@
#ifndef COMPILER_BUILD_TABLES_COMPATIBLE_TOKENS_H_
#define COMPILER_BUILD_TABLES_COMPATIBLE_TOKENS_H_
#include "compiler/rule.h"
#include "compiler/rules/symbol.h"
#include <map>
#include <set>
namespace tree_sitter {
struct LexicalGrammar;
namespace build_tables {
struct CompatibleTokensResult {
std::set<rules::Symbol> recovery_tokens;
std::map<rules::Symbol, std::set<rules::Symbol>> unmergeable_pairs;
};
CompatibleTokensResult get_compatible_tokens(const LexicalGrammar &);
} // namespace build_tables
} // namespace tree_sitter
#endif // COMPILER_BUILD_TABLES_COMPATIBLE_TOKENS_H_

View file

@ -1,89 +0,0 @@
#include "compiler/build_tables/recovery_tokens.h"
#include "compiler/lexical_grammar.h"
#include "compiler/rules/choice.h"
#include "compiler/rules/character_set.h"
#include "compiler/rules/repeat.h"
#include "compiler/rules/visitor.h"
#include "compiler/rules/seq.h"
#include "compiler/rules/metadata.h"
namespace tree_sitter {
namespace build_tables {
using rules::Symbol;
using std::set;
template <bool left, bool right>
class CharacterAggregator : public rules::RuleFn<void> {
void apply_to(const rules::Seq *rule) {
if (left)
apply(rule->left);
if (right)
apply(rule->right);
}
void apply_to(const rules::Choice *rule) {
for (const rule_ptr &element : rule->elements)
apply(element);
}
void apply_to(const rules::Repeat *rule) {
apply(rule->content);
}
void apply_to(const rules::Metadata *rule) {
apply(rule->rule);
}
void apply_to(const rules::CharacterSet *rule) {
result.add_set(*rule);
}
public:
rules::CharacterSet result;
};
class FirstCharacters : public CharacterAggregator<true, false> {};
class LastCharacters : public CharacterAggregator<false, true> {};
class AllCharacters : public CharacterAggregator<true, true> {};
set<Symbol> recovery_tokens(const LexicalGrammar &grammar) {
set<Symbol> result;
AllCharacters all_separator_characters;
for (const rule_ptr &separator : grammar.separators)
all_separator_characters.apply(separator);
for (size_t i = 0; i < grammar.variables.size(); i++) {
const Variable &variable = grammar.variables[i];
rule_ptr rule = variable.rule;
FirstCharacters first_characters;
first_characters.apply(variable.rule);
LastCharacters last_characters;
last_characters.apply(variable.rule);
AllCharacters all_characters;
all_characters.apply(variable.rule);
bool has_distinct_start =
!first_characters.result.includes_all &&
!first_characters.result.intersects(all_separator_characters.result);
bool has_distinct_end =
!last_characters.result.includes_all &&
!last_characters.result.intersects(all_separator_characters.result);
bool has_no_separators =
!all_characters.result.intersects(all_separator_characters.result);
if ((has_distinct_start && has_distinct_end) || has_no_separators)
result.insert(Symbol(i, Symbol::Terminal));
}
return result;
}
} // namespace build_tables
} // namespace tree_sitter

View file

@ -1,19 +0,0 @@
#ifndef COMPILER_BUILD_TABLES_DOES_MATCH_ANY_LINE_H_
#define COMPILER_BUILD_TABLES_DOES_MATCH_ANY_LINE_H_
#include "compiler/rule.h"
#include "compiler/rules/symbol.h"
#include <set>
namespace tree_sitter {
struct LexicalGrammar;
namespace build_tables {
std::set<rules::Symbol> recovery_tokens(const LexicalGrammar &);
} // namespace build_tables
} // namespace tree_sitter
#endif // COMPILER_BUILD_TABLES_DOES_MATCH_ANY_LINE_H_

View file

@ -1,65 +0,0 @@
#ifndef COMPILER_BUILD_TABLES_REMOVE_DUPLICATE_STATES_H_
#define COMPILER_BUILD_TABLES_REMOVE_DUPLICATE_STATES_H_
#include <map>
#include <vector>
namespace tree_sitter {
namespace build_tables {
template <typename TableType>
std::map<size_t, size_t> remove_duplicate_states(TableType *table) {
std::map<size_t, size_t> replacements;
while (true) {
std::map<size_t, size_t> duplicates;
for (size_t i = 0, size = table->states.size(); i < size; i++)
for (size_t j = 0; j < i; j++)
if (!duplicates.count(j) && table->merge_state(j, i)) {
duplicates.insert({ i, j });
break;
}
if (duplicates.empty())
break;
std::map<size_t, size_t> new_replacements;
for (size_t i = 0, size = table->states.size(); i < size; i++) {
size_t new_state_index = i;
auto duplicate = duplicates.find(i);
if (duplicate != duplicates.end())
new_state_index = duplicate->second;
size_t prior_removed = 0;
for (const auto &duplicate : duplicates) {
if (duplicate.first >= new_state_index)
break;
prior_removed++;
}
new_state_index -= prior_removed;
new_replacements.insert({ i, new_state_index });
replacements.insert({ i, new_state_index });
for (auto &replacement : replacements)
if (replacement.second == i)
replacement.second = new_state_index;
}
for (auto &state : table->states)
state.each_referenced_state([&new_replacements](int64_t *state_index) {
auto new_replacement = new_replacements.find(*state_index);
if (new_replacement != new_replacements.end())
*state_index = new_replacement->second;
});
for (auto i = duplicates.rbegin(); i != duplicates.rend(); ++i)
table->states.erase(table->states.begin() + i->first);
}
return replacements;
}
} // namespace build_tables
} // namespace tree_sitter
#endif // COMPILER_BUILD_TABLES_REMOVE_DUPLICATE_STATES_H_

View file

@ -57,11 +57,6 @@ bool LexState::operator==(const LexState &other) const {
is_token_start == other.is_token_start;
}
void LexState::each_referenced_state(function<void(LexStateId *)> fn) {
for (auto &entry : advance_actions)
fn(&entry.second.state_index);
}
LexStateId LexTable::add_state() {
states.push_back(LexState());
return states.size() - 1;
@ -71,8 +66,4 @@ LexState &LexTable::state(LexStateId id) {
return states[id];
}
bool LexTable::merge_state(size_t i, size_t j) {
return states[i] == states[j];
}
} // namespace tree_sitter

View file

@ -54,7 +54,6 @@ class LexState {
LexState();
std::set<rules::CharacterSet> expected_inputs() const;
bool operator==(const LexState &) const;
void each_referenced_state(std::function<void(LexStateId *)>);
std::map<rules::CharacterSet, AdvanceAction> advance_actions;
AcceptTokenAction accept_action;
@ -66,8 +65,6 @@ class LexTable {
LexStateId add_state();
LexState &state(LexStateId state_id);
std::vector<LexState> states;
bool merge_state(size_t i, size_t j);
};
} // namespace tree_sitter

View file

@ -201,58 +201,4 @@ void ParseTable::set_nonterminal_action(ParseStateId state_id,
states[state_id].nonterminal_entries[lookahead] = next_state_id;
}
static bool has_entry(const ParseState &state, const ParseTableEntry &entry) {
for (const auto &pair : state.terminal_entries)
if (pair.second == entry)
return true;
return false;
}
bool ParseTable::merge_state(size_t i, size_t j) {
ParseState &state = states[i];
ParseState &other = states[j];
if (state.nonterminal_entries != other.nonterminal_entries)
return false;
for (auto &entry : state.terminal_entries) {
Symbol lookahead = entry.first;
const vector<ParseAction> &actions = entry.second.actions;
const auto &other_entry = other.terminal_entries.find(lookahead);
if (other_entry == other.terminal_entries.end()) {
if (mergeable_symbols.count(lookahead) == 0 && !lookahead.is_built_in())
return false;
if (actions.back().type != ParseActionTypeReduce)
return false;
if (!has_entry(other, entry.second))
return false;
} else if (entry.second != other_entry->second) {
return false;
}
}
set<Symbol> symbols_to_merge;
for (auto &entry : other.terminal_entries) {
Symbol lookahead = entry.first;
const vector<ParseAction> &actions = entry.second.actions;
if (!state.terminal_entries.count(lookahead)) {
if (mergeable_symbols.count(lookahead) == 0 && !lookahead.is_built_in())
return false;
if (actions.back().type != ParseActionTypeReduce)
return false;
if (!has_entry(state, entry.second))
return false;
symbols_to_merge.insert(lookahead);
}
}
for (const Symbol &lookahead : symbols_to_merge)
state.terminal_entries[lookahead] = other.terminal_entries.find(lookahead)->second;
return true;
}
} // namespace tree_sitter

View file

@ -93,7 +93,6 @@ class ParseTable {
ParseStateId add_state();
ParseAction &add_terminal_action(ParseStateId state_id, rules::Symbol, ParseAction);
void set_nonterminal_action(ParseStateId, rules::Symbol::Index, ParseStateId);
bool merge_state(size_t i, size_t j);
std::vector<ParseState> states;
std::map<rules::Symbol, ParseTableSymbolMetadata> symbols;