Merge pull request #48 from tree-sitter/optimize-state-deduping

Optimize parse state deduping
This commit is contained in:
Max Brunsfeld 2016-11-16 12:02:16 -08:00 committed by GitHub
commit 4c60141b63
12 changed files with 243 additions and 70 deletions

View file

@ -44,7 +44,7 @@ class LexTableBuilder {
const LexicalGrammar lex_grammar;
vector<rule_ptr> separator_rules;
LexConflictManager conflict_manager;
unordered_map<const LexItemSet, LexStateId, LexItemSet::Hash> lex_state_ids;
unordered_map<LexItemSet, LexStateId> lex_state_ids;
public:
LexTableBuilder(ParseTable *parse_table, const LexicalGrammar &lex_grammar)

View file

@ -36,7 +36,7 @@ class ParseTableBuilder {
const LexicalGrammar lexical_grammar;
ParseConflictManager conflict_manager;
unordered_map<Symbol, ParseItemSet> recovery_states;
unordered_map<ParseItemSet, ParseStateId, ParseItemSet::Hash> parse_state_ids;
unordered_map<ParseItemSet, ParseStateId> parse_state_ids;
vector<pair<ParseItemSet, ParseStateId>> item_sets_to_process;
ParseTable parse_table;
set<string> conflicts;
@ -146,8 +146,8 @@ class ParseTableBuilder {
auto pair = parse_state_ids.find(item_set);
if (pair == parse_state_ids.end()) {
ParseStateId state_id = parse_table.add_state();
parse_state_ids[item_set] = state_id;
parse_table.states[state_id].shift_actions_signature = item_set.unfinished_item_signature();
item_sets_to_process.push_back({ std::move(item_set), state_id });
return state_id;
} else {
@ -172,7 +172,8 @@ class ParseTableBuilder {
new_action->state_index = add_parse_state(next_item_set);
}
} else {
parse_table.set_nonterminal_action(state_id, symbol.index, add_parse_state(next_item_set));
ParseStateId next_state = add_parse_state(next_item_set);
parse_table.set_nonterminal_action(state_id, symbol.index, next_state);
}
}
}
@ -250,7 +251,76 @@ class ParseTableBuilder {
}
void remove_duplicate_parse_states() {
remove_duplicate_states<ParseTable>(&parse_table);
map<size_t, set<ParseStateId>> state_indices_by_signature;
for (ParseStateId i = 0, n = parse_table.states.size(); i < n; i++) {
ParseState &state = parse_table.states[i];
state_indices_by_signature[state.shift_actions_signature].insert(i);
}
set<ParseStateId> deleted_states;
while (true) {
std::map<ParseStateId, ParseStateId> state_replacements;
for (auto &pair : state_indices_by_signature) {
auto &state_group = pair.second;
for (ParseStateId i : state_group) {
for (ParseStateId j : state_group) {
if (j == i) break;
if (!state_replacements.count(j) && parse_table.merge_state(j, i)) {
state_replacements.insert({ i, j });
deleted_states.insert(i);
break;
}
}
}
}
if (state_replacements.empty()) break;
for (ParseStateId i = 0, n = parse_table.states.size(); i < n; i++) {
ParseState &state = parse_table.states[i];
if (state_replacements.count(i)) {
state_indices_by_signature[state.shift_actions_signature].erase(i);
} else {
state.each_referenced_state([&state_replacements](int64_t *state_index) {
auto replacement = state_replacements.find(*state_index);
if (replacement != state_replacements.end()) {
*state_index = replacement->second;
}
});
}
}
}
vector<ParseStateId> new_state_ids(parse_table.states.size());
size_t deleted_state_count = 0;
auto deleted_state_iter = deleted_states.begin();
for (size_t i = 0; i < new_state_ids.size(); i++) {
while (deleted_state_iter != deleted_states.end() && *deleted_state_iter < i) {
deleted_state_count++;
deleted_state_iter++;
}
new_state_ids[i] = i - deleted_state_count;
}
ParseStateId original_state_index = 0;
auto iter = parse_table.states.begin();
while (iter != parse_table.states.end()) {
if (deleted_states.count(original_state_index)) {
iter = parse_table.states.erase(iter);
} else {
ParseState &state = *iter;
state.each_referenced_state([&new_state_ids](int64_t *state_index) {
*state_index = new_state_ids[*state_index];
});
++iter;
}
original_state_index++;
}
}
ParseAction *add_terminal_action(ParseStateId state_id, Symbol::Index lookahead,

View file

@ -8,11 +8,11 @@
#include "compiler/rules/symbol.h"
#include "compiler/rules/repeat.h"
#include "compiler/rules/visitor.h"
#include "compiler/util/hash_combine.h"
namespace tree_sitter {
namespace build_tables {
using std::hash;
using std::map;
using std::string;
using std::unordered_set;
@ -69,20 +69,9 @@ LexItem::CompletionStatus LexItem::completion_status() const {
return GetCompletionStatus().apply(rule);
}
size_t LexItem::Hash::operator()(const LexItem &item) const {
return hash<Symbol>()(item.lhs) ^ hash<rule_ptr>()(item.rule);
}
size_t LexItemSet::Hash::operator()(const LexItemSet &item_set) const {
size_t result = hash<size_t>()(item_set.entries.size());
for (const auto &item : item_set.entries)
result ^= LexItem::Hash()(item);
return result;
}
LexItemSet::LexItemSet() {}
LexItemSet::LexItemSet(const unordered_set<LexItem, LexItem::Hash> &entries)
LexItemSet::LexItemSet(const unordered_set<LexItem> &entries)
: entries(entries) {}
bool LexItemSet::operator==(const LexItemSet &other) const {
@ -103,3 +92,27 @@ bool LexItemSet::Transition::operator==(const LexItemSet::Transition &other) con
} // namespace build_tables
} // namespace tree_sitter
namespace std {
using tree_sitter::util::hash_combine;
using tree_sitter::util::symmetric_hash_combine;
using tree_sitter::build_tables::LexItem;
using tree_sitter::build_tables::LexItemSet;
size_t hash<LexItem>::operator()(const LexItem &item) const {
size_t result = 0;
hash_combine(&result, item.lhs.index);
hash_combine(&result, item.rule);
return result;
}
size_t hash<LexItemSet>::operator()(const LexItemSet &item_set) const {
size_t result = 0;
hash_combine(&result, item_set.entries.size());
for (const auto &item : item_set.entries)
symmetric_hash_combine(&result, item);
return result;
}
} // namespace std

View file

@ -22,10 +22,6 @@ class LexItem {
bool is_string;
};
struct Hash {
size_t operator()(const LexItem &) const;
};
bool operator==(const LexItem &other) const;
CompletionStatus completion_status() const;
@ -33,23 +29,34 @@ class LexItem {
rule_ptr rule;
};
} // namespace build_tables
} // namespace tree_sitter
namespace std {
template <>
struct hash<tree_sitter::build_tables::LexItem> {
size_t operator()(const tree_sitter::build_tables::LexItem &) const;
};
} // namespace std
namespace tree_sitter {
namespace build_tables {
class LexItemSet {
public:
LexItemSet();
explicit LexItemSet(const std::unordered_set<LexItem, LexItem::Hash> &);
explicit LexItemSet(const std::unordered_set<LexItem> &);
bool operator==(const LexItemSet &) const;
struct Hash {
size_t operator()(const LexItemSet &) const;
};
struct Transition;
typedef std::map<rules::CharacterSet, Transition> TransitionMap;
TransitionMap transitions() const;
std::unordered_set<LexItem, LexItem::Hash> entries;
std::unordered_set<LexItem> entries;
};
struct LexItemSet::Transition {
@ -63,4 +70,13 @@ struct LexItemSet::Transition {
} // namespace build_tables
} // namespace tree_sitter
namespace std {
template <>
struct hash<tree_sitter::build_tables::LexItemSet> {
size_t operator()(const tree_sitter::build_tables::LexItemSet &) const;
};
} // namespace std
#endif // COMPILER_BUILD_TABLES_LEX_ITEM_H_

View file

@ -2,6 +2,7 @@
#include <string>
#include "compiler/syntax_grammar.h"
#include "compiler/rules/built_in_symbols.h"
#include "compiler/util/hash_combine.h"
namespace tree_sitter {
namespace build_tables {
@ -10,8 +11,10 @@ using std::map;
using std::pair;
using std::string;
using std::to_string;
using std::hash;
using rules::Symbol;
using util::hash_combine;
ParseItem::ParseItem() : variable_index(-1), production(nullptr), step_index(0) {}
ParseItem::ParseItem(const Symbol &lhs, const Production &production,
unsigned int step_index)
@ -78,13 +81,6 @@ rules::Associativity ParseItem::associativity() const {
return production->at(step_index).associativity;
}
size_t ParseItem::Hash::operator()(const ParseItem &item) const {
size_t result = hash<int>()(item.variable_index);
result ^= hash<unsigned int>()(item.step_index);
result ^= hash<const void *>()(static_cast<const void *>(item.production));
return result;
}
ParseItemSet::ParseItemSet() {}
ParseItemSet::ParseItemSet(const map<ParseItem, LookaheadSet> &entries)
@ -94,16 +90,19 @@ bool ParseItemSet::operator==(const ParseItemSet &other) const {
return entries == other.entries;
}
size_t ParseItemSet::Hash::operator()(const ParseItemSet &item_set) const {
size_t result = hash<size_t>()(item_set.entries.size());
for (auto &pair : item_set.entries) {
size_t ParseItemSet::unfinished_item_signature() const {
size_t result = 0;
ParseItem previous_item;
for (auto &pair : entries) {
const ParseItem &item = pair.first;
result ^= ParseItem::Hash()(item);
const LookaheadSet &lookahead_set = pair.second;
result ^= hash<size_t>()(lookahead_set.entries->size());
for (Symbol::Index index : *pair.second.entries)
result ^= hash<Symbol::Index>()(index);
if (item.step_index < item.production->size()) {
if (item.variable_index != previous_item.variable_index &&
item.step_index != previous_item.step_index) {
hash_combine(&result, item.variable_index);
hash_combine(&result, item.step_index);
previous_item = item;
}
}
}
return result;
}
@ -135,3 +134,37 @@ void ParseItemSet::add(const ParseItemSet &other) {
} // namespace build_tables
} // namespace tree_sitter
namespace std {
using tree_sitter::build_tables::ParseItem;
using tree_sitter::build_tables::ParseItemSet;
using tree_sitter::util::hash_combine;
template <>
struct hash<ParseItem> {
size_t operator()(const ParseItem &item) const {
size_t result = 0;
hash_combine(&result, item.variable_index);
hash_combine(&result, item.step_index);
hash_combine(&result, item.production);
return result;
}
};
size_t hash<ParseItemSet>::operator()(const ParseItemSet &item_set) const {
size_t result = 0;
hash_combine(&result, item_set.entries.size());
for (auto &pair : item_set.entries) {
const ParseItem &item = pair.first;
const auto &lookahead_set = pair.second;
hash_combine(&result, item);
hash_combine(&result, lookahead_set.entries->size());
for (auto index : *pair.second.entries)
hash_combine(&result, index);
}
return result;
}
} // namespace std

View file

@ -14,6 +14,7 @@ namespace build_tables {
class ParseItem {
public:
ParseItem();
ParseItem(const rules::Symbol &, const Production &, unsigned int);
struct CompletionStatus {
@ -22,10 +23,6 @@ class ParseItem {
rules::Associativity associativity;
};
struct Hash {
size_t operator()(const ParseItem &) const;
};
bool operator==(const ParseItem &other) const;
bool operator<(const ParseItem &other) const;
rules::Symbol lhs() const;
@ -47,13 +44,10 @@ class ParseItemSet {
typedef std::map<rules::Symbol, std::pair<ParseItemSet, PrecedenceRange>>
TransitionMap;
struct Hash {
size_t operator()(const ParseItemSet &) const;
};
TransitionMap transitions() const;
bool operator==(const ParseItemSet &) const;
void add(const ParseItemSet &);
size_t unfinished_item_signature() const;
std::map<ParseItem, LookaheadSet> entries;
};
@ -61,4 +55,15 @@ class ParseItemSet {
} // namespace build_tables
} // namespace tree_sitter
namespace std {
using tree_sitter::build_tables::ParseItemSet;
template <>
struct hash<tree_sitter::build_tables::ParseItemSet> {
size_t operator()(const ParseItemSet &item_set) const;
};
} // namespace std
#endif // COMPILER_BUILD_TABLES_PARSE_ITEM_H_

View file

@ -47,6 +47,7 @@ class ParseAction {
rules::Symbol symbol;
ParseStateId state_index;
size_t consumed_symbol_count;
PrecedenceRange precedence_range;
rules::Associativity associativity;
const Production *production;
@ -78,6 +79,7 @@ class ParseState {
std::map<rules::Symbol::Index, ParseTableEntry> terminal_entries;
std::map<rules::Symbol::Index, ParseStateId> nonterminal_entries;
LexStateId lex_state_id;
size_t shift_actions_signature;
};
struct ParseTableSymbolMetadata {

View file

@ -3,14 +3,15 @@
#include <utility>
#include <vector>
#include "compiler/rules/visitor.h"
#include "compiler/util/hash_combine.h"
namespace tree_sitter {
namespace rules {
using std::string;
using std::hash;
using std::set;
using std::vector;
using util::hash_combine;
static void add_range(set<uint32_t> *characters, uint32_t min, uint32_t max) {
for (uint32_t c = min; c <= max; c++)
@ -83,14 +84,14 @@ bool CharacterSet::operator<(const CharacterSet &other) const {
}
size_t CharacterSet::hash_code() const {
size_t result = hash<bool>()(includes_all);
result ^= hash<size_t>()(included_chars.size());
for (auto &c : included_chars)
result ^= hash<uint32_t>()(c);
result <<= 1;
result ^= hash<size_t>()(excluded_chars.size());
for (auto &c : excluded_chars)
result ^= hash<uint32_t>()(c);
size_t result = 0;
hash_combine(&result, includes_all);
hash_combine(&result, included_chars.size());
for (uint32_t c : included_chars)
hash_combine(&result, c);
hash_combine(&result, excluded_chars.size());
for (uint32_t c : excluded_chars)
hash_combine(&result, c);
return result;
}

View file

@ -2,6 +2,7 @@
#include <string>
#include <set>
#include "compiler/rules/visitor.h"
#include "compiler/util/hash_combine.h"
namespace tree_sitter {
namespace rules {
@ -10,6 +11,7 @@ using std::string;
using std::make_shared;
using std::vector;
using std::set;
using util::symmetric_hash_combine;
Choice::Choice(const vector<rule_ptr> &elements) : elements(elements) {}
@ -50,9 +52,10 @@ bool Choice::operator==(const Rule &rule) const {
}
size_t Choice::hash_code() const {
size_t result = std::hash<size_t>()(elements.size());
size_t result = 0;
symmetric_hash_combine(&result, elements.size());
for (const auto &element : elements)
result ^= element->hash_code();
symmetric_hash_combine(&result, element);
return result;
}

View file

@ -3,14 +3,15 @@
#include <map>
#include "compiler/rules/visitor.h"
#include "compiler/rules/blank.h"
#include "compiler/util/hash_combine.h"
namespace tree_sitter {
namespace rules {
using std::hash;
using std::make_shared;
using std::map;
using std::pair;
using util::hash_combine;
Metadata::Metadata(rule_ptr rule, map<MetadataKey, int> values)
: rule(rule), value(values) {}
@ -25,10 +26,11 @@ bool Metadata::operator==(const Rule &rule) const {
}
size_t Metadata::hash_code() const {
size_t result = hash<size_t>()(value.size());
size_t result = 0;
hash_combine(&result, value.size());
for (auto &pair : value) {
result ^= hash<int>()(pair.first);
result ^= hash<int>()(pair.second);
hash_combine<int>(&result, pair.first);
hash_combine(&result, pair.second);
}
return result;
}

View file

@ -2,13 +2,14 @@
#include <string>
#include <map>
#include "compiler/rules/visitor.h"
#include "compiler/util/hash_combine.h"
namespace tree_sitter {
namespace rules {
using std::string;
using std::to_string;
using std::hash;
using util::hash_combine;
Symbol::Symbol(Symbol::Index index) : index(index), is_token(false) {}
@ -24,7 +25,10 @@ bool Symbol::operator==(const Rule &rule) const {
}
size_t Symbol::hash_code() const {
return hash<Symbol::Index>()(index) ^ hash<bool>()(is_token);
size_t result = 0;
hash_combine(&result, index);
hash_combine(&result, is_token);
return result;
}
rule_ptr Symbol::copy() const {

View file

@ -0,0 +1,24 @@
#ifndef COMPILER_UTIL_HASH_COMBINE_H_
#define COMPILER_UTIL_HASH_COMBINE_H_
#include <functional>
namespace tree_sitter {
namespace util {
template <class T>
inline void hash_combine(std::size_t *seed, const T &new_value) {
std::hash<T> hasher;
*seed ^= hasher(new_value) + 0x9e3779b9 + (*seed << 6) + (*seed >> 2);
}
template <class T>
inline void symmetric_hash_combine(std::size_t *seed, const T &new_value) {
std::hash<T> hasher;
*seed ^= hasher(new_value);
}
} // namespace util
} // namespace tree_sitter
#endif // COMPILER_UTIL_HASH_COMBINE_H_