Merge pull request #48 from tree-sitter/optimize-state-deduping
Optimize parse state deduping
This commit is contained in:
commit
4c60141b63
12 changed files with 243 additions and 70 deletions
|
|
@ -44,7 +44,7 @@ class LexTableBuilder {
|
|||
const LexicalGrammar lex_grammar;
|
||||
vector<rule_ptr> separator_rules;
|
||||
LexConflictManager conflict_manager;
|
||||
unordered_map<const LexItemSet, LexStateId, LexItemSet::Hash> lex_state_ids;
|
||||
unordered_map<LexItemSet, LexStateId> lex_state_ids;
|
||||
|
||||
public:
|
||||
LexTableBuilder(ParseTable *parse_table, const LexicalGrammar &lex_grammar)
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ class ParseTableBuilder {
|
|||
const LexicalGrammar lexical_grammar;
|
||||
ParseConflictManager conflict_manager;
|
||||
unordered_map<Symbol, ParseItemSet> recovery_states;
|
||||
unordered_map<ParseItemSet, ParseStateId, ParseItemSet::Hash> parse_state_ids;
|
||||
unordered_map<ParseItemSet, ParseStateId> parse_state_ids;
|
||||
vector<pair<ParseItemSet, ParseStateId>> item_sets_to_process;
|
||||
ParseTable parse_table;
|
||||
set<string> conflicts;
|
||||
|
|
@ -146,8 +146,8 @@ class ParseTableBuilder {
|
|||
auto pair = parse_state_ids.find(item_set);
|
||||
if (pair == parse_state_ids.end()) {
|
||||
ParseStateId state_id = parse_table.add_state();
|
||||
|
||||
parse_state_ids[item_set] = state_id;
|
||||
parse_table.states[state_id].shift_actions_signature = item_set.unfinished_item_signature();
|
||||
item_sets_to_process.push_back({ std::move(item_set), state_id });
|
||||
return state_id;
|
||||
} else {
|
||||
|
|
@ -172,7 +172,8 @@ class ParseTableBuilder {
|
|||
new_action->state_index = add_parse_state(next_item_set);
|
||||
}
|
||||
} else {
|
||||
parse_table.set_nonterminal_action(state_id, symbol.index, add_parse_state(next_item_set));
|
||||
ParseStateId next_state = add_parse_state(next_item_set);
|
||||
parse_table.set_nonterminal_action(state_id, symbol.index, next_state);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -250,7 +251,76 @@ class ParseTableBuilder {
|
|||
}
|
||||
|
||||
void remove_duplicate_parse_states() {
|
||||
remove_duplicate_states<ParseTable>(&parse_table);
|
||||
map<size_t, set<ParseStateId>> state_indices_by_signature;
|
||||
|
||||
for (ParseStateId i = 0, n = parse_table.states.size(); i < n; i++) {
|
||||
ParseState &state = parse_table.states[i];
|
||||
state_indices_by_signature[state.shift_actions_signature].insert(i);
|
||||
}
|
||||
|
||||
set<ParseStateId> deleted_states;
|
||||
|
||||
while (true) {
|
||||
std::map<ParseStateId, ParseStateId> state_replacements;
|
||||
|
||||
for (auto &pair : state_indices_by_signature) {
|
||||
auto &state_group = pair.second;
|
||||
|
||||
for (ParseStateId i : state_group) {
|
||||
for (ParseStateId j : state_group) {
|
||||
if (j == i) break;
|
||||
if (!state_replacements.count(j) && parse_table.merge_state(j, i)) {
|
||||
state_replacements.insert({ i, j });
|
||||
deleted_states.insert(i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (state_replacements.empty()) break;
|
||||
|
||||
for (ParseStateId i = 0, n = parse_table.states.size(); i < n; i++) {
|
||||
ParseState &state = parse_table.states[i];
|
||||
|
||||
if (state_replacements.count(i)) {
|
||||
state_indices_by_signature[state.shift_actions_signature].erase(i);
|
||||
} else {
|
||||
state.each_referenced_state([&state_replacements](int64_t *state_index) {
|
||||
auto replacement = state_replacements.find(*state_index);
|
||||
if (replacement != state_replacements.end()) {
|
||||
*state_index = replacement->second;
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
vector<ParseStateId> new_state_ids(parse_table.states.size());
|
||||
size_t deleted_state_count = 0;
|
||||
auto deleted_state_iter = deleted_states.begin();
|
||||
for (size_t i = 0; i < new_state_ids.size(); i++) {
|
||||
while (deleted_state_iter != deleted_states.end() && *deleted_state_iter < i) {
|
||||
deleted_state_count++;
|
||||
deleted_state_iter++;
|
||||
}
|
||||
new_state_ids[i] = i - deleted_state_count;
|
||||
}
|
||||
|
||||
ParseStateId original_state_index = 0;
|
||||
auto iter = parse_table.states.begin();
|
||||
while (iter != parse_table.states.end()) {
|
||||
if (deleted_states.count(original_state_index)) {
|
||||
iter = parse_table.states.erase(iter);
|
||||
} else {
|
||||
ParseState &state = *iter;
|
||||
state.each_referenced_state([&new_state_ids](int64_t *state_index) {
|
||||
*state_index = new_state_ids[*state_index];
|
||||
});
|
||||
++iter;
|
||||
}
|
||||
original_state_index++;
|
||||
}
|
||||
}
|
||||
|
||||
ParseAction *add_terminal_action(ParseStateId state_id, Symbol::Index lookahead,
|
||||
|
|
|
|||
|
|
@ -8,11 +8,11 @@
|
|||
#include "compiler/rules/symbol.h"
|
||||
#include "compiler/rules/repeat.h"
|
||||
#include "compiler/rules/visitor.h"
|
||||
#include "compiler/util/hash_combine.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
namespace build_tables {
|
||||
|
||||
using std::hash;
|
||||
using std::map;
|
||||
using std::string;
|
||||
using std::unordered_set;
|
||||
|
|
@ -69,20 +69,9 @@ LexItem::CompletionStatus LexItem::completion_status() const {
|
|||
return GetCompletionStatus().apply(rule);
|
||||
}
|
||||
|
||||
size_t LexItem::Hash::operator()(const LexItem &item) const {
|
||||
return hash<Symbol>()(item.lhs) ^ hash<rule_ptr>()(item.rule);
|
||||
}
|
||||
|
||||
size_t LexItemSet::Hash::operator()(const LexItemSet &item_set) const {
|
||||
size_t result = hash<size_t>()(item_set.entries.size());
|
||||
for (const auto &item : item_set.entries)
|
||||
result ^= LexItem::Hash()(item);
|
||||
return result;
|
||||
}
|
||||
|
||||
LexItemSet::LexItemSet() {}
|
||||
|
||||
LexItemSet::LexItemSet(const unordered_set<LexItem, LexItem::Hash> &entries)
|
||||
LexItemSet::LexItemSet(const unordered_set<LexItem> &entries)
|
||||
: entries(entries) {}
|
||||
|
||||
bool LexItemSet::operator==(const LexItemSet &other) const {
|
||||
|
|
@ -103,3 +92,27 @@ bool LexItemSet::Transition::operator==(const LexItemSet::Transition &other) con
|
|||
|
||||
} // namespace build_tables
|
||||
} // namespace tree_sitter
|
||||
|
||||
namespace std {
|
||||
|
||||
using tree_sitter::util::hash_combine;
|
||||
using tree_sitter::util::symmetric_hash_combine;
|
||||
using tree_sitter::build_tables::LexItem;
|
||||
using tree_sitter::build_tables::LexItemSet;
|
||||
|
||||
size_t hash<LexItem>::operator()(const LexItem &item) const {
|
||||
size_t result = 0;
|
||||
hash_combine(&result, item.lhs.index);
|
||||
hash_combine(&result, item.rule);
|
||||
return result;
|
||||
}
|
||||
|
||||
size_t hash<LexItemSet>::operator()(const LexItemSet &item_set) const {
|
||||
size_t result = 0;
|
||||
hash_combine(&result, item_set.entries.size());
|
||||
for (const auto &item : item_set.entries)
|
||||
symmetric_hash_combine(&result, item);
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace std
|
||||
|
|
|
|||
|
|
@ -22,10 +22,6 @@ class LexItem {
|
|||
bool is_string;
|
||||
};
|
||||
|
||||
struct Hash {
|
||||
size_t operator()(const LexItem &) const;
|
||||
};
|
||||
|
||||
bool operator==(const LexItem &other) const;
|
||||
CompletionStatus completion_status() const;
|
||||
|
||||
|
|
@ -33,23 +29,34 @@ class LexItem {
|
|||
rule_ptr rule;
|
||||
};
|
||||
|
||||
} // namespace build_tables
|
||||
} // namespace tree_sitter
|
||||
|
||||
namespace std {
|
||||
|
||||
template <>
|
||||
struct hash<tree_sitter::build_tables::LexItem> {
|
||||
size_t operator()(const tree_sitter::build_tables::LexItem &) const;
|
||||
};
|
||||
|
||||
} // namespace std
|
||||
|
||||
namespace tree_sitter {
|
||||
namespace build_tables {
|
||||
|
||||
class LexItemSet {
|
||||
public:
|
||||
LexItemSet();
|
||||
explicit LexItemSet(const std::unordered_set<LexItem, LexItem::Hash> &);
|
||||
explicit LexItemSet(const std::unordered_set<LexItem> &);
|
||||
|
||||
bool operator==(const LexItemSet &) const;
|
||||
|
||||
struct Hash {
|
||||
size_t operator()(const LexItemSet &) const;
|
||||
};
|
||||
|
||||
struct Transition;
|
||||
typedef std::map<rules::CharacterSet, Transition> TransitionMap;
|
||||
|
||||
TransitionMap transitions() const;
|
||||
|
||||
std::unordered_set<LexItem, LexItem::Hash> entries;
|
||||
std::unordered_set<LexItem> entries;
|
||||
};
|
||||
|
||||
struct LexItemSet::Transition {
|
||||
|
|
@ -63,4 +70,13 @@ struct LexItemSet::Transition {
|
|||
} // namespace build_tables
|
||||
} // namespace tree_sitter
|
||||
|
||||
namespace std {
|
||||
|
||||
template <>
|
||||
struct hash<tree_sitter::build_tables::LexItemSet> {
|
||||
size_t operator()(const tree_sitter::build_tables::LexItemSet &) const;
|
||||
};
|
||||
|
||||
} // namespace std
|
||||
|
||||
#endif // COMPILER_BUILD_TABLES_LEX_ITEM_H_
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@
|
|||
#include <string>
|
||||
#include "compiler/syntax_grammar.h"
|
||||
#include "compiler/rules/built_in_symbols.h"
|
||||
#include "compiler/util/hash_combine.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
namespace build_tables {
|
||||
|
|
@ -10,8 +11,10 @@ using std::map;
|
|||
using std::pair;
|
||||
using std::string;
|
||||
using std::to_string;
|
||||
using std::hash;
|
||||
using rules::Symbol;
|
||||
using util::hash_combine;
|
||||
|
||||
ParseItem::ParseItem() : variable_index(-1), production(nullptr), step_index(0) {}
|
||||
|
||||
ParseItem::ParseItem(const Symbol &lhs, const Production &production,
|
||||
unsigned int step_index)
|
||||
|
|
@ -78,13 +81,6 @@ rules::Associativity ParseItem::associativity() const {
|
|||
return production->at(step_index).associativity;
|
||||
}
|
||||
|
||||
size_t ParseItem::Hash::operator()(const ParseItem &item) const {
|
||||
size_t result = hash<int>()(item.variable_index);
|
||||
result ^= hash<unsigned int>()(item.step_index);
|
||||
result ^= hash<const void *>()(static_cast<const void *>(item.production));
|
||||
return result;
|
||||
}
|
||||
|
||||
ParseItemSet::ParseItemSet() {}
|
||||
|
||||
ParseItemSet::ParseItemSet(const map<ParseItem, LookaheadSet> &entries)
|
||||
|
|
@ -94,16 +90,19 @@ bool ParseItemSet::operator==(const ParseItemSet &other) const {
|
|||
return entries == other.entries;
|
||||
}
|
||||
|
||||
size_t ParseItemSet::Hash::operator()(const ParseItemSet &item_set) const {
|
||||
size_t result = hash<size_t>()(item_set.entries.size());
|
||||
for (auto &pair : item_set.entries) {
|
||||
size_t ParseItemSet::unfinished_item_signature() const {
|
||||
size_t result = 0;
|
||||
ParseItem previous_item;
|
||||
for (auto &pair : entries) {
|
||||
const ParseItem &item = pair.first;
|
||||
result ^= ParseItem::Hash()(item);
|
||||
|
||||
const LookaheadSet &lookahead_set = pair.second;
|
||||
result ^= hash<size_t>()(lookahead_set.entries->size());
|
||||
for (Symbol::Index index : *pair.second.entries)
|
||||
result ^= hash<Symbol::Index>()(index);
|
||||
if (item.step_index < item.production->size()) {
|
||||
if (item.variable_index != previous_item.variable_index &&
|
||||
item.step_index != previous_item.step_index) {
|
||||
hash_combine(&result, item.variable_index);
|
||||
hash_combine(&result, item.step_index);
|
||||
previous_item = item;
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
|
@ -135,3 +134,37 @@ void ParseItemSet::add(const ParseItemSet &other) {
|
|||
|
||||
} // namespace build_tables
|
||||
} // namespace tree_sitter
|
||||
|
||||
namespace std {
|
||||
|
||||
using tree_sitter::build_tables::ParseItem;
|
||||
using tree_sitter::build_tables::ParseItemSet;
|
||||
using tree_sitter::util::hash_combine;
|
||||
|
||||
template <>
|
||||
struct hash<ParseItem> {
|
||||
size_t operator()(const ParseItem &item) const {
|
||||
size_t result = 0;
|
||||
hash_combine(&result, item.variable_index);
|
||||
hash_combine(&result, item.step_index);
|
||||
hash_combine(&result, item.production);
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
size_t hash<ParseItemSet>::operator()(const ParseItemSet &item_set) const {
|
||||
size_t result = 0;
|
||||
hash_combine(&result, item_set.entries.size());
|
||||
for (auto &pair : item_set.entries) {
|
||||
const ParseItem &item = pair.first;
|
||||
const auto &lookahead_set = pair.second;
|
||||
|
||||
hash_combine(&result, item);
|
||||
hash_combine(&result, lookahead_set.entries->size());
|
||||
for (auto index : *pair.second.entries)
|
||||
hash_combine(&result, index);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace std
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ namespace build_tables {
|
|||
|
||||
class ParseItem {
|
||||
public:
|
||||
ParseItem();
|
||||
ParseItem(const rules::Symbol &, const Production &, unsigned int);
|
||||
|
||||
struct CompletionStatus {
|
||||
|
|
@ -22,10 +23,6 @@ class ParseItem {
|
|||
rules::Associativity associativity;
|
||||
};
|
||||
|
||||
struct Hash {
|
||||
size_t operator()(const ParseItem &) const;
|
||||
};
|
||||
|
||||
bool operator==(const ParseItem &other) const;
|
||||
bool operator<(const ParseItem &other) const;
|
||||
rules::Symbol lhs() const;
|
||||
|
|
@ -47,13 +44,10 @@ class ParseItemSet {
|
|||
typedef std::map<rules::Symbol, std::pair<ParseItemSet, PrecedenceRange>>
|
||||
TransitionMap;
|
||||
|
||||
struct Hash {
|
||||
size_t operator()(const ParseItemSet &) const;
|
||||
};
|
||||
|
||||
TransitionMap transitions() const;
|
||||
bool operator==(const ParseItemSet &) const;
|
||||
void add(const ParseItemSet &);
|
||||
size_t unfinished_item_signature() const;
|
||||
|
||||
std::map<ParseItem, LookaheadSet> entries;
|
||||
};
|
||||
|
|
@ -61,4 +55,15 @@ class ParseItemSet {
|
|||
} // namespace build_tables
|
||||
} // namespace tree_sitter
|
||||
|
||||
namespace std {
|
||||
|
||||
using tree_sitter::build_tables::ParseItemSet;
|
||||
|
||||
template <>
|
||||
struct hash<tree_sitter::build_tables::ParseItemSet> {
|
||||
size_t operator()(const ParseItemSet &item_set) const;
|
||||
};
|
||||
|
||||
} // namespace std
|
||||
|
||||
#endif // COMPILER_BUILD_TABLES_PARSE_ITEM_H_
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@ class ParseAction {
|
|||
rules::Symbol symbol;
|
||||
ParseStateId state_index;
|
||||
size_t consumed_symbol_count;
|
||||
|
||||
PrecedenceRange precedence_range;
|
||||
rules::Associativity associativity;
|
||||
const Production *production;
|
||||
|
|
@ -78,6 +79,7 @@ class ParseState {
|
|||
std::map<rules::Symbol::Index, ParseTableEntry> terminal_entries;
|
||||
std::map<rules::Symbol::Index, ParseStateId> nonterminal_entries;
|
||||
LexStateId lex_state_id;
|
||||
size_t shift_actions_signature;
|
||||
};
|
||||
|
||||
struct ParseTableSymbolMetadata {
|
||||
|
|
|
|||
|
|
@ -3,14 +3,15 @@
|
|||
#include <utility>
|
||||
#include <vector>
|
||||
#include "compiler/rules/visitor.h"
|
||||
#include "compiler/util/hash_combine.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
namespace rules {
|
||||
|
||||
using std::string;
|
||||
using std::hash;
|
||||
using std::set;
|
||||
using std::vector;
|
||||
using util::hash_combine;
|
||||
|
||||
static void add_range(set<uint32_t> *characters, uint32_t min, uint32_t max) {
|
||||
for (uint32_t c = min; c <= max; c++)
|
||||
|
|
@ -83,14 +84,14 @@ bool CharacterSet::operator<(const CharacterSet &other) const {
|
|||
}
|
||||
|
||||
size_t CharacterSet::hash_code() const {
|
||||
size_t result = hash<bool>()(includes_all);
|
||||
result ^= hash<size_t>()(included_chars.size());
|
||||
for (auto &c : included_chars)
|
||||
result ^= hash<uint32_t>()(c);
|
||||
result <<= 1;
|
||||
result ^= hash<size_t>()(excluded_chars.size());
|
||||
for (auto &c : excluded_chars)
|
||||
result ^= hash<uint32_t>()(c);
|
||||
size_t result = 0;
|
||||
hash_combine(&result, includes_all);
|
||||
hash_combine(&result, included_chars.size());
|
||||
for (uint32_t c : included_chars)
|
||||
hash_combine(&result, c);
|
||||
hash_combine(&result, excluded_chars.size());
|
||||
for (uint32_t c : excluded_chars)
|
||||
hash_combine(&result, c);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@
|
|||
#include <string>
|
||||
#include <set>
|
||||
#include "compiler/rules/visitor.h"
|
||||
#include "compiler/util/hash_combine.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
namespace rules {
|
||||
|
|
@ -10,6 +11,7 @@ using std::string;
|
|||
using std::make_shared;
|
||||
using std::vector;
|
||||
using std::set;
|
||||
using util::symmetric_hash_combine;
|
||||
|
||||
Choice::Choice(const vector<rule_ptr> &elements) : elements(elements) {}
|
||||
|
||||
|
|
@ -50,9 +52,10 @@ bool Choice::operator==(const Rule &rule) const {
|
|||
}
|
||||
|
||||
size_t Choice::hash_code() const {
|
||||
size_t result = std::hash<size_t>()(elements.size());
|
||||
size_t result = 0;
|
||||
symmetric_hash_combine(&result, elements.size());
|
||||
for (const auto &element : elements)
|
||||
result ^= element->hash_code();
|
||||
symmetric_hash_combine(&result, element);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -3,14 +3,15 @@
|
|||
#include <map>
|
||||
#include "compiler/rules/visitor.h"
|
||||
#include "compiler/rules/blank.h"
|
||||
#include "compiler/util/hash_combine.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
namespace rules {
|
||||
|
||||
using std::hash;
|
||||
using std::make_shared;
|
||||
using std::map;
|
||||
using std::pair;
|
||||
using util::hash_combine;
|
||||
|
||||
Metadata::Metadata(rule_ptr rule, map<MetadataKey, int> values)
|
||||
: rule(rule), value(values) {}
|
||||
|
|
@ -25,10 +26,11 @@ bool Metadata::operator==(const Rule &rule) const {
|
|||
}
|
||||
|
||||
size_t Metadata::hash_code() const {
|
||||
size_t result = hash<size_t>()(value.size());
|
||||
size_t result = 0;
|
||||
hash_combine(&result, value.size());
|
||||
for (auto &pair : value) {
|
||||
result ^= hash<int>()(pair.first);
|
||||
result ^= hash<int>()(pair.second);
|
||||
hash_combine<int>(&result, pair.first);
|
||||
hash_combine(&result, pair.second);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,13 +2,14 @@
|
|||
#include <string>
|
||||
#include <map>
|
||||
#include "compiler/rules/visitor.h"
|
||||
#include "compiler/util/hash_combine.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
namespace rules {
|
||||
|
||||
using std::string;
|
||||
using std::to_string;
|
||||
using std::hash;
|
||||
using util::hash_combine;
|
||||
|
||||
Symbol::Symbol(Symbol::Index index) : index(index), is_token(false) {}
|
||||
|
||||
|
|
@ -24,7 +25,10 @@ bool Symbol::operator==(const Rule &rule) const {
|
|||
}
|
||||
|
||||
size_t Symbol::hash_code() const {
|
||||
return hash<Symbol::Index>()(index) ^ hash<bool>()(is_token);
|
||||
size_t result = 0;
|
||||
hash_combine(&result, index);
|
||||
hash_combine(&result, is_token);
|
||||
return result;
|
||||
}
|
||||
|
||||
rule_ptr Symbol::copy() const {
|
||||
|
|
|
|||
24
src/compiler/util/hash_combine.h
Normal file
24
src/compiler/util/hash_combine.h
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
#ifndef COMPILER_UTIL_HASH_COMBINE_H_
|
||||
#define COMPILER_UTIL_HASH_COMBINE_H_
|
||||
|
||||
#include <functional>
|
||||
|
||||
namespace tree_sitter {
|
||||
namespace util {
|
||||
|
||||
template <class T>
|
||||
inline void hash_combine(std::size_t *seed, const T &new_value) {
|
||||
std::hash<T> hasher;
|
||||
*seed ^= hasher(new_value) + 0x9e3779b9 + (*seed << 6) + (*seed >> 2);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline void symmetric_hash_combine(std::size_t *seed, const T &new_value) {
|
||||
std::hash<T> hasher;
|
||||
*seed ^= hasher(new_value);
|
||||
}
|
||||
|
||||
} // namespace util
|
||||
} // namespace tree_sitter
|
||||
|
||||
#endif // COMPILER_UTIL_HASH_COMBINE_H_
|
||||
Loading…
Add table
Add a link
Reference in a new issue