Consolidate the unify for detecting conflicting tokens

This commit is contained in:
Max Brunsfeld 2018-03-28 10:02:57 -07:00
parent a8bc67ac42
commit 186f70649c
10 changed files with 139 additions and 256 deletions

View file

@ -13,7 +13,6 @@
'sources': [
'src/compiler/build_tables/lex_item.cc',
'src/compiler/build_tables/lex_item_transitions.cc',
'src/compiler/build_tables/lex_conflict_manager.cc',
'src/compiler/build_tables/lex_table_builder.cc',
'src/compiler/build_tables/lookahead_set.cc',
'src/compiler/build_tables/parse_item.cc',

View file

@ -1,53 +0,0 @@
#include "compiler/build_tables/lex_conflict_manager.h"
#include <utility>
#include "compiler/parse_table.h"
#include "compiler/rule.h"
#include "compiler/build_tables/lex_item.h"
namespace tree_sitter {
namespace build_tables {
bool LexConflictManager::resolve(const LexItemSet &item_set,
const AdvanceAction &new_action,
const AcceptTokenAction &old_action) {
if (new_action.precedence_range.max >= old_action.precedence) {
for (const LexItem &item : item_set.entries) {
possible_extensions[old_action.symbol.index].insert(item.lhs.index);
}
return true;
} else {
for (const LexItem &item : item_set.entries) {
possible_homonyms[item.lhs.index].insert(old_action.symbol.index);
}
return false;
}
}
bool LexConflictManager::resolve(const AcceptTokenAction &new_action,
const AcceptTokenAction &old_action) {
bool result;
if (new_action.precedence > old_action.precedence) {
result = true;
} else if (new_action.precedence < old_action.precedence) {
result = false;
} else if (new_action.is_string && !old_action.is_string) {
result = true;
} else if (old_action.is_string && !new_action.is_string) {
result = false;
} else if (new_action.symbol.index < old_action.symbol.index) {
result = true;
} else {
result = false;
}
if (result) {
possible_homonyms[old_action.symbol.index].insert(new_action.symbol.index);
} else {
possible_homonyms[new_action.symbol.index].insert(old_action.symbol.index);
}
return result;
}
} // namespace build_tables
} // namespace tree_sitter

View file

@ -1,31 +0,0 @@
#ifndef COMPILER_BUILD_TABLES_LEX_CONFLICT_MANAGER_H_
#define COMPILER_BUILD_TABLES_LEX_CONFLICT_MANAGER_H_
#include <map>
#include <set>
#include "compiler/lexical_grammar.h"
#include "compiler/rule.h"
namespace tree_sitter {
struct AdvanceAction;
struct AcceptTokenAction;
namespace build_tables {
class LexItemSet;
class LexConflictManager {
public:
bool resolve(const LexItemSet &, const AdvanceAction &,
const AcceptTokenAction &);
bool resolve(const AcceptTokenAction &, const AcceptTokenAction &);
std::map<rules::Symbol::Index, std::set<rules::Symbol::Index>> possible_homonyms;
std::map<rules::Symbol::Index, std::set<rules::Symbol::Index>> possible_extensions;
};
} // namespace build_tables
} // namespace tree_sitter
#endif // COMPILER_BUILD_TABLES_LEX_CONFLICT_MANAGER_H_

View file

@ -13,6 +13,7 @@ using std::string;
using std::unordered_set;
using rules::CharacterSet;
using rules::Symbol;
using rules::Metadata;
LexItem::LexItem(const rules::Symbol &lhs, const rules::Rule &rule)
: lhs(lhs), rule(rule) {}
@ -82,6 +83,19 @@ bool LexItemSet::operator==(const LexItemSet &other) const {
return entries == other.entries;
}
bool LexItem::is_in_separators() const {
if (!rule.is<Metadata>()) return false;
auto &metadata = rule.get_unchecked<Metadata>();
return !metadata.params.is_main_token;
}
bool LexItemSet::has_items_in_separators() const {
for (const LexItem &item : entries) {
if (item.is_in_separators()) return true;
}
return false;
}
LexItemSet::TransitionMap LexItemSet::transitions() const {
TransitionMap result;
for (const LexItem &item : entries) {

View file

@ -22,6 +22,7 @@ class LexItem {
bool operator==(const LexItem &other) const;
CompletionStatus completion_status() const;
bool is_in_separators() const;
rules::Symbol lhs;
rules::Rule rule;
@ -47,12 +48,12 @@ class LexItemSet {
LexItemSet();
explicit LexItemSet(const std::unordered_set<LexItem> &);
bool operator==(const LexItemSet &) const;
struct Transition;
typedef std::map<rules::CharacterSet, Transition> TransitionMap;
bool operator==(const LexItemSet &) const;
TransitionMap transitions() const;
bool has_items_in_separators() const;
std::unordered_set<LexItem> entries;
};

View file

@ -7,7 +7,6 @@
#include <utility>
#include <cwctype>
#include <vector>
#include "compiler/build_tables/lex_conflict_manager.h"
#include "compiler/build_tables/lex_item.h"
#include "compiler/build_tables/lookahead_set.h"
#include "compiler/parse_table.h"
@ -69,15 +68,13 @@ class LexTableBuilderImpl : public LexTableBuilder {
LexTable keyword_lex_table;
const LexicalGrammar grammar;
vector<Rule> separator_rules;
LexConflictManager conflict_manager;
unordered_map<LexItemSet, LexStateId> main_lex_state_ids;
unordered_map<LexItemSet, LexStateId> keyword_lex_state_ids;
CharacterSet separator_start_characters;
vector<CharacterSet> starting_characters_by_token;
vector<CharacterSet> following_characters_by_token;
vector<set<Symbol>> shadowed_tokens_by_token;
const vector<LookaheadSet> &coincident_tokens_by_token;
vector<bool> conflict_status_by_token;
vector<ConflictStatus> conflict_matrix;
bool conflict_detection_mode;
LookaheadSet keyword_symbols;
Symbol keyword_capture_token;
@ -90,8 +87,8 @@ class LexTableBuilderImpl : public LexTableBuilder {
: grammar(lexical_grammar),
starting_characters_by_token(lexical_grammar.variables.size()),
following_characters_by_token(lexical_grammar.variables.size()),
shadowed_tokens_by_token(lexical_grammar.variables.size()),
coincident_tokens_by_token(coincident_tokens),
conflict_matrix(lexical_grammar.variables.size() * lexical_grammar.variables.size(), DoesNotMatch),
conflict_detection_mode(false),
keyword_capture_token(rules::NONE()) {
@ -165,8 +162,6 @@ class LexTableBuilderImpl : public LexTableBuilder {
Symbol::terminal(i),
Symbol::terminal(j)
}), true));
if (conflict_status_by_token[i]) shadowed_tokens_by_token[j].insert(Symbol::terminal(i));
if (conflict_status_by_token[j]) shadowed_tokens_by_token[i].insert(Symbol::terminal(j));
}
}
}
@ -176,7 +171,7 @@ class LexTableBuilderImpl : public LexTableBuilder {
Symbol symbol = Symbol::terminal(i);
bool matches_all_keywords = true;
keyword_symbols.for_each([&](Symbol keyword_symbol) {
if (!conflict_manager.possible_homonyms[symbol.index].count(keyword_symbol.index)) {
if (!(get_conflict_status(symbol, keyword_symbol) & MatchesSameString)) {
matches_all_keywords = false;
return false;
}
@ -193,9 +188,11 @@ class LexTableBuilderImpl : public LexTableBuilder {
// Don't use a token to capture keywords if it conflicts with other tokens
// that occur in the same state as a keyword.
bool shadows_other_tokens = false;
for (auto shadowed_token : shadowed_tokens_by_token[i]) {
if (!keyword_symbols.contains(shadowed_token) &&
keyword_symbols.intersects(coincident_tokens_by_token[shadowed_token.index])) {
for (Symbol::Index j = 0; j < n; j++) {
Symbol other_symbol = Symbol::terminal(j);
if ((get_conflict_status(other_symbol, symbol) & CannotDistinguish) &&
!keyword_symbols.contains(other_symbol) &&
keyword_symbols.intersects(coincident_tokens_by_token[j])) {
shadows_other_tokens = true;
break;
}
@ -254,11 +251,21 @@ class LexTableBuilderImpl : public LexTableBuilder {
return {main_lex_table, keyword_lex_table, keyword_capture_token};
}
const set<Symbol> &get_incompatible_tokens(Symbol::Index index) const {
return shadowed_tokens_by_token[index];
ConflictStatus get_conflict_status(Symbol shadowed_token, Symbol other_token) const {
if (shadowed_token.is_built_in() ||
other_token.is_built_in() ||
!shadowed_token.is_terminal() ||
!other_token.is_terminal()) return DoesNotMatch;
unsigned index = shadowed_token.index * grammar.variables.size() + other_token.index;
return conflict_matrix[index];
}
private:
void record_conflict(Symbol shadowed_token, Symbol other_token, ConflictStatus status) {
unsigned index = shadowed_token.index * grammar.variables.size() + other_token.index;
conflict_matrix[index] = static_cast<ConflictStatus>(conflict_matrix[index] | status);
}
LexStateId add_lex_state(LexTable &lex_table, const LexItemSet &item_set) {
auto &lex_state_ids = &lex_table == &main_lex_table ?
main_lex_state_ids :
@ -284,27 +291,27 @@ class LexTableBuilderImpl : public LexTableBuilder {
AdvanceAction action(-1, transition.precedence, transition.in_main_token);
AcceptTokenAction &accept_action = lex_table.states[state_id].accept_action;
if (accept_action.is_present()) {
bool prefer_advancing = conflict_manager.resolve(
transition.destination,
action,
accept_action
);
bool prefer_advancing = action.precedence_range.max >= accept_action.precedence;
if (conflict_detection_mode) {
bool next_item_set_can_yield_this_token = false;
for (const LexItem &item : transition.destination.entries) {
if (item.lhs == accept_action.symbol) {
next_item_set_can_yield_this_token = true;
} else if (!prefer_advancing && !transition.in_main_token) {
conflict_status_by_token[item.lhs.index] = true;
} else if (!prefer_advancing && item_set.has_items_in_separators()) {
record_conflict(item.lhs, accept_action.symbol, MatchesShorterStringWithinSeparators);
}
}
if (prefer_advancing &&
!next_item_set_can_yield_this_token &&
(characters.intersects(following_characters_by_token[accept_action.symbol.index]) ||
characters.intersects(separator_start_characters))) {
conflict_status_by_token[accept_action.symbol.index] = true;
if (prefer_advancing && !next_item_set_can_yield_this_token) {
auto advance_symbol = transition.destination.entries.begin()->lhs;
if (characters.intersects(following_characters_by_token[accept_action.symbol.index]) ||
characters.intersects(separator_start_characters)) {
record_conflict(accept_action.symbol, advance_symbol, MatchesLongerStringWithValidNextChar);
} else {
record_conflict(accept_action.symbol, advance_symbol, MatchesLongerString);
}
return;
}
}
@ -325,10 +332,10 @@ class LexTableBuilderImpl : public LexTableBuilder {
grammar.variables[item.lhs.index].is_string);
AcceptTokenAction &existing_action = lex_table.states[state_id].accept_action;
if (existing_action.is_present()) {
if (conflict_manager.resolve(action, existing_action)) {
conflict_status_by_token[existing_action.symbol.index] = true;
if (should_replace_accept_action(existing_action, action)) {
record_conflict(existing_action.symbol, action.symbol, MatchesSameString);
} else {
conflict_status_by_token[action.symbol.index] = true;
record_conflict(action.symbol, existing_action.symbol, MatchesSameString);
continue;
}
}
@ -340,26 +347,16 @@ class LexTableBuilderImpl : public LexTableBuilder {
void mark_fragile_tokens(ParseTable *parse_table) {
for (ParseState &state : parse_table->states) {
for (auto &entry : state.terminal_entries) {
Symbol symbol = entry.first;
if (symbol.is_terminal()) {
auto homonyms = conflict_manager.possible_homonyms.find(symbol.index);
if (homonyms != conflict_manager.possible_homonyms.end())
for (Symbol::Index homonym : homonyms->second)
if (state.terminal_entries.count(Symbol::terminal(homonym))) {
entry.second.reusable = false;
break;
}
if (!entry.second.reusable)
continue;
auto extensions = conflict_manager.possible_extensions.find(symbol.index);
if (extensions != conflict_manager.possible_extensions.end())
for (Symbol::Index extension : extensions->second)
if (state.terminal_entries.count(Symbol::terminal(extension))) {
entry.second.depends_on_lookahead = true;
break;
}
Symbol token = entry.first;
if (token.is_external() || token.is_built_in()) continue;
for (Symbol::Index i = 0; i < grammar.variables.size(); i++) {
Symbol other_token = Symbol::terminal(i);
ConflictStatus status = get_conflict_status(token, other_token);
if (status != ConflictStatus::DoesNotMatch &&
state.terminal_entries.count(other_token)) {
entry.second.reusable = false;
break;
}
}
}
}
@ -370,25 +367,16 @@ class LexTableBuilderImpl : public LexTableBuilder {
left->for_each_difference(right, [&](bool in_left, Symbol different_symbol) {
if (!different_symbol.is_external() && !different_symbol.is_built_in()) {
if (in_left) {
right.for_each([&](Symbol right_symbol) {
if (shadowed_tokens_by_token[different_symbol.index].count(right_symbol) ||
!coincident_tokens_by_token[different_symbol.index].contains(right_symbol)) {
is_compatible = false;
return;
}
});
if (!is_compatible) return false;
} else {
left->for_each([&](Symbol left_symbol) {
if (shadowed_tokens_by_token[different_symbol.index].count(left_symbol) ||
!coincident_tokens_by_token[different_symbol.index].contains(left_symbol)) {
is_compatible = false;
return;
}
});
if (!is_compatible) return false;
}
const LookaheadSet &existing_set = in_left ? right : *left;
existing_set.for_each([&](Symbol existing_symbol) {
if ((get_conflict_status(existing_symbol, different_symbol) & CannotDistinguish) ||
!coincident_tokens_by_token[different_symbol.index].contains(existing_symbol)) {
is_compatible = false;
return false;
}
return true;
});
if (!is_compatible) return false;
}
return true;
@ -508,10 +496,22 @@ class LexTableBuilderImpl : public LexTableBuilder {
);
}
bool should_replace_accept_action(const AcceptTokenAction &old_action,
const AcceptTokenAction &new_action) {
if (new_action.precedence > old_action.precedence) return true;
if (new_action.precedence < old_action.precedence) return false;
if (new_action.is_string && !old_action.is_string) return true;
if (old_action.is_string && !new_action.is_string) return false;
return new_action.symbol.index < old_action.symbol.index;
}
void clear() {
main_lex_table.states.clear();
main_lex_state_ids.clear();
conflict_status_by_token = vector<bool>(grammar.variables.size(), false);
}
const string &token_name(rules::Symbol &symbol) {
return grammar.variables[symbol.index].name;
}
};
@ -531,8 +531,8 @@ LexTableBuilder::BuildResult LexTableBuilder::build(ParseTable *parse_table) {
return static_cast<LexTableBuilderImpl *>(this)->build(parse_table);
}
const set<Symbol> &LexTableBuilder::get_incompatible_tokens(Symbol::Index token) const {
return static_cast<const LexTableBuilderImpl *>(this)->get_incompatible_tokens(token);
ConflictStatus LexTableBuilder::get_conflict_status(Symbol a, Symbol b) const {
return static_cast<const LexTableBuilderImpl *>(this)->get_conflict_status(a, b);
}
} // namespace build_tables

View file

@ -17,6 +17,19 @@ namespace build_tables {
class LookaheadSet;
enum ConflictStatus {
DoesNotMatch = 0,
MatchesShorterStringWithinSeparators = 1 << 0,
MatchesSameString = 1 << 1,
MatchesLongerString = 1 << 2,
MatchesLongerStringWithValidNextChar = 1 << 3,
CannotDistinguish = (
MatchesShorterStringWithinSeparators |
MatchesSameString |
MatchesLongerStringWithValidNextChar
),
};
class LexTableBuilder {
public:
static std::unique_ptr<LexTableBuilder> create(const SyntaxGrammar &,
@ -31,7 +44,8 @@ class LexTableBuilder {
};
BuildResult build(ParseTable *);
const std::set<rules::Symbol> &get_incompatible_tokens(rules::Symbol::Index) const;
ConflictStatus get_conflict_status(rules::Symbol, rules::Symbol) const;
protected:
LexTableBuilder() = default;

View file

@ -145,21 +145,44 @@ class ParseTableBuilderImpl : public ParseTableBuilder {
}
void build_error_parse_state(ParseStateId state_id) {
unsigned CannotMerge = (
MatchesShorterStringWithinSeparators |
MatchesLongerStringWithValidNextChar
);
// Add all the tokens that have no conflict with other tokens.
LookaheadSet non_conflicting_tokens;
for (unsigned i = 0; i < lexical_grammar.variables.size(); i++) {
Symbol token = Symbol::terminal(i);
const LexicalVariable &variable = lexical_grammar.variables[i];
bool exclude_from_recovery_state = false;
for (Symbol incompatible_token : lex_table_builder->get_incompatible_tokens(i)) {
if (!coincident_tokens_by_token[i].contains(incompatible_token) &&
((lexical_grammar.variables[incompatible_token.index].is_string && !variable.is_string) ||
!lex_table_builder->get_incompatible_tokens(incompatible_token.index).count(token))) {
exclude_from_recovery_state = true;
bool conflicts_with_other_tokens = false;
for (unsigned j = 0; j < lexical_grammar.variables.size(); j++) {
Symbol other_token = Symbol::terminal(j);
if (j != i &&
!coincident_tokens_by_token[token.index].contains(other_token) &&
(lex_table_builder->get_conflict_status(other_token, token) & CannotMerge)) {
conflicts_with_other_tokens = true;
break;
}
}
if (!exclude_from_recovery_state) {
parse_table.add_terminal_action(state_id, Symbol::terminal(i), ParseAction::Recover());
if (!conflicts_with_other_tokens) non_conflicting_tokens.insert(token);
}
LookaheadSet tokens;
for (unsigned i = 0; i < lexical_grammar.variables.size(); i++) {
Symbol token = Symbol::terminal(i);
bool conflicts_with_other_tokens = false;
if (!non_conflicting_tokens.contains(token)) {
non_conflicting_tokens.for_each([&](Symbol other_token) {
if (!coincident_tokens_by_token[token.index].contains(other_token) &&
(lex_table_builder->get_conflict_status(other_token, token) & CannotMerge)) {
conflicts_with_other_tokens = true;
return false;
}
return true;
});
}
if (!conflicts_with_other_tokens) {
parse_table.add_terminal_action(state_id, token, ParseAction::Recover());
}
}
@ -492,8 +515,10 @@ class ParseTableBuilderImpl : public ParseTableBuilder {
// Do not add a token if it conflicts with an existing token.
if (!new_token.is_built_in()) {
for (Symbol incompatible_token : lex_table_builder->get_incompatible_tokens(new_token.index)) {
if (state.terminal_entries.count(incompatible_token)) return false;
for (const auto &entry : state.terminal_entries) {
if (lex_table_builder->get_conflict_status(entry.first, new_token) & CannotDistinguish) {
return false;
}
}
}

View file

@ -1,85 +0,0 @@
#include "test_helper.h"
#include "helpers/stream_methods.h"
#include "compiler/rule.h"
#include "compiler/parse_table.h"
#include "compiler/build_tables/lex_conflict_manager.h"
#include "compiler/build_tables/lex_item.h"
using namespace rules;
using namespace build_tables;
START_TEST
describe("LexConflictManager::resolve(new_action, old_action)", []() {
LexConflictManager conflict_manager;
bool update;
Symbol sym1 = Symbol::terminal(0);
Symbol sym2 = Symbol::terminal(1);
Symbol sym3 = Symbol::terminal(2);
Symbol sym4 = Symbol::terminal(3);
LexItemSet item_set({ LexItem(sym4, Blank{} )});
before_each([&]() {
conflict_manager = LexConflictManager();
});
it("favors advance actions over empty accept token actions", [&]() {
update = conflict_manager.resolve(item_set, AdvanceAction(2, {0, 0}, true), AcceptTokenAction());
AssertThat(update, IsTrue());
});
describe("accept-token/accept-token conflicts", [&]() {
describe("when the tokens' precedence values differ", [&]() {
it("favors the token with higher precedence", [&]() {
update = conflict_manager.resolve(AcceptTokenAction(sym2, 1, false), AcceptTokenAction(sym1, 2, false));
AssertThat(update, IsFalse());
update = conflict_manager.resolve(AcceptTokenAction(sym1, 2, false), AcceptTokenAction(sym2, 1, false));
AssertThat(update, IsTrue());
});
it("adds the preferred token as a possible homonym for the discarded one", [&]() {
conflict_manager.resolve(AcceptTokenAction(sym2, 1, false), AcceptTokenAction(sym1, 2, false));
AssertThat(conflict_manager.possible_homonyms[sym2.index], Contains(sym1.index));
});
});
describe("when one token is string-based and the other is regexp-based", [&]() {
it("favors the string-based token", [&]() {
update = conflict_manager.resolve(AcceptTokenAction(sym1, 0, false), AcceptTokenAction(sym2, 0, true));
AssertThat(update, IsFalse());
update = conflict_manager.resolve(AcceptTokenAction(sym2, 0, true), AcceptTokenAction(sym1, 0, false));
AssertThat(update, IsTrue());
});
});
describe("when the tokens have equal precedence", [&]() {
it("favors the token listed earlier in the grammar", [&]() {
update = conflict_manager.resolve(AcceptTokenAction(sym2, 0, false), AcceptTokenAction(sym1, 0, false));
AssertThat(update, IsFalse());
update = conflict_manager.resolve(AcceptTokenAction(sym1, 0, false), AcceptTokenAction(sym2, 0, false));
AssertThat(update, IsTrue());
});
});
});
describe("advance/accept-token conflicts", [&]() {
describe("when the token to accept has higher precedence", [&]() {
it("prefers the accept-token action", [&]() {
update = conflict_manager.resolve(item_set, AdvanceAction(1, { 1, 2 }, true), AcceptTokenAction(sym3, 3, true));
AssertThat(update, IsFalse());
});
});
describe("when the token to accept does not have a higher precedence", [&]() {
it("favors the advance action", [&]() {
update = conflict_manager.resolve(item_set, AdvanceAction(1, { 1, 2 }, true), AcceptTokenAction(sym3, 2, true));
AssertThat(update, IsTrue());
});
});
});
});
END_TEST

View file

@ -37,7 +37,6 @@
'externals/crypto-algorithms',
],
'sources': [
'test/compiler/build_tables/lex_conflict_manager_test.cc',
'test/compiler/build_tables/lex_item_test.cc',
'test/compiler/build_tables/parse_item_set_builder_test.cc',
'test/compiler/build_tables/rule_can_be_blank_test.cc',