Merge compatible starting token states before constructing lex table

This commit is contained in:
Max Brunsfeld 2017-09-01 14:22:50 -07:00
parent 9d668c5004
commit 4c9c05806a
5 changed files with 219 additions and 141 deletions

View file

@ -34,12 +34,13 @@ using rules::Symbol;
using rules::Metadata;
using rules::Seq;
class StartingCharacterAggregator {
template <bool is_start>
class StartOrEndCharacterAggregator {
public:
void apply(const Rule &rule) {
rule.match(
[this](const Seq &sequence) {
apply(*sequence.left);
apply(is_start ? *sequence.left : *sequence.right);
},
[this](const rules::Choice &rule) {
@ -48,20 +49,9 @@ class StartingCharacterAggregator {
}
},
[this](const rules::Repeat &rule) {
apply(*rule.rule);
},
[this](const rules::Metadata &rule) {
apply(*rule.rule);
},
[this](const rules::CharacterSet &rule) {
result.add_set(rule);
},
[this](const rules::Blank) {},
[this](const rules::Repeat &rule) { apply(*rule.rule); },
[this](const rules::Metadata &rule) { apply(*rule.rule); },
[this](const rules::CharacterSet &rule) { result.add_set(rule); },
[](auto) {}
);
}
@ -69,26 +59,37 @@ class StartingCharacterAggregator {
CharacterSet result;
};
using StartingCharacterAggregator = StartOrEndCharacterAggregator<true>;
using EndingCharacterAggregator = StartOrEndCharacterAggregator<false>;
class LexTableBuilderImpl : public LexTableBuilder {
LexTable lex_table;
const LexicalGrammar grammar;
vector<Rule> separator_rules;
LexConflictManager conflict_manager;
unordered_map<LexItemSet, LexStateId> lex_state_ids;
map<Symbol::Index, CharacterSet> following_characters_by_token_index;
vector<set<Symbol>> incompatible_tokens_by_token_index;
CharacterSet separator_start_characters;
CharacterSet current_conflict_detection_following_characters;
Symbol::Index current_conflict_detection_token_index;
bool current_conflict_value;
vector<CharacterSet> starting_characters_by_token;
vector<CharacterSet> following_characters_by_token;
vector<set<Symbol>> shadowed_tokens_by_token;
const vector<LookaheadSet> &coincident_tokens_by_token;
vector<bool> conflict_status_by_token;
bool conflict_detection_mode;
public:
LexTableBuilderImpl(const SyntaxGrammar &syntax_grammar,
const LexicalGrammar &lexical_grammar,
const vector<set<Symbol::Index>> &following_tokens_by_token_index) :
grammar(lexical_grammar),
incompatible_tokens_by_token_index(lexical_grammar.variables.size()) {
const vector<LookaheadSet> &following_tokens_by_token,
const vector<LookaheadSet> &coincident_tokens)
: grammar(lexical_grammar),
starting_characters_by_token(lexical_grammar.variables.size()),
following_characters_by_token(lexical_grammar.variables.size()),
shadowed_tokens_by_token(lexical_grammar.variables.size()),
coincident_tokens_by_token(coincident_tokens),
conflict_detection_mode(false) {
// Compute the possible separator rules and the set of separator characters that can occur
// immediately after any token.
StartingCharacterAggregator separator_character_aggregator;
for (const auto &rule : grammar.separators) {
separator_rules.push_back(Repeat{rule});
@ -96,34 +97,84 @@ class LexTableBuilderImpl : public LexTableBuilder {
}
separator_rules.push_back(Blank{});
separator_start_characters = separator_character_aggregator.result;
clear();
// Compute the set of characters that each token can start with and the set of non-separator
// characters that can follow each token.
for (unsigned i = 0, n = grammar.variables.size(); i < n; i++) {
Symbol token = Symbol::terminal(i);
auto &incompatible_indices = incompatible_tokens_by_token_index[i];
StartingCharacterAggregator starting_character_aggregator;
starting_character_aggregator.apply(grammar.variables[i].rule);
starting_characters_by_token[i] = starting_character_aggregator.result;
for (unsigned j = 0; j < n; j++) {
if (i == j) continue;
if (detect_conflict(i, j, following_tokens_by_token_index)) {
incompatible_indices.insert(Symbol::terminal(j));
}
StartingCharacterAggregator following_character_aggregator;
following_tokens_by_token[i].for_each([&](Symbol following_token) {
following_character_aggregator.apply(grammar.variables[following_token.index].rule);
});
// TODO - Refactor this. In general, a keyword token cannot be followed immediately by
// another alphanumeric character. But this requirement is currently not expressed anywhere in
// the grammar. So without this hack, we would be overly conservative about merging parse
// states because we would often consider `identifier` tokens to *conflict* with keyword
// tokens.
if (is_keyword(grammar.variables[i])) {
following_character_aggregator.result
.exclude('a', 'z')
.exclude('A', 'Z')
.exclude('0', '9')
.exclude('_')
.exclude('$');
}
for (const ExternalToken &external_token : syntax_grammar.external_tokens) {
if (external_token.corresponding_internal_token == token) {
for (unsigned j = 0; j < syntax_grammar.external_tokens.size(); j++) {
incompatible_indices.insert(Symbol::external(j));
}
following_characters_by_token[i] = following_character_aggregator.result;
}
// For each pair of tokens, generate a lex table for just those two tokens and record what
// conflicts arise.
conflict_detection_mode = true;
for (Symbol::Index i = 0, n = grammar.variables.size(); i < n; i++) {
for (Symbol::Index j = 0; j < i; j++) {
if (starting_characters_by_token[i].intersects(starting_characters_by_token[j]) ||
starting_characters_by_token[i].intersects(separator_start_characters) ||
starting_characters_by_token[j].intersects(separator_start_characters)) {
clear();
add_lex_state(item_set_for_terminals(LookaheadSet({
Symbol::terminal(i),
Symbol::terminal(j)
})));
if (conflict_status_by_token[i]) shadowed_tokens_by_token[j].insert(Symbol::terminal(i));
if (conflict_status_by_token[j]) shadowed_tokens_by_token[i].insert(Symbol::terminal(j));
}
}
}
}
LexTable build(ParseTable *parse_table) {
clear();
conflict_detection_mode = false;
vector<pair<LookaheadSet, vector<ParseState *>>> starting_token_sets;
for (ParseState &parse_state : parse_table->states) {
parse_state.lex_state_id = add_lex_state(
item_set_for_terminals(parse_state.terminal_entries)
);
LookaheadSet token_set;
for (auto &entry : parse_state.terminal_entries) {
token_set.insert(entry.first);
}
bool did_merge = false;
for (auto &pair : starting_token_sets) {
if (merge_token_set(&pair.first, token_set)) {
did_merge = true;
pair.second.push_back(&parse_state);
break;
}
}
if (!did_merge) starting_token_sets.push_back({token_set, {&parse_state}});
}
for (auto &pair : starting_token_sets) {
LexStateId state_id = add_lex_state(item_set_for_terminals(pair.first));
for (ParseState *parse_state : pair.second) {
parse_state->lex_state_id = state_id;
}
}
mark_fragile_tokens(parse_table);
remove_duplicate_lex_states(parse_table);
@ -131,64 +182,17 @@ class LexTableBuilderImpl : public LexTableBuilder {
}
const set<Symbol> &get_incompatible_tokens(Symbol::Index index) const {
return incompatible_tokens_by_token_index[index];
}
bool detect_conflict(Symbol::Index left, Symbol::Index right,
const vector<set<Symbol::Index>> &following_tokens_by_token_index) {
StartingCharacterAggregator left_starting_characters;
StartingCharacterAggregator right_starting_characters;
left_starting_characters.apply(grammar.variables[left].rule);
right_starting_characters.apply(grammar.variables[right].rule);
if (!left_starting_characters.result.intersects(right_starting_characters.result) &&
!left_starting_characters.result.intersects(separator_start_characters) &&
!right_starting_characters.result.intersects(separator_start_characters)) {
return false;
}
auto following_characters_entry = following_characters_by_token_index.find(right);
if (following_characters_entry == following_characters_by_token_index.end()) {
StartingCharacterAggregator aggregator;
for (auto following_token_index : following_tokens_by_token_index[right]) {
aggregator.apply(grammar.variables[following_token_index].rule);
}
following_characters_entry =
following_characters_by_token_index.insert({right, aggregator.result}).first;
// TODO - Refactor this. In general, a keyword token cannot be followed immediately by
// another alphanumeric character. But this requirement is currently not expressed anywhere in
// the grammar. So without this hack, we would be overly conservative about merging parse
// states because we would often consider `identifier` tokens to *conflict* with keyword
// tokens.
if (is_keyword(grammar.variables[right])) {
following_characters_entry->second
.exclude('a', 'z')
.exclude('A', 'Z')
.exclude('0', '9')
.exclude('_')
.exclude('$');
}
}
current_conflict_detection_token_index = right;
current_conflict_detection_following_characters = following_characters_entry->second;
add_lex_state(item_set_for_terminals({{Symbol::terminal(left), {}}, {Symbol::terminal(right), {}}}));
bool result = current_conflict_value;
clear();
return result;
return shadowed_tokens_by_token[index];
}
private:
bool is_keyword(const LexicalVariable &variable) {
return variable.is_string && iswalpha(get_last_character(variable.rule));
}
static uint32_t get_last_character(const Rule &rule) {
return rule.match(
[](const Seq &sequence) { return get_last_character(*sequence.right); },
[](const rules::CharacterSet &rule) { return *rule.included_chars.begin(); },
[](const rules::Metadata &rule) { return get_last_character(*rule.rule); },
[](auto) { return 0; }
);
EndingCharacterAggregator aggregator;
aggregator.apply(variable.rule);
return
!aggregator.result.includes_all &&
aggregator.result.included_chars.size() == 1 &&
iswalpha(*aggregator.result.included_chars.begin());
}
LexStateId add_lex_state(const LexItemSet &item_set) {
@ -208,11 +212,9 @@ class LexTableBuilderImpl : public LexTableBuilder {
void clear() {
lex_table.states.clear();
lex_state_ids.clear();
current_conflict_detection_following_characters = CharacterSet();
current_conflict_value = false;
conflict_status_by_token = vector<bool>(grammar.variables.size(), false);
}
private:
void add_advance_actions(const LexItemSet &item_set, LexStateId state_id) {
for (const auto &pair : item_set.transitions()) {
const CharacterSet &characters = pair.first;
@ -221,23 +223,28 @@ class LexTableBuilderImpl : public LexTableBuilder {
AdvanceAction action(-1, transition.precedence, transition.in_main_token);
AcceptTokenAction &accept_action = lex_table.states[state_id].accept_action;
if (accept_action.is_present()) {
bool prefer_advancing = conflict_manager.resolve(transition.destination, action, accept_action);
bool can_advance_for_accepted_token = false;
for (const LexItem &item : transition.destination.entries) {
if (item.lhs == accept_action.symbol) {
can_advance_for_accepted_token = true;
} else if (item.lhs.index == current_conflict_detection_token_index &&
!prefer_advancing && !transition.in_main_token) {
current_conflict_value = true;
}
}
bool prefer_advancing = conflict_manager.resolve(
transition.destination,
action,
accept_action
);
if (accept_action.symbol.index == current_conflict_detection_token_index &&
!can_advance_for_accepted_token &&
(characters.intersects(separator_start_characters) ||
(characters.intersects(current_conflict_detection_following_characters) &&
grammar.variables[accept_action.symbol.index].is_string))) {
current_conflict_value = true;
if (conflict_detection_mode) {
bool next_item_set_can_yield_this_token = false;
for (const LexItem &item : transition.destination.entries) {
if (item.lhs == accept_action.symbol) {
next_item_set_can_yield_this_token = true;
} else if (!prefer_advancing && !transition.in_main_token) {
conflict_status_by_token[item.lhs.index] = true;
}
}
if (prefer_advancing &&
!next_item_set_can_yield_this_token &&
(characters.intersects(following_characters_by_token[accept_action.symbol.index]) ||
characters.intersects(separator_start_characters))) {
conflict_status_by_token[accept_action.symbol.index] = true;
}
}
if (!prefer_advancing) continue;
@ -256,10 +263,15 @@ class LexTableBuilderImpl : public LexTableBuilder {
item.lhs.is_built_in() ||
grammar.variables[item.lhs.index].is_string);
AcceptTokenAction &existing_action = lex_table.states[state_id].accept_action;
if (!existing_action.is_present() ||
conflict_manager.resolve(action, existing_action)) {
lex_table.states[state_id].accept_action = action;
if (existing_action.is_present()) {
if (conflict_manager.resolve(action, existing_action)) {
conflict_status_by_token[existing_action.symbol.index] = true;
} else {
conflict_status_by_token[action.symbol.index] = true;
continue;
}
}
lex_table.states[state_id].accept_action = action;
}
}
}
@ -292,6 +304,39 @@ class LexTableBuilderImpl : public LexTableBuilder {
}
}
bool merge_token_set(LookaheadSet *left, const LookaheadSet &right) const {
bool is_compatible = true;
left->for_each([&](Symbol left_symbol) {
if (left_symbol.is_terminal() && !left_symbol.is_built_in() && !right.contains(left_symbol)) {
right.for_each([&](Symbol right_symbol) {
if (shadowed_tokens_by_token[left_symbol.index].count(right_symbol) ||
!coincident_tokens_by_token[left_symbol.index].contains(right_symbol)) {
is_compatible = false;
return;
}
});
}
if (!is_compatible) return;
});
right.for_each([&](Symbol right_symbol) {
if (right_symbol.is_terminal() && !right_symbol.is_built_in() && !left->contains(right_symbol)) {
left->for_each([&](Symbol left_symbol) {
if (shadowed_tokens_by_token[right_symbol.index].count(left_symbol) ||
!coincident_tokens_by_token[right_symbol.index].contains(left_symbol)) {
is_compatible = false;
return;
}
});
}
if (!is_compatible) return;
});
if (is_compatible) left->insert_all(right);
return is_compatible;
}
void remove_duplicate_lex_states(ParseTable *parse_table) {
for (LexState &state : lex_table.states) {
state.accept_action.is_string = false;
@ -359,10 +404,9 @@ class LexTableBuilderImpl : public LexTableBuilder {
}
}
LexItemSet item_set_for_terminals(const map<Symbol, ParseTableEntry> &terminals) {
LexItemSet item_set_for_terminals(const LookaheadSet &terminals) {
LexItemSet result;
for (const auto &pair : terminals) {
Symbol symbol = pair.first;
terminals.for_each([&](Symbol symbol) {
if (symbol.is_terminal()) {
for (const auto &rule : rules_for_symbol(symbol)) {
for (const auto &separator_rule : separator_rules) {
@ -378,7 +422,7 @@ class LexTableBuilderImpl : public LexTableBuilder {
}
}
}
}
});
return result;
}
@ -401,11 +445,13 @@ class LexTableBuilderImpl : public LexTableBuilder {
unique_ptr<LexTableBuilder> LexTableBuilder::create(const SyntaxGrammar &syntax_grammar,
const LexicalGrammar &lexical_grammar,
const vector<set<Symbol::Index>> &following_tokens) {
const vector<LookaheadSet> &following_tokens,
const vector<LookaheadSet> &coincident_tokens) {
return unique_ptr<LexTableBuilder>(new LexTableBuilderImpl(
syntax_grammar,
lexical_grammar,
following_tokens
following_tokens,
coincident_tokens
));
}

View file

@ -14,11 +14,14 @@ struct LexicalGrammar;
namespace build_tables {
class LookaheadSet;
class LexTableBuilder {
public:
static std::unique_ptr<LexTableBuilder> create(const SyntaxGrammar &,
const LexicalGrammar &,
const std::vector<std::set<rules::Symbol::Index>> &);
const std::vector<LookaheadSet> &,
const std::vector<LookaheadSet> &);
LexTable build(ParseTable *);
const std::set<rules::Symbol> &get_incompatible_tokens(rules::Symbol::Index) const;

View file

@ -52,7 +52,8 @@ class ParseTableBuilderImpl : public ParseTableBuilder {
ParseItemSetBuilder item_set_builder;
unique_ptr<LexTableBuilder> lex_table_builder;
set<ParseAction> fragile_reductions;
vector<set<Symbol::Index>> following_tokens_by_token_index;
vector<LookaheadSet> following_tokens_by_token;
vector<LookaheadSet> coincident_tokens_by_token;
bool processing_recovery_states;
public:
@ -60,8 +61,22 @@ class ParseTableBuilderImpl : public ParseTableBuilder {
: grammar(syntax_grammar),
lexical_grammar(lexical_grammar),
item_set_builder(syntax_grammar, lexical_grammar),
following_tokens_by_token_index(lexical_grammar.variables.size()),
processing_recovery_states(false) {}
following_tokens_by_token(lexical_grammar.variables.size()),
coincident_tokens_by_token(lexical_grammar.variables.size()),
processing_recovery_states(false) {
for (unsigned i = 0, n = lexical_grammar.variables.size(); i < n; i++) {
coincident_tokens_by_token[i].insert(rules::END_OF_INPUT());
if (lexical_grammar.variables[i].is_string) {
for (unsigned j = 0; j < i; j++) {
if (lexical_grammar.variables[j].is_string) {
coincident_tokens_by_token[i].insert(Symbol::terminal(j));
coincident_tokens_by_token[j].insert(Symbol::terminal(i));
}
}
}
}
}
tuple<ParseTable, LexTable, CompileError> build() {
// Ensure that the empty rename sequence has index 0.
@ -90,7 +105,8 @@ class ParseTableBuilderImpl : public ParseTableBuilder {
lex_table_builder = LexTableBuilder::create(
grammar,
lexical_grammar,
following_tokens_by_token_index
following_tokens_by_token,
coincident_tokens_by_token
);
processing_recovery_states = true;
@ -130,17 +146,18 @@ class ParseTableBuilderImpl : public ParseTableBuilder {
for (unsigned i = 0; i < lexical_grammar.variables.size(); i++) {
Symbol token = Symbol::terminal(i);
bool has_non_reciprocal_conflict = false;
const LexicalVariable &variable = lexical_grammar.variables[i];
bool exclude_from_recovery_state = false;
for (Symbol incompatible_token : lex_table_builder->get_incompatible_tokens(i)) {
if (incompatible_token.is_terminal() &&
!lex_table_builder->get_incompatible_tokens(incompatible_token.index).count(token)) {
has_non_reciprocal_conflict = true;
if (!coincident_tokens_by_token[i].contains(incompatible_token) &&
((lexical_grammar.variables[incompatible_token.index].is_string && !variable.is_string) ||
!lex_table_builder->get_incompatible_tokens(incompatible_token.index).count(token))) {
exclude_from_recovery_state = true;
break;
}
}
if (!has_non_reciprocal_conflict) {
if (!exclude_from_recovery_state) {
add_out_of_context_parse_state(&error_state, Symbol::terminal(i));
}
}
@ -163,8 +180,7 @@ class ParseTableBuilderImpl : public ParseTableBuilder {
parse_table.states[state_id] = error_state;
}
void add_out_of_context_parse_state(ParseState *error_state,
const rules::Symbol &symbol) {
void add_out_of_context_parse_state(ParseState *error_state, const rules::Symbol &symbol) {
const ParseItemSet &item_set = recovery_item_sets_by_lookahead[symbol];
if (!item_set.entries.empty()) {
ParseStateId state = add_parse_state({}, item_set);
@ -300,6 +316,16 @@ class ParseTableBuilderImpl : public ParseTableBuilder {
}
}
auto &terminals = state.terminal_entries;
for (auto iter = terminals.begin(), end = terminals.end(); iter != end; ++iter) {
if (iter->first.is_built_in() || iter->first.is_external()) continue;
for (auto other_iter = terminals.begin(); other_iter != iter; ++other_iter) {
if (other_iter->first.is_built_in() || other_iter->first.is_external()) continue;
coincident_tokens_by_token[iter->first.index].insert(other_iter->first);
coincident_tokens_by_token[other_iter->first.index].insert(iter->first);
}
}
return "";
}
@ -767,7 +793,7 @@ class ParseTableBuilderImpl : public ParseTableBuilder {
if (left_symbol.is_terminal() && !left_symbol.is_built_in()) {
right_tokens.for_each([&](Symbol right_symbol) {
if (right_symbol.is_terminal() && !right_symbol.is_built_in()) {
following_tokens_by_token_index[left_symbol.index].insert(right_symbol.index);
following_tokens_by_token[left_symbol.index].insert(right_symbol);
}
});
}

View file

@ -16,6 +16,7 @@ struct AdvanceAction {
AdvanceAction();
AdvanceAction(size_t, PrecedenceRange, bool);
bool operator==(const AdvanceAction &other) const;
inline bool operator!=(const AdvanceAction &other) const { return !operator==(other); }
LexStateId state_index;
PrecedenceRange precedence_range;
@ -26,7 +27,8 @@ struct AcceptTokenAction {
AcceptTokenAction();
AcceptTokenAction(rules::Symbol, int, bool);
bool is_present() const;
bool operator==(const AcceptTokenAction &action) const;
bool operator==(const AcceptTokenAction &other) const;
inline bool operator!=(const AcceptTokenAction &other) const { return !operator==(other); }
rules::Symbol symbol;
int precedence;

View file

@ -234,6 +234,7 @@ static Tree *parser__lex(Parser *self, StackVersion version) {
bool found_external_token = false;
bool skipped_error = false;
bool error_mode = parse_state == ERROR_STATE;
int32_t first_error_character = 0;
Length error_start_position, error_end_position;
uint32_t last_byte_scanned = start_position.bytes;
@ -260,8 +261,7 @@ static Tree *parser__lex(Parser *self, StackVersion version) {
self->lexer.token_end_position = self->lexer.current_position;
}
if (lex_mode.lex_state == ERROR_STATE &&
self->lexer.token_end_position.bytes <= current_position.bytes) {
if (error_mode && self->lexer.token_end_position.bytes <= current_position.bytes) {
LOG("disregard_empty_token");
} else {
found_external_token = true;
@ -291,6 +291,7 @@ static Tree *parser__lex(Parser *self, StackVersion version) {
if (lex_mode.lex_state != self->language->lex_modes[ERROR_STATE].lex_state) {
LOG("retry_in_error_mode");
error_mode = true;
lex_mode = self->language->lex_modes[ERROR_STATE];
valid_external_tokens = ts_language_enabled_external_tokens(
self->language,