diff --git a/src/compiler/build_tables/build_parse_table.cc b/src/compiler/build_tables/build_parse_table.cc index ca9405a0..c5519555 100644 --- a/src/compiler/build_tables/build_parse_table.cc +++ b/src/compiler/build_tables/build_parse_table.cc @@ -413,26 +413,22 @@ class ParseTableBuilder { ParseState &state = parse_table.states[i]; ParseState &other = parse_table.states[j]; - if (state.nonterminal_entries != other.nonterminal_entries) - return false; + if (state.nonterminal_entries != other.nonterminal_entries) return false; for (auto &entry : state.terminal_entries) { Symbol lookahead = entry.first; - const vector &actions = entry.second.actions; - auto &incompatible_tokens = incompatible_tokens_by_index[lookahead.index]; - const auto &other_entry = other.terminal_entries.find(lookahead); + if (other_entry == other.terminal_entries.end()) { + if (entry.second.actions.back().type != ParseActionTypeReduce) return false; + if (!has_entry(other, entry.second)) return false; + if (lookahead.is_external()) return false; if (!lookahead.is_built_in()) { - for (const Symbol &incompatible_token : incompatible_tokens) { + for (const Symbol &incompatible_token : incompatible_tokens_by_index[lookahead.index]) { if (other.terminal_entries.count(incompatible_token)) return false; } } - if (actions.back().type != ParseActionTypeReduce) - return false; - if (!has_entry(other, entry.second)) - return false; } else if (entry.second != other_entry->second) { return false; } @@ -442,26 +438,25 @@ class ParseTableBuilder { for (auto &entry : other.terminal_entries) { Symbol lookahead = entry.first; - const vector &actions = entry.second.actions; - auto &incompatible_tokens = incompatible_tokens_by_index[lookahead.index]; if (!state.terminal_entries.count(lookahead)) { + if (entry.second.actions.back().type != ParseActionTypeReduce) return false; + if (!has_entry(state, entry.second)) return false; + if (lookahead.is_external()) return false; if (!lookahead.is_built_in()) { - for (const Symbol &incompatible_token : incompatible_tokens) { + for (const Symbol &incompatible_token : incompatible_tokens_by_index[lookahead.index]) { if (state.terminal_entries.count(incompatible_token)) return false; } } - if (actions.back().type != ParseActionTypeReduce) - return false; - if (!has_entry(state, entry.second)) - return false; + symbols_to_merge.insert(lookahead); } } - for (const Symbol &lookahead : symbols_to_merge) + for (const Symbol &lookahead : symbols_to_merge) { state.terminal_entries[lookahead] = other.terminal_entries.find(lookahead)->second; + } return true; } diff --git a/src/compiler/build_tables/lex_conflict_manager.cc b/src/compiler/build_tables/lex_conflict_manager.cc index 82b5efaf..ccac60ff 100644 --- a/src/compiler/build_tables/lex_conflict_manager.cc +++ b/src/compiler/build_tables/lex_conflict_manager.cc @@ -16,6 +16,9 @@ bool LexConflictManager::resolve(const LexItemSet &item_set, } return true; } else { + for (const LexItem &item : item_set.entries) { + possible_homonyms[item.lhs.index].insert(old_action.symbol.index); + } return false; } } diff --git a/src/compiler/build_tables/lex_table_builder.cc b/src/compiler/build_tables/lex_table_builder.cc index f9068d42..4101f854 100644 --- a/src/compiler/build_tables/lex_table_builder.cc +++ b/src/compiler/build_tables/lex_table_builder.cc @@ -70,7 +70,8 @@ class LexTableBuilderImpl : public LexTableBuilder { LexTable lex_table; const LexicalGrammar grammar; vector separator_rules; - CharacterSet first_separator_characters; + CharacterSet separator_start_characters; + CharacterSet token_start_characters; LexConflictManager conflict_manager; unordered_map lex_state_ids; @@ -78,13 +79,26 @@ class LexTableBuilderImpl : public LexTableBuilder { vector shadowed_token_indices; LexTableBuilderImpl(const LexicalGrammar &grammar) : grammar(grammar) { - StartingCharacterAggregator starting_character_aggregator; + StartingCharacterAggregator separator_character_aggregator; for (const auto &rule : grammar.separators) { separator_rules.push_back(Repeat{rule}); - starting_character_aggregator.apply(rule); + separator_character_aggregator.apply(rule); } separator_rules.push_back(Blank{}); - first_separator_characters = starting_character_aggregator.result; + separator_start_characters = separator_character_aggregator.result; + + StartingCharacterAggregator token_start_character_aggregator; + for (const auto &variable : grammar.variables) { + token_start_character_aggregator.apply(variable.rule); + } + token_start_characters = token_start_character_aggregator.result; + token_start_characters + .exclude('a', 'z') + .exclude('A', 'Z') + .exclude('0', '9') + .exclude('_') + .exclude('$'); + shadowed_token_indices.resize(grammar.variables.size()); } @@ -100,24 +114,21 @@ class LexTableBuilderImpl : public LexTableBuilder { } bool detect_conflict(Symbol::Index left, Symbol::Index right) { - clear(); + StartingCharacterAggregator left_starting_characters; + StartingCharacterAggregator right_starting_characters; + left_starting_characters.apply(grammar.variables[left].rule); + right_starting_characters.apply(grammar.variables[right].rule); + if (!left_starting_characters.result.intersects(right_starting_characters.result) && + !left_starting_characters.result.intersects(separator_start_characters) && + !right_starting_characters.result.intersects(separator_start_characters)) { + return false; + } + clear(); map terminals; terminals[Symbol::terminal(left)]; terminals[Symbol::terminal(right)]; - - if (grammar.variables[left].is_string && grammar.variables[right].is_string) { - StartingCharacterAggregator left_starting_characters; - left_starting_characters.apply(grammar.variables[left].rule); - StartingCharacterAggregator right_starting_characters; - right_starting_characters.apply(grammar.variables[right].rule); - if (!(left_starting_characters.result == right_starting_characters.result)) { - return false; - } - } - add_lex_state(item_set_for_terminals(terminals)); - return shadowed_token_indices[right]; } @@ -148,25 +159,27 @@ class LexTableBuilderImpl : public LexTableBuilder { const LexItemSet::Transition &transition = pair.second; AdvanceAction action(-1, transition.precedence, transition.in_main_token); - auto current_action = lex_table.states[state_id].accept_action; - if (current_action.is_present()) { - bool prefer_advancing = conflict_manager.resolve(transition.destination, action, current_action); - bool matches_accepted_token = false; + AcceptTokenAction &accept_action = lex_table.states[state_id].accept_action; + if (accept_action.is_present()) { + bool prefer_advancing = conflict_manager.resolve(transition.destination, action, accept_action); + bool can_advance_for_accepted_token = false; for (const LexItem &item : transition.destination.entries) { - if (item.lhs == current_action.symbol) { - matches_accepted_token = true; - } else if (!transition.in_main_token && !item.lhs.is_built_in() && !prefer_advancing) { + if (item.lhs == accept_action.symbol) { + can_advance_for_accepted_token = true; + } else if (!prefer_advancing && !transition.in_main_token && !item.lhs.is_built_in()) { shadowed_token_indices[item.lhs.index] = true; } } - if (!matches_accepted_token && characters.intersects(first_separator_characters)) { - shadowed_token_indices[current_action.symbol.index] = true; + if (!can_advance_for_accepted_token) { + if (characters.intersects(separator_start_characters) || + (grammar.variables[accept_action.symbol.index].is_string && + characters.intersects(token_start_characters))) { + shadowed_token_indices[accept_action.symbol.index] = true; + } } - if (!prefer_advancing) { - continue; - } + if (!prefer_advancing) continue; } action.state_index = add_lex_state(transition.destination); @@ -181,15 +194,11 @@ class LexTableBuilderImpl : public LexTableBuilder { AcceptTokenAction action(item.lhs, completion_status.precedence.max, item.lhs.is_built_in() || grammar.variables[item.lhs.index].is_string); - - auto current_action = lex_table.states[state_id].accept_action; - if (current_action.is_present()) { - if (!conflict_manager.resolve(action, current_action)) { - continue; - } + AcceptTokenAction &existing_action = lex_table.states[state_id].accept_action; + if (!existing_action.is_present() || + conflict_manager.resolve(action, existing_action)) { + lex_table.states[state_id].accept_action = action; } - - lex_table.states[state_id].accept_action = action; } } } diff --git a/test/compiler/build_tables/lex_table_builder_test.cc b/test/compiler/build_tables/lex_table_builder_test.cc new file mode 100644 index 00000000..7376bd2c --- /dev/null +++ b/test/compiler/build_tables/lex_table_builder_test.cc @@ -0,0 +1,106 @@ +#include "test_helper.h" +#include "compiler/lexical_grammar.h" +#include "compiler/build_tables/lex_table_builder.h" + +using namespace build_tables; +using namespace rules; + +START_TEST + +describe("LexTableBuilder::detect_conflict", []() { + vector separators({ + CharacterSet({ ' ', '\t' }), + }); + + it("returns false for tokens that don't match the same string", [&]() { + auto builder = LexTableBuilder::create(LexicalGrammar{ + { + LexicalVariable{ + "token_1", + VariableTypeNamed, + Rule::seq({ + CharacterSet({ 'a' }), + CharacterSet({ 'b' }), + CharacterSet({ 'c' }), + }), + false + }, + LexicalVariable{ + "token_2", + VariableTypeNamed, + Rule::seq({ + CharacterSet({ 'b' }), + CharacterSet({ 'c' }), + CharacterSet({ 'd' }), + }), + false + }, + }, + separators + }); + + AssertThat(builder->detect_conflict(0, 1), IsFalse()); + AssertThat(builder->detect_conflict(1, 0), IsFalse()); + }); + + it("returns true when one token matches a string that the other matches, " + "plus some addition content that begins with a separator character", [&]() { + LexicalGrammar grammar{ + { + LexicalVariable{ + "token_1", + VariableTypeNamed, + Rule::repeat(CharacterSet().include_all().exclude('\n')), // regex: /.+/ + false + }, + LexicalVariable{ + "token_2", + VariableTypeNamed, + Rule::seq({ CharacterSet({ 'a' }), CharacterSet({ 'b' }), CharacterSet({ 'c' }) }), // string: 'abc' + true + }, + }, + separators + }; + + auto builder = LexTableBuilder::create(grammar); + AssertThat(builder->detect_conflict(0, 1), IsTrue()); + AssertThat(builder->detect_conflict(1, 0), IsFalse()); + + grammar.variables[1].is_string = false; + AssertThat(builder->detect_conflict(0, 1), IsTrue()); + AssertThat(builder->detect_conflict(1, 0), IsFalse()); + }); + + it("returns true when one token matches a string that the other matches, " + "plus some addition content that matches another one-character token", [&]() { + LexicalGrammar grammar{ + { + LexicalVariable{ + "token_1", + VariableTypeNamed, + Rule::seq({ + CharacterSet({ '>' }), + CharacterSet({ '>' }), + }), + true + }, + LexicalVariable{ + "token_2", + VariableTypeNamed, + Rule::seq({ + CharacterSet({ '>' }), + }), + true + }, + }, + separators + }; + + auto builder = LexTableBuilder::create(grammar); + AssertThat(builder->detect_conflict(0, 1), IsTrue()); + AssertThat(builder->detect_conflict(1, 0), IsFalse()); + }); +}); + +END_TEST