From 2c043803f1968d6e05e61a84c93fbc0478e6aa06 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 22 Jun 2017 15:32:13 -0700 Subject: [PATCH] Be more conservative about avoiding lexing conflicts when merging states This fixes a bug in the C++ grammar where the `>>` token was merged into a state where it was previously not valid, but the `>` token *was* valid. This caused nested templates like - std::vector> to not parse correctly. --- .../build_tables/lex_table_builder.cc | 48 +++++--- .../build_tables/lex_table_builder_test.cc | 106 ++++++++++++++++++ 2 files changed, 138 insertions(+), 16 deletions(-) create mode 100644 test/compiler/build_tables/lex_table_builder_test.cc diff --git a/src/compiler/build_tables/lex_table_builder.cc b/src/compiler/build_tables/lex_table_builder.cc index f9068d42..6eb4f3cc 100644 --- a/src/compiler/build_tables/lex_table_builder.cc +++ b/src/compiler/build_tables/lex_table_builder.cc @@ -70,7 +70,8 @@ class LexTableBuilderImpl : public LexTableBuilder { LexTable lex_table; const LexicalGrammar grammar; vector separator_rules; - CharacterSet first_separator_characters; + CharacterSet separator_start_characters; + CharacterSet token_start_characters; LexConflictManager conflict_manager; unordered_map lex_state_ids; @@ -78,13 +79,26 @@ class LexTableBuilderImpl : public LexTableBuilder { vector shadowed_token_indices; LexTableBuilderImpl(const LexicalGrammar &grammar) : grammar(grammar) { - StartingCharacterAggregator starting_character_aggregator; + StartingCharacterAggregator separator_character_aggregator; for (const auto &rule : grammar.separators) { separator_rules.push_back(Repeat{rule}); - starting_character_aggregator.apply(rule); + separator_character_aggregator.apply(rule); } separator_rules.push_back(Blank{}); - first_separator_characters = starting_character_aggregator.result; + separator_start_characters = separator_character_aggregator.result; + + StartingCharacterAggregator token_start_character_aggregator; + for (const auto &variable : grammar.variables) { + token_start_character_aggregator.apply(variable.rule); + } + token_start_characters = token_start_character_aggregator.result; + token_start_characters + .exclude('a', 'z') + .exclude('A', 'Z') + .exclude('0', '9') + .exclude('_') + .exclude('$'); + shadowed_token_indices.resize(grammar.variables.size()); } @@ -148,25 +162,27 @@ class LexTableBuilderImpl : public LexTableBuilder { const LexItemSet::Transition &transition = pair.second; AdvanceAction action(-1, transition.precedence, transition.in_main_token); - auto current_action = lex_table.states[state_id].accept_action; - if (current_action.is_present()) { - bool prefer_advancing = conflict_manager.resolve(transition.destination, action, current_action); - bool matches_accepted_token = false; + AcceptTokenAction &accept_action = lex_table.states[state_id].accept_action; + if (accept_action.is_present()) { + bool prefer_advancing = conflict_manager.resolve(transition.destination, action, accept_action); + bool can_advance_for_accepted_token = false; for (const LexItem &item : transition.destination.entries) { - if (item.lhs == current_action.symbol) { - matches_accepted_token = true; - } else if (!transition.in_main_token && !item.lhs.is_built_in() && !prefer_advancing) { + if (item.lhs == accept_action.symbol) { + can_advance_for_accepted_token = true; + } else if (!prefer_advancing && !transition.in_main_token && !item.lhs.is_built_in()) { shadowed_token_indices[item.lhs.index] = true; } } - if (!matches_accepted_token && characters.intersects(first_separator_characters)) { - shadowed_token_indices[current_action.symbol.index] = true; + if (!can_advance_for_accepted_token) { + if (characters.intersects(separator_start_characters) || + (grammar.variables[accept_action.symbol.index].is_string && + characters.intersects(token_start_characters))) { + shadowed_token_indices[accept_action.symbol.index] = true; + } } - if (!prefer_advancing) { - continue; - } + if (!prefer_advancing) continue; } action.state_index = add_lex_state(transition.destination); diff --git a/test/compiler/build_tables/lex_table_builder_test.cc b/test/compiler/build_tables/lex_table_builder_test.cc new file mode 100644 index 00000000..7376bd2c --- /dev/null +++ b/test/compiler/build_tables/lex_table_builder_test.cc @@ -0,0 +1,106 @@ +#include "test_helper.h" +#include "compiler/lexical_grammar.h" +#include "compiler/build_tables/lex_table_builder.h" + +using namespace build_tables; +using namespace rules; + +START_TEST + +describe("LexTableBuilder::detect_conflict", []() { + vector separators({ + CharacterSet({ ' ', '\t' }), + }); + + it("returns false for tokens that don't match the same string", [&]() { + auto builder = LexTableBuilder::create(LexicalGrammar{ + { + LexicalVariable{ + "token_1", + VariableTypeNamed, + Rule::seq({ + CharacterSet({ 'a' }), + CharacterSet({ 'b' }), + CharacterSet({ 'c' }), + }), + false + }, + LexicalVariable{ + "token_2", + VariableTypeNamed, + Rule::seq({ + CharacterSet({ 'b' }), + CharacterSet({ 'c' }), + CharacterSet({ 'd' }), + }), + false + }, + }, + separators + }); + + AssertThat(builder->detect_conflict(0, 1), IsFalse()); + AssertThat(builder->detect_conflict(1, 0), IsFalse()); + }); + + it("returns true when one token matches a string that the other matches, " + "plus some addition content that begins with a separator character", [&]() { + LexicalGrammar grammar{ + { + LexicalVariable{ + "token_1", + VariableTypeNamed, + Rule::repeat(CharacterSet().include_all().exclude('\n')), // regex: /.+/ + false + }, + LexicalVariable{ + "token_2", + VariableTypeNamed, + Rule::seq({ CharacterSet({ 'a' }), CharacterSet({ 'b' }), CharacterSet({ 'c' }) }), // string: 'abc' + true + }, + }, + separators + }; + + auto builder = LexTableBuilder::create(grammar); + AssertThat(builder->detect_conflict(0, 1), IsTrue()); + AssertThat(builder->detect_conflict(1, 0), IsFalse()); + + grammar.variables[1].is_string = false; + AssertThat(builder->detect_conflict(0, 1), IsTrue()); + AssertThat(builder->detect_conflict(1, 0), IsFalse()); + }); + + it("returns true when one token matches a string that the other matches, " + "plus some addition content that matches another one-character token", [&]() { + LexicalGrammar grammar{ + { + LexicalVariable{ + "token_1", + VariableTypeNamed, + Rule::seq({ + CharacterSet({ '>' }), + CharacterSet({ '>' }), + }), + true + }, + LexicalVariable{ + "token_2", + VariableTypeNamed, + Rule::seq({ + CharacterSet({ '>' }), + }), + true + }, + }, + separators + }; + + auto builder = LexTableBuilder::create(grammar); + AssertThat(builder->detect_conflict(0, 1), IsTrue()); + AssertThat(builder->detect_conflict(1, 0), IsFalse()); + }); +}); + +END_TEST