Be more conservative about avoiding lexing conflicts when merging states

This fixes a bug in the C++ grammar where the `>>` token was merged into
a state where it was previously not valid, but the `>` token *was*
valid. This caused nested templates like -

std::vector<std::pair<int, int>>

to not parse correctly.
This commit is contained in:
Max Brunsfeld 2017-06-22 15:32:13 -07:00
parent 6db12ab44e
commit 2c043803f1
2 changed files with 138 additions and 16 deletions

View file

@ -70,7 +70,8 @@ class LexTableBuilderImpl : public LexTableBuilder {
LexTable lex_table;
const LexicalGrammar grammar;
vector<Rule> separator_rules;
CharacterSet first_separator_characters;
CharacterSet separator_start_characters;
CharacterSet token_start_characters;
LexConflictManager conflict_manager;
unordered_map<LexItemSet, LexStateId> lex_state_ids;
@ -78,13 +79,26 @@ class LexTableBuilderImpl : public LexTableBuilder {
vector<bool> shadowed_token_indices;
LexTableBuilderImpl(const LexicalGrammar &grammar) : grammar(grammar) {
StartingCharacterAggregator starting_character_aggregator;
StartingCharacterAggregator separator_character_aggregator;
for (const auto &rule : grammar.separators) {
separator_rules.push_back(Repeat{rule});
starting_character_aggregator.apply(rule);
separator_character_aggregator.apply(rule);
}
separator_rules.push_back(Blank{});
first_separator_characters = starting_character_aggregator.result;
separator_start_characters = separator_character_aggregator.result;
StartingCharacterAggregator token_start_character_aggregator;
for (const auto &variable : grammar.variables) {
token_start_character_aggregator.apply(variable.rule);
}
token_start_characters = token_start_character_aggregator.result;
token_start_characters
.exclude('a', 'z')
.exclude('A', 'Z')
.exclude('0', '9')
.exclude('_')
.exclude('$');
shadowed_token_indices.resize(grammar.variables.size());
}
@ -148,25 +162,27 @@ class LexTableBuilderImpl : public LexTableBuilder {
const LexItemSet::Transition &transition = pair.second;
AdvanceAction action(-1, transition.precedence, transition.in_main_token);
auto current_action = lex_table.states[state_id].accept_action;
if (current_action.is_present()) {
bool prefer_advancing = conflict_manager.resolve(transition.destination, action, current_action);
bool matches_accepted_token = false;
AcceptTokenAction &accept_action = lex_table.states[state_id].accept_action;
if (accept_action.is_present()) {
bool prefer_advancing = conflict_manager.resolve(transition.destination, action, accept_action);
bool can_advance_for_accepted_token = false;
for (const LexItem &item : transition.destination.entries) {
if (item.lhs == current_action.symbol) {
matches_accepted_token = true;
} else if (!transition.in_main_token && !item.lhs.is_built_in() && !prefer_advancing) {
if (item.lhs == accept_action.symbol) {
can_advance_for_accepted_token = true;
} else if (!prefer_advancing && !transition.in_main_token && !item.lhs.is_built_in()) {
shadowed_token_indices[item.lhs.index] = true;
}
}
if (!matches_accepted_token && characters.intersects(first_separator_characters)) {
shadowed_token_indices[current_action.symbol.index] = true;
if (!can_advance_for_accepted_token) {
if (characters.intersects(separator_start_characters) ||
(grammar.variables[accept_action.symbol.index].is_string &&
characters.intersects(token_start_characters))) {
shadowed_token_indices[accept_action.symbol.index] = true;
}
}
if (!prefer_advancing) {
continue;
}
if (!prefer_advancing) continue;
}
action.state_index = add_lex_state(transition.destination);

View file

@ -0,0 +1,106 @@
#include "test_helper.h"
#include "compiler/lexical_grammar.h"
#include "compiler/build_tables/lex_table_builder.h"
using namespace build_tables;
using namespace rules;
START_TEST
describe("LexTableBuilder::detect_conflict", []() {
vector<Rule> separators({
CharacterSet({ ' ', '\t' }),
});
it("returns false for tokens that don't match the same string", [&]() {
auto builder = LexTableBuilder::create(LexicalGrammar{
{
LexicalVariable{
"token_1",
VariableTypeNamed,
Rule::seq({
CharacterSet({ 'a' }),
CharacterSet({ 'b' }),
CharacterSet({ 'c' }),
}),
false
},
LexicalVariable{
"token_2",
VariableTypeNamed,
Rule::seq({
CharacterSet({ 'b' }),
CharacterSet({ 'c' }),
CharacterSet({ 'd' }),
}),
false
},
},
separators
});
AssertThat(builder->detect_conflict(0, 1), IsFalse());
AssertThat(builder->detect_conflict(1, 0), IsFalse());
});
it("returns true when one token matches a string that the other matches, "
"plus some addition content that begins with a separator character", [&]() {
LexicalGrammar grammar{
{
LexicalVariable{
"token_1",
VariableTypeNamed,
Rule::repeat(CharacterSet().include_all().exclude('\n')), // regex: /.+/
false
},
LexicalVariable{
"token_2",
VariableTypeNamed,
Rule::seq({ CharacterSet({ 'a' }), CharacterSet({ 'b' }), CharacterSet({ 'c' }) }), // string: 'abc'
true
},
},
separators
};
auto builder = LexTableBuilder::create(grammar);
AssertThat(builder->detect_conflict(0, 1), IsTrue());
AssertThat(builder->detect_conflict(1, 0), IsFalse());
grammar.variables[1].is_string = false;
AssertThat(builder->detect_conflict(0, 1), IsTrue());
AssertThat(builder->detect_conflict(1, 0), IsFalse());
});
it("returns true when one token matches a string that the other matches, "
"plus some addition content that matches another one-character token", [&]() {
LexicalGrammar grammar{
{
LexicalVariable{
"token_1",
VariableTypeNamed,
Rule::seq({
CharacterSet({ '>' }),
CharacterSet({ '>' }),
}),
true
},
LexicalVariable{
"token_2",
VariableTypeNamed,
Rule::seq({
CharacterSet({ '>' }),
}),
true
},
},
separators
};
auto builder = LexTableBuilder::create(grammar);
AssertThat(builder->detect_conflict(0, 1), IsTrue());
AssertThat(builder->detect_conflict(1, 0), IsFalse());
});
});
END_TEST