Be more conservative about avoiding lexing conflicts when merging states

This fixes a bug in the C++ grammar where the `>>` token was merged into a state where it was previously not valid, but the `>` token *was* valid. This caused nested templates like - std::vector<std::pair<int, int>> to not parse correctly.
2017-06-22 15:32:13 -07:00 · 2017-06-22 15:32:13 -07:00 · 2c043803f1
commit 2c043803f1
parent 6db12ab44e
2 changed files with 138 additions and 16 deletions
--- a/test/compiler/build_tables/lex_table_builder_test.cc
+++ b/test/compiler/build_tables/lex_table_builder_test.cc
@ -0,0 +1,106 @@
+#include "test_helper.h"
+#include "compiler/lexical_grammar.h"
+#include "compiler/build_tables/lex_table_builder.h"
+
+using namespace build_tables;
+using namespace rules;
+
+START_TEST
+
+describe("LexTableBuilder::detect_conflict", []() {
+  vector<Rule> separators({
+    CharacterSet({ ' ', '\t' }),
+  });
+
+  it("returns false for tokens that don't match the same string", [&]() {
+    auto builder = LexTableBuilder::create(LexicalGrammar{
+      {
+        LexicalVariable{
+          "token_1",
+          VariableTypeNamed,
+          Rule::seq({
+            CharacterSet({ 'a' }),
+            CharacterSet({ 'b' }),
+            CharacterSet({ 'c' }),
+          }),
+          false
+        },
+        LexicalVariable{
+          "token_2",
+          VariableTypeNamed,
+          Rule::seq({
+            CharacterSet({ 'b' }),
+            CharacterSet({ 'c' }),
+            CharacterSet({ 'd' }),
+          }),
+          false
+        },
+      },
+      separators
+    });
+
+    AssertThat(builder->detect_conflict(0, 1), IsFalse());
+    AssertThat(builder->detect_conflict(1, 0), IsFalse());
+  });
+
+  it("returns true when one token matches a string that the other matches, "
+     "plus some addition content that begins with a separator character", [&]() {
+    LexicalGrammar grammar{
+      {
+        LexicalVariable{
+          "token_1",
+          VariableTypeNamed,
+          Rule::repeat(CharacterSet().include_all().exclude('\n')), // regex: /.+/
+          false
+        },
+        LexicalVariable{
+          "token_2",
+          VariableTypeNamed,
+          Rule::seq({ CharacterSet({ 'a' }), CharacterSet({ 'b' }), CharacterSet({ 'c' }) }), // string: 'abc'
+          true
+        },
+      },
+      separators
+    };
+
+    auto builder = LexTableBuilder::create(grammar);
+    AssertThat(builder->detect_conflict(0, 1), IsTrue());
+    AssertThat(builder->detect_conflict(1, 0), IsFalse());
+
+    grammar.variables[1].is_string = false;
+    AssertThat(builder->detect_conflict(0, 1), IsTrue());
+    AssertThat(builder->detect_conflict(1, 0), IsFalse());
+  });
+
+  it("returns true when one token matches a string that the other matches, "
+     "plus some addition content that matches another one-character token", [&]() {
+    LexicalGrammar grammar{
+      {
+        LexicalVariable{
+          "token_1",
+          VariableTypeNamed,
+          Rule::seq({
+            CharacterSet({ '>' }),
+            CharacterSet({ '>' }),
+          }),
+          true
+        },
+        LexicalVariable{
+          "token_2",
+          VariableTypeNamed,
+          Rule::seq({
+            CharacterSet({ '>' }),
+          }),
+          true
+        },
+      },
+      separators
+    };
+
+    auto builder = LexTableBuilder::create(grammar);
+    AssertThat(builder->detect_conflict(0, 1), IsTrue());
+    AssertThat(builder->detect_conflict(1, 0), IsFalse());
+  });
+});
+
+END_TEST