From 2c043803f1968d6e05e61a84c93fbc0478e6aa06 Mon Sep 17 00:00:00 2001
From: Max Brunsfeld <maxbrunsfeld@gmail.com>
Date: Thu, 22 Jun 2017 15:32:13 -0700
Subject: [PATCH] Be more conservative about avoiding lexing conflicts when
 merging states

This fixes a bug in the C++ grammar where the `>>` token was merged into
a state where it was previously not valid, but the `>` token *was*
valid. This caused nested templates like -

std::vector<std::pair<int, int>>

to not parse correctly.
---
 .../build_tables/lex_table_builder.cc         |  48 +++++---
 .../build_tables/lex_table_builder_test.cc    | 106 ++++++++++++++++++
 2 files changed, 138 insertions(+), 16 deletions(-)
 create mode 100644 test/compiler/build_tables/lex_table_builder_test.cc
diff --git a/src/compiler/build_tables/lex_table_builder.cc b/src/compiler/build_tables/lex_table_builder.cc
index f9068d42..6eb4f3cc 100644
--- a/src/compiler/build_tables/lex_table_builder.cc
+++ b/src/compiler/build_tables/lex_table_builder.cc
@@ -70,7 +70,8 @@ class LexTableBuilderImpl : public LexTableBuilder {
   LexTable lex_table;
   const LexicalGrammar grammar;
   vector<Rule> separator_rules;
-  CharacterSet first_separator_characters;
+  CharacterSet separator_start_characters;
+  CharacterSet token_start_characters;
   LexConflictManager conflict_manager;
   unordered_map<LexItemSet, LexStateId> lex_state_ids;
 
@@ -78,13 +79,26 @@ class LexTableBuilderImpl : public LexTableBuilder {
   vector<bool> shadowed_token_indices;
 
   LexTableBuilderImpl(const LexicalGrammar &grammar) : grammar(grammar) {
-    StartingCharacterAggregator starting_character_aggregator;
+    StartingCharacterAggregator separator_character_aggregator;
     for (const auto &rule : grammar.separators) {
       separator_rules.push_back(Repeat{rule});
-      starting_character_aggregator.apply(rule);
+      separator_character_aggregator.apply(rule);
     }
     separator_rules.push_back(Blank{});
-    first_separator_characters = starting_character_aggregator.result;
+    separator_start_characters = separator_character_aggregator.result;
+
+    StartingCharacterAggregator token_start_character_aggregator;
+    for (const auto &variable : grammar.variables) {
+      token_start_character_aggregator.apply(variable.rule);
+    }
+    token_start_characters = token_start_character_aggregator.result;
+    token_start_characters
+      .exclude('a', 'z')
+      .exclude('A', 'Z')
+      .exclude('0', '9')
+      .exclude('_')
+      .exclude('$');
+
     shadowed_token_indices.resize(grammar.variables.size());
   }
 
@@ -148,25 +162,27 @@ class LexTableBuilderImpl : public LexTableBuilder {
       const LexItemSet::Transition &transition = pair.second;
 
       AdvanceAction action(-1, transition.precedence, transition.in_main_token);
-      auto current_action = lex_table.states[state_id].accept_action;
-      if (current_action.is_present()) {
-        bool prefer_advancing = conflict_manager.resolve(transition.destination, action, current_action);
-        bool matches_accepted_token = false;
+      AcceptTokenAction &accept_action = lex_table.states[state_id].accept_action;
+      if (accept_action.is_present()) {
+        bool prefer_advancing = conflict_manager.resolve(transition.destination, action, accept_action);
+        bool can_advance_for_accepted_token = false;
         for (const LexItem &item : transition.destination.entries) {
-          if (item.lhs == current_action.symbol) {
-            matches_accepted_token = true;
-          } else if (!transition.in_main_token && !item.lhs.is_built_in() && !prefer_advancing) {
+          if (item.lhs == accept_action.symbol) {
+            can_advance_for_accepted_token = true;
+          } else if (!prefer_advancing && !transition.in_main_token && !item.lhs.is_built_in()) {
             shadowed_token_indices[item.lhs.index] = true;
           }
         }
 
-        if (!matches_accepted_token && characters.intersects(first_separator_characters)) {
-          shadowed_token_indices[current_action.symbol.index] = true;
+        if (!can_advance_for_accepted_token) {
+          if (characters.intersects(separator_start_characters) ||
+              (grammar.variables[accept_action.symbol.index].is_string &&
+               characters.intersects(token_start_characters))) {
+            shadowed_token_indices[accept_action.symbol.index] = true;
+          }
         }
 
-        if (!prefer_advancing) {
-          continue;
-        }
+        if (!prefer_advancing) continue;
       }
 
       action.state_index = add_lex_state(transition.destination);
diff --git a/test/compiler/build_tables/lex_table_builder_test.cc b/test/compiler/build_tables/lex_table_builder_test.cc
new file mode 100644
index 00000000..7376bd2c
--- /dev/null
+++ b/test/compiler/build_tables/lex_table_builder_test.cc
@@ -0,0 +1,106 @@
+#include "test_helper.h"
+#include "compiler/lexical_grammar.h"
+#include "compiler/build_tables/lex_table_builder.h"
+
+using namespace build_tables;
+using namespace rules;
+
+START_TEST
+
+describe("LexTableBuilder::detect_conflict", []() {
+  vector<Rule> separators({
+    CharacterSet({ ' ', '\t' }),
+  });
+
+  it("returns false for tokens that don't match the same string", [&]() {
+    auto builder = LexTableBuilder::create(LexicalGrammar{
+      {
+        LexicalVariable{
+          "token_1",
+          VariableTypeNamed,
+          Rule::seq({
+            CharacterSet({ 'a' }),
+            CharacterSet({ 'b' }),
+            CharacterSet({ 'c' }),
+          }),
+          false
+        },
+        LexicalVariable{
+          "token_2",
+          VariableTypeNamed,
+          Rule::seq({
+            CharacterSet({ 'b' }),
+            CharacterSet({ 'c' }),
+            CharacterSet({ 'd' }),
+          }),
+          false
+        },
+      },
+      separators
+    });
+
+    AssertThat(builder->detect_conflict(0, 1), IsFalse());
+    AssertThat(builder->detect_conflict(1, 0), IsFalse());
+  });
+
+  it("returns true when one token matches a string that the other matches, "
+     "plus some addition content that begins with a separator character", [&]() {
+    LexicalGrammar grammar{
+      {
+        LexicalVariable{
+          "token_1",
+          VariableTypeNamed,
+          Rule::repeat(CharacterSet().include_all().exclude('\n')), // regex: /.+/
+          false
+        },
+        LexicalVariable{
+          "token_2",
+          VariableTypeNamed,
+          Rule::seq({ CharacterSet({ 'a' }), CharacterSet({ 'b' }), CharacterSet({ 'c' }) }), // string: 'abc'
+          true
+        },
+      },
+      separators
+    };
+
+    auto builder = LexTableBuilder::create(grammar);
+    AssertThat(builder->detect_conflict(0, 1), IsTrue());
+    AssertThat(builder->detect_conflict(1, 0), IsFalse());
+
+    grammar.variables[1].is_string = false;
+    AssertThat(builder->detect_conflict(0, 1), IsTrue());
+    AssertThat(builder->detect_conflict(1, 0), IsFalse());
+  });
+
+  it("returns true when one token matches a string that the other matches, "
+     "plus some addition content that matches another one-character token", [&]() {
+    LexicalGrammar grammar{
+      {
+        LexicalVariable{
+          "token_1",
+          VariableTypeNamed,
+          Rule::seq({
+            CharacterSet({ '>' }),
+            CharacterSet({ '>' }),
+          }),
+          true
+        },
+        LexicalVariable{
+          "token_2",
+          VariableTypeNamed,
+          Rule::seq({
+            CharacterSet({ '>' }),
+          }),
+          true
+        },
+      },
+      separators
+    };
+
+    auto builder = LexTableBuilder::create(grammar);
+    AssertThat(builder->detect_conflict(0, 1), IsTrue());
+    AssertThat(builder->detect_conflict(1, 0), IsFalse());
+  });
+});
+
+END_TEST