Add regression test for avoiding lexical conflicts due to state merging

2017-03-09 12:13:38 -08:00 · 2017-03-09 12:13:38 -08:00 · 56ec45729c
commit 56ec45729c
parent ac4167fdc9
3 changed files with 118 additions and 0 deletions
--- a/spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/corpus.txt
+++ b/spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/corpus.txt
@ -0,0 +1,33 @@
+========================
+regexes
+========================
+
+/a+/
+
+---
+
+(expression (regex))
+
+========================
+conditionals
+========================
+
+(if (1) /a+/)
+
+---
+
+(expression (parenthesized (expression (conditional
+  (parenthesized (expression (number)))
+  (expression (regex))))))
+
+========================
+quotients
+========================
+
+((1) / 2)
+
+---
+
+(expression (parenthesized (expression (quotient
+  (expression (parenthesized (expression (number))))
+  (expression (number))))))
--- a/spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/grammar.json
+++ b/spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/grammar.json
@ -0,0 +1,65 @@
+{
+  "name": "lexical_conflicts_due_to_state_merging",
+
+  "extras": [
+    {"type": "PATTERN", "value": "\\s"}
+  ],
+
+  "rules": {
+    "expression": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "conditional"},
+        {"type": "SYMBOL", "name": "regex"},
+        {"type": "SYMBOL", "name": "quotient"},
+        {"type": "SYMBOL", "name": "number"},
+        {"type": "SYMBOL", "name": "parenthesized"}
+      ]
+    },
+
+    "conditional": {
+      "type": "PREC_LEFT",
+      "value": 1,
+      "content": {
+        "type": "SEQ",
+        "members": [
+          {"type": "STRING", "value": "if"},
+          {"type": "SYMBOL", "name": "parenthesized"},
+          {"type": "SYMBOL", "name": "expression"}
+        ]
+      }
+    },
+
+    "quotient": {
+      "type": "PREC_LEFT",
+      "value": 0,
+      "content": {
+        "type": "SEQ",
+        "members": [
+          {"type": "SYMBOL", "name": "expression"},
+          {"type": "STRING", "value": "/"},
+          {"type": "SYMBOL", "name": "expression"}
+        ]
+      }
+    },
+
+    "regex": {
+      "type": "PATTERN",
+      "value": "/[^/\n]+/"
+    },
+
+    "number": {
+      "type": "PATTERN",
+      "value": "\\d+"
+    },
+
+    "parenthesized": {
+      "type": "SEQ",
+      "members": [
+        {"type": "STRING", "value": "("},
+        {"type": "SYMBOL", "name": "expression"},
+        {"type": "STRING", "value": ")"}
+      ]
+    }
+  }
+}
--- a/spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/readme.md
+++ b/spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/readme.md
@ -0,0 +1,20 @@
+This grammar has two tokens, `regex` and `/`, which conflict: when a `/` character is encountered, the lexer can't tell if it is part of a `/` token or a `regex` by looking ahead only one character. But because these tokens are never valid in the same position, this doesn't cause any problem.
+
+When merging similar parse states in order to reduce the size of the parse table, it is important that we avoid merging states in a way that causes these two tokens to both appear as valid lookahead symbols in a given state.
+
+If we weren't careful, this grammar would cause that to happen, because a `regex` is valid in this state:
+
+```
+(if (1) /\w+/)
+       ^
+```
+
+and a `/` is valid in this state:
+
+
+```
+((1) / 2)
+    ^
+```
+
+And these two states would otherwise be candidates for merging, because they both contain only the action `reduce(parenthesized, 3)`.