From 56ec45729c243ab0efb4b422ab8b28a2c9ceb403 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 9 Mar 2017 12:13:38 -0800 Subject: [PATCH] Add regression test for avoiding lexical conflicts due to state merging --- .../corpus.txt | 33 ++++++++++ .../grammar.json | 65 +++++++++++++++++++ .../readme.md | 20 ++++++ 3 files changed, 118 insertions(+) create mode 100644 spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/corpus.txt create mode 100644 spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/grammar.json create mode 100644 spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/readme.md diff --git a/spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/corpus.txt b/spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/corpus.txt new file mode 100644 index 00000000..d8b75557 --- /dev/null +++ b/spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/corpus.txt @@ -0,0 +1,33 @@ +======================== +regexes +======================== + +/a+/ + +--- + +(expression (regex)) + +======================== +conditionals +======================== + +(if (1) /a+/) + +--- + +(expression (parenthesized (expression (conditional + (parenthesized (expression (number))) + (expression (regex)))))) + +======================== +quotients +======================== + +((1) / 2) + +--- + +(expression (parenthesized (expression (quotient + (expression (parenthesized (expression (number)))) + (expression (number)))))) diff --git a/spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/grammar.json b/spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/grammar.json new file mode 100644 index 00000000..143d6f2d --- /dev/null +++ b/spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/grammar.json @@ -0,0 +1,65 @@ +{ + "name": "lexical_conflicts_due_to_state_merging", + + "extras": [ + {"type": "PATTERN", "value": "\\s"} + ], + + "rules": { + "expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "conditional"}, + {"type": "SYMBOL", "name": "regex"}, + {"type": "SYMBOL", "name": "quotient"}, + {"type": "SYMBOL", "name": "number"}, + {"type": "SYMBOL", "name": "parenthesized"} + ] + }, + + "conditional": { + "type": "PREC_LEFT", + "value": 1, + "content": { + "type": "SEQ", + "members": [ + {"type": "STRING", "value": "if"}, + {"type": "SYMBOL", "name": "parenthesized"}, + {"type": "SYMBOL", "name": "expression"} + ] + } + }, + + "quotient": { + "type": "PREC_LEFT", + "value": 0, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "/"}, + {"type": "SYMBOL", "name": "expression"} + ] + } + }, + + "regex": { + "type": "PATTERN", + "value": "/[^/\n]+/" + }, + + "number": { + "type": "PATTERN", + "value": "\\d+" + }, + + "parenthesized": { + "type": "SEQ", + "members": [ + {"type": "STRING", "value": "("}, + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": ")"} + ] + } + } +} \ No newline at end of file diff --git a/spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/readme.md b/spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/readme.md new file mode 100644 index 00000000..9fc5fd7f --- /dev/null +++ b/spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/readme.md @@ -0,0 +1,20 @@ +This grammar has two tokens, `regex` and `/`, which conflict: when a `/` character is encountered, the lexer can't tell if it is part of a `/` token or a `regex` by looking ahead only one character. But because these tokens are never valid in the same position, this doesn't cause any problem. + +When merging similar parse states in order to reduce the size of the parse table, it is important that we avoid merging states in a way that causes these two tokens to both appear as valid lookahead symbols in a given state. + +If we weren't careful, this grammar would cause that to happen, because a `regex` is valid in this state: + +``` +(if (1) /\w+/) + ^ +``` + +and a `/` is valid in this state: + + +``` +((1) / 2) + ^ +``` + +And these two states would otherwise be candidates for merging, because they both contain only the action `reduce(parenthesized, 3)`. \ No newline at end of file