In lexer, prefer tokens to skipped separator characters

This was causing newlines in go and javascript to be parsed as meaningless separator characters instead of statement terminators
2014-05-30 13:29:54 -07:00 · 2014-05-30 13:29:54 -07:00 · e93e254518
commit e93e254518
parent 220e081c49
26 changed files with 5559 additions and 6650 deletions
--- a/spec/compiler/build_tables/conflict_manager_spec.cc
+++ b/spec/compiler/build_tables/conflict_manager_spec.cc
@ -8,7 +8,7 @@ using namespace build_tables;
 START_TEST

 describe("resolving parse conflicts", []() {
-    bool should_update;
+    bool update;

    PreparedGrammar parse_grammar({
        { "rule1", seq({ sym("rule2"), sym("token2") }) },
@ -37,31 +37,67 @@ describe("resolving parse conflicts", []() {
        });

        it("favors non-errors over lexical errors", [&]() {
-            should_update = manager->resolve_lex_action(LexAction::Error(), LexAction::Advance(2));
-            AssertThat(should_update, IsTrue());
+            update = manager->resolve_lex_action(LexAction::Error(), LexAction::Advance(2, {0}));
+            AssertThat(update, IsTrue());

-            should_update = manager->resolve_lex_action(LexAction::Advance(2), LexAction::Error());
-            AssertThat(should_update, IsFalse());
+            update = manager->resolve_lex_action(LexAction::Advance(2, {0}), LexAction::Error());
+            AssertThat(update, IsFalse());
+        });
+
+        describe("accept-token/advance conflicts", [&]() {
+            describe("when the the accept-token has higher precedence", [&]() {
+                it("prefers the accept", [&]() {
+                    update = manager->resolve_lex_action(LexAction::Accept(sym3, 0), LexAction::Advance(1, { -1 }));
+                    AssertThat(update, IsFalse());
+
+                    update = manager->resolve_lex_action(LexAction::Advance(1, { -1 }), LexAction::Accept(sym3, 2));
+                    AssertThat(update, IsTrue());
+                });
+            });
+
+            describe("when the the actions have the same precedence", [&]() {
+                it("prefers the advance", [&]() {
+                    update = manager->resolve_lex_action(LexAction::Accept(sym3, 0), LexAction::Advance(1, { 0 }));
+                    AssertThat(update, IsTrue());
+
+                    update = manager->resolve_lex_action(LexAction::Advance(1, { 0 }), LexAction::Accept(sym3, 0));
+                    AssertThat(update, IsFalse());
+                });
+            });
+
+            describe("when the advance has conflicting precedences compared to the accept", [&]() {
+                it("prefers the advance", [&]() {
+                    update = manager->resolve_lex_action(LexAction::Accept(sym3, 0), LexAction::Advance(1, { -2, 2 }));
+                    AssertThat(update, IsTrue());
+
+                    update = manager->resolve_lex_action(LexAction::Advance(1, { -2, 2 }), LexAction::Accept(sym3, 0));
+                    AssertThat(update, IsFalse());
+                });
+
+                it_skip("records a conflict", [&]() {
+                    manager->resolve_lex_action(LexAction::Accept(sym3, 0), LexAction::Advance(1, { -2, 2 }));
+                });
+            });
        });

        describe("accept-token/accept-token conflicts", [&]() {
            describe("when one token has a higher precedence than the other", [&]() {
                it("prefers the token with the higher precedence", [&]() {
-                    should_update = manager->resolve_lex_action(LexAction::Accept(sym3, 2), LexAction::Accept(sym2, 0));
-                    AssertThat(should_update, IsFalse());
+                    update = manager->resolve_lex_action(LexAction::Accept(sym3, 2), LexAction::Accept(sym2, 0));
+                    AssertThat(update, IsFalse());

-                    should_update = manager->resolve_lex_action(LexAction::Accept(sym2, 0), LexAction::Accept(sym3, 2));
-                    AssertThat(should_update, IsTrue());
+                    update = manager->resolve_lex_action(LexAction::Accept(sym2, 0), LexAction::Accept(sym3, 2));
+                    AssertThat(update, IsTrue());
                });
            });

            describe("when both tokens have the same precedence", [&]() {
                it("prefers the token listed earlier in the grammar", [&]() {
-                    should_update = manager->resolve_lex_action(LexAction::Accept(sym1, 0), LexAction::Accept(sym2, 0));
-                    AssertThat(should_update, IsFalse());
+                    update = manager->resolve_lex_action(LexAction::Accept(sym1, 0), LexAction::Accept(sym2, 0));
+                    AssertThat(update, IsFalse());

-                    should_update = manager->resolve_lex_action(LexAction::Accept(sym2, 0), LexAction::Accept(sym1, 0));
-                    AssertThat(should_update, IsTrue());
+                    update = manager->resolve_lex_action(LexAction::Accept(sym2, 0), LexAction::Accept(sym1, 0));
+                    AssertThat(update, IsTrue());
                });
            });
        });
@ -81,11 +117,11 @@ describe("resolving parse conflicts", []() {
        });

        it("favors non-errors over parse errors", [&]() {
-            should_update = manager->resolve_parse_action(sym1, ParseAction::Error(), ParseAction::Shift(2, { 0 }));
-            AssertThat(should_update, IsTrue());
+            update = manager->resolve_parse_action(sym1, ParseAction::Error(), ParseAction::Shift(2, { 0 }));
+            AssertThat(update, IsTrue());

-            should_update = manager->resolve_parse_action(sym1, ParseAction::Shift(2, { 0 }), ParseAction::Error());
-            AssertThat(should_update, IsFalse());
+            update = manager->resolve_parse_action(sym1, ParseAction::Shift(2, { 0 }), ParseAction::Error());
+            AssertThat(update, IsFalse());
        });

        describe("shift/reduce conflicts", [&]() {
--- a/spec/compiler/helpers/rule_helpers.cc
+++ b/spec/compiler/helpers/rule_helpers.cc
@ -34,7 +34,7 @@ namespace tree_sitter {
        rule_ptr i_aux_token(size_t index) {
            return make_shared<rules::Symbol>(index, SymbolOption(SymbolOptionAuxiliary|SymbolOptionToken));
        }
-        
+
        rule_ptr metadata(rule_ptr rule, map<MetadataKey, int> values) {
            return make_shared<Metadata>(rule, values);
        }
--- a/spec/compiler/prepare_grammar/expand_tokens_spec.cc
+++ b/spec/compiler/prepare_grammar/expand_tokens_spec.cc
@ -15,9 +15,9 @@ describe("expanding token rules", []() {
                pattern("x*"),
                i_sym(11) }) },
        }, {});
-        
+
        auto result = expand_tokens(grammar);
-        
+
        AssertThat(result.second, Equals((const GrammarError *)nullptr));
        AssertThat(result.first, Equals(PreparedGrammar({
            { "rule_A", seq({
@ -26,7 +26,7 @@ describe("expanding token rules", []() {
                i_sym(11) }) },
        }, {})));
    });
-    
+
    it("replaces string rules with a sequence of characters", [&]() {
        PreparedGrammar grammar({
            { "rule_A", seq({
@ -34,9 +34,9 @@ describe("expanding token rules", []() {
                str("xyz"),
                i_sym(11) }) },
        }, {});
-        
+
        auto result = expand_tokens(grammar);
-        
+
        AssertThat(result.second, Equals((const GrammarError *)nullptr));
        AssertThat(result.first, Equals(PreparedGrammar({
            { "rule_A", seq({
@ -45,7 +45,7 @@ describe("expanding token rules", []() {
                i_sym(11) }) },
        }, {})));
    });
-    
+
    it("returns an error when the grammar contains an invalid regex", [&]() {
        PreparedGrammar grammar({
            { "rule_A", seq({
@ -53,7 +53,7 @@ describe("expanding token rules", []() {
                str("xyz"),
                pattern("[") }) },
        }, {});
-        
+
        auto result = expand_tokens(grammar);

        AssertThat(result.second, EqualsPointer(new GrammarError(GrammarErrorTypeRegex, "unmatched open paren")));
--- a/spec/compiler/prepare_grammar/extract_tokens_spec.cc
+++ b/spec/compiler/prepare_grammar/extract_tokens_spec.cc
@ -82,11 +82,11 @@ describe("extracting tokens from a grammar", []() {
        pair<PreparedGrammar, PreparedGrammar> result = extract_tokens(PreparedGrammar({}, {
            { "rule_A", seq({ str("ab"), i_sym(0) }) }
        }));
-        
+
        AssertThat(result.first, Equals(PreparedGrammar({}, {
            { "rule_A", seq({ i_aux_token(0), i_sym(0) }) }
        })));
-        
+
        AssertThat(result.second, Equals(PreparedGrammar({}, {
            { "'ab'", str("ab") },
        })));
@ -99,34 +99,34 @@ describe("extracting tokens from a grammar", []() {
                { "rule_B", pattern("a|b") },
                { "rule_C", token(seq({ str("a"), str("b") })) },
            }, {}));
-            
+
            AssertThat(result.first, Equals(PreparedGrammar({
                { "rule_A", i_token(0) }
            }, {})));
-            
+
            AssertThat(result.second, Equals(PreparedGrammar({
                { "rule_B", pattern("a|b") },
                { "rule_C", token(seq({ str("a"), str("b") })) },
            }, {})));
        });
-        
+
        it("updates symbols whose indices need to change due to deleted rules", [&]() {
            auto result = extract_tokens(PreparedGrammar({
                { "rule_A", str("ab") },
                { "rule_B", i_sym(0) },
                { "rule_C", i_sym(1) },
            }, {}));
-            
+
            AssertThat(result.first, Equals(PreparedGrammar({
                { "rule_B", i_token(0) },
                { "rule_C", i_sym(0) },
            }, {})));
-            
+
            AssertThat(result.second, Equals(PreparedGrammar({
                { "rule_A", str("ab") },
            }, {})));
        });
-        
+
        it("updates the grammar's ubiquitous_tokens", [&]() {
            auto result = extract_tokens(PreparedGrammar({
                { "rule_A", str("ab") },
@ -135,24 +135,24 @@ describe("extracting tokens from a grammar", []() {
            }, {}, PreparedGrammarOptions({
                { Symbol(0) }
            })));
-            
+
            AssertThat(result.first.options.ubiquitous_tokens, Equals(vector<Symbol>({
                { Symbol(0, SymbolOptionToken) }
            })));
        });
-        
+
        it("extracts entire auxiliary rules", [&]() {
            auto result = extract_tokens(PreparedGrammar({}, {
                { "rule_A", str("ab") },
                { "rule_B", i_aux_sym(0) },
                { "rule_C", i_aux_sym(1) },
            }));
-            
+
            AssertThat(result.first, Equals(PreparedGrammar({}, {
                { "rule_B", i_aux_token(0) },
                { "rule_C", i_aux_sym(0) },
            })));
-            
+
            AssertThat(result.second, Equals(PreparedGrammar({}, {
                { "rule_A", str("ab") },
            })));
--- a/spec/compiler/prepare_grammar/parse_regex_spec.cc
+++ b/spec/compiler/prepare_grammar/parse_regex_spec.cc
@ -13,13 +13,13 @@ describe("parsing regex patterns", []() {
            "[aAeE]",
            character({ 'a', 'A', 'e', 'E' })
        },
-        
+
        {
            "'.' characters as wildcards",
            ".",
            CharacterSet({'\n'}).complement().copy()
        },
-        
+
        {
            "character classes",
            "\\w-\\d",
@ -28,7 +28,7 @@ describe("parsing regex patterns", []() {
                character({ '-' }),
                character({ {'0', '9'} }) })
        },
-        
+
        {
            "choices",
            "ab|cd|ef",
@ -47,7 +47,7 @@ describe("parsing regex patterns", []() {
                })
            })
        },
-        
+
        {
            "simple sequences",
            "abc",
@ -56,25 +56,25 @@ describe("parsing regex patterns", []() {
                character({ 'b' }),
                character({ 'c' }) })
        },
-        
+
        {
            "character ranges",
            "[12a-dA-D3]",
            character({ {'1', '3'}, {'a', 'd'}, { 'A', 'D' }, })
        },
-        
+
        {
            "negated characters",
            "[^a\\d]",
            character({ {'a'}, {'0', '9'} }, false)
        },
-        
+
        {
            "backslashes",
            "\\\\",
            character({ '\\' })
        },
-        
+
        {
            "character groups in sequences",
            "x([^x]|\\\\x)*x",
@ -87,7 +87,7 @@ describe("parsing regex patterns", []() {
                character({ 'x' })
            })
        },
-        
+
        {
            "choices in sequences",
            "(a|b)cd",
@ -100,7 +100,7 @@ describe("parsing regex patterns", []() {
                character({ 'd' })
            })
        },
-        
+
        {
            "escaped parentheses",
            "a\\(b",
@ -110,7 +110,7 @@ describe("parsing regex patterns", []() {
                character({ 'b' })
            })
        },
-        
+
        {
            "escaped periods",
            "a\\.",
@ -119,7 +119,7 @@ describe("parsing regex patterns", []() {
                character({ '.' })
            })
        },
-        
+
        {
            "plus repeats",
            "(ab)+(cd)+",
@ -134,7 +134,7 @@ describe("parsing regex patterns", []() {
                }),
            })
        },
-        
+
        {
            "asterix repeats",
            "(ab)*(cd)*",
@ -143,7 +143,7 @@ describe("parsing regex patterns", []() {
                repeat(seq({ character({ 'c' }), character({ 'd' }) })),
            })
        },
-        
+
        {
            "optional rules",
            "a(bc)?",
@ -156,7 +156,7 @@ describe("parsing regex patterns", []() {
            })
        }
    };
-    
+
    vector<tuple<string, string, const char *>> invalid_inputs = {
        {
            "mismatched open parens",
@ -189,23 +189,23 @@ describe("parsing regex patterns", []() {
            "unmatched close square bracket",
        },
    };
-    
+
    for (auto &triple : valid_inputs) {
        string description = get<0>(triple);
        string regex = get<1>(triple);
        rule_ptr rule = get<2>(triple);
-        
+
        it(("parses " + description).c_str(), [&]() {
            auto result = parse_regex(regex);
            AssertThat(result.first, EqualsPointer(rule));
        });
    }
-    
+
    for (auto &triple : invalid_inputs) {
        string description = get<0>(triple);
        string regex = get<1>(triple);
        const char *expected_message = get<2>(triple);
-        
+
        it(("handles invalid regexes with " + description).c_str(), [&]() {
            auto result = parse_regex(regex);
            AssertThat(result.second, !Equals((const GrammarError *)nullptr));