In lexer, prefer tokens to skipped separator characters

This was causing newlines in go and javascript to be parsed as
meaningless separator characters instead of statement terminators
This commit is contained in:
Max Brunsfeld 2014-05-30 13:29:54 -07:00
parent 220e081c49
commit e93e254518
26 changed files with 5559 additions and 6650 deletions

View file

@ -8,7 +8,7 @@ using namespace build_tables;
START_TEST
describe("resolving parse conflicts", []() {
bool should_update;
bool update;
PreparedGrammar parse_grammar({
{ "rule1", seq({ sym("rule2"), sym("token2") }) },
@ -37,31 +37,67 @@ describe("resolving parse conflicts", []() {
});
it("favors non-errors over lexical errors", [&]() {
should_update = manager->resolve_lex_action(LexAction::Error(), LexAction::Advance(2));
AssertThat(should_update, IsTrue());
update = manager->resolve_lex_action(LexAction::Error(), LexAction::Advance(2, {0}));
AssertThat(update, IsTrue());
should_update = manager->resolve_lex_action(LexAction::Advance(2), LexAction::Error());
AssertThat(should_update, IsFalse());
update = manager->resolve_lex_action(LexAction::Advance(2, {0}), LexAction::Error());
AssertThat(update, IsFalse());
});
describe("accept-token/advance conflicts", [&]() {
describe("when the the accept-token has higher precedence", [&]() {
it("prefers the accept", [&]() {
update = manager->resolve_lex_action(LexAction::Accept(sym3, 0), LexAction::Advance(1, { -1 }));
AssertThat(update, IsFalse());
update = manager->resolve_lex_action(LexAction::Advance(1, { -1 }), LexAction::Accept(sym3, 2));
AssertThat(update, IsTrue());
});
});
describe("when the the actions have the same precedence", [&]() {
it("prefers the advance", [&]() {
update = manager->resolve_lex_action(LexAction::Accept(sym3, 0), LexAction::Advance(1, { 0 }));
AssertThat(update, IsTrue());
update = manager->resolve_lex_action(LexAction::Advance(1, { 0 }), LexAction::Accept(sym3, 0));
AssertThat(update, IsFalse());
});
});
describe("when the advance has conflicting precedences compared to the accept", [&]() {
it("prefers the advance", [&]() {
update = manager->resolve_lex_action(LexAction::Accept(sym3, 0), LexAction::Advance(1, { -2, 2 }));
AssertThat(update, IsTrue());
update = manager->resolve_lex_action(LexAction::Advance(1, { -2, 2 }), LexAction::Accept(sym3, 0));
AssertThat(update, IsFalse());
});
it_skip("records a conflict", [&]() {
manager->resolve_lex_action(LexAction::Accept(sym3, 0), LexAction::Advance(1, { -2, 2 }));
});
});
});
describe("accept-token/accept-token conflicts", [&]() {
describe("when one token has a higher precedence than the other", [&]() {
it("prefers the token with the higher precedence", [&]() {
should_update = manager->resolve_lex_action(LexAction::Accept(sym3, 2), LexAction::Accept(sym2, 0));
AssertThat(should_update, IsFalse());
update = manager->resolve_lex_action(LexAction::Accept(sym3, 2), LexAction::Accept(sym2, 0));
AssertThat(update, IsFalse());
should_update = manager->resolve_lex_action(LexAction::Accept(sym2, 0), LexAction::Accept(sym3, 2));
AssertThat(should_update, IsTrue());
update = manager->resolve_lex_action(LexAction::Accept(sym2, 0), LexAction::Accept(sym3, 2));
AssertThat(update, IsTrue());
});
});
describe("when both tokens have the same precedence", [&]() {
it("prefers the token listed earlier in the grammar", [&]() {
should_update = manager->resolve_lex_action(LexAction::Accept(sym1, 0), LexAction::Accept(sym2, 0));
AssertThat(should_update, IsFalse());
update = manager->resolve_lex_action(LexAction::Accept(sym1, 0), LexAction::Accept(sym2, 0));
AssertThat(update, IsFalse());
should_update = manager->resolve_lex_action(LexAction::Accept(sym2, 0), LexAction::Accept(sym1, 0));
AssertThat(should_update, IsTrue());
update = manager->resolve_lex_action(LexAction::Accept(sym2, 0), LexAction::Accept(sym1, 0));
AssertThat(update, IsTrue());
});
});
});
@ -81,11 +117,11 @@ describe("resolving parse conflicts", []() {
});
it("favors non-errors over parse errors", [&]() {
should_update = manager->resolve_parse_action(sym1, ParseAction::Error(), ParseAction::Shift(2, { 0 }));
AssertThat(should_update, IsTrue());
update = manager->resolve_parse_action(sym1, ParseAction::Error(), ParseAction::Shift(2, { 0 }));
AssertThat(update, IsTrue());
should_update = manager->resolve_parse_action(sym1, ParseAction::Shift(2, { 0 }), ParseAction::Error());
AssertThat(should_update, IsFalse());
update = manager->resolve_parse_action(sym1, ParseAction::Shift(2, { 0 }), ParseAction::Error());
AssertThat(update, IsFalse());
});
describe("shift/reduce conflicts", [&]() {

View file

@ -34,7 +34,7 @@ namespace tree_sitter {
rule_ptr i_aux_token(size_t index) {
return make_shared<rules::Symbol>(index, SymbolOption(SymbolOptionAuxiliary|SymbolOptionToken));
}
rule_ptr metadata(rule_ptr rule, map<MetadataKey, int> values) {
return make_shared<Metadata>(rule, values);
}

View file

@ -15,9 +15,9 @@ describe("expanding token rules", []() {
pattern("x*"),
i_sym(11) }) },
}, {});
auto result = expand_tokens(grammar);
AssertThat(result.second, Equals((const GrammarError *)nullptr));
AssertThat(result.first, Equals(PreparedGrammar({
{ "rule_A", seq({
@ -26,7 +26,7 @@ describe("expanding token rules", []() {
i_sym(11) }) },
}, {})));
});
it("replaces string rules with a sequence of characters", [&]() {
PreparedGrammar grammar({
{ "rule_A", seq({
@ -34,9 +34,9 @@ describe("expanding token rules", []() {
str("xyz"),
i_sym(11) }) },
}, {});
auto result = expand_tokens(grammar);
AssertThat(result.second, Equals((const GrammarError *)nullptr));
AssertThat(result.first, Equals(PreparedGrammar({
{ "rule_A", seq({
@ -45,7 +45,7 @@ describe("expanding token rules", []() {
i_sym(11) }) },
}, {})));
});
it("returns an error when the grammar contains an invalid regex", [&]() {
PreparedGrammar grammar({
{ "rule_A", seq({
@ -53,7 +53,7 @@ describe("expanding token rules", []() {
str("xyz"),
pattern("[") }) },
}, {});
auto result = expand_tokens(grammar);
AssertThat(result.second, EqualsPointer(new GrammarError(GrammarErrorTypeRegex, "unmatched open paren")));

View file

@ -82,11 +82,11 @@ describe("extracting tokens from a grammar", []() {
pair<PreparedGrammar, PreparedGrammar> result = extract_tokens(PreparedGrammar({}, {
{ "rule_A", seq({ str("ab"), i_sym(0) }) }
}));
AssertThat(result.first, Equals(PreparedGrammar({}, {
{ "rule_A", seq({ i_aux_token(0), i_sym(0) }) }
})));
AssertThat(result.second, Equals(PreparedGrammar({}, {
{ "'ab'", str("ab") },
})));
@ -99,34 +99,34 @@ describe("extracting tokens from a grammar", []() {
{ "rule_B", pattern("a|b") },
{ "rule_C", token(seq({ str("a"), str("b") })) },
}, {}));
AssertThat(result.first, Equals(PreparedGrammar({
{ "rule_A", i_token(0) }
}, {})));
AssertThat(result.second, Equals(PreparedGrammar({
{ "rule_B", pattern("a|b") },
{ "rule_C", token(seq({ str("a"), str("b") })) },
}, {})));
});
it("updates symbols whose indices need to change due to deleted rules", [&]() {
auto result = extract_tokens(PreparedGrammar({
{ "rule_A", str("ab") },
{ "rule_B", i_sym(0) },
{ "rule_C", i_sym(1) },
}, {}));
AssertThat(result.first, Equals(PreparedGrammar({
{ "rule_B", i_token(0) },
{ "rule_C", i_sym(0) },
}, {})));
AssertThat(result.second, Equals(PreparedGrammar({
{ "rule_A", str("ab") },
}, {})));
});
it("updates the grammar's ubiquitous_tokens", [&]() {
auto result = extract_tokens(PreparedGrammar({
{ "rule_A", str("ab") },
@ -135,24 +135,24 @@ describe("extracting tokens from a grammar", []() {
}, {}, PreparedGrammarOptions({
{ Symbol(0) }
})));
AssertThat(result.first.options.ubiquitous_tokens, Equals(vector<Symbol>({
{ Symbol(0, SymbolOptionToken) }
})));
});
it("extracts entire auxiliary rules", [&]() {
auto result = extract_tokens(PreparedGrammar({}, {
{ "rule_A", str("ab") },
{ "rule_B", i_aux_sym(0) },
{ "rule_C", i_aux_sym(1) },
}));
AssertThat(result.first, Equals(PreparedGrammar({}, {
{ "rule_B", i_aux_token(0) },
{ "rule_C", i_aux_sym(0) },
})));
AssertThat(result.second, Equals(PreparedGrammar({}, {
{ "rule_A", str("ab") },
})));

View file

@ -13,13 +13,13 @@ describe("parsing regex patterns", []() {
"[aAeE]",
character({ 'a', 'A', 'e', 'E' })
},
{
"'.' characters as wildcards",
".",
CharacterSet({'\n'}).complement().copy()
},
{
"character classes",
"\\w-\\d",
@ -28,7 +28,7 @@ describe("parsing regex patterns", []() {
character({ '-' }),
character({ {'0', '9'} }) })
},
{
"choices",
"ab|cd|ef",
@ -47,7 +47,7 @@ describe("parsing regex patterns", []() {
})
})
},
{
"simple sequences",
"abc",
@ -56,25 +56,25 @@ describe("parsing regex patterns", []() {
character({ 'b' }),
character({ 'c' }) })
},
{
"character ranges",
"[12a-dA-D3]",
character({ {'1', '3'}, {'a', 'd'}, { 'A', 'D' }, })
},
{
"negated characters",
"[^a\\d]",
character({ {'a'}, {'0', '9'} }, false)
},
{
"backslashes",
"\\\\",
character({ '\\' })
},
{
"character groups in sequences",
"x([^x]|\\\\x)*x",
@ -87,7 +87,7 @@ describe("parsing regex patterns", []() {
character({ 'x' })
})
},
{
"choices in sequences",
"(a|b)cd",
@ -100,7 +100,7 @@ describe("parsing regex patterns", []() {
character({ 'd' })
})
},
{
"escaped parentheses",
"a\\(b",
@ -110,7 +110,7 @@ describe("parsing regex patterns", []() {
character({ 'b' })
})
},
{
"escaped periods",
"a\\.",
@ -119,7 +119,7 @@ describe("parsing regex patterns", []() {
character({ '.' })
})
},
{
"plus repeats",
"(ab)+(cd)+",
@ -134,7 +134,7 @@ describe("parsing regex patterns", []() {
}),
})
},
{
"asterix repeats",
"(ab)*(cd)*",
@ -143,7 +143,7 @@ describe("parsing regex patterns", []() {
repeat(seq({ character({ 'c' }), character({ 'd' }) })),
})
},
{
"optional rules",
"a(bc)?",
@ -156,7 +156,7 @@ describe("parsing regex patterns", []() {
})
}
};
vector<tuple<string, string, const char *>> invalid_inputs = {
{
"mismatched open parens",
@ -189,23 +189,23 @@ describe("parsing regex patterns", []() {
"unmatched close square bracket",
},
};
for (auto &triple : valid_inputs) {
string description = get<0>(triple);
string regex = get<1>(triple);
rule_ptr rule = get<2>(triple);
it(("parses " + description).c_str(), [&]() {
auto result = parse_regex(regex);
AssertThat(result.first, EqualsPointer(rule));
});
}
for (auto &triple : invalid_inputs) {
string description = get<0>(triple);
string regex = get<1>(triple);
const char *expected_message = get<2>(triple);
it(("handles invalid regexes with " + description).c_str(), [&]() {
auto result = parse_regex(regex);
AssertThat(result.second, !Equals((const GrammarError *)nullptr));