Merge pull request #63 from tree-sitter/avoid-lexing-issues-when-merging-states

Avoid introducing new lexical conflicts when merging parse states
This commit is contained in:
Max Brunsfeld 2017-03-09 12:19:46 -08:00 committed by GitHub
commit 352e678c12
100 changed files with 2517 additions and 2168 deletions

View file

@ -11,13 +11,12 @@
'externals/json-parser',
],
'sources': [
'src/compiler/build_tables/build_lex_table.cc',
'src/compiler/build_tables/build_parse_table.cc',
'src/compiler/build_tables/build_tables.cc',
'src/compiler/build_tables/recovery_tokens.cc',
'src/compiler/build_tables/lex_item.cc',
'src/compiler/build_tables/lex_item_transitions.cc',
'src/compiler/build_tables/lex_conflict_manager.cc',
'src/compiler/build_tables/lex_table_builder.cc',
'src/compiler/build_tables/lookahead_set.cc',
'src/compiler/build_tables/parse_item.cc',
'src/compiler/build_tables/parse_item_set_builder.cc',
@ -41,7 +40,6 @@
'src/compiler/prepare_grammar/token_description.cc',
'src/compiler/rule.cc',
'src/compiler/syntax_grammar.cc',
'src/compiler/variable.cc',
'src/compiler/rules/blank.cc',
'src/compiler/rules/built_in_symbols.cc',
'src/compiler/rules/character_range.cc',

View file

@ -1,34 +0,0 @@
#include "spec_helper.h"
#include "compiler/rules/character_set.h"
#include "compiler/build_tables/recovery_tokens.h"
#include "compiler/lexical_grammar.h"
#include "helpers/rule_helpers.h"
#include "helpers/stream_methods.h"
#include "compiler/rules.h"
using namespace rules;
using namespace build_tables;
START_TEST
describe("recovery_tokens(rule)", []() {
it("includes rules that can only begin and end with an explicit set of characters", [&]() {
LexicalGrammar grammar;
grammar.separators = {
character({ ' ' }),
};
grammar.variables = {
Variable("var0", VariableTypeNamed, character({}, false)),
Variable("var1", VariableTypeNamed, seq({
character({ 'a', 'b' }),
character({}, false),
character({ 'c', 'd' }),
})),
};
AssertThat(recovery_tokens(grammar), Equals<set<Symbol>>({ Symbol(1, Symbol::Terminal) }));
});
});
END_TEST

View file

@ -20,6 +20,10 @@ describe("LexConflictManager::resolve(new_action, old_action)", []() {
Symbol sym4(3, Symbol::Terminal);
LexItemSet item_set({ LexItem(sym4, blank() )});
before_each([&]() {
conflict_manager = LexConflictManager();
});
it("favors advance actions over empty accept token actions", [&]() {
update = conflict_manager.resolve(item_set, AdvanceAction(2, {0, 0}, true), AcceptTokenAction());
AssertThat(update, IsTrue());
@ -65,6 +69,7 @@ describe("LexConflictManager::resolve(new_action, old_action)", []() {
describe("advance/accept-token conflicts", [&]() {
describe("when the token to accept has higher precedence", [&]() {
it("prefers the accept-token action", [&]() {
AssertThat(conflict_manager.possible_extensions, IsEmpty());
update = conflict_manager.resolve(item_set, AdvanceAction(1, { 1, 2 }, true), AcceptTokenAction(sym3, 3, true));
AssertThat(update, IsFalse());
AssertThat(conflict_manager.possible_extensions, IsEmpty());
@ -72,13 +77,9 @@ describe("LexConflictManager::resolve(new_action, old_action)", []() {
});
describe("when the token to accept does not have a higher precedence", [&]() {
it("favors the advance action", [&]() {
it("favors the advance action and adds the in-progress tokens as possible extensions of the discarded token", [&]() {
update = conflict_manager.resolve(item_set, AdvanceAction(1, { 1, 2 }, true), AcceptTokenAction(sym3, 2, true));
AssertThat(update, IsTrue());
});
it("adds the in-progress tokens as possible extensions of the discarded token", [&]() {
conflict_manager.resolve(item_set, AdvanceAction(1, { 1, 2 }, true), AcceptTokenAction(sym3, 3, true));
AssertThat(conflict_manager.possible_extensions[sym3.index], Contains(sym4.index));
});
});

View file

@ -13,11 +13,10 @@ START_TEST
describe("LexItem", []() {
describe("completion_status()", [&]() {
it("indicates whether the item is done, its precedence, and whether it is a string", [&]() {
it("indicates whether the item is done and its precedence", [&]() {
LexItem item1(Symbol(0, Symbol::Terminal), character({ 'a', 'b', 'c' }));
AssertThat(item1.completion_status().is_done, IsFalse());
AssertThat(item1.completion_status().precedence, Equals(PrecedenceRange()));
AssertThat(item1.completion_status().is_string, IsFalse());
MetadataParams params;
params.precedence = 3;
@ -30,12 +29,10 @@ describe("LexItem", []() {
AssertThat(item2.completion_status().is_done, IsTrue());
AssertThat(item2.completion_status().precedence, Equals(PrecedenceRange(3)));
AssertThat(item2.completion_status().is_string, IsTrue());
LexItem item3(Symbol(0, Symbol::Terminal), repeat(character({ ' ', '\t' })));
AssertThat(item3.completion_status().is_done, IsTrue());
AssertThat(item3.completion_status().precedence, Equals(PrecedenceRange()));
AssertThat(item3.completion_status().is_string, IsFalse());
});
});
});

View file

@ -12,12 +12,13 @@ using namespace rules;
START_TEST
describe("ParseItemSetBuilder", []() {
vector<Variable> lexical_variables;
vector<LexicalVariable> lexical_variables;
for (size_t i = 0; i < 20; i++) {
lexical_variables.push_back(Variable{
lexical_variables.push_back({
"token_" + to_string(i),
VariableTypeNamed,
blank(),
false
});
}
@ -25,13 +26,13 @@ describe("ParseItemSetBuilder", []() {
it("adds items at the beginnings of referenced rules", [&]() {
SyntaxGrammar grammar{{
SyntaxVariable("rule0", VariableTypeNamed, {
SyntaxVariable{"rule0", VariableTypeNamed, {
Production({
{Symbol(1, Symbol::NonTerminal), 0, AssociativityNone},
{Symbol(11, Symbol::Terminal), 0, AssociativityNone},
}),
}),
SyntaxVariable("rule1", VariableTypeNamed, {
}},
SyntaxVariable{"rule1", VariableTypeNamed, {
Production({
{Symbol(12, Symbol::Terminal), 0, AssociativityNone},
{Symbol(13, Symbol::Terminal), 0, AssociativityNone},
@ -39,13 +40,13 @@ describe("ParseItemSetBuilder", []() {
Production({
{Symbol(2, Symbol::NonTerminal), 0, AssociativityNone},
})
}),
SyntaxVariable("rule2", VariableTypeNamed, {
}},
SyntaxVariable{"rule2", VariableTypeNamed, {
Production({
{Symbol(14, Symbol::Terminal), 0, AssociativityNone},
{Symbol(15, Symbol::Terminal), 0, AssociativityNone},
})
}),
}},
}, {}, {}, {}};
auto production = [&](int variable_index, int production_index) -> const Production & {
@ -84,19 +85,19 @@ describe("ParseItemSetBuilder", []() {
it("handles rules with empty productions", [&]() {
SyntaxGrammar grammar{{
SyntaxVariable("rule0", VariableTypeNamed, {
SyntaxVariable{"rule0", VariableTypeNamed, {
Production({
{Symbol(1, Symbol::NonTerminal), 0, AssociativityNone},
{Symbol(11, Symbol::Terminal), 0, AssociativityNone},
}),
}),
SyntaxVariable("rule1", VariableTypeNamed, {
}},
SyntaxVariable{"rule1", VariableTypeNamed, {
Production({
{Symbol(12, Symbol::Terminal), 0, AssociativityNone},
{Symbol(13, Symbol::Terminal), 0, AssociativityNone},
}),
Production({})
}),
}},
}, {}, {}, {}};
auto production = [&](int variable_index, int production_index) -> const Production & {

View file

@ -2,6 +2,7 @@
#include "compiler/prepare_grammar/initial_syntax_grammar.h"
#include "compiler/prepare_grammar/expand_repeats.h"
#include "helpers/rule_helpers.h"
#include "helpers/stream_methods.h"
START_TEST
@ -11,141 +12,159 @@ using prepare_grammar::expand_repeats;
describe("expand_repeats", []() {
it("replaces repeat rules with pairs of recursive rules", [&]() {
InitialSyntaxGrammar grammar{{
Variable("rule0", VariableTypeNamed, repeat1(i_token(0))),
}, {}, {}, {}};
InitialSyntaxGrammar grammar{
{
Variable{"rule0", VariableTypeNamed, repeat1(i_token(0))},
},
{}, {}, {}
};
auto result = expand_repeats(grammar);
AssertThat(result.variables, Equals(vector<Variable>({
Variable("rule0", VariableTypeNamed, i_sym(1)),
Variable("rule0_repeat1", VariableTypeAuxiliary, choice({
AssertThat(result.variables, Equals(vector<Variable>{
Variable{"rule0", VariableTypeNamed, i_sym(1)},
Variable{"rule0_repeat1", VariableTypeAuxiliary, choice({
seq({ i_sym(1), i_token(0) }),
i_token(0),
})),
})));
})},
}));
});
it("replaces repeats inside of sequences", [&]() {
InitialSyntaxGrammar grammar{{
Variable("rule0", VariableTypeNamed, seq({
i_token(10),
repeat1(i_token(11)),
})),
}, {}, {}, {}};
InitialSyntaxGrammar grammar{
{
Variable{"rule0", VariableTypeNamed, seq({
i_token(10),
repeat1(i_token(11)),
})},
},
{}, {}, {}
};
auto result = expand_repeats(grammar);
AssertThat(result.variables, Equals(vector<Variable>({
Variable("rule0", VariableTypeNamed, seq({
AssertThat(result.variables, Equals(vector<Variable>{
Variable{"rule0", VariableTypeNamed, seq({
i_token(10),
i_sym(1),
})),
Variable("rule0_repeat1", VariableTypeAuxiliary, choice({
})},
Variable{"rule0_repeat1", VariableTypeAuxiliary, choice({
seq({ i_sym(1), i_token(11) }),
i_token(11)
})),
})));
})},
}));
});
it("replaces repeats inside of choices", [&]() {
InitialSyntaxGrammar grammar{{
Variable("rule0", VariableTypeNamed, choice({
i_token(10),
repeat1(i_token(11))
})),
}, {}, {}, {}};
InitialSyntaxGrammar grammar{
{
Variable{"rule0", VariableTypeNamed, choice({
i_token(10),
repeat1(i_token(11))
})},
},
{}, {}, {}
};
auto result = expand_repeats(grammar);
AssertThat(result.variables, Equals(vector<Variable>({
Variable("rule0", VariableTypeNamed, choice({
AssertThat(result.variables, Equals(vector<Variable>{
Variable{"rule0", VariableTypeNamed, choice({
i_token(10),
i_sym(1),
})),
Variable("rule0_repeat1", VariableTypeAuxiliary, choice({
})},
Variable{"rule0_repeat1", VariableTypeAuxiliary, choice({
seq({ i_sym(1), i_token(11) }),
i_token(11),
})),
})));
})},
}));
});
it("does not create redundant auxiliary rules", [&]() {
InitialSyntaxGrammar grammar{{
Variable("rule0", VariableTypeNamed, choice({
seq({ i_token(1), repeat1(i_token(4)) }),
seq({ i_token(2), repeat1(i_token(4)) }),
})),
Variable("rule1", VariableTypeNamed, seq({
i_token(3),
repeat1(i_token(4))
})),
}, {}, {}, {}};
InitialSyntaxGrammar grammar{
{
Variable{"rule0", VariableTypeNamed, choice({
seq({ i_token(1), repeat1(i_token(4)) }),
seq({ i_token(2), repeat1(i_token(4)) }),
})},
Variable{"rule1", VariableTypeNamed, seq({
i_token(3),
repeat1(i_token(4))
})},
},
{}, {}, {}
};
auto result = expand_repeats(grammar);
AssertThat(result.variables, Equals(vector<Variable>({
Variable("rule0", VariableTypeNamed, choice({
AssertThat(result.variables, Equals(vector<Variable>{
Variable{"rule0", VariableTypeNamed, choice({
seq({ i_token(1), i_sym(2) }),
seq({ i_token(2), i_sym(2) }),
})),
Variable("rule1", VariableTypeNamed, seq({
})},
Variable{"rule1", VariableTypeNamed, seq({
i_token(3),
i_sym(2),
})),
Variable("rule0_repeat1", VariableTypeAuxiliary, choice({
})},
Variable{"rule0_repeat1", VariableTypeAuxiliary, choice({
seq({ i_sym(2), i_token(4) }),
i_token(4),
})),
})));
})},
}));
});
it("can replace multiple repeats in the same rule", [&]() {
InitialSyntaxGrammar grammar{{
Variable("rule0", VariableTypeNamed, seq({
repeat1(i_token(10)),
repeat1(i_token(11)),
})),
}, {}, {}, {}};
InitialSyntaxGrammar grammar{
{
Variable{"rule0", VariableTypeNamed, seq({
repeat1(i_token(10)),
repeat1(i_token(11)),
})},
},
{}, {}, {}
};
auto result = expand_repeats(grammar);
AssertThat(result.variables, Equals(vector<Variable>({
Variable("rule0", VariableTypeNamed, seq({
AssertThat(result.variables, Equals(vector<Variable>{
Variable{"rule0", VariableTypeNamed, seq({
i_sym(1),
i_sym(2),
})),
Variable("rule0_repeat1", VariableTypeAuxiliary, choice({
})},
Variable{"rule0_repeat1", VariableTypeAuxiliary, choice({
seq({ i_sym(1), i_token(10) }),
i_token(10),
})),
Variable("rule0_repeat2", VariableTypeAuxiliary, choice({
})},
Variable{"rule0_repeat2", VariableTypeAuxiliary, choice({
seq({ i_sym(2), i_token(11) }),
i_token(11),
})),
})));
})},
}));
});
it("can replace repeats in multiple rules", [&]() {
InitialSyntaxGrammar grammar{{
Variable("rule0", VariableTypeNamed, repeat1(i_token(10))),
Variable("rule1", VariableTypeNamed, repeat1(i_token(11))),
}, {}, {}, {}};
InitialSyntaxGrammar grammar{
{
Variable{"rule0", VariableTypeNamed, repeat1(i_token(10))},
Variable{"rule1", VariableTypeNamed, repeat1(i_token(11))},
},
{}, {}, {}
};
auto result = expand_repeats(grammar);
AssertThat(result.variables, Equals(vector<Variable>({
Variable("rule0", VariableTypeNamed, i_sym(2)),
Variable("rule1", VariableTypeNamed, i_sym(3)),
Variable("rule0_repeat1", VariableTypeAuxiliary, choice({
AssertThat(result.variables, Equals(vector<Variable>{
Variable{"rule0", VariableTypeNamed, i_sym(2)},
Variable{"rule1", VariableTypeNamed, i_sym(3)},
Variable{"rule0_repeat1", VariableTypeAuxiliary, choice({
seq({ i_sym(2), i_token(10) }),
i_token(10),
})),
Variable("rule1_repeat1", VariableTypeAuxiliary, choice({
})},
Variable{"rule1_repeat1", VariableTypeAuxiliary, choice({
seq({ i_sym(3), i_token(11) }),
i_token(11),
})),
})));
})},
}));
});
});

View file

@ -15,89 +15,149 @@ describe("expand_tokens", []() {
describe("string rules", [&]() {
it("replaces strings with sequences of character sets", [&]() {
LexicalGrammar grammar{{
Variable("rule_A", VariableTypeNamed, seq({
i_sym(10),
str("xyz"),
i_sym(11),
})),
}, {}};
LexicalGrammar grammar{
{
LexicalVariable{
"rule_A",
VariableTypeNamed,
seq({
i_sym(10),
str("xyz"),
i_sym(11),
}),
false
}
},
{}
};
auto result = expand_tokens(grammar);
AssertThat(result.second, Equals(CompileError::none()));
AssertThat(result.first.variables, Equals(vector<Variable>({
Variable("rule_A", VariableTypeNamed, seq({
i_sym(10),
metadata(seq({
character({ 'x' }),
character({ 'y' }),
character({ 'z' }),
}), string_token_params),
i_sym(11),
})),
})));
AssertThat(result.first.variables, Equals(vector<LexicalVariable>{
LexicalVariable{
"rule_A",
VariableTypeNamed,
seq({
i_sym(10),
metadata(seq({
character({ 'x' }),
character({ 'y' }),
character({ 'z' }),
}), string_token_params),
i_sym(11),
}),
false
}
}));
});
it("handles strings containing non-ASCII UTF8 characters", [&]() {
LexicalGrammar grammar{{
Variable("rule_A", VariableTypeNamed, str("\u03B1 \u03B2")),
}, {}};
LexicalGrammar grammar{
{
LexicalVariable{
"rule_A",
VariableTypeNamed,
str("\u03B1 \u03B2"),
false
},
},
{}
};
auto result = expand_tokens(grammar);
AssertThat(result.first.variables, Equals(vector<Variable>({
Variable("rule_A", VariableTypeNamed, metadata(seq({
character({ 945 }),
character({ ' ' }),
character({ 946 }),
}), string_token_params)),
})));
AssertThat(result.first.variables, Equals(vector<LexicalVariable>{
LexicalVariable{
"rule_A",
VariableTypeNamed,
metadata(seq({
character({ 945 }),
character({ ' ' }),
character({ 946 }),
}), string_token_params),
false
}
}));
});
});
describe("regexp rules", [&]() {
it("replaces regexps with the equivalent rule tree", [&]() {
LexicalGrammar grammar{{
Variable("rule_A", VariableTypeNamed, seq({
i_sym(10),
pattern("x*"),
i_sym(11),
})),
}, {}};
LexicalGrammar grammar{
{
LexicalVariable{
"rule_A",
VariableTypeNamed,
seq({
i_sym(10),
pattern("x*"),
i_sym(11),
}),
false
}
},
{}
};
auto result = expand_tokens(grammar);
AssertThat(result.second, Equals(CompileError::none()));
AssertThat(result.first.variables, Equals(vector<Variable>({
Variable("rule_A", VariableTypeNamed, seq({
i_sym(10),
repeat(character({ 'x' })),
i_sym(11),
})),
})));
AssertThat(result.first.variables, Equals(vector<LexicalVariable>{
LexicalVariable{
"rule_A",
VariableTypeNamed,
seq({
i_sym(10),
repeat(character({ 'x' })),
i_sym(11),
}),
false
}
}));
});
it("handles regexps containing non-ASCII UTF8 characters", [&]() {
LexicalGrammar grammar{{
Variable("rule_A", VariableTypeNamed, pattern("[^\u03B1-\u03B4]*")),
}, {}};
LexicalGrammar grammar{
{
LexicalVariable{
"rule_A",
VariableTypeNamed,
pattern("[^\u03B1-\u03B4]*"),
false
}
},
{}
};
auto result = expand_tokens(grammar);
AssertThat(result.first.variables, Equals(vector<Variable>({
Variable("rule_A", VariableTypeNamed, repeat(character({ 945, 946, 947, 948 }, false))),
})));
AssertThat(result.first.variables, Equals(vector<LexicalVariable>{
LexicalVariable{
"rule_A",
VariableTypeNamed,
repeat(character({ 945, 946, 947, 948 }, false)),
false
}
}));
});
it("returns an error when the grammar contains an invalid regex", [&]() {
LexicalGrammar grammar{{
Variable("rule_A", VariableTypeNamed, seq({
pattern("("),
str("xyz"),
pattern("["),
}))
}, {}};
LexicalGrammar grammar{
{
LexicalVariable{
"rule_A",
VariableTypeNamed,
seq({
pattern("("),
str("xyz"),
pattern("["),
}),
false
},
},
{}
};
auto result = expand_tokens(grammar);

View file

@ -16,20 +16,25 @@ using prepare_grammar::InitialSyntaxGrammar;
describe("extract_tokens", []() {
it("moves strings, patterns, and sub-rules marked as tokens into the lexical grammar", [&]() {
auto result = extract_tokens(InternedGrammar{{
Variable("rule_A", VariableTypeNamed, repeat1(seq({
str("ab"),
pattern("cd*"),
choice({
i_sym(1),
i_sym(2),
token(repeat1(choice({ str("ef"), str("gh") }))),
}),
}))),
Variable("rule_B", VariableTypeNamed, pattern("ij+")),
Variable("rule_C", VariableTypeNamed, choice({ str("kl"), blank() })),
Variable("rule_D", VariableTypeNamed, repeat1(i_sym(3)))
}, {}, {}, {}});
auto result = extract_tokens(InternedGrammar{
{
Variable{"rule_A", VariableTypeNamed, repeat1(seq({
str("ab"),
pattern("cd*"),
choice({
i_sym(1),
i_sym(2),
token(repeat1(choice({ str("ef"), str("gh") }))),
}),
}))},
Variable{"rule_B", VariableTypeNamed, pattern("ij+")},
Variable{"rule_C", VariableTypeNamed, choice({ str("kl"), blank() })},
Variable{"rule_D", VariableTypeNamed, repeat1(i_sym(3))},
},
{},
{},
{}
});
InitialSyntaxGrammar &syntax_grammar = get<0>(result);
LexicalGrammar &lexical_grammar = get<1>(result);
@ -37,8 +42,8 @@ describe("extract_tokens", []() {
AssertThat(error, Equals(CompileError::none()));
AssertThat(syntax_grammar.variables, Equals(vector<Variable>({
Variable("rule_A", VariableTypeNamed, repeat1(seq({
AssertThat(syntax_grammar.variables, Equals(vector<Variable>{
Variable{"rule_A", VariableTypeNamed, repeat1(seq({
// This string is now the first token in the lexical grammar.
i_token(0),
@ -58,83 +63,88 @@ describe("extract_tokens", []() {
// This token rule is now the third rule in the lexical grammar.
i_token(2),
}),
}))),
}))},
Variable("rule_C", VariableTypeNamed, choice({ i_token(4), blank() })),
Variable("rule_D", VariableTypeNamed, repeat1(i_sym(2))),
})));
Variable{"rule_C", VariableTypeNamed, choice({ i_token(4), blank() })},
Variable{"rule_D", VariableTypeNamed, repeat1(i_sym(2))},
}));
AssertThat(lexical_grammar.variables, Equals(vector<Variable>({
AssertThat(lexical_grammar.variables, Equals(vector<LexicalVariable>({
// Strings become anonymous rules.
Variable("ab", VariableTypeAnonymous, str("ab")),
LexicalVariable{"ab", VariableTypeAnonymous, str("ab"), true},
// Patterns become hidden rules.
Variable("/cd*/", VariableTypeAuxiliary, pattern("cd*")),
LexicalVariable{"/cd*/", VariableTypeAuxiliary, pattern("cd*"), false},
// Rules marked as tokens become hidden rules.
Variable("/(ef|gh)*/", VariableTypeAuxiliary, repeat1(choice({
LexicalVariable{"/(ef|gh)*/", VariableTypeAuxiliary, repeat1(choice({
str("ef"),
str("gh")
}))),
})), false},
// This named rule was moved wholesale to the lexical grammar.
Variable("rule_B", VariableTypeNamed, pattern("ij+")),
LexicalVariable{"rule_B", VariableTypeNamed, pattern("ij+"), false},
// Strings become anonymous rules.
Variable("kl", VariableTypeAnonymous, str("kl")),
LexicalVariable{"kl", VariableTypeAnonymous, str("kl"), true},
})));
});
it("does not create duplicate tokens in the lexical grammar", [&]() {
auto result = extract_tokens(InternedGrammar{{
Variable("rule_A", VariableTypeNamed, seq({
str("ab"),
i_sym(0),
str("ab"),
})),
}, {}, {}, {}});
auto result = extract_tokens(InternedGrammar{
{
Variable{"rule_A", VariableTypeNamed, seq({
str("ab"),
i_sym(0),
str("ab"),
})},
},
{},
{},
{}
});
InitialSyntaxGrammar &syntax_grammar = get<0>(result);
LexicalGrammar &lexical_grammar = get<1>(result);
AssertThat(syntax_grammar.variables, Equals(vector<Variable>({
Variable("rule_A", VariableTypeNamed, seq({ i_token(0), i_sym(0), i_token(0) })),
})));
AssertThat(syntax_grammar.variables, Equals(vector<Variable> {
Variable {"rule_A", VariableTypeNamed, seq({ i_token(0), i_sym(0), i_token(0) })},
}));
AssertThat(lexical_grammar.variables, Equals(vector<Variable>({
Variable("ab", VariableTypeAnonymous, str("ab")),
})))
AssertThat(lexical_grammar.variables, Equals(vector<LexicalVariable> {
LexicalVariable {"ab", VariableTypeAnonymous, str("ab"), true},
}))
});
it("does not move entire rules into the lexical grammar if their content is used elsewhere in the grammar", [&]() {
auto result = extract_tokens(InternedGrammar{{
Variable("rule_A", VariableTypeNamed, seq({ i_sym(1), str("ab") })),
Variable("rule_B", VariableTypeNamed, str("cd")),
Variable("rule_C", VariableTypeNamed, seq({ str("ef"), str("cd") })),
Variable{"rule_A", VariableTypeNamed, seq({ i_sym(1), str("ab") })},
Variable{"rule_B", VariableTypeNamed, str("cd")},
Variable{"rule_C", VariableTypeNamed, seq({ str("ef"), str("cd") })},
}, {}, {}, {}});
InitialSyntaxGrammar &syntax_grammar = get<0>(result);
LexicalGrammar &lexical_grammar = get<1>(result);
AssertThat(syntax_grammar.variables, Equals(vector<Variable>({
Variable("rule_A", VariableTypeNamed, seq({ i_sym(1), i_token(0) })),
Variable("rule_B", VariableTypeNamed, i_token(1)),
Variable("rule_C", VariableTypeNamed, seq({ i_token(2), i_token(1) })),
Variable{"rule_A", VariableTypeNamed, seq({ i_sym(1), i_token(0) })},
Variable{"rule_B", VariableTypeNamed, i_token(1)},
Variable{"rule_C", VariableTypeNamed, seq({ i_token(2), i_token(1) })},
})));
AssertThat(lexical_grammar.variables, Equals(vector<Variable>({
Variable("ab", VariableTypeAnonymous, str("ab")),
Variable("cd", VariableTypeAnonymous, str("cd")),
Variable("ef", VariableTypeAnonymous, str("ef")),
})));
AssertThat(lexical_grammar.variables, Equals(vector<LexicalVariable> {
LexicalVariable {"ab", VariableTypeAnonymous, str("ab"), true},
LexicalVariable {"cd", VariableTypeAnonymous, str("cd"), true},
LexicalVariable {"ef", VariableTypeAnonymous, str("ef"), true},
}));
});
it("renumbers the grammar's expected conflict symbols based on any moved rules", [&]() {
auto result = extract_tokens(InternedGrammar{
{
Variable("rule_A", VariableTypeNamed, str("ok")),
Variable("rule_B", VariableTypeNamed, repeat(i_sym(0))),
Variable("rule_C", VariableTypeNamed, repeat(seq({ i_sym(0), i_sym(0) }))),
Variable{"rule_A", VariableTypeNamed, str("ok")},
Variable{"rule_B", VariableTypeNamed, repeat(i_sym(0))},
Variable{"rule_C", VariableTypeNamed, repeat(seq({ i_sym(0), i_sym(0) }))},
},
{
str(" ")
@ -155,12 +165,17 @@ describe("extract_tokens", []() {
describe("handling extra tokens", [&]() {
it("adds inline extra tokens to the lexical grammar's separators", [&]() {
auto result = extract_tokens(InternedGrammar{{
Variable("rule_A", VariableTypeNamed, str("x")),
}, {
str("y"),
pattern("\\s+"),
}, {}, {}});
auto result = extract_tokens(InternedGrammar{
{
Variable{"rule_A", VariableTypeNamed, str("x")},
},
{
str("y"),
pattern("\\s+"),
},
{},
{}
});
AssertThat(get<2>(result), Equals(CompileError::none()));
@ -172,12 +187,17 @@ describe("extract_tokens", []() {
});
it("handles inline extra tokens that match tokens in the grammar", [&]() {
auto result = extract_tokens(InternedGrammar{{
Variable("rule_A", VariableTypeNamed, str("x")),
Variable("rule_B", VariableTypeNamed, str("y")),
}, {
str("y"),
}, {}, {}});
auto result = extract_tokens(InternedGrammar{
{
Variable{"rule_A", VariableTypeNamed, str("x")},
Variable{"rule_B", VariableTypeNamed, str("y")},
},
{
str("y"),
},
{},
{}
});
AssertThat(get<2>(result), Equals(CompileError::none()));
AssertThat(get<1>(result).separators.size(), Equals<size_t>(0));
@ -185,13 +205,18 @@ describe("extract_tokens", []() {
});
it("updates extra symbols according to the new symbol numbers", [&]() {
auto result = extract_tokens(InternedGrammar{{
Variable("rule_A", VariableTypeNamed, seq({ str("w"), str("x"), i_sym(1) })),
Variable("rule_B", VariableTypeNamed, str("y")),
Variable("rule_C", VariableTypeNamed, str("z")),
}, {
i_sym(2),
}, {}, {}});
auto result = extract_tokens(InternedGrammar{
{
Variable{"rule_A", VariableTypeNamed, seq({ str("w"), str("x"), i_sym(1) })},
Variable{"rule_B", VariableTypeNamed, str("y")},
Variable{"rule_C", VariableTypeNamed, str("z")},
},
{
i_sym(2),
},
{},
{}
});
AssertThat(get<2>(result), Equals(CompileError::none()));
@ -204,8 +229,8 @@ describe("extract_tokens", []() {
it("returns an error if any extra tokens are non-token symbols", [&]() {
auto result = extract_tokens(InternedGrammar{{
Variable("rule_A", VariableTypeNamed, seq({ str("x"), i_sym(1) })),
Variable("rule_B", VariableTypeNamed, seq({ str("y"), str("z") })),
Variable{"rule_A", VariableTypeNamed, seq({ str("x"), i_sym(1) })},
Variable{"rule_B", VariableTypeNamed, seq({ str("y"), str("z") })},
}, { i_sym(1) }, {}, {}});
AssertThat(get<2>(result), !Equals(CompileError::none()));
@ -216,8 +241,8 @@ describe("extract_tokens", []() {
it("returns an error if any extra tokens are non-token rules", [&]() {
auto result = extract_tokens(InternedGrammar{{
Variable("rule_A", VariableTypeNamed, str("x")),
Variable("rule_B", VariableTypeNamed, str("y")),
Variable{"rule_A", VariableTypeNamed, str("x")},
Variable{"rule_B", VariableTypeNamed, str("y")},
}, { choice({ i_sym(1), blank() }) }, {}, {}});
AssertThat(get<2>(result), !Equals(CompileError::none()));
@ -231,8 +256,8 @@ describe("extract_tokens", []() {
it("returns an error if an external token has the same name as a non-terminal rule", [&]() {
auto result = extract_tokens(InternedGrammar{
{
Variable("rule_A", VariableTypeNamed, seq({ str("x"), i_sym(1) })),
Variable("rule_B", VariableTypeNamed, seq({ str("y"), str("z") })),
Variable{"rule_A", VariableTypeNamed, seq({ str("x"), i_sym(1) })},
Variable{"rule_B", VariableTypeNamed, seq({ str("y"), str("z") })},
},
{},
{},

View file

@ -12,7 +12,7 @@ using prepare_grammar::flatten_rule;
describe("flatten_grammar", []() {
it("associates each symbol with the precedence and associativity binding it to its successor", [&]() {
SyntaxVariable result = flatten_rule(Variable(
SyntaxVariable result = flatten_rule(Variable{
"test",
VariableTypeNamed,
seq({
@ -30,7 +30,7 @@ describe("flatten_grammar", []() {
})),
i_sym(7),
})
));
});
AssertThat(result.name, Equals("test"));
AssertThat(result.type, Equals(VariableTypeNamed));
@ -54,14 +54,14 @@ describe("flatten_grammar", []() {
});
it("uses the last assigned precedence", [&]() {
SyntaxVariable result = flatten_rule(Variable(
SyntaxVariable result = flatten_rule(Variable{
"test1",
VariableTypeNamed,
prec_left(101, seq({
i_sym(1),
i_sym(2),
}))
));
});
AssertThat(result.productions, Equals(vector<Production>({
Production({
@ -70,13 +70,13 @@ describe("flatten_grammar", []() {
})
})))
result = flatten_rule(Variable(
result = flatten_rule(Variable{
"test2",
VariableTypeNamed,
prec_left(101, seq({
i_sym(1),
}))
));
});
AssertThat(result.productions, Equals(vector<Production>({
Production({

View file

@ -15,27 +15,32 @@ using prepare_grammar::intern_symbols;
describe("intern_symbols", []() {
it("replaces named symbols with numerically-indexed symbols", [&]() {
Grammar grammar{{
{ "x", choice({ sym("y"), sym("_z") }) },
{ "y", sym("_z") },
{ "_z", str("stuff") }
}, {}, {}, {}};
Grammar grammar{
{
{"x", choice({ sym("y"), sym("_z") })},
{"y", sym("_z")},
{"_z", str("stuff")}
}, {}, {}, {}
};
auto result = intern_symbols(grammar);
AssertThat(result.second, Equals(CompileError::none()));
AssertThat(result.first.variables, Equals(vector<Variable>({
Variable("x", VariableTypeNamed, choice({ i_sym(1), i_sym(2) })),
Variable("y", VariableTypeNamed, i_sym(2)),
Variable("_z", VariableTypeHidden, str("stuff")),
})));
AssertThat(result.first.variables, Equals(vector<Variable>{
Variable{"x", VariableTypeNamed, choice({ i_sym(1), i_sym(2) })},
Variable{"y", VariableTypeNamed, i_sym(2)},
Variable{"_z", VariableTypeHidden, str("stuff")},
}));
});
describe("when there are symbols that reference undefined rules", [&]() {
it("returns an error", []() {
Grammar grammar{{
{ "x", sym("y") },
}, {}, {}, {}};
Grammar grammar{
{
{"x", sym("y")},
},
{}, {}, {}
};
auto result = intern_symbols(grammar);
@ -44,13 +49,17 @@ describe("intern_symbols", []() {
});
it("translates the grammar's optional 'extra_tokens' to numerical symbols", [&]() {
Grammar grammar{{
{ "x", choice({ sym("y"), sym("z") }) },
{ "y", sym("z") },
{ "z", str("stuff") }
}, {
sym("z")
}, {}, {}};
Grammar grammar{
{
{"x", choice({ sym("y"), sym("z") })},
{"y", sym("z")},
{"z", str("stuff")}
},
{
sym("z")
},
{}, {}
};
auto result = intern_symbols(grammar);
@ -60,29 +69,34 @@ describe("intern_symbols", []() {
});
it("records any rule names that match external token names", [&]() {
Grammar grammar{{
{ "x", choice({ sym("y"), sym("z") }) },
{ "y", sym("z") },
{ "z", str("stuff") }
}, {}, {}, {
"w",
"z"
}};
Grammar grammar{
{
{"x", choice({ sym("y"), sym("z") })},
{"y", sym("z")},
{"z", str("stuff")},
},
{},
{},
{
"w",
"z"
}
};
auto result = intern_symbols(grammar);
AssertThat(result.first.external_tokens, Equals(vector<ExternalToken>({
{
AssertThat(result.first.external_tokens, Equals(vector<ExternalToken>{
ExternalToken{
"w",
VariableTypeNamed,
rules::NONE()
},
{
ExternalToken{
"z",
VariableTypeNamed,
Symbol(2, Symbol::NonTerminal)
}
})))
},
}))
});
});

View file

@ -1,42 +0,0 @@
#include <tree_sitter/parser.h>
enum {
COMMENT,
};
void *tree_sitter_extra_external_tokens_external_scanner_create() {
return NULL;
}
void tree_sitter_extra_external_tokens_external_scanner_reset(void *payload) {
}
bool tree_sitter_extra_external_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) {
return true;
}
void tree_sitter_extra_external_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) {
}
bool tree_sitter_extra_external_tokens_external_scanner_scan(
void *payload, TSLexer *lexer, const bool *whitelist) {
while (lexer->lookahead == ' ') {
lexer->advance(lexer, true);
}
if (lexer->lookahead == '#') {
lexer->advance(lexer, false);
while (lexer->lookahead != '\n') {
lexer->advance(lexer, false);
}
lexer->result_symbol = COMMENT;
return true;
}
return false;
}
void tree_sitter_extra_external_tokens_external_scanner_destroy(void *payload) {
}

View file

@ -0,0 +1,32 @@
================================================
anonymous tokens defined with character classes
================================================
1234
---
(first_rule)
=================================================
anonymous tokens defined with LF escape sequence
=================================================
---
(first_rule)
=================================================
anonymous tokens defined with CR escape sequence
=================================================
---
(first_rule)
================================================
anonymous tokens with quotes
================================================
'hello'
---
(first_rule)

View file

@ -0,0 +1,14 @@
{
"name": "anonymous_tokens_with_escaped_chars",
"rules": {
"first_rule": {
"type": "CHOICE",
"members": [
{"type": "STRING", "value": "\n"},
{"type": "STRING", "value": "\r"},
{"type": "STRING", "value": "'hello'"},
{"type": "PATTERN", "value": "\\d+"}
]
}
}
}

View file

@ -0,0 +1,8 @@
===================
chained operations
===================
x+y+z
---
(expression (math_operation
(expression (math_operation (expression (identifier)) (expression (identifier))))
(expression (identifier))))

View file

@ -0,0 +1,31 @@
{
"name": "associativity_left",
"rules": {
"expression": {
"type": "CHOICE",
"members": [
{"type": "SYMBOL", "name": "math_operation"},
{"type": "SYMBOL", "name": "identifier"}
]
},
"math_operation": {
"type": "PREC_LEFT",
"value": 0,
"content": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": "+"},
{"type": "SYMBOL", "name": "expression"}
]
}
},
"identifier": {
"type": "PATTERN",
"value": "[a-zA-Z]+"
}
}
}

View file

@ -0,0 +1,13 @@
Unresolved conflict for symbol sequence:
expression '+' expression • '+' …
Possible interpretations:
1: (math_operation expression '+' expression) • '+' …
2: expression '+' (math_operation expression • '+' expression)
Possible resolutions:
1: Specify a left or right associativity in `math_operation`
2: Add a conflict for these rules: `math_operation`

View file

@ -0,0 +1,27 @@
{
"name": "associativity_missing",
"rules": {
"expression": {
"type": "CHOICE",
"members": [
{"type": "SYMBOL", "name": "math_operation"},
{"type": "SYMBOL", "name": "identifier"}
]
},
"math_operation": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": "+"},
{"type": "SYMBOL", "name": "expression"}
]
},
"identifier": {
"type": "PATTERN",
"value": "[a-zA-Z]+"
}
}
}

View file

@ -0,0 +1,8 @@
===================
chained operations
===================
x+y+z
---
(expression (math_operation
(expression (identifier))
(expression (math_operation (expression (identifier)) (expression (identifier))))))

View file

@ -0,0 +1,31 @@
{
"name": "associativity_right",
"rules": {
"expression": {
"type": "CHOICE",
"members": [
{"type": "SYMBOL", "name": "math_operation"},
{"type": "SYMBOL", "name": "identifier"}
]
},
"math_operation": {
"type": "PREC_RIGHT",
"value": 0,
"content": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": "+"},
{"type": "SYMBOL", "name": "expression"}
]
}
},
"identifier": {
"type": "PATTERN",
"value": "[a-zA-Z]+"
}
}
}

View file

@ -0,0 +1,15 @@
Unresolved conflict for symbol sequence:
expression '+' expression • '*' …
Possible interpretations:
1: (sum expression '+' expression) • '*' …
2: expression '+' (product expression • '*' expression)
3: expression '+' (other_thing expression • '*' '*')
Possible resolutions:
1: Specify a higher precedence in `product` and `other_thing` than in the other rules.
2: Specify a higher precedence in `sum` than in the other rules.
3: Add a conflict for these rules: `sum` `product` `other_thing`

View file

@ -0,0 +1,58 @@
{
"name": "conflicting_precedence",
"rules": {
"expression": {
"type": "CHOICE",
"members": [
{"type": "SYMBOL", "name": "sum"},
{"type": "SYMBOL", "name": "product"},
{"type": "SYMBOL", "name": "other_thing"}
]
},
"sum": {
"type": "PREC_LEFT",
"value": 0,
"content": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": "+"},
{"type": "SYMBOL", "name": "expression"}
]
}
},
"product": {
"type": "PREC_LEFT",
"value": 1,
"content": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": "*"},
{"type": "SYMBOL", "name": "expression"}
]
}
},
"other_thing": {
"type": "PREC_LEFT",
"value": -1,
"content": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": "*"},
{"type": "STRING", "value": "*"}
]
}
},
"identifier": {
"type": "PATTERN",
"value": "[a-zA-Z]+"
}
}
}

View file

@ -0,0 +1,2 @@
The rule `rule_2` matches the empty string.
Tree-sitter currently does not support syntactic rules that match the empty string.

View file

@ -0,0 +1,15 @@
{
"name": "epsilon_rules",
"rules": {
"rule_1": {"type": "SYMBOL", "name": "rule_2"},
"rule_2": {
"type": "CHOICE",
"members": [
{"type": "SYMBOL", "name": "rule_1"},
{"type": "BLANK"}
]
}
}
}

View file

@ -0,0 +1,41 @@
=========================================
single-line statements - internal tokens
=========================================
a b
---
(statement (variable) (variable) (line_break))
=========================================
multi-line statements - internal tokens
=========================================
a
b
---
(statement (variable) (variable) (line_break))
=========================================
single-line statements - external tokens
=========================================
'hello' 'world'
---
(statement (string) (string) (line_break))
=========================================
multi-line statements - external tokens
=========================================
'hello'
'world'
---
(statement (string) (string) (line_break))

View file

@ -0,0 +1,36 @@
{
"name": "external_and_internal_tokens",
"externals": [
"string",
"line_break"
],
"extras": [
{"type": "PATTERN", "value": "\\s"}
],
"rules": {
"statement": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "_expression"},
{"type": "SYMBOL", "name": "_expression"},
{"type": "SYMBOL", "name": "line_break"}
]
},
"_expression": {
"type": "CHOICE",
"members": [
{"type": "SYMBOL", "name": "string"},
{"type": "SYMBOL", "name": "variable"},
{"type": "SYMBOL", "name": "number"}
]
},
"variable": {"type": "PATTERN", "value": "\\a+"},
"number": {"type": "PATTERN", "value": "\\d+"},
"line_break": {"type": "STRING", "value": "\n"}
}
}

View file

@ -0,0 +1 @@
This grammar has an external scanner whose `scan` method needs to be able to check for the validity of an *internal* token. This is done by including the names of that internal token (`_line_break`) in the grammar's `externals` field.

View file

@ -1,4 +1,3 @@
#include <stdbool.h>
#include <tree_sitter/parser.h>
enum {
@ -6,21 +5,17 @@ enum {
LINE_BREAK
};
void *tree_sitter_shared_external_tokens_external_scanner_create() {
return NULL;
}
void *tree_sitter_external_and_internal_tokens_external_scanner_create() { return NULL; }
void tree_sitter_shared_external_tokens_external_scanner_reset(void *payload) {
}
void tree_sitter_external_and_internal_tokens_external_scanner_destroy(void *payload) {}
bool tree_sitter_shared_external_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) {
return true;
}
void tree_sitter_external_and_internal_tokens_external_scanner_reset(void *payload) {}
void tree_sitter_shared_external_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) {
}
bool tree_sitter_external_and_internal_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) { return true; }
bool tree_sitter_shared_external_tokens_external_scanner_scan(
void tree_sitter_external_and_internal_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) {}
bool tree_sitter_external_and_internal_tokens_external_scanner_scan(
void *payload, TSLexer *lexer, const bool *whitelist) {
// If a line-break is a valid lookahead token, only skip spaces.
@ -58,6 +53,3 @@ bool tree_sitter_shared_external_tokens_external_scanner_scan(
return false;
}
void tree_sitter_shared_external_tokens_external_scanner_destroy(void *payload) {
}

View file

@ -0,0 +1,10 @@
========================
extra external tokens
========================
x = # a comment
y
---
(assignment (variable) (comment) (variable))

View file

@ -0,0 +1,25 @@
{
"name": "external_extra_tokens",
"externals": [
"comment"
],
"extras": [
{"type": "PATTERN", "value": "\\s"},
{"type": "SYMBOL", "name": "comment"}
],
"rules": {
"assignment": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "variable"},
{"type": "STRING", "value": "="},
{"type": "SYMBOL", "name": "variable"}
]
},
"variable": {"type": "PATTERN", "value": "\\a+"}
}
}

View file

@ -0,0 +1,36 @@
#include <tree_sitter/parser.h>
enum {
COMMENT,
};
void *tree_sitter_external_extra_tokens_external_scanner_create() { return NULL; }
void tree_sitter_external_extra_tokens_external_scanner_destroy(void *payload) {}
void tree_sitter_external_extra_tokens_external_scanner_reset(void *payload) {}
bool tree_sitter_external_extra_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) { return true; }
void tree_sitter_external_extra_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) {}
bool tree_sitter_external_extra_tokens_external_scanner_scan(
void *payload, TSLexer *lexer, const bool *whitelist) {
while (lexer->lookahead == ' ') {
lexer->advance(lexer, true);
}
if (lexer->lookahead == '#') {
lexer->advance(lexer, false);
while (lexer->lookahead != '\n') {
lexer->advance(lexer, false);
}
lexer->result_symbol = COMMENT;
return true;
}
return false;
}

View file

@ -0,0 +1,22 @@
========================
simple external tokens
=========================
x + %(sup (external) scanner?)
---
(expression (sum (expression (identifier)) (expression (string))))
==================================
external tokens that require state
==================================
%{sup {} #{x + y} {} scanner?}
---
(expression (string
(expression (sum
(expression (identifier))
(expression (identifier))))))

View file

@ -0,0 +1,57 @@
{
"name": "external_tokens",
"externals": [
"_percent_string",
"_percent_string_start",
"_percent_string_end"
],
"extras": [
{"type": "PATTERN", "value": "\\s"}
],
"rules": {
"expression": {
"type": "CHOICE",
"members": [
{"type": "SYMBOL", "name": "string"},
{"type": "SYMBOL", "name": "sum"},
{"type": "SYMBOL", "name": "identifier"}
]
},
"sum": {
"type": "PREC_LEFT",
"value": 0,
"content": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": "+"},
{"type": "SYMBOL", "name": "expression"}
]
}
},
"string": {
"type": "CHOICE",
"members": [
{"type": "SYMBOL", "name": "_percent_string"},
{
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "_percent_string_start"},
{"type": "SYMBOL", "name": "expression"},
{"type": "SYMBOL", "name": "_percent_string_end"}
]
},
]
},
"identifier": {
"type": "PATTERN",
"value": "\\a+"
}
}
}

View file

@ -1,4 +1,3 @@
#include <stdbool.h>
#include <tree_sitter/parser.h>
enum {
@ -13,7 +12,7 @@ typedef struct {
uint32_t depth;
} Scanner;
void *tree_sitter_external_scanner_example_external_scanner_create() {
void *tree_sitter_external_tokens_external_scanner_create() {
Scanner *scanner = malloc(sizeof(Scanner));
*scanner = (Scanner){
.open_delimiter = 0,
@ -23,7 +22,17 @@ void *tree_sitter_external_scanner_example_external_scanner_create() {
return scanner;
}
bool tree_sitter_external_scanner_example_external_scanner_scan(
void tree_sitter_external_tokens_external_scanner_destroy(void *payload) {
free(payload);
}
void tree_sitter_external_tokens_external_scanner_reset(void *payload) {}
bool tree_sitter_external_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) { return true; }
void tree_sitter_external_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) {}
bool tree_sitter_external_tokens_external_scanner_scan(
void *payload, TSLexer *lexer, const bool *whitelist) {
Scanner *scanner = payload;
@ -103,16 +112,3 @@ bool tree_sitter_external_scanner_example_external_scanner_scan(
return false;
}
void tree_sitter_external_scanner_example_external_scanner_reset(void *payload) {
}
bool tree_sitter_external_scanner_example_external_scanner_serialize(void *payload, TSExternalTokenState state) {
return true;
}
void tree_sitter_external_scanner_example_external_scanner_deserialize(void *payload, TSExternalTokenState state) {
}
void tree_sitter_external_scanner_example_external_scanner_destroy(void *payload) {
free(payload);
}

View file

@ -0,0 +1,33 @@
========================
regexes
========================
/a+/
---
(expression (regex))
========================
conditionals
========================
(if (1) /a+/)
---
(expression (parenthesized (expression (conditional
(parenthesized (expression (number)))
(expression (regex))))))
========================
quotients
========================
((1) / 2)
---
(expression (parenthesized (expression (quotient
(expression (parenthesized (expression (number))))
(expression (number))))))

View file

@ -0,0 +1,65 @@
{
"name": "lexical_conflicts_due_to_state_merging",
"extras": [
{"type": "PATTERN", "value": "\\s"}
],
"rules": {
"expression": {
"type": "CHOICE",
"members": [
{"type": "SYMBOL", "name": "conditional"},
{"type": "SYMBOL", "name": "regex"},
{"type": "SYMBOL", "name": "quotient"},
{"type": "SYMBOL", "name": "number"},
{"type": "SYMBOL", "name": "parenthesized"}
]
},
"conditional": {
"type": "PREC_LEFT",
"value": 1,
"content": {
"type": "SEQ",
"members": [
{"type": "STRING", "value": "if"},
{"type": "SYMBOL", "name": "parenthesized"},
{"type": "SYMBOL", "name": "expression"}
]
}
},
"quotient": {
"type": "PREC_LEFT",
"value": 0,
"content": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": "/"},
{"type": "SYMBOL", "name": "expression"}
]
}
},
"regex": {
"type": "PATTERN",
"value": "/[^/\n]+/"
},
"number": {
"type": "PATTERN",
"value": "\\d+"
},
"parenthesized": {
"type": "SEQ",
"members": [
{"type": "STRING", "value": "("},
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": ")"}
]
}
}
}

View file

@ -0,0 +1,20 @@
This grammar has two tokens, `regex` and `/`, which conflict: when a `/` character is encountered, the lexer can't tell if it is part of a `/` token or a `regex` by looking ahead only one character. But because these tokens are never valid in the same position, this doesn't cause any problem.
When merging similar parse states in order to reduce the size of the parse table, it is important that we avoid merging states in a way that causes these two tokens to both appear as valid lookahead symbols in a given state.
If we weren't careful, this grammar would cause that to happen, because a `regex` is valid in this state:
```
(if (1) /\w+/)
^
```
and a `/` is valid in this state:
```
((1) / 2)
^
```
And these two states would otherwise be candidates for merging, because they both contain only the action `reduce(parenthesized, 3)`.

View file

@ -0,0 +1,15 @@
Unresolved conflict for symbol sequence:
identifier • '{' …
Possible interpretations:
1: (expression identifier) • '{' …
2: (function_call identifier • block)
Possible resolutions:
1: Specify a higher precedence in `function_call` than in the other rules.
2: Specify a higher precedence in `expression` than in the other rules.
3: Specify a left or right associativity in `expression`
4: Add a conflict for these rules: `expression` `function_call`

View file

@ -0,0 +1,63 @@
{
"name": "precedence_on_single_child_missing",
"extras": [
{"type": "PATTERN", "value": "\\s"}
],
"rules": {
"expression": {
"type": "CHOICE",
"members": [
{"type": "SYMBOL", "name": "function_call"},
{"type": "SYMBOL", "name": "identifier"}
]
},
"function_call": {
"type": "PREC_RIGHT",
"value": 0,
"content": {
"type": "CHOICE",
"members": [
{
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "identifier"},
{"type": "SYMBOL", "name": "expression"}
]
},
{
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "identifier"},
{"type": "SYMBOL", "name": "block"}
]
},
{
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "identifier"},
{"type": "SYMBOL", "name": "expression"},
{"type": "SYMBOL", "name": "block"}
]
}
]
}
},
"block": {
"type": "SEQ",
"members": [
{"type": "STRING", "value": "{"},
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": "}"}
]
},
"identifier": {
"type": "PATTERN",
"value": "[a-zA-Z]+"
}
}
}

View file

@ -0,0 +1,14 @@
This language has function calls similar to Ruby's, with no parentheses required, and optional blocks.
There is a shift/reduce conflict here:
```
foo bar { baz }
^
```
The possible actions are:
1. `reduce(expression, 1)` - `bar` is an expression being passed to the `foo` function.
2. `shift` - `bar` is a function being called with the block `{ baz }`
The grammars `precedence_on_single_child_negative` and `precedence_on_single_child_positive` show possible resolutions to this conflict.

View file

@ -0,0 +1,12 @@
===========================
function calls with blocks
===========================
foo bar { baz }
---
(expression (function_call
(identifier)
(expression (identifier))
(block (expression (identifier)))))

View file

@ -0,0 +1,63 @@
{
"name": "precedence_on_single_child_negative",
"extras": [
{"type": "PATTERN", "value": "\\s"}
],
"rules": {
"expression": {
"type": "CHOICE",
"members": [
{"type": "SYMBOL", "name": "function_call"},
{"type": "SYMBOL", "name": "identifier"}
]
},
"function_call": {
"type": "PREC_RIGHT",
"value": -1,
"content": {
"type": "CHOICE",
"members": [
{
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "identifier"},
{"type": "SYMBOL", "name": "expression"}
]
},
{
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "identifier"},
{"type": "SYMBOL", "name": "block"}
]
},
{
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "identifier"},
{"type": "SYMBOL", "name": "expression"},
{"type": "SYMBOL", "name": "block"}
]
}
]
}
},
"block": {
"type": "SEQ",
"members": [
{"type": "STRING", "value": "{"},
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": "}"}
]
},
"identifier": {
"type": "PATTERN",
"value": "[a-zA-Z]+"
}
}
}

View file

@ -0,0 +1 @@
This grammar resolves the conflict shown in the `precedence_on_single_child_missing` grammar by giving `function_call` a negative precedence. This causes reducing the `bar` variable to an expression to be preferred over shifting the `{` token as part of `function_call`.

View file

@ -0,0 +1,13 @@
===========================
function calls with blocks
===========================
foo bar { baz }
---
(expression (function_call
(identifier)
(expression (function_call
(identifier)
(block (expression (identifier)))))))

View file

@ -0,0 +1,63 @@
{
"name": "precedence_on_single_child_positive",
"extras": [
{"type": "PATTERN", "value": "\\s"}
],
"rules": {
"expression": {
"type": "CHOICE",
"members": [
{"type": "SYMBOL", "name": "function_call"},
{"type": "SYMBOL", "name": "identifier"}
]
},
"function_call": {
"type": "PREC_RIGHT",
"value": 1,
"content": {
"type": "CHOICE",
"members": [
{
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "identifier"},
{"type": "SYMBOL", "name": "expression"}
]
},
{
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "identifier"},
{"type": "SYMBOL", "name": "block"}
]
},
{
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "identifier"},
{"type": "SYMBOL", "name": "expression"},
{"type": "SYMBOL", "name": "block"}
]
}
]
}
},
"block": {
"type": "SEQ",
"members": [
{"type": "STRING", "value": "{"},
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": "}"}
]
},
"identifier": {
"type": "PATTERN",
"value": "[a-zA-Z]+"
}
}
}

View file

@ -0,0 +1 @@
This grammar resolves the conflict shown in the `precedence_on_single_child_missing` grammar by giving `function_call` a positive precedence. This causes shifting the `{` token as part of `function_call` to be preferred over reducing the `bar` variable to an expression.

View file

@ -0,0 +1,24 @@
==========================================
curly brace blocks with high precedence
==========================================
a b {}
---
(expression (function_call
(identifier)
(expression (function_call (identifier) (block)))))
==========================================
do blocks with low precedence
==========================================
a b do end
---
(expression (function_call
(identifier)
(expression (identifier))
(do_block)))

View file

@ -0,0 +1,135 @@
{
"name": "precedence_on_subsequence",
"extras": [
{"type": "PATTERN", "value": "\\s"}
],
"rules": {
"expression": {
"type": "PREC_LEFT",
"value": 0,
"content": {
"type": "CHOICE",
"members": [
{"type": "SYMBOL", "name": "function_call"},
{"type": "SYMBOL", "name": "identifier"},
{"type": "SYMBOL", "name": "scope_resolution"}
]
}
},
"function_call": {
"type": "CHOICE",
"members": [
{
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "identifier"},
{"type": "SYMBOL", "name": "expression"}
]
},
{
"type": "PREC",
"value": 1,
"content": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "identifier"},
{"type": "SYMBOL", "name": "block"}
]
}
},
{
"type": "PREC",
"value": -1,
"content": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "identifier"},
{"type": "SYMBOL", "name": "do_block"}
]
}
},
{
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "identifier"},
{
"type": "PREC",
"value": 1,
"content": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "expression"},
{"type": "SYMBOL", "name": "block"}
]
}
}
]
},
{
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "identifier"},
{
"type": "PREC",
"value": -1,
"content": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "expression"},
{"type": "SYMBOL", "name": "do_block"}
]
}
}
]
}
]
},
"scope_resolution": {
"type": "PREC_LEFT",
"value": 1,
"content": {
"type": "CHOICE",
"members": [
{
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": "::"},
{"type": "SYMBOL", "name": "expression"}
]
},
{
"type": "SEQ",
"members": [
{"type": "STRING", "value": "::"},
{"type": "SYMBOL", "name": "expression"},
]
}
]
}
},
"block": {
"type": "STRING",
"value": "{}"
},
"do_block": {
"type": "STRING",
"value": "do end"
},
"identifier": {
"type": "PATTERN",
"value": "[a-zA-Z]+"
}
}
}

3
spec/fixtures/test_grammars/readme.md vendored Normal file
View file

@ -0,0 +1,3 @@
These small grammars demonstrate specific features or test for certain specific regressions.
For some of them, compilation is expected to fail with a given error message. For others, the resulting parser is expected to produce certain trees.

View file

@ -0,0 +1,13 @@
==================================
the readme example
==================================
a + b * c
---
(expression (sum
(expression (variable))
(expression (product
(expression (variable))
(expression (variable))))))

View file

@ -0,0 +1,67 @@
{
"name": "readme_grammar",
// Things that can appear anywhere in the language, like comments
// and whitespace, are expressed as 'extras'.
"extras": [
{"type": "PATTERN", "value": "\\s"},
{"type": "SYMBOL", "name": "comment"}
],
"rules": {
// The first rule listed in the grammar becomes the 'start rule'.
"expression": {
"type": "CHOICE",
"members": [
{"type": "SYMBOL", "name": "sum"},
{"type": "SYMBOL", "name": "product"},
{"type": "SYMBOL", "name": "number"},
{"type": "SYMBOL", "name": "variable"},
{
"type": "SEQ",
"members": [
{"type": "STRING", "value": "("},
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": ")"}
]
}
]
},
// Tokens like '+' and '*' are described directly within the
// grammar's rules, as opposed to in a seperate lexer description.
"sum": {
"type": "PREC_LEFT",
"value": 1,
"content": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": "+"},
{"type": "SYMBOL", "name": "expression"}
]
}
},
// Ambiguities can be resolved at compile time by assigning precedence
// values to rule subtrees.
"product": {
"type": "PREC_LEFT",
"value": 2,
"content": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": "*"},
{"type": "SYMBOL", "name": "expression"}
]
}
},
// Tokens can be specified using ECMAScript regexps.
"number": {"type": "PATTERN", "value": "\\d+"},
"comment": {"type": "PATTERN", "value": "#.*"},
"variable": {"type": "PATTERN", "value": "[a-zA-Z]\\w*"}
}
}

View file

@ -0,0 +1,7 @@
========================
the empty string
=======================
---
(first_rule)

View file

@ -0,0 +1,6 @@
{
"name": "start_rule_is_blank",
"rules": {
"first_rule": {"type": "BLANK"}
}
}

View file

@ -0,0 +1,6 @@
===========================
the single token
==========================
the-value
---
(first_rule)

View file

@ -0,0 +1,6 @@
{
"name": "start_rule_is_token",
"rules": {
"first_rule": {"type": "STRING", "value": "the-value"}
}
}

View file

@ -0,0 +1,61 @@
#include "helpers/file_helpers.h"
#include <sys/stat.h>
#include <errno.h>
#include <fstream>
#include <dirent.h>
using std::string;
using std::ifstream;
using std::istreambuf_iterator;
using std::ofstream;
using std::vector;
bool file_exists(const string &path) {
struct stat file_stat;
return stat(path.c_str(), &file_stat) == 0;
}
int get_modified_time(const string &path) {
struct stat file_stat;
if (stat(path.c_str(), &file_stat) != 0) {
if (errno != ENOENT)
fprintf(stderr, "Error in stat() for path: %s\n", + path.c_str());
return 0;
}
return file_stat.st_mtime;
}
string read_file(const string &path) {
ifstream file(path);
istreambuf_iterator<char> file_iterator(file), end_iterator;
string content(file_iterator, end_iterator);
file.close();
return content;
}
void write_file(const string &path, const string &content) {
ofstream file(path);
file << content;
file.close();
}
vector<string> list_directory(const string &path) {
vector<string> result;
DIR *dir = opendir(path.c_str());
if (!dir) {
printf("\nTest error - no such directory '%s'", path.c_str());
return result;
}
struct dirent *dir_entry;
while ((dir_entry = readdir(dir))) {
string name(dir_entry->d_name);
if (name != "." && name != "..") {
result.push_back(name);
}
}
closedir(dir);
return result;
}

View file

@ -0,0 +1,14 @@
#ifndef HELPERS_FILE_HELPERS_H_
#define HELPERS_FILE_HELPERS_H_
#include <string>
#include <vector>
#include <sys/stat.h>
bool file_exists(const std::string &path);
int get_modified_time(const std::string &path);
std::string read_file(const std::string &path);
void write_file(const std::string &path, const std::string &content);
std::vector<std::string> list_directory(const std::string &path);
#endif // HELPERS_FILE_HELPERS_H_

View file

@ -1,12 +1,12 @@
#include "spec_helper.h"
#include "helpers/load_language.h"
#include "helpers/file_helpers.h"
#include <unistd.h>
#include <dlfcn.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <map>
#include <string>
#include <sys/stat.h>
#include <fstream>
#include <stdlib.h>
#include "tree_sitter/compiler.h"
@ -54,25 +54,10 @@ static std::string run_command(const char *cmd, const char *args[]) {
}
}
static bool file_exists(const string &path) {
struct stat file_stat;
return stat(path.c_str(), &file_stat) == 0;
}
static int get_modified_time(const string &path) {
struct stat file_stat;
if (stat(path.c_str(), &file_stat) != 0) {
if (errno != ENOENT)
fprintf(stderr, "Error in stat() for path: %s\n", + path.c_str());
return 0;
}
return file_stat.st_mtime;
}
const TSLanguage *load_language(const string &source_filename,
const string &lib_filename,
const string &language_name,
string external_scanner_filename = "") {
static const TSLanguage *load_language(const string &source_filename,
const string &lib_filename,
const string &language_name,
string external_scanner_filename = "") {
string language_function_name = "tree_sitter_" + language_name;
string header_dir = getenv("PWD") + string("/include");
int source_mtime = get_modified_time(source_filename);
@ -132,9 +117,9 @@ const TSLanguage *load_language(const string &source_filename,
return reinterpret_cast<TSLanguage *(*)()>(language_function)();
}
const TSLanguage *load_compile_result(const string &name,
const TSCompileResult &compile_result,
string external_scanner_path) {
const TSLanguage *load_test_language(const string &name,
const TSCompileResult &compile_result,
string external_scanner_path) {
if (compile_result.error_type != TSCompileErrorTypeNone) {
Assert::Failure(string("Compilation failed ") + compile_result.error_message);
return nullptr;
@ -155,7 +140,7 @@ const TSLanguage *load_compile_result(const string &name,
return language;
}
const TSLanguage *get_test_language(const string &language_name) {
const TSLanguage *load_real_language(const string &language_name) {
if (loaded_languages[language_name])
return loaded_languages[language_name];
@ -182,20 +167,14 @@ const TSLanguage *get_test_language(const string &language_name) {
if (parser_mtime < grammar_mtime || parser_mtime < libcompiler_mtime) {
printf("\n" "Regenerating the %s parser...\n", language_name.c_str());
ifstream grammar_file(grammar_filename);
istreambuf_iterator<char> grammar_file_iterator(grammar_file), end_iterator;
string grammar_json(grammar_file_iterator, end_iterator);
grammar_file.close();
string grammar_json = read_file(grammar_filename);
TSCompileResult result = ts_compile_grammar(grammar_json.c_str());
if (result.error_type != TSCompileErrorTypeNone) {
fprintf(stderr, "Failed to compile %s grammar: %s\n", language_name.c_str(), result.error_message);
return nullptr;
}
ofstream parser_file(parser_filename);
parser_file << result.code;
parser_file.close();
write_file(parser_filename, result.code);
}
mkdir("out/tmp", 0777);

View file

@ -5,8 +5,10 @@
#include "tree_sitter/runtime.h"
#include <string>
const TSLanguage *load_compile_result(const std::string &, const TSCompileResult &,
std::string external_scanner_path = "");
const TSLanguage *get_test_language(const std::string &language_name);
const TSLanguage *load_real_language(const std::string &name);
const TSLanguage *load_test_language(const std::string &name,
const TSCompileResult &compile_result,
std::string external_scanner_path = "");
#endif // HELPERS_LOAD_LANGUAGE_H_

View file

@ -1,20 +1,18 @@
#include "helpers/read_test_entries.h"
#include <assert.h>
#include <string>
#include <fstream>
#include <streambuf>
#include <dirent.h>
#include <regex>
#include "helpers/file_helpers.h"
using std::regex;
using std::regex_search;
using std::regex_replace;
using std::smatch;
using std::regex_constants::extended;
using std::smatch;
using std::string;
using std::vector;
using std::ifstream;
using std::istreambuf_iterator;
string fixtures_dir = "spec/fixtures/";
static string trim_output(const string &input) {
string result(input);
@ -27,7 +25,7 @@ static string trim_output(const string &input) {
static vector<TestEntry> parse_test_entries(string content) {
regex header_pattern("===+\n" "([^=]+)\n" "===+\n", extended);
regex separator_pattern("---+\n", extended);
regex separator_pattern("---+\r?\n", extended);
vector<string> descriptions;
vector<string> bodies;
@ -55,51 +53,42 @@ static vector<TestEntry> parse_test_entries(string content) {
body.substr(0, matches.position() - 1),
trim_output(body.substr(matches.position() + matches[0].length()))
});
} else {
puts(("Invalid corpus entry with description: " + descriptions[i]).c_str());
abort();
}
}
return result;
}
static vector<string> list_directory(string dir_name) {
vector<string> result;
DIR *dir = opendir(dir_name.c_str());
if (!dir) {
printf("\nTest error - no such directory '%s'", dir_name.c_str());
return result;
}
struct dirent *dir_entry;
while ((dir_entry = readdir(dir))) {
string name(dir_entry->d_name);
if (name != "." && name != "..")
result.push_back(dir_name + "/" + name);
}
closedir(dir);
return result;
}
static string read_file(string filename) {
ifstream file(filename);
string result((istreambuf_iterator<char>(file)), istreambuf_iterator<char>());
return result;
}
vector<TestEntry> read_corpus_entries(string language_name) {
vector<TestEntry> read_real_language_corpus(string language_name) {
vector<TestEntry> result;
string fixtures_dir = "spec/fixtures/";
string test_directory = fixtures_dir + "grammars/" + language_name + "/grammar_test";
for (string &test_filename : list_directory(test_directory))
for (TestEntry &entry : parse_test_entries(read_file(test_filename)))
for (string &test_filename : list_directory(test_directory)) {
for (TestEntry &entry : parse_test_entries(read_file(test_directory + "/" + test_filename))) {
result.push_back(entry);
}
}
string error_test_filename = fixtures_dir + "/error_corpus/" + language_name + "_errors.txt";
for (TestEntry &entry : parse_test_entries(read_file(error_test_filename)))
for (TestEntry &entry : parse_test_entries(read_file(error_test_filename))) {
result.push_back(entry);
}
return result;
}
vector<TestEntry> read_test_language_corpus(string language_name) {
vector<TestEntry> result;
string test_directory = fixtures_dir + "test_grammars/" + language_name;
for (string &test_filename : list_directory(test_directory)) {
for (TestEntry &entry : parse_test_entries(read_file(test_directory + "/" + test_filename))) {
result.push_back(entry);
}
}
return result;
}

View file

@ -10,6 +10,7 @@ struct TestEntry {
std::string tree_string;
};
std::vector<TestEntry> read_corpus_entries(std::string directory);
std::vector<TestEntry> read_real_language_corpus(std::string name);
std::vector<TestEntry> read_test_language_corpus(std::string name);
#endif

View file

@ -1,6 +1,8 @@
#include "rule_helpers.h"
#include <memory>
#include "compiler/rules/symbol.h"
#include "compiler/variable.h"
#include "compiler/lexical_grammar.h"
namespace tree_sitter {
using std::make_shared;
@ -52,4 +54,9 @@ namespace tree_sitter {
return left.name == right.name && left.rule->operator==(*right.rule) &&
left.type == right.type;
}
bool operator==(const LexicalVariable &left, const LexicalVariable &right) {
return left.name == right.name && left.rule->operator==(*right.rule) &&
left.type == right.type && left.is_string == right.is_string;
}
}

View file

@ -15,7 +15,11 @@ namespace tree_sitter {
rule_ptr i_token(size_t index);
rule_ptr active_prec(int precedence, rule_ptr);
struct Variable;
struct LexicalVariable;
bool operator==(const Variable &left, const Variable &right);
bool operator==(const LexicalVariable &left, const LexicalVariable &right);
}
#endif // HELPERS_RULE_HELPERS_H_

View file

@ -3,6 +3,7 @@
#include "tree_sitter/compiler.h"
#include "compiler/parse_table.h"
#include "compiler/syntax_grammar.h"
#include "compiler/lexical_grammar.h"
#include "compiler/build_tables/parse_item.h"
#include "compiler/build_tables/lex_item.h"
@ -41,6 +42,11 @@ ostream &operator<<(ostream &stream, const SyntaxVariable &variable) {
return stream << string("{") << variable.name << string(", ") << variable.productions << string(", ") << to_string(variable.type) << string("}");
}
ostream &operator<<(ostream &stream, const LexicalVariable &variable) {
return stream << "{" << variable.name << ", " << variable.rule << ", " <<
to_string(variable.type) << ", " << to_string(variable.is_string) << "}";
}
std::ostream &operator<<(std::ostream &stream, const AdvanceAction &action) {
return stream << string("#<advance ") + to_string(action.state_index) + ">";
}

View file

@ -93,10 +93,11 @@ using std::string;
using std::to_string;
struct Variable;
struct SyntaxVariable;
struct LexicalVariable;
struct AdvanceAction;
struct AcceptTokenAction;
class ParseAction;
class ParseState;
struct ParseAction;
struct ParseState;
struct ExternalToken;
struct ProductionStep;
struct PrecedenceRange;
@ -107,6 +108,7 @@ ostream &operator<<(ostream &, const Rule &);
ostream &operator<<(ostream &, const rule_ptr &);
ostream &operator<<(ostream &, const Variable &);
ostream &operator<<(ostream &, const SyntaxVariable &);
ostream &operator<<(ostream &, const LexicalVariable &);
ostream &operator<<(ostream &, const AdvanceAction &);
ostream &operator<<(ostream &, const AcceptTokenAction &);
ostream &operator<<(ostream &, const ParseAction &);
@ -119,8 +121,8 @@ namespace build_tables {
class LexItem;
class LexItemSet;
class ParseItem;
class ParseItemSet;
struct ParseItem;
struct ParseItemSet;
class LookaheadSet;
ostream &operator<<(ostream &, const LexItem &);

View file

@ -1,847 +0,0 @@
#include "spec_helper.h"
#include "runtime/alloc.h"
#include "helpers/load_language.h"
#include "helpers/stderr_logger.h"
#include "helpers/dedent.h"
#include "compiler/util/string_helpers.h"
#include <map>
static string fill_template(string input, map<string, string> parameters) {
string result = input;
for (const auto &pair : parameters) {
util::str_replace(&result, "{{" + pair.first + "}}", pair.second);
}
return result;
}
START_TEST
describe("compile_grammar", []() {
TSDocument *document;
before_each([&]() {
document = ts_document_new();
});
after_each([&]() {
ts_document_free(document);
});
auto assert_root_node = [&](const string &expected_string) {
TSNode root_node = ts_document_root_node(document);
char *node_string = ts_node_string(root_node, document);
AssertThat(node_string, Equals(expected_string));
ts_free(node_string);
};
describe("conflicts", [&]() {
it("can resolve shift/reduce conflicts using associativities", [&]() {
string grammar_template = R"JSON({
"name": "associativity_example",
"rules": {
"expression": {
"type": "CHOICE",
"members": [
{"type": "SYMBOL", "name": "math_operation"},
{"type": "SYMBOL", "name": "identifier"}
]
},
"math_operation": {
"type": "{{math_operation_prec_type}}",
"value": 0,
"content": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": "+"},
{"type": "SYMBOL", "name": "expression"}
]
}
},
"identifier": {
"type": "PATTERN",
"value": "[a-zA-Z]+"
}
}
})JSON";
// Ambiguity, which '+' applies first?
ts_document_set_input_string(document, "x+y+z");
TSCompileResult result = ts_compile_grammar(fill_template(grammar_template, {
{"math_operation_prec_type", "PREC"}
}).c_str());
AssertThat(result.error_message, Equals(dedent(R"MESSAGE(
Unresolved conflict for symbol sequence:
expression '+' expression '+'
Possible interpretations:
1: (math_operation expression '+' expression) '+'
2: expression '+' (math_operation expression '+' expression)
Possible resolutions:
1: Specify a left or right associativity in `math_operation`
2: Add a conflict for these rules: `math_operation`
)MESSAGE")));
result = ts_compile_grammar(fill_template(grammar_template, {
{"math_operation_prec_type", "PREC_LEFT"}
}).c_str());
ts_document_set_language(document, load_compile_result("associativity_example", result));
ts_document_parse(document);
assert_root_node("(expression (math_operation "
"(expression (math_operation (expression (identifier)) (expression (identifier)))) "
"(expression (identifier))))");
result = ts_compile_grammar(fill_template(grammar_template, {
{"math_operation_prec_type", "PREC_RIGHT"}
}).c_str());
ts_document_set_language(document, load_compile_result("associativity_example", result));
ts_document_parse(document);
assert_root_node("(expression (math_operation "
"(expression (identifier)) "
"(expression (math_operation (expression (identifier)) (expression (identifier))))))");
});
it("can resolve shift/reduce conflicts involving single-child rules using precedence", [&]() {
string grammar_template = R"JSON({
"name": "associativity_example",
"extras": [
{"type": "PATTERN", "value": "\\s"}
],
"rules": {
"expression": {
"type": "CHOICE",
"members": [
{"type": "SYMBOL", "name": "function_call"},
{"type": "SYMBOL", "name": "identifier"}
]
},
"function_call": {
"type": "PREC_RIGHT",
"value": {{function_call_precedence}},
"content": {
"type": "CHOICE",
"members": [
{
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "identifier"},
{"type": "SYMBOL", "name": "expression"}
]
},
{
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "identifier"},
{"type": "SYMBOL", "name": "block"}
]
},
{
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "identifier"},
{"type": "SYMBOL", "name": "expression"},
{"type": "SYMBOL", "name": "block"}
]
}
]
}
},
"block": {
"type": "SEQ",
"members": [
{"type": "STRING", "value": "{"},
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": "}"}
]
},
"identifier": {
"type": "PATTERN",
"value": "[a-zA-Z]+"
}
}
})JSON";
// Ambiguity: is the trailing block associated with `bar` or `foo`?
ts_document_set_input_string(document, "foo bar { baz }");
TSCompileResult result = ts_compile_grammar(fill_template(grammar_template, {
{"function_call_precedence", "0"}
}).c_str());
AssertThat(result.error_message, Equals(dedent(R"MESSAGE(
Unresolved conflict for symbol sequence:
identifier '{'
Possible interpretations:
1: (expression identifier) '{'
2: (function_call identifier block)
Possible resolutions:
1: Specify a higher precedence in `function_call` than in the other rules.
2: Specify a higher precedence in `expression` than in the other rules.
3: Specify a left or right associativity in `expression`
4: Add a conflict for these rules: `expression` `function_call`
)MESSAGE")));
// Giving function calls lower precedence than expressions causes `bar`
// to be treated as an expression passed to `foo`, not as a function
// that's being called with a block.
result = ts_compile_grammar(fill_template(grammar_template, {
{"function_call_precedence", "-1"}
}).c_str());
AssertThat(result.error_message, IsNull());
ts_document_set_language(document, load_compile_result("associativity_example", result));
ts_document_parse(document);
assert_root_node("(expression (function_call "
"(identifier) "
"(expression (identifier)) "
"(block (expression (identifier)))))");
// Giving function calls higher precedence than expressions causes `bar`
// to be treated as a function that's being called with a block, not as
// an expression passed to `foo`.
result = ts_compile_grammar(fill_template(grammar_template, {
{"function_call_precedence", "1"}
}).c_str());
AssertThat(result.error_message, IsNull());
ts_document_set_language(document, load_compile_result("associativity_example", result));
ts_document_set_input_string(document, "foo bar { baz }");
ts_document_parse(document);
assert_root_node("(expression (function_call "
"(identifier) "
"(expression (function_call "
"(identifier) "
"(block (expression (identifier)))))))");
});
it("handles precedence applied to specific rule subsequences (regression)", [&]() {
TSCompileResult result = ts_compile_grammar(R"JSON({
"name": "precedence_on_subsequence",
"extras": [
{"type": "STRING", "value": " "}
],
"rules": {
"expression": {
"type": "PREC_LEFT",
"value": 0,
"content": {
"type": "CHOICE",
"members": [
{"type": "SYMBOL", "name": "function_call"},
{"type": "SYMBOL", "name": "identifier"},
{"type": "SYMBOL", "name": "scope_resolution"}
]
}
},
"function_call": {
"type": "CHOICE",
"members": [
{
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "identifier"},
{"type": "SYMBOL", "name": "expression"}
]
},
{
"type": "PREC",
"value": 1,
"content": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "identifier"},
{"type": "SYMBOL", "name": "block"}
]
}
},
{
"type": "PREC",
"value": -1,
"content": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "identifier"},
{"type": "SYMBOL", "name": "do_block"}
]
}
},
{
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "identifier"},
{
"type": "PREC",
"value": 1,
"content": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "expression"},
{"type": "SYMBOL", "name": "block"}
]
}
}
]
},
{
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "identifier"},
{
"type": "PREC",
"value": -1,
"content": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "expression"},
{"type": "SYMBOL", "name": "do_block"}
]
}
}
]
}
]
},
"scope_resolution": {
"type": "PREC_LEFT",
"value": 1,
"content": {
"type": "CHOICE",
"members": [
{
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": "::"},
{"type": "SYMBOL", "name": "expression"}
]
},
{
"type": "SEQ",
"members": [
{"type": "STRING", "value": "::"},
{"type": "SYMBOL", "name": "expression"},
]
}
]
}
},
"block": {
"type": "STRING",
"value": "{}"
},
"do_block": {
"type": "STRING",
"value": "do end"
},
"identifier": {
"type": "PATTERN",
"value": "[a-zA-Z]+"
}
}
})JSON");
auto language = load_compile_result("precedence_on_subsequence", result);
ts_document_set_language(document, language);
ts_document_set_input_string(document, "a b {}");
ts_document_parse(document);
assert_root_node("(expression (function_call "
"(identifier) "
"(expression (function_call (identifier) (block)))))");
ts_document_set_input_string(document, "a b do end");
ts_document_parse(document);
assert_root_node("(expression (function_call "
"(identifier) "
"(expression (identifier)) "
"(do_block)))");
});
it("does not allow conflicting precedences", [&]() {
string grammar_template = R"JSON({
"name": "conflicting_precedence_example",
"rules": {
"expression": {
"type": "CHOICE",
"members": [
{"type": "SYMBOL", "name": "sum"},
{"type": "SYMBOL", "name": "product"},
{"type": "SYMBOL", "name": "other_thing"}
]
},
"sum": {
"type": "PREC_LEFT",
"value": 0,
"content": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": "+"},
{"type": "SYMBOL", "name": "expression"}
]
}
},
"product": {
"type": "PREC_LEFT",
"value": 1,
"content": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": "*"},
{"type": "SYMBOL", "name": "expression"}
]
}
},
"other_thing": {
"type": "PREC_LEFT",
"value": -1,
"content": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": "*"},
{"type": "STRING", "value": "*"}
]
}
},
"identifier": {
"type": "PATTERN",
"value": "[a-zA-Z]+"
}
}
})JSON";
TSCompileResult result = ts_compile_grammar(fill_template(grammar_template, {
}).c_str());
AssertThat(result.error_message, Equals(dedent(R"MESSAGE(
Unresolved conflict for symbol sequence:
expression '+' expression '*'
Possible interpretations:
1: (sum expression '+' expression) '*'
2: expression '+' (product expression '*' expression)
3: expression '+' (other_thing expression '*' '*')
Possible resolutions:
1: Specify a higher precedence in `product` and `other_thing` than in the other rules.
2: Specify a higher precedence in `sum` than in the other rules.
3: Add a conflict for these rules: `sum` `product` `other_thing`
)MESSAGE")));
});
});
describe("when the grammar contains rules that match the empty string", [&]() {
it("reports an error", [&]() {
TSCompileResult result = ts_compile_grammar(R"JSON(
{
"name": "empty_rules",
"rules": {
"rule_1": {"type": "SYMBOL", "name": "rule_2"},
"rule_2": {
"type": "CHOICE",
"members": [
{"type": "SYMBOL", "name": "rule_1"},
{"type": "BLANK"}
]
}
}
}
)JSON");
AssertThat(result.error_message, Equals(dedent(R"MESSAGE(
The rule `rule_2` matches the empty string.
Tree-sitter currently does not support syntactic rules that match the empty string.
)MESSAGE")));
});
});
describe("external scanners", [&]() {
it("can tokenize using arbitrary user-defined scanner functions", [&]() {
string grammar = R"JSON({
"name": "external_scanner_example",
"externals": [
"_percent_string",
"_percent_string_start",
"_percent_string_end"
],
"extras": [
{"type": "PATTERN", "value": "\\s"}
],
"rules": {
"expression": {
"type": "CHOICE",
"members": [
{"type": "SYMBOL", "name": "string"},
{"type": "SYMBOL", "name": "sum"},
{"type": "SYMBOL", "name": "identifier"}
]
},
"sum": {
"type": "PREC_LEFT",
"value": 0,
"content": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": "+"},
{"type": "SYMBOL", "name": "expression"}
]
}
},
"string": {
"type": "CHOICE",
"members": [
{"type": "SYMBOL", "name": "_percent_string"},
{
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "_percent_string_start"},
{"type": "SYMBOL", "name": "expression"},
{"type": "SYMBOL", "name": "_percent_string_end"}
]
},
]
},
"identifier": {
"type": "PATTERN",
"value": "\\a+"
}
}
})JSON";
TSCompileResult result = ts_compile_grammar(grammar.c_str());
AssertThat(result.error_message, IsNull());
ts_document_set_language(document, load_compile_result(
"external_scanner_example",
result,
"spec/fixtures/external_scanners/percent_strings.c"
));
ts_document_set_input_string(document, "x + %(sup (external) scanner?)");
ts_document_parse(document);
assert_root_node("(expression (sum (expression (identifier)) (expression (string))))");
ts_document_set_input_string(document, "%{sup {} #{x + y} {} scanner?}");
ts_document_parse(document);
assert_root_node("(expression (string (expression (sum (expression (identifier)) (expression (identifier))))))");
});
it("allows external scanners to refer to tokens that are defined internally", [&]() {
string grammar = R"JSON({
"name": "shared_external_tokens",
"externals": [
"string",
"line_break"
],
"extras": [
{"type": "PATTERN", "value": "\\s"}
],
"rules": {
"statement": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "_expression"},
{"type": "SYMBOL", "name": "_expression"},
{"type": "SYMBOL", "name": "line_break"}
]
},
"_expression": {
"type": "CHOICE",
"members": [
{"type": "SYMBOL", "name": "string"},
{"type": "SYMBOL", "name": "variable"},
{"type": "SYMBOL", "name": "number"}
]
},
"variable": {"type": "PATTERN", "value": "\\a+"},
"number": {"type": "PATTERN", "value": "\\d+"},
"line_break": {"type": "STRING", "value": "\n"}
}
})JSON";
TSCompileResult result = ts_compile_grammar(grammar.c_str());
AssertThat(result.error_message, IsNull());
ts_document_set_language(document, load_compile_result(
"shared_external_tokens",
result,
"spec/fixtures/external_scanners/shared_external_tokens.c"
));
ts_document_set_input_string(document, "a b\n");
ts_document_parse(document);
assert_root_node("(statement (variable) (variable) (line_break))");
ts_document_set_input_string(document, "a \nb\n");
ts_document_parse(document);
assert_root_node("(statement (variable) (variable) (line_break))");
ts_document_set_input_string(document, "'hello' 'world'\n");
ts_document_parse(document);
assert_root_node("(statement (string) (string) (line_break))");
ts_document_set_input_string(document, "'hello' \n'world'\n");
ts_document_parse(document);
assert_root_node("(statement (string) (string) (line_break))");
});
it("allows external tokens to be used as extras", [&]() {
string grammar = R"JSON({
"name": "extra_external_tokens",
"externals": [
"comment"
],
"extras": [
{"type": "PATTERN", "value": "\\s"},
{"type": "SYMBOL", "name": "comment"}
],
"rules": {
"assignment": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "variable"},
{"type": "STRING", "value": "="},
{"type": "SYMBOL", "name": "variable"}
]
},
"variable": {"type": "PATTERN", "value": "\\a+"}
}
})JSON";
TSCompileResult result = ts_compile_grammar(grammar.c_str());
AssertThat(result.error_message, IsNull());
ts_document_set_language(document, load_compile_result(
"extra_external_tokens",
result,
"spec/fixtures/external_scanners/extra_external_tokens.c"
));
ts_document_set_input_string(document, "x = # a comment\n y");
ts_document_parse(document);
assert_root_node("(assignment (variable) (comment) (variable))");
});
});
describe("when the grammar's start symbol is a token", [&]() {
it("parses the token", [&]() {
TSCompileResult result = ts_compile_grammar(R"JSON(
{
"name": "one_token_language",
"rules": {
"first_rule": {"type": "STRING", "value": "the-value"}
}
}
)JSON");
ts_document_set_language(document, load_compile_result("one_token_language", result));
ts_document_set_input_string(document, "the-value");
ts_document_parse(document);
assert_root_node("(first_rule)");
});
});
describe("when the grammar's start symbol is blank", [&]() {
it("parses the empty string", [&]() {
TSCompileResult result = ts_compile_grammar(R"JSON(
{
"name": "blank_language",
"rules": {
"first_rule": {"type": "BLANK"}
}
}
)JSON");
ts_document_set_language(document, load_compile_result("blank_language", result));
ts_document_set_input_string(document, "");
ts_document_parse(document);
assert_root_node("(first_rule)");
});
});
describe("when the grammar contains anonymous tokens with escaped characters", [&]() {
it("escapes the escaped characters properly in the generated parser", [&]() {
TSCompileResult result = ts_compile_grammar(R"JSON(
{
"name": "escaped_char_language",
"rules": {
"first_rule": {
"type": "CHOICE",
"members": [
{"type": "STRING", "value": "\n"},
{"type": "STRING", "value": "\r"},
{"type": "STRING", "value": "'hello'"},
{"type": "PATTERN", "value": "\\d+"}
]
}
}
}
)JSON");
ts_document_set_language(document, load_compile_result("escaped_char_language", result));
ts_document_set_input_string(document, "1234");
ts_document_parse(document);
assert_root_node("(first_rule)");
ts_document_set_input_string(document, "\n");
ts_document_parse(document);
assert_root_node("(first_rule)");
ts_document_set_input_string(document, "'hello'");
ts_document_parse(document);
assert_root_node("(first_rule)");
});
});
describe("the grammar in the README", [&]() {
it("parses the input in the README", [&]() {
TSCompileResult result = ts_compile_grammar(R"JSON(
{
"name": "arithmetic",
// Things that can appear anywhere in the language, like comments
// and whitespace, are expressed as 'extras'.
"extras": [
{"type": "PATTERN", "value": "\\s"},
{"type": "SYMBOL", "name": "comment"}
],
"rules": {
// The first rule listed in the grammar becomes the 'start rule'.
"expression": {
"type": "CHOICE",
"members": [
{"type": "SYMBOL", "name": "sum"},
{"type": "SYMBOL", "name": "product"},
{"type": "SYMBOL", "name": "number"},
{"type": "SYMBOL", "name": "variable"},
{
"type": "SEQ",
"members": [
{"type": "STRING", "value": "("},
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": ")"}
]
}
]
},
// Tokens like '+' and '*' are described directly within the
// grammar's rules, as opposed to in a seperate lexer description.
"sum": {
"type": "PREC_LEFT",
"value": 1,
"content": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": "+"},
{"type": "SYMBOL", "name": "expression"}
]
}
},
// Ambiguities can be resolved at compile time by assigning precedence
// values to rule subtrees.
"product": {
"type": "PREC_LEFT",
"value": 2,
"content": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": "*"},
{"type": "SYMBOL", "name": "expression"}
]
}
},
// Tokens can be specified using ECMAScript regexps.
"number": {"type": "PATTERN", "value": "\\d+"},
"comment": {"type": "PATTERN", "value": "#.*"},
"variable": {"type": "PATTERN", "value": "[a-zA-Z]\\w*"}
}
}
)JSON");
const TSLanguage *language = load_compile_result("arithmetic", result);
ts_document_set_language(document, language);
ts_document_set_input_string(document, "a + b * c");
ts_document_parse(document);
assert_root_node(
"(expression (sum "
"(expression (variable)) "
"(expression (product "
"(expression (variable)) "
"(expression (variable))))))");
});
});
});
END_TEST

View file

@ -1,185 +0,0 @@
#include "spec_helper.h"
#include "runtime/alloc.h"
#include "helpers/load_language.h"
#include "helpers/read_test_entries.h"
#include "helpers/spy_input.h"
#include "helpers/stderr_logger.h"
#include "helpers/point_helpers.h"
#include "helpers/encoding_helpers.h"
#include "helpers/record_alloc.h"
#include "helpers/random_helpers.h"
#include "helpers/scope_sequence.h"
#include <set>
static void assert_correct_tree_shape(const TSDocument *document, string tree_string) {
TSNode root_node = ts_document_root_node(document);
const char *node_string = ts_node_string(root_node, document);
string result(node_string);
ts_free((void *)node_string);
AssertThat(result, Equals(tree_string));
}
static void assert_consistent_sizes(TSNode node) {
size_t child_count = ts_node_child_count(node);
size_t start_byte = ts_node_start_byte(node);
size_t end_byte = ts_node_end_byte(node);
TSPoint start_point = ts_node_start_point(node);
TSPoint end_point = ts_node_end_point(node);
bool some_child_has_changes = false;
AssertThat(start_byte, !IsGreaterThan(end_byte));
AssertThat(start_point, !IsGreaterThan(end_point));
size_t last_child_end_byte = start_byte;
TSPoint last_child_end_point = start_point;
for (size_t i = 0; i < child_count; i++) {
TSNode child = ts_node_child(node, i);
size_t child_start_byte = ts_node_start_byte(child);
TSPoint child_start_point = ts_node_start_point(child);
AssertThat(child_start_byte, !IsLessThan(last_child_end_byte));
AssertThat(child_start_point, !IsLessThan(last_child_end_point));
assert_consistent_sizes(child);
if (ts_node_has_changes(child))
some_child_has_changes = true;
last_child_end_byte = ts_node_end_byte(child);
last_child_end_point = ts_node_end_point(child);
}
if (child_count > 0) {
AssertThat(end_byte, !IsLessThan(last_child_end_byte));
AssertThat(end_point, !IsLessThan(last_child_end_point));
}
if (some_child_has_changes) {
AssertThat(ts_node_has_changes(node), IsTrue());
}
}
static void assert_correct_tree_size(TSDocument *document, string content) {
TSNode root_node = ts_document_root_node(document);
size_t expected_size = content.size();
// In the JSON grammar, the start rule (`_value`) is hidden, so the node
// returned from `ts_document_root_node` (e.g. an `object` node), does not
// actually point to the root of the tree. In this weird case, trailing
// whitespace is not included in the root node's size.
//
// TODO: Fix this inconsistency. Maybe disallow the start rule being hidden?
if (ts_document_language(document) == get_test_language("json") &&
string(ts_node_type(root_node, document)) != "ERROR")
expected_size = content.find_last_not_of("\n ") + 1;
AssertThat(ts_node_end_byte(root_node), Equals(expected_size));
assert_consistent_sizes(root_node);
}
START_TEST
describe("The Corpus", []() {
vector<string> test_languages({
"javascript",
"json",
"c",
"cpp",
"python",
});
for (auto &language_name : test_languages) {
describe(("the " + language_name + " language").c_str(), [&]() {
TSDocument *document;
before_each([&]() {
record_alloc::start();
document = ts_document_new();
ts_document_set_language(document, get_test_language(language_name));
// ts_document_set_logger(document, stderr_logger_new(true));
// ts_document_print_debugging_graphs(document, true);
});
after_each([&]() {
ts_document_free(document);
AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty());
});
for (auto &entry : read_corpus_entries(language_name)) {
SpyInput *input;
auto it_handles_edit_sequence = [&](string name, std::function<void()> edit_sequence){
it(("parses " + entry.description + ": " + name).c_str(), [&]() {
input = new SpyInput(entry.input, 3);
ts_document_set_input(document, input->input());
edit_sequence();
assert_correct_tree_shape(document, entry.tree_string);
assert_correct_tree_size(document, input->content);
delete input;
});
};
it_handles_edit_sequence("initial parse", [&]() {
ts_document_parse(document);
});
std::set<std::pair<size_t, size_t>> deletions;
std::set<std::pair<size_t, string>> insertions;
for (size_t i = 0; i < 60; i++) {
size_t edit_position = random() % utf8_char_count(entry.input);
size_t deletion_size = random() % (utf8_char_count(entry.input) - edit_position);
string inserted_text = random_words(random() % 4 + 1);
if (insertions.insert({edit_position, inserted_text}).second) {
string description = "\"" + inserted_text + "\" at " + to_string(edit_position);
it_handles_edit_sequence("repairing an insertion of " + description, [&]() {
ts_document_edit(document, input->replace(edit_position, 0, inserted_text));
ts_document_parse(document);
assert_correct_tree_size(document, input->content);
ts_document_edit(document, input->undo());
assert_correct_tree_size(document, input->content);
TSRange *ranges;
uint32_t range_count;
ScopeSequence old_scope_sequence = build_scope_sequence(document, input->content);
ts_document_parse_and_get_changed_ranges(document, &ranges, &range_count);
ScopeSequence new_scope_sequence = build_scope_sequence(document, input->content);
verify_changed_ranges(old_scope_sequence, new_scope_sequence,
input->content, ranges, range_count);
ts_free(ranges);
});
}
if (deletions.insert({edit_position, deletion_size}).second) {
string desription = to_string(edit_position) + "-" + to_string(edit_position + deletion_size);
it_handles_edit_sequence("repairing a deletion of " + desription, [&]() {
ts_document_edit(document, input->replace(edit_position, deletion_size, ""));
ts_document_parse(document);
assert_correct_tree_size(document, input->content);
ts_document_edit(document, input->undo());
assert_correct_tree_size(document, input->content);
TSRange *ranges;
uint32_t range_count;
ScopeSequence old_scope_sequence = build_scope_sequence(document, input->content);
ts_document_parse_and_get_changed_ranges(document, &ranges, &range_count);
ScopeSequence new_scope_sequence = build_scope_sequence(document, input->content);
verify_changed_ranges(old_scope_sequence, new_scope_sequence,
input->content, ranges, range_count);
ts_free(ranges);
});
}
}
}
});
}
});
END_TEST

View file

@ -0,0 +1,181 @@
#include "spec_helper.h"
#include "runtime/alloc.h"
#include "helpers/load_language.h"
#include "helpers/read_test_entries.h"
#include "helpers/spy_input.h"
#include "helpers/stderr_logger.h"
#include "helpers/point_helpers.h"
#include "helpers/encoding_helpers.h"
#include "helpers/record_alloc.h"
#include "helpers/random_helpers.h"
#include "helpers/scope_sequence.h"
#include <set>
static void assert_consistent_sizes(TSNode node) {
size_t child_count = ts_node_child_count(node);
size_t start_byte = ts_node_start_byte(node);
size_t end_byte = ts_node_end_byte(node);
TSPoint start_point = ts_node_start_point(node);
TSPoint end_point = ts_node_end_point(node);
bool some_child_has_changes = false;
AssertThat(start_byte, !IsGreaterThan(end_byte));
AssertThat(start_point, !IsGreaterThan(end_point));
size_t last_child_end_byte = start_byte;
TSPoint last_child_end_point = start_point;
for (size_t i = 0; i < child_count; i++) {
TSNode child = ts_node_child(node, i);
size_t child_start_byte = ts_node_start_byte(child);
TSPoint child_start_point = ts_node_start_point(child);
AssertThat(child_start_byte, !IsLessThan(last_child_end_byte));
AssertThat(child_start_point, !IsLessThan(last_child_end_point));
assert_consistent_sizes(child);
if (ts_node_has_changes(child))
some_child_has_changes = true;
last_child_end_byte = ts_node_end_byte(child);
last_child_end_point = ts_node_end_point(child);
}
if (child_count > 0) {
AssertThat(end_byte, !IsLessThan(last_child_end_byte));
AssertThat(end_point, !IsLessThan(last_child_end_point));
}
if (some_child_has_changes) {
AssertThat(ts_node_has_changes(node), IsTrue());
}
}
static void assert_correct_tree_size(TSDocument *document, string content) {
TSNode root_node = ts_document_root_node(document);
size_t expected_size = content.size();
// In the JSON grammar, the start rule (`_value`) is hidden, so the node
// returned from `ts_document_root_node` (e.g. an `object` node), does not
// actually point to the root of the tree. In this weird case, trailing
// whitespace is not included in the root node's size.
//
// TODO: Fix this inconsistency. Maybe disallow the start rule being hidden?
if (ts_document_language(document) == load_real_language("json") &&
string(ts_node_type(root_node, document)) != "ERROR")
expected_size = content.find_last_not_of("\n ") + 1;
AssertThat(ts_node_end_byte(root_node), Equals(expected_size));
assert_consistent_sizes(root_node);
}
START_TEST
vector<string> test_languages({
"javascript",
"json",
"c",
"cpp",
"python",
});
for (auto &language_name : test_languages) {
describe(("the " + language_name + " language").c_str(), [&]() {
TSDocument *document;
before_each([&]() {
record_alloc::start();
document = ts_document_new();
ts_document_set_language(document, load_real_language(language_name));
// ts_document_set_logger(document, stderr_logger_new(true));
// ts_document_print_debugging_graphs(document, true);
});
after_each([&]() {
ts_document_free(document);
AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty());
});
for (auto &entry : read_real_language_corpus(language_name)) {
SpyInput *input;
auto it_handles_edit_sequence = [&](string name, std::function<void()> edit_sequence){
it(("parses " + entry.description + ": " + name).c_str(), [&]() {
input = new SpyInput(entry.input, 3);
ts_document_set_input(document, input->input());
edit_sequence();
TSNode root_node = ts_document_root_node(document);
const char *node_string = ts_node_string(root_node, document);
string result(node_string);
ts_free((void *)node_string);
AssertThat(result, Equals(entry.tree_string));
assert_correct_tree_size(document, input->content);
delete input;
});
};
it_handles_edit_sequence("initial parse", [&]() {
ts_document_parse(document);
});
std::set<std::pair<size_t, size_t>> deletions;
std::set<std::pair<size_t, string>> insertions;
for (size_t i = 0; i < 60; i++) {
size_t edit_position = random() % utf8_char_count(entry.input);
size_t deletion_size = random() % (utf8_char_count(entry.input) - edit_position);
string inserted_text = random_words(random() % 4 + 1);
if (insertions.insert({edit_position, inserted_text}).second) {
string description = "\"" + inserted_text + "\" at " + to_string(edit_position);
it_handles_edit_sequence("repairing an insertion of " + description, [&]() {
ts_document_edit(document, input->replace(edit_position, 0, inserted_text));
ts_document_parse(document);
assert_correct_tree_size(document, input->content);
ts_document_edit(document, input->undo());
assert_correct_tree_size(document, input->content);
TSRange *ranges;
uint32_t range_count;
ScopeSequence old_scope_sequence = build_scope_sequence(document, input->content);
ts_document_parse_and_get_changed_ranges(document, &ranges, &range_count);
ScopeSequence new_scope_sequence = build_scope_sequence(document, input->content);
verify_changed_ranges(old_scope_sequence, new_scope_sequence,
input->content, ranges, range_count);
ts_free(ranges);
});
}
if (deletions.insert({edit_position, deletion_size}).second) {
string desription = to_string(edit_position) + "-" + to_string(edit_position + deletion_size);
it_handles_edit_sequence("repairing a deletion of " + desription, [&]() {
ts_document_edit(document, input->replace(edit_position, deletion_size, ""));
ts_document_parse(document);
assert_correct_tree_size(document, input->content);
ts_document_edit(document, input->undo());
assert_correct_tree_size(document, input->content);
TSRange *ranges;
uint32_t range_count;
ScopeSequence old_scope_sequence = build_scope_sequence(document, input->content);
ts_document_parse_and_get_changed_ranges(document, &ranges, &range_count);
ScopeSequence new_scope_sequence = build_scope_sequence(document, input->content);
verify_changed_ranges(old_scope_sequence, new_scope_sequence,
input->content, ranges, range_count);
ts_free(ranges);
});
}
}
}
});
}
END_TEST

View file

@ -0,0 +1,78 @@
#include "spec_helper.h"
#include "helpers/read_test_entries.h"
#include "helpers/load_language.h"
#include "helpers/stderr_logger.h"
#include "helpers/file_helpers.h"
#include "runtime/alloc.h"
START_TEST
string grammars_dir_path = "spec/fixtures/test_grammars";
vector<string> test_languages = list_directory(grammars_dir_path);
for (auto &language_name : test_languages) {
if (language_name == "readme.md") continue;
describe(("test language: " + language_name).c_str(), [&]() {
string directory_path = grammars_dir_path + "/" + language_name;
string grammar_path = directory_path + "/grammar.json";
string external_scanner_path = directory_path + "/scanner.c";
string expected_error_path = directory_path + "/expected_error.txt";
string corpus_path = directory_path + "/corpus.txt";
if (!file_exists(external_scanner_path)) {
external_scanner_path = "";
}
string grammar_json = read_file(grammar_path);
TSCompileResult compile_result = ts_compile_grammar(grammar_json.c_str());
if (file_exists(expected_error_path)) {
it("fails with the correct error message", [&]() {
string expected_error = read_file(expected_error_path);
AssertThat((void *)compile_result.error_message, !IsNull());
AssertThat(compile_result.error_message, Equals(expected_error));
});
return;
} else {
TSDocument *document = nullptr;
const TSLanguage *language = nullptr;
before_each([&]() {
if (!language) {
language = load_test_language(
language_name,
compile_result,
external_scanner_path
);
}
document = ts_document_new();
ts_document_set_language(document, language);
// ts_document_set_logger(document, stderr_logger_new(true));
// ts_document_print_debugging_graphs(document, true);
});
after_each([&]() {
if (document) ts_document_free(document);
});
for (auto &entry : read_test_language_corpus(language_name)) {
it(("parses " + entry.description).c_str(), [&]() {
ts_document_set_input_string_with_length(document, entry.input.c_str(), entry.input.size());
ts_document_parse(document);
TSNode root_node = ts_document_root_node(document);
const char *node_string = ts_node_string(root_node, document);
string result(node_string);
ts_free((void *)node_string);
AssertThat(result, Equals(entry.tree_string));
});
}
}
});
}
END_TEST

View file

@ -43,7 +43,7 @@ describe("Document", [&]() {
before_each([&]() {
spy_input = new SpyInput("{\"key\": [null, 2]}", 3);
ts_document_set_language(document, get_test_language("json"));
ts_document_set_language(document, load_real_language("json"));
ts_document_set_input_string(document, "{\"key\": [1, 2]}");
ts_document_parse(document);
@ -152,7 +152,7 @@ describe("Document", [&]() {
});
it("uses the given language for future parses", [&]() {
ts_document_set_language(document, get_test_language("json"));
ts_document_set_language(document, load_real_language("json"));
ts_document_parse(document);
root = ts_document_root_node(document);
@ -162,10 +162,10 @@ describe("Document", [&]() {
});
it("clears out any previous tree", [&]() {
ts_document_set_language(document, get_test_language("json"));
ts_document_set_language(document, load_real_language("json"));
ts_document_parse(document);
ts_document_set_language(document, get_test_language("javascript"));
ts_document_set_language(document, load_real_language("javascript"));
AssertThat(ts_document_root_node(document).data, Equals<void *>(nullptr));
ts_document_parse(document);
@ -177,7 +177,7 @@ describe("Document", [&]() {
});
it("does not allow setting a language with a different version number", [&]() {
TSLanguage language = *get_test_language("json");
TSLanguage language = *load_real_language("json");
AssertThat(ts_language_version(&language), Equals<uint32_t>(TREE_SITTER_LANGUAGE_VERSION));
language.version++;
@ -193,7 +193,7 @@ describe("Document", [&]() {
before_each([&]() {
logger = new SpyLogger();
ts_document_set_language(document, get_test_language("json"));
ts_document_set_language(document, load_real_language("json"));
ts_document_set_input_string(document, "[1, 2]");
});
@ -235,7 +235,7 @@ describe("Document", [&]() {
SpyInput *input;
before_each([&]() {
ts_document_set_language(document, get_test_language("javascript"));
ts_document_set_language(document, load_real_language("javascript"));
input = new SpyInput("{a: null};", 3);
ts_document_set_input(document, input->input());
ts_document_parse(document);

View file

@ -40,7 +40,7 @@ describe("Node", []() {
record_alloc::start();
document = ts_document_new();
ts_document_set_language(document, get_test_language("json"));
ts_document_set_language(document, load_real_language("json"));
ts_document_set_input_string(document, input_string.c_str());
ts_document_parse(document);

View file

@ -83,7 +83,7 @@ describe("Parser", [&]() {
describe("handling errors", [&]() {
describe("when there is an invalid substring right before a valid token", [&]() {
it("computes the error node's size and position correctly", [&]() {
ts_document_set_language(document, get_test_language("json"));
ts_document_set_language(document, load_real_language("json"));
set_text(" [123, @@@@@, true]");
assert_root_node(
@ -108,7 +108,7 @@ describe("Parser", [&]() {
describe("when there is an unexpected string in the middle of a token", [&]() {
it("computes the error node's size and position correctly", [&]() {
ts_document_set_language(document, get_test_language("json"));
ts_document_set_language(document, load_real_language("json"));
set_text(" [123, faaaaalse, true]");
assert_root_node(
@ -134,7 +134,7 @@ describe("Parser", [&]() {
describe("when there is one unexpected token between two valid tokens", [&]() {
it("computes the error node's size and position correctly", [&]() {
ts_document_set_language(document, get_test_language("json"));
ts_document_set_language(document, load_real_language("json"));
set_text(" [123, true false, true]");
assert_root_node(
@ -153,7 +153,7 @@ describe("Parser", [&]() {
describe("when there is an unexpected string at the end of a token", [&]() {
it("computes the error's size and position correctly", [&]() {
ts_document_set_language(document, get_test_language("json"));
ts_document_set_language(document, load_real_language("json"));
set_text(" [123, \"hi\n, true]");
assert_root_node(
@ -163,7 +163,7 @@ describe("Parser", [&]() {
describe("when there is an unterminated error", [&]() {
it("maintains a consistent tree", [&]() {
ts_document_set_language(document, get_test_language("javascript"));
ts_document_set_language(document, load_real_language("javascript"));
set_text("a; /* b");
assert_root_node(
"(ERROR (program (expression_statement (identifier))) (UNEXPECTED EOF))");
@ -172,7 +172,7 @@ describe("Parser", [&]() {
describe("when there are extra tokens at the end of the viable prefix", [&]() {
it("does not include them in the error node", [&]() {
ts_document_set_language(document, get_test_language("javascript"));
ts_document_set_language(document, load_real_language("javascript"));
set_text(
"var x;\n"
"\n"
@ -192,7 +192,7 @@ describe("Parser", [&]() {
describe("handling extra tokens", [&]() {
describe("when the token appears as part of a grammar rule", [&]() {
it("incorporates it into the tree", [&]() {
ts_document_set_language(document, get_test_language("javascript"));
ts_document_set_language(document, load_real_language("javascript"));
set_text("fn()\n");
assert_root_node(
@ -202,7 +202,7 @@ describe("Parser", [&]() {
describe("when the token appears somewhere else", [&]() {
it("incorporates it into the tree", [&]() {
ts_document_set_language(document, get_test_language("javascript"));
ts_document_set_language(document, load_real_language("javascript"));
set_text(
"fn()\n"
" .otherFn();");
@ -218,7 +218,7 @@ describe("Parser", [&]() {
describe("when several extra tokens appear in a row", [&]() {
it("incorporates them into the tree", [&]() {
ts_document_set_language(document, get_test_language("javascript"));
ts_document_set_language(document, load_real_language("javascript"));
set_text(
"fn()\n\n"
"// This is a comment"
@ -239,7 +239,7 @@ describe("Parser", [&]() {
describe("editing", [&]() {
describe("creating new tokens near the end of the input", [&]() {
it("updates the parse tree and re-reads only the changed portion of the text", [&]() {
ts_document_set_language(document, get_test_language("javascript"));
ts_document_set_language(document, load_real_language("javascript"));
set_text("x * (100 + abc);");
assert_root_node(
@ -262,7 +262,7 @@ describe("Parser", [&]() {
it("updates the parse tree and re-reads only the changed portion of the input", [&]() {
chunk_size = 2;
ts_document_set_language(document, get_test_language("javascript"));
ts_document_set_language(document, load_real_language("javascript"));
set_text("123 + 456 * (10 + x);");
assert_root_node(
@ -285,7 +285,7 @@ describe("Parser", [&]() {
describe("introducing an error", [&]() {
it("gives the error the right size", [&]() {
ts_document_set_language(document, get_test_language("javascript"));
ts_document_set_language(document, load_real_language("javascript"));
set_text("var x = y;");
assert_root_node(
@ -308,7 +308,7 @@ describe("Parser", [&]() {
describe("into the middle of an existing token", [&]() {
it("updates the parse tree", [&]() {
ts_document_set_language(document, get_test_language("javascript"));
ts_document_set_language(document, load_real_language("javascript"));
set_text("abc * 123;");
assert_root_node(
@ -327,7 +327,7 @@ describe("Parser", [&]() {
describe("at the end of an existing token", [&]() {
it("updates the parse tree", [&]() {
ts_document_set_language(document, get_test_language("javascript"));
ts_document_set_language(document, load_real_language("javascript"));
set_text("abc * 123;");
assert_root_node(
@ -346,7 +346,7 @@ describe("Parser", [&]() {
describe("inserting text into a node containing a extra token", [&]() {
it("updates the parse tree", [&]() {
ts_document_set_language(document, get_test_language("javascript"));
ts_document_set_language(document, load_real_language("javascript"));
set_text("123 *\n"
"// a-comment\n"
"abc;");
@ -373,7 +373,7 @@ describe("Parser", [&]() {
describe("when a critical token is removed", [&]() {
it("updates the parse tree, creating an error", [&]() {
ts_document_set_language(document, get_test_language("javascript"));
ts_document_set_language(document, load_real_language("javascript"));
set_text("123 * 456; 789 * 123;");
assert_root_node(
@ -392,7 +392,7 @@ describe("Parser", [&]() {
describe("with external tokens", [&]() {
it("maintains the external scanner's state during incremental parsing", [&]() {
ts_document_set_language(document, get_test_language("python"));
ts_document_set_language(document, load_real_language("python"));
string text = dedent(R"PYTHON(
if a:
print b
@ -420,7 +420,7 @@ describe("Parser", [&]() {
});
it("does not try to re-use nodes that are within the edited region", [&]() {
ts_document_set_language(document, get_test_language("javascript"));
ts_document_set_language(document, load_real_language("javascript"));
set_text("{ x: (b.c) };");
assert_root_node(
@ -435,7 +435,7 @@ describe("Parser", [&]() {
});
it("updates the document's parse count", [&]() {
ts_document_set_language(document, get_test_language("javascript"));
ts_document_set_language(document, load_real_language("javascript"));
AssertThat(ts_document_parse_count(document), Equals<size_t>(0));
set_text("{ x: (b.c) };");
@ -449,7 +449,7 @@ describe("Parser", [&]() {
describe("lexing", [&]() {
describe("handling tokens containing wildcard patterns (e.g. comments)", [&]() {
it("terminates them at the end of the document", [&]() {
ts_document_set_language(document, get_test_language("javascript"));
ts_document_set_language(document, load_real_language("javascript"));
set_text("x; // this is a comment");
assert_root_node(
@ -464,7 +464,7 @@ describe("Parser", [&]() {
it("recognizes UTF8 characters as single characters", [&]() {
// 'ΩΩΩ — ΔΔ';
ts_document_set_language(document, get_test_language("javascript"));
ts_document_set_language(document, load_real_language("javascript"));
set_text("'\u03A9\u03A9\u03A9 \u2014 \u0394\u0394';");
assert_root_node(

View file

@ -1,195 +0,0 @@
#include "compiler/build_tables/build_lex_table.h"
#include <climits>
#include <map>
#include <set>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "compiler/build_tables/lex_conflict_manager.h"
#include "compiler/build_tables/remove_duplicate_states.h"
#include "compiler/build_tables/lex_item.h"
#include "compiler/parse_table.h"
#include "compiler/lexical_grammar.h"
#include "compiler/rules/built_in_symbols.h"
#include "compiler/rules/choice.h"
#include "compiler/rules/metadata.h"
#include "compiler/rules/repeat.h"
#include "compiler/rules/seq.h"
#include "compiler/rules/blank.h"
namespace tree_sitter {
namespace build_tables {
using std::map;
using std::set;
using std::string;
using std::vector;
using std::make_shared;
using std::unordered_map;
using rules::Blank;
using rules::Choice;
using rules::CharacterSet;
using rules::Repeat;
using rules::Symbol;
using rules::Metadata;
using rules::Seq;
class LexTableBuilder {
LexTable lex_table;
ParseTable *parse_table;
const LexicalGrammar lex_grammar;
vector<rule_ptr> separator_rules;
LexConflictManager conflict_manager;
unordered_map<LexItemSet, LexStateId> lex_state_ids;
public:
LexTableBuilder(ParseTable *parse_table, const LexicalGrammar &lex_grammar)
: parse_table(parse_table), lex_grammar(lex_grammar) {
for (const rule_ptr &rule : lex_grammar.separators)
separator_rules.push_back(Repeat::build(rule));
separator_rules.push_back(Blank::build());
}
LexTable build() {
for (ParseState &parse_state : parse_table->states)
add_lex_state_for_parse_state(&parse_state);
mark_fragile_tokens();
remove_duplicate_lex_states();
return lex_table;
}
private:
void add_lex_state_for_parse_state(ParseState *parse_state) {
parse_state->lex_state_id =
add_lex_state(item_set_for_terminals(parse_state->terminal_entries));
}
LexStateId add_lex_state(const LexItemSet &item_set) {
const auto &pair = lex_state_ids.find(item_set);
if (pair == lex_state_ids.end()) {
LexStateId state_id = lex_table.add_state();
lex_state_ids[item_set] = state_id;
add_accept_token_actions(item_set, state_id);
add_advance_actions(item_set, state_id);
return state_id;
} else {
return pair->second;
}
}
void add_advance_actions(const LexItemSet &item_set, LexStateId state_id) {
for (const auto &pair : item_set.transitions()) {
const CharacterSet &characters = pair.first;
const LexItemSet::Transition &transition = pair.second;
AdvanceAction action(-1, transition.precedence, transition.in_main_token);
auto current_action = lex_table.state(state_id).accept_action;
if (conflict_manager.resolve(transition.destination, action,
current_action)) {
action.state_index = add_lex_state(transition.destination);
lex_table.state(state_id).advance_actions[characters] = action;
}
}
}
void add_accept_token_actions(const LexItemSet &item_set, LexStateId state_id) {
for (const LexItem &item : item_set.entries) {
LexItem::CompletionStatus completion_status = item.completion_status();
if (completion_status.is_done) {
AcceptTokenAction action(item.lhs, completion_status.precedence.max,
completion_status.is_string);
auto current_action = lex_table.state(state_id).accept_action;
if (conflict_manager.resolve(action, current_action))
lex_table.state(state_id).accept_action = action;
}
}
}
void mark_fragile_tokens() {
for (ParseState &state : parse_table->states) {
for (auto &entry : state.terminal_entries) {
Symbol symbol = entry.first;
if (symbol.is_token()) {
auto homonyms = conflict_manager.possible_homonyms.find(symbol.index);
if (homonyms != conflict_manager.possible_homonyms.end())
for (Symbol::Index homonym : homonyms->second)
if (state.terminal_entries.count(Symbol(homonym, Symbol::Terminal))) {
entry.second.reusable = false;
break;
}
if (!entry.second.reusable)
continue;
auto extensions = conflict_manager.possible_extensions.find(symbol.index);
if (extensions != conflict_manager.possible_extensions.end())
for (Symbol::Index extension : extensions->second)
if (state.terminal_entries.count(Symbol(extension, Symbol::Terminal))) {
entry.second.depends_on_lookahead = true;
break;
}
}
}
}
}
void remove_duplicate_lex_states() {
for (LexState &state : lex_table.states) {
state.accept_action.is_string = false;
state.accept_action.precedence = 0;
}
auto replacements =
remove_duplicate_states<LexTable>(&lex_table);
for (ParseState &parse_state : parse_table->states) {
auto replacement = replacements.find(parse_state.lex_state_id);
if (replacement != replacements.end())
parse_state.lex_state_id = replacement->second;
}
}
LexItemSet item_set_for_terminals(const map<Symbol, ParseTableEntry> &terminals) {
LexItemSet result;
for (const auto &pair : terminals) {
Symbol symbol = pair.first;
if (symbol.is_token()) {
for (const rule_ptr &rule : rules_for_symbol(symbol)) {
for (const rule_ptr &separator_rule : separator_rules) {
result.entries.insert(LexItem(
symbol,
Metadata::separator(
Seq::build({
separator_rule,
Metadata::main_token(rule) }))));
}
}
}
}
return result;
}
vector<rule_ptr> rules_for_symbol(const rules::Symbol &symbol) {
if (symbol == rules::END_OF_INPUT())
return { CharacterSet().include(0).copy() };
rule_ptr rule = lex_grammar.variables[symbol.index].rule;
auto choice = rule->as<Choice>();
if (choice)
return choice->elements;
else
return { rule };
}
};
LexTable build_lex_table(ParseTable *table, const LexicalGrammar &grammar) {
return LexTableBuilder(table, grammar).build();
}
} // namespace build_tables
} // namespace tree_sitter

View file

@ -1,18 +0,0 @@
#ifndef COMPILER_BUILD_TABLES_BUILD_LEX_TABLE_H_
#define COMPILER_BUILD_TABLES_BUILD_LEX_TABLE_H_
#include "compiler/lex_table.h"
namespace tree_sitter {
struct LexicalGrammar;
class ParseTable;
namespace build_tables {
LexTable build_lex_table(ParseTable *, const LexicalGrammar &);
} // namespace build_tables
} // namespace tree_sitter
#endif // COMPILER_BUILD_TABLES_BUILD_LEX_TABLE_H_

View file

@ -6,14 +6,13 @@
#include <unordered_map>
#include <utility>
#include "compiler/parse_table.h"
#include "compiler/build_tables/remove_duplicate_states.h"
#include "compiler/build_tables/parse_item.h"
#include "compiler/build_tables/parse_item_set_builder.h"
#include "compiler/lexical_grammar.h"
#include "compiler/syntax_grammar.h"
#include "compiler/rules/symbol.h"
#include "compiler/rules/built_in_symbols.h"
#include "compiler/build_tables/recovery_tokens.h"
#include "compiler/build_tables/lex_table_builder.h"
namespace tree_sitter {
namespace build_tables {
@ -41,6 +40,7 @@ class ParseTableBuilder {
set<string> conflicts;
ParseItemSetBuilder item_set_builder;
set<const Production *> fragile_productions;
vector<set<Symbol::Index>> incompatible_token_indices_by_index;
bool allow_any_conflict;
public:
@ -56,9 +56,9 @@ class ParseTableBuilder {
Symbol(0, Symbol::Terminal) :
Symbol(0, Symbol::NonTerminal);
Production start_production({
ProductionStep(start_symbol, 0, rules::AssociativityNone),
});
Production start_production{
ProductionStep{start_symbol, 0, rules::AssociativityNone},
};
// Placeholder for error state
add_parse_state(ParseItemSet());
@ -71,10 +71,11 @@ class ParseTableBuilder {
}));
CompileError error = process_part_state_queue();
if (error.type != TSCompileErrorTypeNone)
if (error.type != TSCompileErrorTypeNone) {
return { parse_table, error };
}
parse_table.mergeable_symbols = recovery_tokens(lexical_grammar);
compute_unmergable_token_pairs();
build_error_parse_state();
@ -110,8 +111,18 @@ class ParseTableBuilder {
void build_error_parse_state() {
ParseState error_state;
for (const Symbol symbol : parse_table.mergeable_symbols) {
add_out_of_context_parse_state(&error_state, symbol);
for (Symbol::Index i = 0; i < lexical_grammar.variables.size(); i++) {
bool has_non_reciprocal_conflict = false;
for (Symbol::Index incompatible_index : incompatible_token_indices_by_index[i]) {
if (!incompatible_token_indices_by_index[incompatible_index].count(i)) {
has_non_reciprocal_conflict = true;
break;
}
}
if (!has_non_reciprocal_conflict) {
add_out_of_context_parse_state(&error_state, Symbol(i, Symbol::Terminal));
}
}
for (const Symbol &symbol : grammar.extra_tokens) {
@ -148,7 +159,8 @@ class ParseTableBuilder {
ParseStateId add_parse_state(const ParseItemSet &item_set) {
auto pair = parse_state_ids.find(item_set);
if (pair == parse_state_ids.end()) {
ParseStateId state_id = parse_table.add_state();
ParseStateId state_id = parse_table.states.size();
parse_table.states.push_back(ParseState());
parse_state_ids[item_set] = state_id;
parse_table.states[state_id].shift_actions_signature = item_set.unfinished_item_signature();
item_sets_to_process.push_back({ std::move(item_set), state_id });
@ -291,6 +303,34 @@ class ParseTableBuilder {
}
}
void compute_unmergable_token_pairs() {
incompatible_token_indices_by_index.resize(lexical_grammar.variables.size());
// First, assume that all tokens are mutually incompatible.
for (Symbol::Index i = 0, n = lexical_grammar.variables.size(); i < n; i++) {
auto &incompatible_indices = incompatible_token_indices_by_index[i];
for (Symbol::Index j = 0; j < n; j++) {
if (j != i) incompatible_indices.insert(j);
}
}
// For the remaining possibly-incompatible pairs of tokens, check if they
// are actually incompatible by actually generating lexical states that
// contain them both.
auto lex_table_builder = LexTableBuilder::create(lexical_grammar);
for (Symbol::Index i = 0, n = lexical_grammar.variables.size(); i < n; i++) {
auto &incompatible_indices = incompatible_token_indices_by_index[i];
auto iter = incompatible_indices.begin();
while (iter != incompatible_indices.end()) {
if (lex_table_builder->detect_conflict(i, *iter)) {
++iter;
} else {
iter = incompatible_indices.erase(iter);
}
}
}
}
void remove_duplicate_parse_states() {
map<size_t, set<ParseStateId>> state_indices_by_signature;
@ -302,7 +342,7 @@ class ParseTableBuilder {
set<ParseStateId> deleted_states;
while (true) {
std::map<ParseStateId, ParseStateId> state_replacements;
map<ParseStateId, ParseStateId> state_replacements;
for (auto &pair : state_indices_by_signature) {
auto &state_group = pair.second;
@ -310,7 +350,7 @@ class ParseTableBuilder {
for (ParseStateId i : state_group) {
for (ParseStateId j : state_group) {
if (j == i) break;
if (!state_replacements.count(j) && parse_table.merge_state(j, i)) {
if (!state_replacements.count(j) && merge_parse_state(j, i)) {
state_replacements.insert({ i, j });
deleted_states.insert(i);
break;
@ -364,6 +404,72 @@ class ParseTableBuilder {
}
}
static bool has_entry(const ParseState &state, const ParseTableEntry &entry) {
for (const auto &pair : state.terminal_entries)
if (pair.second == entry)
return true;
return false;
}
bool merge_parse_state(size_t i, size_t j) {
ParseState &state = parse_table.states[i];
ParseState &other = parse_table.states[j];
if (state.nonterminal_entries != other.nonterminal_entries)
return false;
for (auto &entry : state.terminal_entries) {
Symbol lookahead = entry.first;
const vector<ParseAction> &actions = entry.second.actions;
auto &incompatible_token_indices = incompatible_token_indices_by_index[lookahead.index];
const auto &other_entry = other.terminal_entries.find(lookahead);
if (other_entry == other.terminal_entries.end()) {
if (lookahead.is_external()) return false;
if (!lookahead.is_built_in()) {
for (Symbol::Index incompatible_index : incompatible_token_indices) {
Symbol incompatible_symbol(incompatible_index, Symbol::Terminal);
if (other.terminal_entries.count(incompatible_symbol)) return false;
}
}
if (actions.back().type != ParseActionTypeReduce)
return false;
if (!has_entry(other, entry.second))
return false;
} else if (entry.second != other_entry->second) {
return false;
}
}
set<Symbol> symbols_to_merge;
for (auto &entry : other.terminal_entries) {
Symbol lookahead = entry.first;
const vector<ParseAction> &actions = entry.second.actions;
auto &incompatible_token_indices = incompatible_token_indices_by_index[lookahead.index];
if (!state.terminal_entries.count(lookahead)) {
if (lookahead.is_external()) return false;
if (!lookahead.is_built_in()) {
for (Symbol::Index incompatible_index : incompatible_token_indices) {
Symbol incompatible_symbol(incompatible_index, Symbol::Terminal);
if (state.terminal_entries.count(incompatible_symbol)) return false;
}
}
if (actions.back().type != ParseActionTypeReduce)
return false;
if (!has_entry(state, entry.second))
return false;
symbols_to_merge.insert(lookahead);
}
}
for (const Symbol &lookahead : symbols_to_merge)
state.terminal_entries[lookahead] = other.terminal_entries.find(lookahead)->second;
return true;
}
string handle_conflict(const ParseItemSet &item_set, ParseStateId state_id,
Symbol lookahead) {
ParseTableEntry &entry = parse_table.states[state_id].terminal_entries[lookahead];
@ -574,7 +680,7 @@ class ParseTableBuilder {
switch (symbol.type) {
case Symbol::Terminal: {
const Variable &variable = lexical_grammar.variables[symbol.index];
const LexicalVariable &variable = lexical_grammar.variables[symbol.index];
if (variable.type == VariableTypeNamed)
return variable.name;
else

View file

@ -1,6 +1,6 @@
#include "compiler/build_tables/build_tables.h"
#include <tuple>
#include "compiler/build_tables/build_lex_table.h"
#include "compiler/build_tables/lex_table_builder.h"
#include "compiler/build_tables/build_parse_table.h"
#include "compiler/syntax_grammar.h"
#include "compiler/lexical_grammar.h"
@ -15,11 +15,13 @@ using std::vector;
using std::make_tuple;
tuple<ParseTable, LexTable, CompileError> build_tables(
const SyntaxGrammar &grammar, const LexicalGrammar &lex_grammar) {
auto parse_table_result = build_parse_table(grammar, lex_grammar);
const SyntaxGrammar &grammar,
const LexicalGrammar &lexical_grammar
) {
auto parse_table_result = build_parse_table(grammar, lexical_grammar);
ParseTable parse_table = parse_table_result.first;
const CompileError error = parse_table_result.second;
LexTable lex_table = build_lex_table(&parse_table, lex_grammar);
LexTable lex_table = LexTableBuilder::create(lexical_grammar)->build(&parse_table);
return make_tuple(parse_table, lex_table, error);
}

View file

@ -10,11 +10,10 @@ namespace build_tables {
bool LexConflictManager::resolve(const LexItemSet &item_set,
const AdvanceAction &new_action,
const AcceptTokenAction &old_action) {
if (!old_action.is_present())
return true;
if (new_action.precedence_range.max >= old_action.precedence) {
for (const LexItem &item : item_set.entries)
for (const LexItem &item : item_set.entries) {
possible_extensions[old_action.symbol.index].insert(item.lhs.index);
}
return true;
} else {
return false;
@ -23,30 +22,26 @@ bool LexConflictManager::resolve(const LexItemSet &item_set,
bool LexConflictManager::resolve(const AcceptTokenAction &new_action,
const AcceptTokenAction &old_action) {
if (!old_action.is_present())
return true;
int old_precedence = old_action.precedence;
int new_precedence = new_action.precedence;
bool result;
if (new_precedence > old_precedence)
if (new_action.precedence > old_action.precedence) {
result = true;
else if (new_precedence < old_precedence)
} else if (new_action.precedence < old_action.precedence) {
result = false;
else if (new_action.is_string && !old_action.is_string)
} else if (new_action.is_string && !old_action.is_string) {
result = true;
else if (old_action.is_string && !new_action.is_string)
} else if (old_action.is_string && !new_action.is_string) {
result = false;
else if (new_action.symbol.index < old_action.symbol.index)
} else if (new_action.symbol.index < old_action.symbol.index) {
result = true;
else
} else {
result = false;
}
if (result)
if (result) {
possible_homonyms[old_action.symbol.index].insert(new_action.symbol.index);
else
} else {
possible_homonyms[new_action.symbol.index].insert(old_action.symbol.index);
}
return result;
}

View file

@ -32,19 +32,15 @@ LexItem::CompletionStatus LexItem::completion_status() const {
CompletionStatus apply_to(const rules::Choice *rule) {
for (const auto &element : rule->elements) {
CompletionStatus status = apply(element);
if (status.is_done)
return status;
if (status.is_done) return status;
}
return { false, PrecedenceRange(), false };
return { false, PrecedenceRange() };
}
CompletionStatus apply_to(const rules::Metadata *rule) {
CompletionStatus result = apply(rule->rule);
if (result.is_done) {
if (result.precedence.empty && rule->params.has_precedence)
result.precedence.add(rule->params.precedence);
if (rule->params.is_string)
result.is_string = true;
if (result.is_done && result.precedence.empty && rule->params.has_precedence) {
result.precedence.add(rule->params.precedence);
}
return result;
}
@ -54,15 +50,16 @@ LexItem::CompletionStatus LexItem::completion_status() const {
}
CompletionStatus apply_to(const rules::Blank *rule) {
return { true, PrecedenceRange(), false };
return { true, PrecedenceRange() };
}
CompletionStatus apply_to(const rules::Seq *rule) {
CompletionStatus left_status = apply(rule->left);
if (left_status.is_done)
if (left_status.is_done) {
return apply(rule->right);
else
return { false, PrecedenceRange(), false };
} else {
return { false, PrecedenceRange() };
}
}
};
@ -80,8 +77,9 @@ bool LexItemSet::operator==(const LexItemSet &other) const {
LexItemSet::TransitionMap LexItemSet::transitions() const {
TransitionMap result;
for (const LexItem &item : entries)
for (const LexItem &item : entries) {
lex_item_transitions(&result, item);
}
return result;
}

View file

@ -19,7 +19,6 @@ class LexItem {
struct CompletionStatus {
bool is_done;
PrecedenceRange precedence;
bool is_string;
};
bool operator==(const LexItem &other) const;

View file

@ -0,0 +1,324 @@
#include "compiler/build_tables/lex_table_builder.h"
#include <climits>
#include <map>
#include <set>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "compiler/build_tables/lex_conflict_manager.h"
#include "compiler/build_tables/lex_item.h"
#include "compiler/parse_table.h"
#include "compiler/lexical_grammar.h"
#include "compiler/rules/built_in_symbols.h"
#include "compiler/rules/choice.h"
#include "compiler/rules/metadata.h"
#include "compiler/rules/repeat.h"
#include "compiler/rules/seq.h"
#include "compiler/rules/blank.h"
#include "compiler/rules/visitor.h"
namespace tree_sitter {
namespace build_tables {
using std::map;
using std::pair;
using std::set;
using std::string;
using std::vector;
using std::unordered_map;
using std::unique_ptr;
using rules::Blank;
using rules::Choice;
using rules::CharacterSet;
using rules::Repeat;
using rules::Symbol;
using rules::Metadata;
using rules::Seq;
class StartingCharacterAggregator : public rules::RuleFn<void> {
void apply_to(const rules::Seq *rule) {
apply(rule->left);
}
void apply_to(const rules::Choice *rule) {
for (const rule_ptr &element : rule->elements) apply(element);
}
void apply_to(const rules::Repeat *rule) {
apply(rule->content);
}
void apply_to(const rules::Metadata *rule) {
apply(rule->rule);
}
void apply_to(const rules::CharacterSet *rule) {
result.add_set(*rule);
}
public:
CharacterSet result;
};
class LexTableBuilderImpl : public LexTableBuilder {
LexTable lex_table;
const LexicalGrammar grammar;
vector<rule_ptr> separator_rules;
CharacterSet first_separator_characters;
LexConflictManager conflict_manager;
unordered_map<LexItemSet, LexStateId> lex_state_ids;
public:
vector<bool> shadowed_token_indices;
LexTableBuilderImpl(const LexicalGrammar &grammar) : grammar(grammar) {
StartingCharacterAggregator starting_character_aggregator;
for (const rule_ptr &rule : grammar.separators) {
separator_rules.push_back(Repeat::build(rule));
starting_character_aggregator.apply(rule);
}
separator_rules.push_back(Blank::build());
first_separator_characters = starting_character_aggregator.result;
shadowed_token_indices.resize(grammar.variables.size());
}
LexTable build(ParseTable *parse_table) {
for (ParseState &parse_state : parse_table->states) {
parse_state.lex_state_id = add_lex_state(
item_set_for_terminals(parse_state.terminal_entries)
);
}
mark_fragile_tokens(parse_table);
remove_duplicate_lex_states(parse_table);
return lex_table;
}
bool detect_conflict(Symbol::Index left, Symbol::Index right) {
clear();
map<Symbol, ParseTableEntry> terminals;
terminals[Symbol(left, Symbol::Terminal)];
terminals[Symbol(right, Symbol::Terminal)];
add_lex_state(item_set_for_terminals(terminals));
return shadowed_token_indices[right];
}
LexStateId add_lex_state(const LexItemSet &item_set) {
const auto &pair = lex_state_ids.find(item_set);
if (pair == lex_state_ids.end()) {
LexStateId state_id = lex_table.states.size();
lex_table.states.push_back(LexState());
lex_state_ids[item_set] = state_id;
add_accept_token_actions(item_set, state_id);
add_advance_actions(item_set, state_id);
return state_id;
} else {
return pair->second;
}
}
void clear() {
lex_table.states.clear();
lex_state_ids.clear();
shadowed_token_indices.assign(grammar.variables.size(), false);
}
private:
void add_advance_actions(const LexItemSet &item_set, LexStateId state_id) {
for (const auto &pair : item_set.transitions()) {
const CharacterSet &characters = pair.first;
const LexItemSet::Transition &transition = pair.second;
AdvanceAction action(-1, transition.precedence, transition.in_main_token);
auto current_action = lex_table.states[state_id].accept_action;
if (current_action.is_present()) {
bool prefer_advancing = conflict_manager.resolve(transition.destination, action, current_action);
bool matches_accepted_token = false;
for (const LexItem &item : transition.destination.entries) {
if (item.lhs == current_action.symbol) {
matches_accepted_token = true;
} else if (!transition.in_main_token && !item.lhs.is_built_in() && !prefer_advancing) {
shadowed_token_indices[item.lhs.index] = true;
}
}
if (!matches_accepted_token && characters.intersects(first_separator_characters)) {
shadowed_token_indices[current_action.symbol.index] = true;
}
if (!prefer_advancing) {
continue;
}
}
action.state_index = add_lex_state(transition.destination);
lex_table.states[state_id].advance_actions[characters] = action;
}
}
void add_accept_token_actions(const LexItemSet &item_set, LexStateId state_id) {
for (const LexItem &item : item_set.entries) {
LexItem::CompletionStatus completion_status = item.completion_status();
if (completion_status.is_done) {
AcceptTokenAction action(item.lhs, completion_status.precedence.max,
item.lhs.is_built_in() ||
grammar.variables[item.lhs.index].is_string);
auto current_action = lex_table.states[state_id].accept_action;
if (current_action.is_present()) {
if (!conflict_manager.resolve(action, current_action)) {
continue;
}
}
lex_table.states[state_id].accept_action = action;
}
}
}
void mark_fragile_tokens(ParseTable *parse_table) {
for (ParseState &state : parse_table->states) {
for (auto &entry : state.terminal_entries) {
Symbol symbol = entry.first;
if (symbol.is_token()) {
auto homonyms = conflict_manager.possible_homonyms.find(symbol.index);
if (homonyms != conflict_manager.possible_homonyms.end())
for (Symbol::Index homonym : homonyms->second)
if (state.terminal_entries.count(Symbol(homonym, Symbol::Terminal))) {
entry.second.reusable = false;
break;
}
if (!entry.second.reusable)
continue;
auto extensions = conflict_manager.possible_extensions.find(symbol.index);
if (extensions != conflict_manager.possible_extensions.end())
for (Symbol::Index extension : extensions->second)
if (state.terminal_entries.count(Symbol(extension, Symbol::Terminal))) {
entry.second.depends_on_lookahead = true;
break;
}
}
}
}
}
void remove_duplicate_lex_states(ParseTable *parse_table) {
for (LexState &state : lex_table.states) {
state.accept_action.is_string = false;
state.accept_action.precedence = 0;
}
map<LexStateId, LexStateId> replacements;
while (true) {
map<LexStateId, LexStateId> duplicates;
for (LexStateId i = 0, size = lex_table.states.size(); i < size; i++) {
for (LexStateId j = 0; j < i; j++) {
if (!duplicates.count(j) && lex_table.states[j] == lex_table.states[i]) {
duplicates.insert({ i, j });
break;
}
}
}
if (duplicates.empty()) break;
map<size_t, size_t> new_replacements;
for (LexStateId i = 0, size = lex_table.states.size(); i < size; i++) {
LexStateId new_state_index = i;
auto duplicate = duplicates.find(i);
if (duplicate != duplicates.end()) {
new_state_index = duplicate->second;
}
size_t prior_removed = 0;
for (const auto &duplicate : duplicates) {
if (duplicate.first >= new_state_index) break;
prior_removed++;
}
new_state_index -= prior_removed;
new_replacements.insert({ i, new_state_index });
replacements.insert({ i, new_state_index });
for (auto &replacement : replacements) {
if (replacement.second == i) {
replacement.second = new_state_index;
}
}
}
for (auto &state : lex_table.states) {
for (auto &entry : state.advance_actions) {
auto new_replacement = new_replacements.find(entry.second.state_index);
if (new_replacement != new_replacements.end()) {
entry.second.state_index = new_replacement->second;
}
}
}
for (auto i = duplicates.rbegin(); i != duplicates.rend(); ++i) {
lex_table.states.erase(lex_table.states.begin() + i->first);
}
}
for (ParseState &parse_state : parse_table->states) {
auto replacement = replacements.find(parse_state.lex_state_id);
if (replacement != replacements.end()) {
parse_state.lex_state_id = replacement->second;
}
}
}
LexItemSet item_set_for_terminals(const map<Symbol, ParseTableEntry> &terminals) {
LexItemSet result;
for (const auto &pair : terminals) {
Symbol symbol = pair.first;
if (symbol.is_token()) {
for (const rule_ptr &rule : rules_for_symbol(symbol)) {
for (const rule_ptr &separator_rule : separator_rules) {
result.entries.insert(LexItem(
symbol,
Metadata::separator(
Seq::build({
separator_rule,
Metadata::main_token(rule) }))));
}
}
}
}
return result;
}
vector<rule_ptr> rules_for_symbol(const rules::Symbol &symbol) {
if (symbol == rules::END_OF_INPUT())
return { CharacterSet().include(0).copy() };
rule_ptr rule = grammar.variables[symbol.index].rule;
auto choice = rule->as<Choice>();
if (choice)
return choice->elements;
else
return { rule };
}
};
unique_ptr<LexTableBuilder> LexTableBuilder::create(const LexicalGrammar &grammar) {
return unique_ptr<LexTableBuilder>(new LexTableBuilderImpl(grammar));
}
LexTable LexTableBuilder::build(ParseTable *parse_table) {
return static_cast<LexTableBuilderImpl *>(this)->build(parse_table);
}
bool LexTableBuilder::detect_conflict(Symbol::Index left, Symbol::Index right) {
return static_cast<LexTableBuilderImpl *>(this)->detect_conflict(left, right);
}
} // namespace build_tables
} // namespace tree_sitter

View file

@ -0,0 +1,26 @@
#ifndef COMPILER_BUILD_TABLES_LEX_TABLE_BUILDER_H_
#define COMPILER_BUILD_TABLES_LEX_TABLE_BUILDER_H_
#include <memory>
#include "compiler/lex_table.h"
namespace tree_sitter {
struct ParseTable;
struct LexicalGrammar;
namespace build_tables {
class LexTableBuilder {
public:
static std::unique_ptr<LexTableBuilder> create(const LexicalGrammar &);
LexTable build(ParseTable *);
bool detect_conflict(rules::Symbol::Index, rules::Symbol::Index);
protected:
LexTableBuilder() = default;
};
} // namespace build_tables
} // namespace tree_sitter
#endif // COMPILER_BUILD_TABLES_LEX_TABLE_BUILDER_H_

View file

@ -12,8 +12,7 @@
namespace tree_sitter {
namespace build_tables {
class ParseItem {
public:
struct ParseItem {
ParseItem();
ParseItem(const rules::Symbol &, const Production &, unsigned int);
@ -36,8 +35,7 @@ class ParseItem {
unsigned int step_index;
};
class ParseItemSet {
public:
struct ParseItemSet {
ParseItemSet();
explicit ParseItemSet(const std::map<ParseItem, LookaheadSet> &);

View file

@ -1,89 +0,0 @@
#include "compiler/build_tables/recovery_tokens.h"
#include "compiler/lexical_grammar.h"
#include "compiler/rules/choice.h"
#include "compiler/rules/character_set.h"
#include "compiler/rules/repeat.h"
#include "compiler/rules/visitor.h"
#include "compiler/rules/seq.h"
#include "compiler/rules/metadata.h"
namespace tree_sitter {
namespace build_tables {
using rules::Symbol;
using std::set;
template <bool left, bool right>
class CharacterAggregator : public rules::RuleFn<void> {
void apply_to(const rules::Seq *rule) {
if (left)
apply(rule->left);
if (right)
apply(rule->right);
}
void apply_to(const rules::Choice *rule) {
for (const rule_ptr &element : rule->elements)
apply(element);
}
void apply_to(const rules::Repeat *rule) {
apply(rule->content);
}
void apply_to(const rules::Metadata *rule) {
apply(rule->rule);
}
void apply_to(const rules::CharacterSet *rule) {
result.add_set(*rule);
}
public:
rules::CharacterSet result;
};
class FirstCharacters : public CharacterAggregator<true, false> {};
class LastCharacters : public CharacterAggregator<false, true> {};
class AllCharacters : public CharacterAggregator<true, true> {};
set<Symbol> recovery_tokens(const LexicalGrammar &grammar) {
set<Symbol> result;
AllCharacters all_separator_characters;
for (const rule_ptr &separator : grammar.separators)
all_separator_characters.apply(separator);
for (size_t i = 0; i < grammar.variables.size(); i++) {
const Variable &variable = grammar.variables[i];
rule_ptr rule = variable.rule;
FirstCharacters first_characters;
first_characters.apply(variable.rule);
LastCharacters last_characters;
last_characters.apply(variable.rule);
AllCharacters all_characters;
all_characters.apply(variable.rule);
bool has_distinct_start =
!first_characters.result.includes_all &&
!first_characters.result.intersects(all_separator_characters.result);
bool has_distinct_end =
!last_characters.result.includes_all &&
!last_characters.result.intersects(all_separator_characters.result);
bool has_no_separators =
!all_characters.result.intersects(all_separator_characters.result);
if ((has_distinct_start && has_distinct_end) || has_no_separators)
result.insert(Symbol(i, Symbol::Terminal));
}
return result;
}
} // namespace build_tables
} // namespace tree_sitter

View file

@ -1,19 +0,0 @@
#ifndef COMPILER_BUILD_TABLES_DOES_MATCH_ANY_LINE_H_
#define COMPILER_BUILD_TABLES_DOES_MATCH_ANY_LINE_H_
#include "compiler/rule.h"
#include "compiler/rules/symbol.h"
#include <set>
namespace tree_sitter {
struct LexicalGrammar;
namespace build_tables {
std::set<rules::Symbol> recovery_tokens(const LexicalGrammar &);
} // namespace build_tables
} // namespace tree_sitter
#endif // COMPILER_BUILD_TABLES_DOES_MATCH_ANY_LINE_H_

View file

@ -1,65 +0,0 @@
#ifndef COMPILER_BUILD_TABLES_REMOVE_DUPLICATE_STATES_H_
#define COMPILER_BUILD_TABLES_REMOVE_DUPLICATE_STATES_H_
#include <map>
#include <vector>
namespace tree_sitter {
namespace build_tables {
template <typename TableType>
std::map<size_t, size_t> remove_duplicate_states(TableType *table) {
std::map<size_t, size_t> replacements;
while (true) {
std::map<size_t, size_t> duplicates;
for (size_t i = 0, size = table->states.size(); i < size; i++)
for (size_t j = 0; j < i; j++)
if (!duplicates.count(j) && table->merge_state(j, i)) {
duplicates.insert({ i, j });
break;
}
if (duplicates.empty())
break;
std::map<size_t, size_t> new_replacements;
for (size_t i = 0, size = table->states.size(); i < size; i++) {
size_t new_state_index = i;
auto duplicate = duplicates.find(i);
if (duplicate != duplicates.end())
new_state_index = duplicate->second;
size_t prior_removed = 0;
for (const auto &duplicate : duplicates) {
if (duplicate.first >= new_state_index)
break;
prior_removed++;
}
new_state_index -= prior_removed;
new_replacements.insert({ i, new_state_index });
replacements.insert({ i, new_state_index });
for (auto &replacement : replacements)
if (replacement.second == i)
replacement.second = new_state_index;
}
for (auto &state : table->states)
state.each_referenced_state([&new_replacements](int64_t *state_index) {
auto new_replacement = new_replacements.find(*state_index);
if (new_replacement != new_replacements.end())
*state_index = new_replacement->second;
});
for (auto i = duplicates.rbegin(); i != duplicates.rend(); ++i)
table->states.erase(table->states.begin() + i->first);
}
return replacements;
}
} // namespace build_tables
} // namespace tree_sitter
#endif // COMPILER_BUILD_TABLES_REMOVE_DUPLICATE_STATES_H_

View file

@ -26,8 +26,6 @@ using std::vector;
using util::escape_char;
using rules::Symbol;
static Variable EOF_ENTRY("end", VariableTypeNamed, rule_ptr());
static const map<char, string> REPLACEMENTS({
{ '~', "TILDE" },
{ '`', "BQUOTE" },
@ -561,7 +559,7 @@ class CCodeGenerator {
return { variable.name, variable.type };
}
case Symbol::Terminal: {
const Variable &variable = lexical_grammar.variables[symbol.index];
const LexicalVariable &variable = lexical_grammar.variables[symbol.index];
return { variable.name, variable.type };
}
case Symbol::External:

View file

@ -7,8 +7,8 @@ namespace tree_sitter {
struct LexicalGrammar;
struct SyntaxGrammar;
class LexTable;
class ParseTable;
struct LexTable;
struct ParseTable;
namespace generate_code {

View file

@ -44,35 +44,10 @@ bool AcceptTokenAction::operator==(const AcceptTokenAction &other) const {
LexState::LexState() : is_token_start(false) {}
set<CharacterSet> LexState::expected_inputs() const {
set<CharacterSet> result;
for (auto &pair : advance_actions)
result.insert(pair.first);
return result;
}
bool LexState::operator==(const LexState &other) const {
return advance_actions == other.advance_actions &&
accept_action == other.accept_action &&
is_token_start == other.is_token_start;
}
void LexState::each_referenced_state(function<void(LexStateId *)> fn) {
for (auto &entry : advance_actions)
fn(&entry.second.state_index);
}
LexStateId LexTable::add_state() {
states.push_back(LexState());
return states.size() - 1;
}
LexState &LexTable::state(LexStateId id) {
return states[id];
}
bool LexTable::merge_state(size_t i, size_t j) {
return states[i] == states[j];
}
} // namespace tree_sitter

View file

@ -13,17 +13,9 @@ namespace tree_sitter {
typedef int64_t LexStateId;
typedef enum {
LexActionTypeError,
LexActionTypeAccept,
LexActionTypeAcceptFragile,
LexActionTypeAdvance
} LexActionType;
struct AdvanceAction {
AdvanceAction();
AdvanceAction(size_t, PrecedenceRange, bool);
bool operator==(const AdvanceAction &other) const;
LexStateId state_index;
@ -34,7 +26,6 @@ struct AdvanceAction {
struct AcceptTokenAction {
AcceptTokenAction();
AcceptTokenAction(rules::Symbol, int, bool);
bool is_present() const;
bool operator==(const AcceptTokenAction &action) const;
@ -43,31 +34,17 @@ struct AcceptTokenAction {
bool is_string;
};
} // namespace tree_sitter
namespace std {} // namespace std
namespace tree_sitter {
class LexState {
public:
struct LexState {
LexState();
std::set<rules::CharacterSet> expected_inputs() const;
bool operator==(const LexState &) const;
void each_referenced_state(std::function<void(LexStateId *)>);
std::map<rules::CharacterSet, AdvanceAction> advance_actions;
AcceptTokenAction accept_action;
bool is_token_start;
};
class LexTable {
public:
LexStateId add_state();
LexState &state(LexStateId state_id);
struct LexTable {
std::vector<LexState> states;
bool merge_state(size_t i, size_t j);
};
} // namespace tree_sitter

View file

@ -9,8 +9,15 @@
namespace tree_sitter {
struct LexicalVariable {
std::string name;
VariableType type;
rule_ptr rule;
bool is_string;
};
struct LexicalGrammar {
std::vector<Variable> variables;
std::vector<LexicalVariable> variables;
std::vector<rule_ptr> separators;
};

View file

@ -148,13 +148,6 @@ bool ParseState::has_shift_action() const {
return (!nonterminal_entries.empty());
}
set<Symbol> ParseState::expected_inputs() const {
set<Symbol> result;
for (auto &entry : terminal_entries)
result.insert(entry.first);
return result;
}
void ParseState::each_referenced_state(function<void(ParseStateId *)> fn) {
for (auto &entry : terminal_entries)
for (ParseAction &action : entry.second.actions)
@ -169,18 +162,6 @@ bool ParseState::operator==(const ParseState &other) const {
nonterminal_entries == other.nonterminal_entries;
}
set<Symbol> ParseTable::all_symbols() const {
set<Symbol> result;
for (auto &pair : symbols)
result.insert(pair.first);
return result;
}
ParseStateId ParseTable::add_state() {
states.push_back(ParseState());
return states.size() - 1;
}
ParseAction &ParseTable::add_terminal_action(ParseStateId state_id,
Symbol lookahead,
ParseAction action) {
@ -201,58 +182,4 @@ void ParseTable::set_nonterminal_action(ParseStateId state_id,
states[state_id].nonterminal_entries[lookahead] = next_state_id;
}
static bool has_entry(const ParseState &state, const ParseTableEntry &entry) {
for (const auto &pair : state.terminal_entries)
if (pair.second == entry)
return true;
return false;
}
bool ParseTable::merge_state(size_t i, size_t j) {
ParseState &state = states[i];
ParseState &other = states[j];
if (state.nonterminal_entries != other.nonterminal_entries)
return false;
for (auto &entry : state.terminal_entries) {
Symbol lookahead = entry.first;
const vector<ParseAction> &actions = entry.second.actions;
const auto &other_entry = other.terminal_entries.find(lookahead);
if (other_entry == other.terminal_entries.end()) {
if (mergeable_symbols.count(lookahead) == 0 && !lookahead.is_built_in())
return false;
if (actions.back().type != ParseActionTypeReduce)
return false;
if (!has_entry(other, entry.second))
return false;
} else if (entry.second != other_entry->second) {
return false;
}
}
set<Symbol> symbols_to_merge;
for (auto &entry : other.terminal_entries) {
Symbol lookahead = entry.first;
const vector<ParseAction> &actions = entry.second.actions;
if (!state.terminal_entries.count(lookahead)) {
if (mergeable_symbols.count(lookahead) == 0 && !lookahead.is_built_in())
return false;
if (actions.back().type != ParseActionTypeReduce)
return false;
if (!has_entry(state, entry.second))
return false;
symbols_to_merge.insert(lookahead);
}
}
for (const Symbol &lookahead : symbols_to_merge)
state.terminal_entries[lookahead] = other.terminal_entries.find(lookahead)->second;
return true;
}
} // namespace tree_sitter

View file

@ -23,13 +23,11 @@ enum ParseActionType {
ParseActionTypeRecover,
};
class ParseAction {
struct ParseAction {
ParseAction();
ParseAction(ParseActionType type, ParseStateId state_index,
rules::Symbol symbol, size_t consumed_symbol_count,
const Production *);
public:
ParseAction();
static ParseAction Accept();
static ParseAction Error();
static ParseAction Shift(ParseStateId state_index);
@ -39,7 +37,6 @@ class ParseAction {
static ParseAction ShiftExtra();
bool operator==(const ParseAction &) const;
bool operator<(const ParseAction &) const;
rules::Associativity associativity() const;
int precedence() const;
@ -47,30 +44,26 @@ class ParseAction {
bool extra;
bool fragile;
ParseStateId state_index;
rules::Symbol symbol;
size_t consumed_symbol_count;
const Production *production;
};
struct ParseTableEntry {
std::vector<ParseAction> actions;
bool reusable;
bool depends_on_lookahead;
ParseTableEntry();
ParseTableEntry(const std::vector<ParseAction> &, bool, bool);
bool operator==(const ParseTableEntry &other) const;
inline bool operator!=(const ParseTableEntry &other) const {
return !operator==(other);
}
std::vector<ParseAction> actions;
bool reusable;
bool depends_on_lookahead;
};
class ParseState {
public:
struct ParseState {
ParseState();
std::set<rules::Symbol> expected_inputs() const;
bool operator==(const ParseState &) const;
bool merge(const ParseState &);
void each_referenced_state(std::function<void(ParseStateId *)>);
@ -87,18 +80,12 @@ struct ParseTableSymbolMetadata {
bool structural;
};
class ParseTable {
public:
std::set<rules::Symbol> all_symbols() const;
ParseStateId add_state();
struct ParseTable {
ParseAction &add_terminal_action(ParseStateId state_id, rules::Symbol, ParseAction);
void set_nonterminal_action(ParseStateId, rules::Symbol::Index, ParseStateId);
bool merge_state(size_t i, size_t j);
std::vector<ParseState> states;
std::map<rules::Symbol, ParseTableSymbolMetadata> symbols;
std::set<rules::Symbol> mergeable_symbols;
};
} // namespace tree_sitter

View file

@ -41,10 +41,17 @@ class ExpandRepeats : public rules::IdentityRuleFn {
string helper_rule_name = rule_name + "_repeat" + to_string(++repeat_count);
Symbol repeat_symbol(offset + index, Symbol::NonTerminal);
existing_repeats.push_back({ rule->copy(), repeat_symbol });
aux_rules.push_back(
Variable(helper_rule_name, VariableTypeAuxiliary,
Choice::build({ Seq::build({ repeat_symbol.copy(), inner_rule }),
inner_rule })));
aux_rules.push_back(Variable{
helper_rule_name,
VariableTypeAuxiliary,
Choice::build({
Seq::build({
repeat_symbol.copy(),
inner_rule,
}),
inner_rule,
})
});
return repeat_symbol.copy();
}

View file

@ -67,11 +67,11 @@ pair<LexicalGrammar, CompileError> expand_tokens(const LexicalGrammar &grammar)
LexicalGrammar result;
ExpandTokens expander;
for (const Variable &variable : grammar.variables) {
for (const LexicalVariable &variable : grammar.variables) {
auto rule = expander.apply(variable.rule);
if (expander.error.type)
return { result, expander.error };
result.variables.push_back(Variable(variable.name, variable.type, rule));
result.variables.push_back({variable.name, variable.type, rule, variable.is_string});
}
for (auto &sep : grammar.separators) {

View file

@ -56,7 +56,7 @@ class SymbolReplacer : public rules::IdentityRuleFn {
class TokenExtractor : public rules::IdentityRuleFn {
using rules::IdentityRuleFn::apply_to;
rule_ptr apply_to_token(const Rule *input, VariableType entry_type) {
rule_ptr apply_to_token(const Rule *input, VariableType entry_type, bool is_string) {
for (size_t i = 0; i < tokens.size(); i++)
if (tokens[i].rule->operator==(*input)) {
token_usage_counts[i]++;
@ -65,29 +65,30 @@ class TokenExtractor : public rules::IdentityRuleFn {
rule_ptr rule = input->copy();
size_t index = tokens.size();
tokens.push_back(Variable(token_description(rule), entry_type, rule));
tokens.push_back({token_description(rule), entry_type, rule, is_string});
token_usage_counts.push_back(1);
return make_shared<Symbol>(index, Symbol::Terminal);
}
rule_ptr apply_to(const rules::String *rule) {
return apply_to_token(rule, VariableTypeAnonymous);
return apply_to_token(rule, VariableTypeAnonymous, true);
}
rule_ptr apply_to(const rules::Pattern *rule) {
return apply_to_token(rule, VariableTypeAuxiliary);
return apply_to_token(rule, VariableTypeAuxiliary, false);
}
rule_ptr apply_to(const rules::Metadata *rule) {
if (rule->params.is_token)
return apply_to_token(rule->rule.get(), VariableTypeAuxiliary);
else
if (rule->params.is_token) {
return apply_to_token(rule->rule.get(), VariableTypeAuxiliary, false);
} else {
return rules::IdentityRuleFn::apply_to(rule);
}
}
public:
vector<size_t> token_usage_counts;
vector<Variable> tokens;
vector<LexicalVariable> tokens;
};
static CompileError extra_token_error(const string &message) {
@ -106,8 +107,11 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
*/
vector<Variable> processed_variables;
for (const Variable &variable : grammar.variables)
processed_variables.push_back(
Variable(variable.name, variable.type, extractor.apply(variable.rule)));
processed_variables.push_back(Variable{
variable.name,
variable.type,
extractor.apply(variable.rule)
});
lexical_grammar.variables = extractor.tokens;
/*
@ -139,8 +143,9 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
for (const ConflictSet &conflict_set : grammar.expected_conflicts) {
ConflictSet new_conflict_set;
for (const Symbol &symbol : conflict_set)
for (const Symbol &symbol : conflict_set) {
new_conflict_set.insert(symbol_replacer.replace_symbol(symbol));
}
syntax_grammar.expected_conflicts.insert(new_conflict_set);
}
@ -154,7 +159,7 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
for (const rule_ptr &rule : grammar.extra_tokens) {
int i = 0;
bool used_elsewhere_in_grammar = false;
for (const Variable &variable : lexical_grammar.variables) {
for (const LexicalVariable &variable : lexical_grammar.variables) {
if (variable.rule->operator==(*rule)) {
syntax_grammar.extra_tokens.insert(Symbol(i, Symbol::Terminal));
used_elsewhere_in_grammar = true;
@ -171,9 +176,10 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
}
auto symbol = rule->as<Symbol>();
if (!symbol)
if (!symbol) {
return make_tuple(syntax_grammar, lexical_grammar,
extra_token_error(rule->to_string()));
}
Symbol new_symbol = symbol_replacer.replace_symbol(*symbol);
if (new_symbol.is_non_terminal()) {

View file

@ -25,8 +25,11 @@ class FlattenRule : public rules::RuleFn<void> {
Production production;
void apply_to(const rules::Symbol *sym) {
production.push_back(ProductionStep(*sym, precedence_stack.back(),
associativity_stack.back()));
production.push_back(ProductionStep{
*sym,
precedence_stack.back(),
associativity_stack.back()
});
}
void apply_to(const rules::Metadata *metadata) {
@ -85,7 +88,7 @@ SyntaxVariable flatten_rule(const Variable &variable) {
}
}
return SyntaxVariable(variable.name, variable.type, productions);
return SyntaxVariable{variable.name, variable.type, productions};
}
pair<SyntaxGrammar, CompileError> flatten_grammar(const InitialSyntaxGrammar &grammar) {

View file

@ -8,7 +8,7 @@ namespace prepare_grammar {
LexicalGrammar normalize_rules(const LexicalGrammar &input_grammar) {
LexicalGrammar result(input_grammar);
for (Variable &variable : result.variables) {
for (LexicalVariable &variable : result.variables) {
variable.rule = rules::Choice::build(extract_choices(variable.rule));
}

View file

@ -7,20 +7,6 @@
namespace tree_sitter {
using std::string;
using std::to_string;
using std::pair;
using std::vector;
using std::set;
SyntaxVariable::SyntaxVariable(const string &name, VariableType type,
const vector<Production> &productions)
: name(name), productions(productions), type(type) {}
ProductionStep::ProductionStep(const rules::Symbol &symbol, int precedence,
rules::Associativity associativity)
: symbol(symbol), precedence(precedence), associativity(associativity) {}
bool ExternalToken::operator==(const ExternalToken &other) const {
return name == other.name && type == other.type &&
corresponding_internal_token == other.corresponding_internal_token;

View file

@ -11,15 +11,14 @@
namespace tree_sitter {
struct ExternalToken {
bool operator==(const ExternalToken &) const;
std::string name;
VariableType type;
rules::Symbol corresponding_internal_token;
bool operator==(const ExternalToken &) const;
};
struct ProductionStep {
ProductionStep(const rules::Symbol &, int, rules::Associativity);
bool operator==(const ProductionStep &) const;
rules::Symbol symbol;
@ -30,12 +29,9 @@ struct ProductionStep {
typedef std::vector<ProductionStep> Production;
struct SyntaxVariable {
SyntaxVariable(const std::string &, VariableType,
const std::vector<Production> &);
std::string name;
std::vector<Production> productions;
VariableType type;
std::vector<Production> productions;
};
typedef std::set<rules::Symbol> ConflictSet;

View file

@ -1,11 +0,0 @@
#include "compiler/variable.h"
#include <string>
namespace tree_sitter {
using std::string;
Variable::Variable(const string &name, VariableType type, const rule_ptr &rule)
: name(name), rule(rule), type(type) {}
} // namespace tree_sitter

View file

@ -15,11 +15,9 @@ enum VariableType {
};
struct Variable {
Variable(const std::string &, VariableType, const rule_ptr &);
std::string name;
rule_ptr rule;
VariableType type;
rule_ptr rule;
};
} // namespace tree_sitter