Avoid introducing certain lexical conflicts during parse state merging
The current pretty conservative approach is to avoid merging parse states which would cause a pair tokens to co-exist for the first time in any parse state, where the two tokens can start with the same character and at least one of the tokens can contain a character which is part of the grammar's separators.
This commit is contained in:
parent
3c8e6f9987
commit
686dc0997c
24 changed files with 305 additions and 158 deletions
|
|
@ -25,6 +25,7 @@
|
|||
'src/compiler/compile.cc',
|
||||
'src/compiler/generate_code/c_code.cc',
|
||||
'src/compiler/lex_table.cc',
|
||||
'src/compiler/lexical_grammar.cc',
|
||||
'src/compiler/parse_grammar.cc',
|
||||
'src/compiler/parse_table.cc',
|
||||
'src/compiler/precedence_range.cc',
|
||||
|
|
|
|||
|
|
@ -14,17 +14,18 @@ START_TEST
|
|||
describe("recovery_tokens(rule)", []() {
|
||||
it("includes rules that can only begin and end with an explicit set of characters", [&]() {
|
||||
LexicalGrammar grammar;
|
||||
|
||||
grammar.separators = {
|
||||
character({ ' ' }),
|
||||
};
|
||||
|
||||
grammar.variables = {
|
||||
Variable("var0", VariableTypeNamed, character({}, false)),
|
||||
Variable("var1", VariableTypeNamed, seq({
|
||||
LexicalVariable("var0", VariableTypeNamed, character({}, false), false),
|
||||
LexicalVariable("var1", VariableTypeNamed, seq({
|
||||
character({ 'a', 'b' }),
|
||||
character({}, false),
|
||||
character({ 'c', 'd' }),
|
||||
})),
|
||||
}), false),
|
||||
};
|
||||
|
||||
AssertThat(get_compatible_tokens(grammar).recovery_tokens, Equals<set<Symbol>>({ Symbol(1, Symbol::Terminal) }));
|
||||
|
|
|
|||
|
|
@ -13,11 +13,10 @@ START_TEST
|
|||
|
||||
describe("LexItem", []() {
|
||||
describe("completion_status()", [&]() {
|
||||
it("indicates whether the item is done, its precedence, and whether it is a string", [&]() {
|
||||
it("indicates whether the item is done and its precedence", [&]() {
|
||||
LexItem item1(Symbol(0, Symbol::Terminal), character({ 'a', 'b', 'c' }));
|
||||
AssertThat(item1.completion_status().is_done, IsFalse());
|
||||
AssertThat(item1.completion_status().precedence, Equals(PrecedenceRange()));
|
||||
AssertThat(item1.completion_status().is_string, IsFalse());
|
||||
|
||||
MetadataParams params;
|
||||
params.precedence = 3;
|
||||
|
|
@ -30,12 +29,10 @@ describe("LexItem", []() {
|
|||
|
||||
AssertThat(item2.completion_status().is_done, IsTrue());
|
||||
AssertThat(item2.completion_status().precedence, Equals(PrecedenceRange(3)));
|
||||
AssertThat(item2.completion_status().is_string, IsTrue());
|
||||
|
||||
LexItem item3(Symbol(0, Symbol::Terminal), repeat(character({ ' ', '\t' })));
|
||||
AssertThat(item3.completion_status().is_done, IsTrue());
|
||||
AssertThat(item3.completion_status().precedence, Equals(PrecedenceRange()));
|
||||
AssertThat(item3.completion_status().is_string, IsFalse());
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -12,12 +12,13 @@ using namespace rules;
|
|||
START_TEST
|
||||
|
||||
describe("ParseItemSetBuilder", []() {
|
||||
vector<Variable> lexical_variables;
|
||||
vector<LexicalVariable> lexical_variables;
|
||||
for (size_t i = 0; i < 20; i++) {
|
||||
lexical_variables.push_back(Variable{
|
||||
lexical_variables.push_back({
|
||||
"token_" + to_string(i),
|
||||
VariableTypeNamed,
|
||||
blank(),
|
||||
false
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -15,89 +15,149 @@ describe("expand_tokens", []() {
|
|||
|
||||
describe("string rules", [&]() {
|
||||
it("replaces strings with sequences of character sets", [&]() {
|
||||
LexicalGrammar grammar{{
|
||||
Variable("rule_A", VariableTypeNamed, seq({
|
||||
i_sym(10),
|
||||
str("xyz"),
|
||||
i_sym(11),
|
||||
})),
|
||||
}, {}};
|
||||
LexicalGrammar grammar {
|
||||
{
|
||||
LexicalVariable {
|
||||
"rule_A",
|
||||
VariableTypeNamed,
|
||||
seq({
|
||||
i_sym(10),
|
||||
str("xyz"),
|
||||
i_sym(11),
|
||||
}),
|
||||
false
|
||||
}
|
||||
},
|
||||
{}
|
||||
};
|
||||
|
||||
auto result = expand_tokens(grammar);
|
||||
|
||||
AssertThat(result.second, Equals(CompileError::none()));
|
||||
AssertThat(result.first.variables, Equals(vector<Variable>({
|
||||
Variable("rule_A", VariableTypeNamed, seq({
|
||||
i_sym(10),
|
||||
metadata(seq({
|
||||
character({ 'x' }),
|
||||
character({ 'y' }),
|
||||
character({ 'z' }),
|
||||
}), string_token_params),
|
||||
i_sym(11),
|
||||
})),
|
||||
})));
|
||||
AssertThat(result.first.variables, Equals(vector<LexicalVariable> {
|
||||
LexicalVariable {
|
||||
"rule_A",
|
||||
VariableTypeNamed,
|
||||
seq({
|
||||
i_sym(10),
|
||||
metadata(seq({
|
||||
character({ 'x' }),
|
||||
character({ 'y' }),
|
||||
character({ 'z' }),
|
||||
}), string_token_params),
|
||||
i_sym(11),
|
||||
}),
|
||||
false
|
||||
}
|
||||
}));
|
||||
});
|
||||
|
||||
it("handles strings containing non-ASCII UTF8 characters", [&]() {
|
||||
LexicalGrammar grammar{{
|
||||
Variable("rule_A", VariableTypeNamed, str("\u03B1 \u03B2")),
|
||||
}, {}};
|
||||
LexicalGrammar grammar {
|
||||
{
|
||||
LexicalVariable {
|
||||
"rule_A",
|
||||
VariableTypeNamed,
|
||||
str("\u03B1 \u03B2"),
|
||||
false
|
||||
},
|
||||
},
|
||||
{}
|
||||
};
|
||||
|
||||
auto result = expand_tokens(grammar);
|
||||
|
||||
AssertThat(result.first.variables, Equals(vector<Variable>({
|
||||
Variable("rule_A", VariableTypeNamed, metadata(seq({
|
||||
character({ 945 }),
|
||||
character({ ' ' }),
|
||||
character({ 946 }),
|
||||
}), string_token_params)),
|
||||
})));
|
||||
AssertThat(result.first.variables, Equals(vector<LexicalVariable> {
|
||||
LexicalVariable {
|
||||
"rule_A",
|
||||
VariableTypeNamed,
|
||||
metadata(seq({
|
||||
character({ 945 }),
|
||||
character({ ' ' }),
|
||||
character({ 946 }),
|
||||
}), string_token_params),
|
||||
false
|
||||
}
|
||||
}));
|
||||
});
|
||||
});
|
||||
|
||||
describe("regexp rules", [&]() {
|
||||
it("replaces regexps with the equivalent rule tree", [&]() {
|
||||
LexicalGrammar grammar{{
|
||||
Variable("rule_A", VariableTypeNamed, seq({
|
||||
i_sym(10),
|
||||
pattern("x*"),
|
||||
i_sym(11),
|
||||
})),
|
||||
}, {}};
|
||||
LexicalGrammar grammar {
|
||||
{
|
||||
LexicalVariable {
|
||||
"rule_A",
|
||||
VariableTypeNamed,
|
||||
seq({
|
||||
i_sym(10),
|
||||
pattern("x*"),
|
||||
i_sym(11),
|
||||
}),
|
||||
false
|
||||
}
|
||||
},
|
||||
{}
|
||||
};
|
||||
|
||||
auto result = expand_tokens(grammar);
|
||||
|
||||
AssertThat(result.second, Equals(CompileError::none()));
|
||||
AssertThat(result.first.variables, Equals(vector<Variable>({
|
||||
Variable("rule_A", VariableTypeNamed, seq({
|
||||
i_sym(10),
|
||||
repeat(character({ 'x' })),
|
||||
i_sym(11),
|
||||
})),
|
||||
})));
|
||||
AssertThat(result.first.variables, Equals(vector<LexicalVariable> {
|
||||
LexicalVariable {
|
||||
"rule_A",
|
||||
VariableTypeNamed,
|
||||
seq({
|
||||
i_sym(10),
|
||||
repeat(character({ 'x' })),
|
||||
i_sym(11),
|
||||
}),
|
||||
false
|
||||
}
|
||||
}));
|
||||
});
|
||||
|
||||
it("handles regexps containing non-ASCII UTF8 characters", [&]() {
|
||||
LexicalGrammar grammar{{
|
||||
Variable("rule_A", VariableTypeNamed, pattern("[^\u03B1-\u03B4]*")),
|
||||
}, {}};
|
||||
LexicalGrammar grammar {
|
||||
{
|
||||
LexicalVariable {
|
||||
"rule_A",
|
||||
VariableTypeNamed,
|
||||
pattern("[^\u03B1-\u03B4]*"),
|
||||
false
|
||||
}
|
||||
},
|
||||
{}
|
||||
};
|
||||
|
||||
auto result = expand_tokens(grammar);
|
||||
|
||||
AssertThat(result.first.variables, Equals(vector<Variable>({
|
||||
Variable("rule_A", VariableTypeNamed, repeat(character({ 945, 946, 947, 948 }, false))),
|
||||
})));
|
||||
AssertThat(result.first.variables, Equals(vector<LexicalVariable> {
|
||||
LexicalVariable {
|
||||
"rule_A",
|
||||
VariableTypeNamed,
|
||||
repeat(character({ 945, 946, 947, 948 }, false)),
|
||||
false
|
||||
}
|
||||
}));
|
||||
});
|
||||
|
||||
it("returns an error when the grammar contains an invalid regex", [&]() {
|
||||
LexicalGrammar grammar{{
|
||||
Variable("rule_A", VariableTypeNamed, seq({
|
||||
pattern("("),
|
||||
str("xyz"),
|
||||
pattern("["),
|
||||
}))
|
||||
}, {}};
|
||||
LexicalGrammar grammar {
|
||||
{
|
||||
LexicalVariable {
|
||||
"rule_A",
|
||||
VariableTypeNamed,
|
||||
seq({
|
||||
pattern("("),
|
||||
str("xyz"),
|
||||
pattern("["),
|
||||
}),
|
||||
false
|
||||
},
|
||||
},
|
||||
{}
|
||||
};
|
||||
|
||||
auto result = expand_tokens(grammar);
|
||||
|
||||
|
|
|
|||
|
|
@ -16,20 +16,25 @@ using prepare_grammar::InitialSyntaxGrammar;
|
|||
|
||||
describe("extract_tokens", []() {
|
||||
it("moves strings, patterns, and sub-rules marked as tokens into the lexical grammar", [&]() {
|
||||
auto result = extract_tokens(InternedGrammar{{
|
||||
Variable("rule_A", VariableTypeNamed, repeat1(seq({
|
||||
str("ab"),
|
||||
pattern("cd*"),
|
||||
choice({
|
||||
i_sym(1),
|
||||
i_sym(2),
|
||||
token(repeat1(choice({ str("ef"), str("gh") }))),
|
||||
}),
|
||||
}))),
|
||||
Variable("rule_B", VariableTypeNamed, pattern("ij+")),
|
||||
Variable("rule_C", VariableTypeNamed, choice({ str("kl"), blank() })),
|
||||
Variable("rule_D", VariableTypeNamed, repeat1(i_sym(3)))
|
||||
}, {}, {}, {}});
|
||||
auto result = extract_tokens(InternedGrammar {
|
||||
{
|
||||
Variable("rule_A", VariableTypeNamed, repeat1(seq({
|
||||
str("ab"),
|
||||
pattern("cd*"),
|
||||
choice({
|
||||
i_sym(1),
|
||||
i_sym(2),
|
||||
token(repeat1(choice({ str("ef"), str("gh") }))),
|
||||
}),
|
||||
}))),
|
||||
Variable("rule_B", VariableTypeNamed, pattern("ij+")),
|
||||
Variable("rule_C", VariableTypeNamed, choice({ str("kl"), blank() })),
|
||||
Variable("rule_D", VariableTypeNamed, repeat1(i_sym(3)))
|
||||
},
|
||||
{},
|
||||
{},
|
||||
{}
|
||||
});
|
||||
|
||||
InitialSyntaxGrammar &syntax_grammar = get<0>(result);
|
||||
LexicalGrammar &lexical_grammar = get<1>(result);
|
||||
|
|
@ -64,46 +69,51 @@ describe("extract_tokens", []() {
|
|||
Variable("rule_D", VariableTypeNamed, repeat1(i_sym(2))),
|
||||
})));
|
||||
|
||||
AssertThat(lexical_grammar.variables, Equals(vector<Variable>({
|
||||
AssertThat(lexical_grammar.variables, Equals(vector<LexicalVariable>({
|
||||
// Strings become anonymous rules.
|
||||
Variable("ab", VariableTypeAnonymous, str("ab")),
|
||||
LexicalVariable("ab", VariableTypeAnonymous, str("ab"), true),
|
||||
|
||||
// Patterns become hidden rules.
|
||||
Variable("/cd*/", VariableTypeAuxiliary, pattern("cd*")),
|
||||
LexicalVariable("/cd*/", VariableTypeAuxiliary, pattern("cd*"), false),
|
||||
|
||||
// Rules marked as tokens become hidden rules.
|
||||
Variable("/(ef|gh)*/", VariableTypeAuxiliary, repeat1(choice({
|
||||
LexicalVariable("/(ef|gh)*/", VariableTypeAuxiliary, repeat1(choice({
|
||||
str("ef"),
|
||||
str("gh")
|
||||
}))),
|
||||
})), false),
|
||||
|
||||
// This named rule was moved wholesale to the lexical grammar.
|
||||
Variable("rule_B", VariableTypeNamed, pattern("ij+")),
|
||||
LexicalVariable("rule_B", VariableTypeNamed, pattern("ij+"), false),
|
||||
|
||||
// Strings become anonymous rules.
|
||||
Variable("kl", VariableTypeAnonymous, str("kl")),
|
||||
LexicalVariable("kl", VariableTypeAnonymous, str("kl"), true),
|
||||
})));
|
||||
});
|
||||
|
||||
it("does not create duplicate tokens in the lexical grammar", [&]() {
|
||||
auto result = extract_tokens(InternedGrammar{{
|
||||
Variable("rule_A", VariableTypeNamed, seq({
|
||||
str("ab"),
|
||||
i_sym(0),
|
||||
str("ab"),
|
||||
})),
|
||||
}, {}, {}, {}});
|
||||
auto result = extract_tokens(InternedGrammar {
|
||||
{
|
||||
Variable("rule_A", VariableTypeNamed, seq({
|
||||
str("ab"),
|
||||
i_sym(0),
|
||||
str("ab"),
|
||||
})),
|
||||
},
|
||||
{},
|
||||
{},
|
||||
{}
|
||||
});
|
||||
|
||||
InitialSyntaxGrammar &syntax_grammar = get<0>(result);
|
||||
LexicalGrammar &lexical_grammar = get<1>(result);
|
||||
|
||||
AssertThat(syntax_grammar.variables, Equals(vector<Variable>({
|
||||
Variable("rule_A", VariableTypeNamed, seq({ i_token(0), i_sym(0), i_token(0) })),
|
||||
})));
|
||||
AssertThat(syntax_grammar.variables, Equals(vector<Variable> {
|
||||
Variable {"rule_A", VariableTypeNamed, seq({ i_token(0), i_sym(0), i_token(0) })},
|
||||
}));
|
||||
|
||||
AssertThat(lexical_grammar.variables, Equals(vector<Variable>({
|
||||
Variable("ab", VariableTypeAnonymous, str("ab")),
|
||||
})))
|
||||
AssertThat(lexical_grammar.variables, Equals(vector<LexicalVariable> {
|
||||
LexicalVariable {"ab", VariableTypeAnonymous, str("ab"), true},
|
||||
}))
|
||||
});
|
||||
|
||||
it("does not move entire rules into the lexical grammar if their content is used elsewhere in the grammar", [&]() {
|
||||
|
|
@ -122,11 +132,11 @@ describe("extract_tokens", []() {
|
|||
Variable("rule_C", VariableTypeNamed, seq({ i_token(2), i_token(1) })),
|
||||
})));
|
||||
|
||||
AssertThat(lexical_grammar.variables, Equals(vector<Variable>({
|
||||
Variable("ab", VariableTypeAnonymous, str("ab")),
|
||||
Variable("cd", VariableTypeAnonymous, str("cd")),
|
||||
Variable("ef", VariableTypeAnonymous, str("ef")),
|
||||
})));
|
||||
AssertThat(lexical_grammar.variables, Equals(vector<LexicalVariable> {
|
||||
LexicalVariable {"ab", VariableTypeAnonymous, str("ab"), true},
|
||||
LexicalVariable {"cd", VariableTypeAnonymous, str("cd"), true},
|
||||
LexicalVariable {"ef", VariableTypeAnonymous, str("ef"), true},
|
||||
}));
|
||||
});
|
||||
|
||||
it("renumbers the grammar's expected conflict symbols based on any moved rules", [&]() {
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
#include "rule_helpers.h"
|
||||
#include <memory>
|
||||
#include "compiler/rules/symbol.h"
|
||||
#include "compiler/variable.h"
|
||||
#include "compiler/lexical_grammar.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
using std::make_shared;
|
||||
|
|
@ -52,4 +54,9 @@ namespace tree_sitter {
|
|||
return left.name == right.name && left.rule->operator==(*right.rule) &&
|
||||
left.type == right.type;
|
||||
}
|
||||
|
||||
bool operator==(const LexicalVariable &left, const LexicalVariable &right) {
|
||||
return left.name == right.name && left.rule->operator==(*right.rule) &&
|
||||
left.type == right.type && left.is_string == right.is_string;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -15,7 +15,11 @@ namespace tree_sitter {
|
|||
rule_ptr i_token(size_t index);
|
||||
rule_ptr active_prec(int precedence, rule_ptr);
|
||||
|
||||
struct Variable;
|
||||
struct LexicalVariable;
|
||||
|
||||
bool operator==(const Variable &left, const Variable &right);
|
||||
bool operator==(const LexicalVariable &left, const LexicalVariable &right);
|
||||
}
|
||||
|
||||
#endif // HELPERS_RULE_HELPERS_H_
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
#include "tree_sitter/compiler.h"
|
||||
#include "compiler/parse_table.h"
|
||||
#include "compiler/syntax_grammar.h"
|
||||
#include "compiler/lexical_grammar.h"
|
||||
#include "compiler/build_tables/parse_item.h"
|
||||
#include "compiler/build_tables/lex_item.h"
|
||||
|
||||
|
|
@ -41,6 +42,11 @@ ostream &operator<<(ostream &stream, const SyntaxVariable &variable) {
|
|||
return stream << string("{") << variable.name << string(", ") << variable.productions << string(", ") << to_string(variable.type) << string("}");
|
||||
}
|
||||
|
||||
ostream &operator<<(ostream &stream, const LexicalVariable &variable) {
|
||||
return stream << "{" << variable.name << ", " << variable.rule << ", " <<
|
||||
to_string(variable.type) << ", " << to_string(variable.is_string) << "}";
|
||||
}
|
||||
|
||||
std::ostream &operator<<(std::ostream &stream, const AdvanceAction &action) {
|
||||
return stream << string("#<advance ") + to_string(action.state_index) + ">";
|
||||
}
|
||||
|
|
|
|||
|
|
@ -93,6 +93,7 @@ using std::string;
|
|||
using std::to_string;
|
||||
struct Variable;
|
||||
struct SyntaxVariable;
|
||||
struct LexicalVariable;
|
||||
struct AdvanceAction;
|
||||
struct AcceptTokenAction;
|
||||
class ParseAction;
|
||||
|
|
@ -107,6 +108,7 @@ ostream &operator<<(ostream &, const Rule &);
|
|||
ostream &operator<<(ostream &, const rule_ptr &);
|
||||
ostream &operator<<(ostream &, const Variable &);
|
||||
ostream &operator<<(ostream &, const SyntaxVariable &);
|
||||
ostream &operator<<(ostream &, const LexicalVariable &);
|
||||
ostream &operator<<(ostream &, const AdvanceAction &);
|
||||
ostream &operator<<(ostream &, const AcceptTokenAction &);
|
||||
ostream &operator<<(ostream &, const ParseAction &);
|
||||
|
|
|
|||
|
|
@ -99,7 +99,8 @@ class LexTableBuilder {
|
|||
LexItem::CompletionStatus completion_status = item.completion_status();
|
||||
if (completion_status.is_done) {
|
||||
AcceptTokenAction action(item.lhs, completion_status.precedence.max,
|
||||
completion_status.is_string);
|
||||
item.lhs.is_built_in() ||
|
||||
lex_grammar.variables[item.lhs.index].is_string);
|
||||
|
||||
auto current_action = lex_table.state(state_id).accept_action;
|
||||
if (conflict_manager.resolve(action, current_action))
|
||||
|
|
|
|||
|
|
@ -72,10 +72,11 @@ class ParseTableBuilder {
|
|||
}));
|
||||
|
||||
CompileError error = process_part_state_queue();
|
||||
if (error.type != TSCompileErrorTypeNone)
|
||||
if (error.type != TSCompileErrorTypeNone) {
|
||||
return { parse_table, error };
|
||||
}
|
||||
|
||||
parse_table.mergeable_symbols = compatible_tokens.recovery_tokens;
|
||||
update_unmergable_token_pairs();
|
||||
|
||||
build_error_parse_state();
|
||||
|
||||
|
|
@ -111,7 +112,7 @@ class ParseTableBuilder {
|
|||
void build_error_parse_state() {
|
||||
ParseState error_state;
|
||||
|
||||
for (const Symbol symbol : parse_table.mergeable_symbols) {
|
||||
for (const Symbol symbol : compatible_tokens.recovery_tokens) {
|
||||
add_out_of_context_parse_state(&error_state, symbol);
|
||||
}
|
||||
|
||||
|
|
@ -292,6 +293,25 @@ class ParseTableBuilder {
|
|||
}
|
||||
}
|
||||
|
||||
void update_unmergable_token_pairs() {
|
||||
for (const ParseState &state : parse_table.states) {
|
||||
for (Symbol::Index token_index = 0, token_count = lexical_grammar.variables.size(); token_index < token_count; token_index++) {
|
||||
Symbol token(token_index, Symbol::Terminal);
|
||||
if (state.terminal_entries.count(token)) {
|
||||
auto &incompatible_token_indices = compatible_tokens.unmergeable_pairs[token_index];
|
||||
auto iter = incompatible_token_indices.begin();
|
||||
while (iter != incompatible_token_indices.end()) {
|
||||
if (state.terminal_entries.count(Symbol(*iter, Symbol::NonTerminal))) {
|
||||
iter = incompatible_token_indices.erase(iter);
|
||||
} else {
|
||||
++iter;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void remove_duplicate_parse_states() {
|
||||
map<size_t, set<ParseStateId>> state_indices_by_signature;
|
||||
|
||||
|
|
@ -382,11 +402,19 @@ class ParseTableBuilder {
|
|||
for (auto &entry : state.terminal_entries) {
|
||||
Symbol lookahead = entry.first;
|
||||
const vector<ParseAction> &actions = entry.second.actions;
|
||||
auto &incompatible_token_indices = compatible_tokens.unmergeable_pairs[lookahead.index];
|
||||
|
||||
const auto &other_entry = other.terminal_entries.find(lookahead);
|
||||
if (other_entry == other.terminal_entries.end()) {
|
||||
if (compatible_tokens.recovery_tokens.count(lookahead) == 0 && !lookahead.is_built_in())
|
||||
return false;
|
||||
if (!lookahead.is_built_in()) {
|
||||
if (!compatible_tokens.recovery_tokens.count(lookahead))
|
||||
return false;
|
||||
for (Symbol::Index incompatible_index : incompatible_token_indices) {
|
||||
if (other.terminal_entries.count(Symbol(incompatible_index, Symbol::Terminal))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (actions.back().type != ParseActionTypeReduce)
|
||||
return false;
|
||||
if (!has_entry(other, entry.second))
|
||||
|
|
@ -401,10 +429,18 @@ class ParseTableBuilder {
|
|||
for (auto &entry : other.terminal_entries) {
|
||||
Symbol lookahead = entry.first;
|
||||
const vector<ParseAction> &actions = entry.second.actions;
|
||||
auto &incompatible_token_indices = compatible_tokens.unmergeable_pairs[lookahead.index];
|
||||
|
||||
if (!state.terminal_entries.count(lookahead)) {
|
||||
if (compatible_tokens.recovery_tokens.count(lookahead) == 0 && !lookahead.is_built_in())
|
||||
return false;
|
||||
if (!lookahead.is_built_in()) {
|
||||
if (!compatible_tokens.recovery_tokens.count(lookahead))
|
||||
return false;
|
||||
for (Symbol::Index incompatible_index : incompatible_token_indices) {
|
||||
if (state.terminal_entries.count(Symbol(incompatible_index, Symbol::Terminal))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (actions.back().type != ParseActionTypeReduce)
|
||||
return false;
|
||||
if (!has_entry(state, entry.second))
|
||||
|
|
@ -629,7 +665,7 @@ class ParseTableBuilder {
|
|||
|
||||
switch (symbol.type) {
|
||||
case Symbol::Terminal: {
|
||||
const Variable &variable = lexical_grammar.variables[symbol.index];
|
||||
const LexicalVariable &variable = lexical_grammar.variables[symbol.index];
|
||||
if (variable.type == VariableTypeNamed)
|
||||
return variable.name;
|
||||
else
|
||||
|
|
|
|||
|
|
@ -83,6 +83,7 @@ using FirstCharactersIntersector = CharacterIntersector<true, false>;
|
|||
|
||||
CompatibleTokensResult get_compatible_tokens(const LexicalGrammar &grammar) {
|
||||
CompatibleTokensResult result;
|
||||
result.unmergeable_pairs.resize(grammar.variables.size());
|
||||
|
||||
AllCharacters all_separator_characters;
|
||||
for (const rule_ptr &separator : grammar.separators)
|
||||
|
|
@ -90,7 +91,8 @@ CompatibleTokensResult get_compatible_tokens(const LexicalGrammar &grammar) {
|
|||
|
||||
for (size_t i = 0; i < grammar.variables.size(); i++) {
|
||||
Symbol symbol(i, Symbol::Terminal);
|
||||
rule_ptr rule = grammar.variables[i].rule;
|
||||
const LexicalVariable &variable = grammar.variables[i];
|
||||
rule_ptr rule = variable.rule;
|
||||
|
||||
FirstCharacters first_characters;
|
||||
first_characters.apply(rule);
|
||||
|
|
@ -109,18 +111,20 @@ CompatibleTokensResult get_compatible_tokens(const LexicalGrammar &grammar) {
|
|||
!last_characters.result.includes_all &&
|
||||
!last_characters.result.intersects(all_separator_characters.result);
|
||||
|
||||
bool has_no_separators =
|
||||
!all_characters.result.intersects(all_separator_characters.result);
|
||||
bool has_separators =
|
||||
all_characters.result.intersects(all_separator_characters.result);
|
||||
|
||||
if ((has_distinct_start && has_distinct_end) || has_no_separators)
|
||||
if ((has_distinct_start && has_distinct_end) || !has_separators)
|
||||
result.recovery_tokens.insert(symbol);
|
||||
|
||||
for (size_t j = 0; j < grammar.variables.size(); j++) {
|
||||
if (j == i) continue;
|
||||
Symbol other_symbol(j, Symbol::Terminal);
|
||||
FirstCharactersIntersector intersector(&first_characters.result);
|
||||
if (intersector.apply(grammar.variables[j].rule)) {
|
||||
result.unmergeable_pairs[symbol].insert(other_symbol);
|
||||
for (size_t j = 0; j < i; j++) {
|
||||
const LexicalVariable &other_variable = grammar.variables[j];
|
||||
if (has_separators) {
|
||||
FirstCharactersIntersector intersector(&first_characters.result);
|
||||
if (intersector.apply(other_variable.rule)) {
|
||||
result.unmergeable_pairs[i].insert(j);
|
||||
result.unmergeable_pairs[j].insert(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,8 +3,9 @@
|
|||
|
||||
#include "compiler/rule.h"
|
||||
#include "compiler/rules/symbol.h"
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <unordered_set>
|
||||
|
||||
namespace tree_sitter {
|
||||
|
||||
|
|
@ -14,7 +15,7 @@ namespace build_tables {
|
|||
|
||||
struct CompatibleTokensResult {
|
||||
std::set<rules::Symbol> recovery_tokens;
|
||||
std::map<rules::Symbol, std::set<rules::Symbol>> unmergeable_pairs;
|
||||
std::vector<std::unordered_set<rules::Symbol::Index>> unmergeable_pairs;
|
||||
};
|
||||
|
||||
CompatibleTokensResult get_compatible_tokens(const LexicalGrammar &);
|
||||
|
|
|
|||
|
|
@ -32,19 +32,15 @@ LexItem::CompletionStatus LexItem::completion_status() const {
|
|||
CompletionStatus apply_to(const rules::Choice *rule) {
|
||||
for (const auto &element : rule->elements) {
|
||||
CompletionStatus status = apply(element);
|
||||
if (status.is_done)
|
||||
return status;
|
||||
if (status.is_done) return status;
|
||||
}
|
||||
return { false, PrecedenceRange(), false };
|
||||
return { false, PrecedenceRange() };
|
||||
}
|
||||
|
||||
CompletionStatus apply_to(const rules::Metadata *rule) {
|
||||
CompletionStatus result = apply(rule->rule);
|
||||
if (result.is_done) {
|
||||
if (result.precedence.empty && rule->params.has_precedence)
|
||||
result.precedence.add(rule->params.precedence);
|
||||
if (rule->params.is_string)
|
||||
result.is_string = true;
|
||||
if (result.is_done && result.precedence.empty && rule->params.has_precedence) {
|
||||
result.precedence.add(rule->params.precedence);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
|
@ -54,15 +50,16 @@ LexItem::CompletionStatus LexItem::completion_status() const {
|
|||
}
|
||||
|
||||
CompletionStatus apply_to(const rules::Blank *rule) {
|
||||
return { true, PrecedenceRange(), false };
|
||||
return { true, PrecedenceRange() };
|
||||
}
|
||||
|
||||
CompletionStatus apply_to(const rules::Seq *rule) {
|
||||
CompletionStatus left_status = apply(rule->left);
|
||||
if (left_status.is_done)
|
||||
if (left_status.is_done) {
|
||||
return apply(rule->right);
|
||||
else
|
||||
return { false, PrecedenceRange(), false };
|
||||
} else {
|
||||
return { false, PrecedenceRange() };
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
|
@ -80,8 +77,9 @@ bool LexItemSet::operator==(const LexItemSet &other) const {
|
|||
|
||||
LexItemSet::TransitionMap LexItemSet::transitions() const {
|
||||
TransitionMap result;
|
||||
for (const LexItem &item : entries)
|
||||
for (const LexItem &item : entries) {
|
||||
lex_item_transitions(&result, item);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -19,7 +19,6 @@ class LexItem {
|
|||
struct CompletionStatus {
|
||||
bool is_done;
|
||||
PrecedenceRange precedence;
|
||||
bool is_string;
|
||||
};
|
||||
|
||||
bool operator==(const LexItem &other) const;
|
||||
|
|
|
|||
|
|
@ -561,7 +561,7 @@ class CCodeGenerator {
|
|||
return { variable.name, variable.type };
|
||||
}
|
||||
case Symbol::Terminal: {
|
||||
const Variable &variable = lexical_grammar.variables[symbol.index];
|
||||
const LexicalVariable &variable = lexical_grammar.variables[symbol.index];
|
||||
return { variable.name, variable.type };
|
||||
}
|
||||
case Symbol::External:
|
||||
|
|
|
|||
11
src/compiler/lexical_grammar.cc
Normal file
11
src/compiler/lexical_grammar.cc
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
#include "compiler/lexical_grammar.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
|
||||
using std::string;
|
||||
|
||||
LexicalVariable::LexicalVariable(
|
||||
const string &name, VariableType type, const rule_ptr &rule, bool is_string)
|
||||
: name(name), rule(rule), type(type), is_string(is_string) {}
|
||||
|
||||
} // namespace tree_sitter
|
||||
|
|
@ -9,8 +9,17 @@
|
|||
|
||||
namespace tree_sitter {
|
||||
|
||||
struct LexicalVariable {
|
||||
LexicalVariable(const std::string &, VariableType, const rule_ptr &, bool);
|
||||
|
||||
std::string name;
|
||||
rule_ptr rule;
|
||||
VariableType type;
|
||||
bool is_string;
|
||||
};
|
||||
|
||||
struct LexicalGrammar {
|
||||
std::vector<Variable> variables;
|
||||
std::vector<LexicalVariable> variables;
|
||||
std::vector<rule_ptr> separators;
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -96,8 +96,6 @@ class ParseTable {
|
|||
|
||||
std::vector<ParseState> states;
|
||||
std::map<rules::Symbol, ParseTableSymbolMetadata> symbols;
|
||||
|
||||
std::set<rules::Symbol> mergeable_symbols;
|
||||
};
|
||||
|
||||
} // namespace tree_sitter
|
||||
|
|
|
|||
|
|
@ -67,11 +67,11 @@ pair<LexicalGrammar, CompileError> expand_tokens(const LexicalGrammar &grammar)
|
|||
LexicalGrammar result;
|
||||
ExpandTokens expander;
|
||||
|
||||
for (const Variable &variable : grammar.variables) {
|
||||
for (const LexicalVariable &variable : grammar.variables) {
|
||||
auto rule = expander.apply(variable.rule);
|
||||
if (expander.error.type)
|
||||
return { result, expander.error };
|
||||
result.variables.push_back(Variable(variable.name, variable.type, rule));
|
||||
result.variables.push_back({variable.name, variable.type, rule, variable.is_string});
|
||||
}
|
||||
|
||||
for (auto &sep : grammar.separators) {
|
||||
|
|
|
|||
|
|
@ -56,7 +56,7 @@ class SymbolReplacer : public rules::IdentityRuleFn {
|
|||
class TokenExtractor : public rules::IdentityRuleFn {
|
||||
using rules::IdentityRuleFn::apply_to;
|
||||
|
||||
rule_ptr apply_to_token(const Rule *input, VariableType entry_type) {
|
||||
rule_ptr apply_to_token(const Rule *input, VariableType entry_type, bool is_string) {
|
||||
for (size_t i = 0; i < tokens.size(); i++)
|
||||
if (tokens[i].rule->operator==(*input)) {
|
||||
token_usage_counts[i]++;
|
||||
|
|
@ -65,29 +65,30 @@ class TokenExtractor : public rules::IdentityRuleFn {
|
|||
|
||||
rule_ptr rule = input->copy();
|
||||
size_t index = tokens.size();
|
||||
tokens.push_back(Variable(token_description(rule), entry_type, rule));
|
||||
tokens.push_back({token_description(rule), entry_type, rule, is_string});
|
||||
token_usage_counts.push_back(1);
|
||||
return make_shared<Symbol>(index, Symbol::Terminal);
|
||||
}
|
||||
|
||||
rule_ptr apply_to(const rules::String *rule) {
|
||||
return apply_to_token(rule, VariableTypeAnonymous);
|
||||
return apply_to_token(rule, VariableTypeAnonymous, true);
|
||||
}
|
||||
|
||||
rule_ptr apply_to(const rules::Pattern *rule) {
|
||||
return apply_to_token(rule, VariableTypeAuxiliary);
|
||||
return apply_to_token(rule, VariableTypeAuxiliary, false);
|
||||
}
|
||||
|
||||
rule_ptr apply_to(const rules::Metadata *rule) {
|
||||
if (rule->params.is_token)
|
||||
return apply_to_token(rule->rule.get(), VariableTypeAuxiliary);
|
||||
else
|
||||
if (rule->params.is_token) {
|
||||
return apply_to_token(rule->rule.get(), VariableTypeAuxiliary, false);
|
||||
} else {
|
||||
return rules::IdentityRuleFn::apply_to(rule);
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
vector<size_t> token_usage_counts;
|
||||
vector<Variable> tokens;
|
||||
vector<LexicalVariable> tokens;
|
||||
};
|
||||
|
||||
static CompileError extra_token_error(const string &message) {
|
||||
|
|
@ -139,8 +140,9 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
|
|||
|
||||
for (const ConflictSet &conflict_set : grammar.expected_conflicts) {
|
||||
ConflictSet new_conflict_set;
|
||||
for (const Symbol &symbol : conflict_set)
|
||||
for (const Symbol &symbol : conflict_set) {
|
||||
new_conflict_set.insert(symbol_replacer.replace_symbol(symbol));
|
||||
}
|
||||
syntax_grammar.expected_conflicts.insert(new_conflict_set);
|
||||
}
|
||||
|
||||
|
|
@ -154,7 +156,7 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
|
|||
for (const rule_ptr &rule : grammar.extra_tokens) {
|
||||
int i = 0;
|
||||
bool used_elsewhere_in_grammar = false;
|
||||
for (const Variable &variable : lexical_grammar.variables) {
|
||||
for (const LexicalVariable &variable : lexical_grammar.variables) {
|
||||
if (variable.rule->operator==(*rule)) {
|
||||
syntax_grammar.extra_tokens.insert(Symbol(i, Symbol::Terminal));
|
||||
used_elsewhere_in_grammar = true;
|
||||
|
|
@ -171,9 +173,10 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
|
|||
}
|
||||
|
||||
auto symbol = rule->as<Symbol>();
|
||||
if (!symbol)
|
||||
if (!symbol) {
|
||||
return make_tuple(syntax_grammar, lexical_grammar,
|
||||
extra_token_error(rule->to_string()));
|
||||
}
|
||||
|
||||
Symbol new_symbol = symbol_replacer.replace_symbol(*symbol);
|
||||
if (new_symbol.is_non_terminal()) {
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ namespace prepare_grammar {
|
|||
LexicalGrammar normalize_rules(const LexicalGrammar &input_grammar) {
|
||||
LexicalGrammar result(input_grammar);
|
||||
|
||||
for (Variable &variable : result.variables) {
|
||||
for (LexicalVariable &variable : result.variables) {
|
||||
variable.rule = rules::Choice::build(extract_choices(variable.rule));
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -8,10 +8,8 @@
|
|||
namespace tree_sitter {
|
||||
|
||||
using std::string;
|
||||
using std::to_string;
|
||||
using std::pair;
|
||||
using std::vector;
|
||||
using std::set;
|
||||
|
||||
SyntaxVariable::SyntaxVariable(const string &name, VariableType type,
|
||||
const vector<Production> &productions)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue