Avoid introducing certain lexical conflicts during parse state merging

The current pretty conservative approach is to avoid merging parse states which
would cause a pair tokens to co-exist for the first time in any parse state,
where the two tokens can start with the same character and at least one of the
tokens can contain a character which is part of the grammar's separators.
This commit is contained in:
Max Brunsfeld 2017-02-27 22:54:38 -08:00
parent 3c8e6f9987
commit 686dc0997c
24 changed files with 305 additions and 158 deletions

View file

@ -25,6 +25,7 @@
'src/compiler/compile.cc',
'src/compiler/generate_code/c_code.cc',
'src/compiler/lex_table.cc',
'src/compiler/lexical_grammar.cc',
'src/compiler/parse_grammar.cc',
'src/compiler/parse_table.cc',
'src/compiler/precedence_range.cc',

View file

@ -14,17 +14,18 @@ START_TEST
describe("recovery_tokens(rule)", []() {
it("includes rules that can only begin and end with an explicit set of characters", [&]() {
LexicalGrammar grammar;
grammar.separators = {
character({ ' ' }),
};
grammar.variables = {
Variable("var0", VariableTypeNamed, character({}, false)),
Variable("var1", VariableTypeNamed, seq({
LexicalVariable("var0", VariableTypeNamed, character({}, false), false),
LexicalVariable("var1", VariableTypeNamed, seq({
character({ 'a', 'b' }),
character({}, false),
character({ 'c', 'd' }),
})),
}), false),
};
AssertThat(get_compatible_tokens(grammar).recovery_tokens, Equals<set<Symbol>>({ Symbol(1, Symbol::Terminal) }));

View file

@ -13,11 +13,10 @@ START_TEST
describe("LexItem", []() {
describe("completion_status()", [&]() {
it("indicates whether the item is done, its precedence, and whether it is a string", [&]() {
it("indicates whether the item is done and its precedence", [&]() {
LexItem item1(Symbol(0, Symbol::Terminal), character({ 'a', 'b', 'c' }));
AssertThat(item1.completion_status().is_done, IsFalse());
AssertThat(item1.completion_status().precedence, Equals(PrecedenceRange()));
AssertThat(item1.completion_status().is_string, IsFalse());
MetadataParams params;
params.precedence = 3;
@ -30,12 +29,10 @@ describe("LexItem", []() {
AssertThat(item2.completion_status().is_done, IsTrue());
AssertThat(item2.completion_status().precedence, Equals(PrecedenceRange(3)));
AssertThat(item2.completion_status().is_string, IsTrue());
LexItem item3(Symbol(0, Symbol::Terminal), repeat(character({ ' ', '\t' })));
AssertThat(item3.completion_status().is_done, IsTrue());
AssertThat(item3.completion_status().precedence, Equals(PrecedenceRange()));
AssertThat(item3.completion_status().is_string, IsFalse());
});
});
});

View file

@ -12,12 +12,13 @@ using namespace rules;
START_TEST
describe("ParseItemSetBuilder", []() {
vector<Variable> lexical_variables;
vector<LexicalVariable> lexical_variables;
for (size_t i = 0; i < 20; i++) {
lexical_variables.push_back(Variable{
lexical_variables.push_back({
"token_" + to_string(i),
VariableTypeNamed,
blank(),
false
});
}

View file

@ -15,89 +15,149 @@ describe("expand_tokens", []() {
describe("string rules", [&]() {
it("replaces strings with sequences of character sets", [&]() {
LexicalGrammar grammar{{
Variable("rule_A", VariableTypeNamed, seq({
i_sym(10),
str("xyz"),
i_sym(11),
})),
}, {}};
LexicalGrammar grammar {
{
LexicalVariable {
"rule_A",
VariableTypeNamed,
seq({
i_sym(10),
str("xyz"),
i_sym(11),
}),
false
}
},
{}
};
auto result = expand_tokens(grammar);
AssertThat(result.second, Equals(CompileError::none()));
AssertThat(result.first.variables, Equals(vector<Variable>({
Variable("rule_A", VariableTypeNamed, seq({
i_sym(10),
metadata(seq({
character({ 'x' }),
character({ 'y' }),
character({ 'z' }),
}), string_token_params),
i_sym(11),
})),
})));
AssertThat(result.first.variables, Equals(vector<LexicalVariable> {
LexicalVariable {
"rule_A",
VariableTypeNamed,
seq({
i_sym(10),
metadata(seq({
character({ 'x' }),
character({ 'y' }),
character({ 'z' }),
}), string_token_params),
i_sym(11),
}),
false
}
}));
});
it("handles strings containing non-ASCII UTF8 characters", [&]() {
LexicalGrammar grammar{{
Variable("rule_A", VariableTypeNamed, str("\u03B1 \u03B2")),
}, {}};
LexicalGrammar grammar {
{
LexicalVariable {
"rule_A",
VariableTypeNamed,
str("\u03B1 \u03B2"),
false
},
},
{}
};
auto result = expand_tokens(grammar);
AssertThat(result.first.variables, Equals(vector<Variable>({
Variable("rule_A", VariableTypeNamed, metadata(seq({
character({ 945 }),
character({ ' ' }),
character({ 946 }),
}), string_token_params)),
})));
AssertThat(result.first.variables, Equals(vector<LexicalVariable> {
LexicalVariable {
"rule_A",
VariableTypeNamed,
metadata(seq({
character({ 945 }),
character({ ' ' }),
character({ 946 }),
}), string_token_params),
false
}
}));
});
});
describe("regexp rules", [&]() {
it("replaces regexps with the equivalent rule tree", [&]() {
LexicalGrammar grammar{{
Variable("rule_A", VariableTypeNamed, seq({
i_sym(10),
pattern("x*"),
i_sym(11),
})),
}, {}};
LexicalGrammar grammar {
{
LexicalVariable {
"rule_A",
VariableTypeNamed,
seq({
i_sym(10),
pattern("x*"),
i_sym(11),
}),
false
}
},
{}
};
auto result = expand_tokens(grammar);
AssertThat(result.second, Equals(CompileError::none()));
AssertThat(result.first.variables, Equals(vector<Variable>({
Variable("rule_A", VariableTypeNamed, seq({
i_sym(10),
repeat(character({ 'x' })),
i_sym(11),
})),
})));
AssertThat(result.first.variables, Equals(vector<LexicalVariable> {
LexicalVariable {
"rule_A",
VariableTypeNamed,
seq({
i_sym(10),
repeat(character({ 'x' })),
i_sym(11),
}),
false
}
}));
});
it("handles regexps containing non-ASCII UTF8 characters", [&]() {
LexicalGrammar grammar{{
Variable("rule_A", VariableTypeNamed, pattern("[^\u03B1-\u03B4]*")),
}, {}};
LexicalGrammar grammar {
{
LexicalVariable {
"rule_A",
VariableTypeNamed,
pattern("[^\u03B1-\u03B4]*"),
false
}
},
{}
};
auto result = expand_tokens(grammar);
AssertThat(result.first.variables, Equals(vector<Variable>({
Variable("rule_A", VariableTypeNamed, repeat(character({ 945, 946, 947, 948 }, false))),
})));
AssertThat(result.first.variables, Equals(vector<LexicalVariable> {
LexicalVariable {
"rule_A",
VariableTypeNamed,
repeat(character({ 945, 946, 947, 948 }, false)),
false
}
}));
});
it("returns an error when the grammar contains an invalid regex", [&]() {
LexicalGrammar grammar{{
Variable("rule_A", VariableTypeNamed, seq({
pattern("("),
str("xyz"),
pattern("["),
}))
}, {}};
LexicalGrammar grammar {
{
LexicalVariable {
"rule_A",
VariableTypeNamed,
seq({
pattern("("),
str("xyz"),
pattern("["),
}),
false
},
},
{}
};
auto result = expand_tokens(grammar);

View file

@ -16,20 +16,25 @@ using prepare_grammar::InitialSyntaxGrammar;
describe("extract_tokens", []() {
it("moves strings, patterns, and sub-rules marked as tokens into the lexical grammar", [&]() {
auto result = extract_tokens(InternedGrammar{{
Variable("rule_A", VariableTypeNamed, repeat1(seq({
str("ab"),
pattern("cd*"),
choice({
i_sym(1),
i_sym(2),
token(repeat1(choice({ str("ef"), str("gh") }))),
}),
}))),
Variable("rule_B", VariableTypeNamed, pattern("ij+")),
Variable("rule_C", VariableTypeNamed, choice({ str("kl"), blank() })),
Variable("rule_D", VariableTypeNamed, repeat1(i_sym(3)))
}, {}, {}, {}});
auto result = extract_tokens(InternedGrammar {
{
Variable("rule_A", VariableTypeNamed, repeat1(seq({
str("ab"),
pattern("cd*"),
choice({
i_sym(1),
i_sym(2),
token(repeat1(choice({ str("ef"), str("gh") }))),
}),
}))),
Variable("rule_B", VariableTypeNamed, pattern("ij+")),
Variable("rule_C", VariableTypeNamed, choice({ str("kl"), blank() })),
Variable("rule_D", VariableTypeNamed, repeat1(i_sym(3)))
},
{},
{},
{}
});
InitialSyntaxGrammar &syntax_grammar = get<0>(result);
LexicalGrammar &lexical_grammar = get<1>(result);
@ -64,46 +69,51 @@ describe("extract_tokens", []() {
Variable("rule_D", VariableTypeNamed, repeat1(i_sym(2))),
})));
AssertThat(lexical_grammar.variables, Equals(vector<Variable>({
AssertThat(lexical_grammar.variables, Equals(vector<LexicalVariable>({
// Strings become anonymous rules.
Variable("ab", VariableTypeAnonymous, str("ab")),
LexicalVariable("ab", VariableTypeAnonymous, str("ab"), true),
// Patterns become hidden rules.
Variable("/cd*/", VariableTypeAuxiliary, pattern("cd*")),
LexicalVariable("/cd*/", VariableTypeAuxiliary, pattern("cd*"), false),
// Rules marked as tokens become hidden rules.
Variable("/(ef|gh)*/", VariableTypeAuxiliary, repeat1(choice({
LexicalVariable("/(ef|gh)*/", VariableTypeAuxiliary, repeat1(choice({
str("ef"),
str("gh")
}))),
})), false),
// This named rule was moved wholesale to the lexical grammar.
Variable("rule_B", VariableTypeNamed, pattern("ij+")),
LexicalVariable("rule_B", VariableTypeNamed, pattern("ij+"), false),
// Strings become anonymous rules.
Variable("kl", VariableTypeAnonymous, str("kl")),
LexicalVariable("kl", VariableTypeAnonymous, str("kl"), true),
})));
});
it("does not create duplicate tokens in the lexical grammar", [&]() {
auto result = extract_tokens(InternedGrammar{{
Variable("rule_A", VariableTypeNamed, seq({
str("ab"),
i_sym(0),
str("ab"),
})),
}, {}, {}, {}});
auto result = extract_tokens(InternedGrammar {
{
Variable("rule_A", VariableTypeNamed, seq({
str("ab"),
i_sym(0),
str("ab"),
})),
},
{},
{},
{}
});
InitialSyntaxGrammar &syntax_grammar = get<0>(result);
LexicalGrammar &lexical_grammar = get<1>(result);
AssertThat(syntax_grammar.variables, Equals(vector<Variable>({
Variable("rule_A", VariableTypeNamed, seq({ i_token(0), i_sym(0), i_token(0) })),
})));
AssertThat(syntax_grammar.variables, Equals(vector<Variable> {
Variable {"rule_A", VariableTypeNamed, seq({ i_token(0), i_sym(0), i_token(0) })},
}));
AssertThat(lexical_grammar.variables, Equals(vector<Variable>({
Variable("ab", VariableTypeAnonymous, str("ab")),
})))
AssertThat(lexical_grammar.variables, Equals(vector<LexicalVariable> {
LexicalVariable {"ab", VariableTypeAnonymous, str("ab"), true},
}))
});
it("does not move entire rules into the lexical grammar if their content is used elsewhere in the grammar", [&]() {
@ -122,11 +132,11 @@ describe("extract_tokens", []() {
Variable("rule_C", VariableTypeNamed, seq({ i_token(2), i_token(1) })),
})));
AssertThat(lexical_grammar.variables, Equals(vector<Variable>({
Variable("ab", VariableTypeAnonymous, str("ab")),
Variable("cd", VariableTypeAnonymous, str("cd")),
Variable("ef", VariableTypeAnonymous, str("ef")),
})));
AssertThat(lexical_grammar.variables, Equals(vector<LexicalVariable> {
LexicalVariable {"ab", VariableTypeAnonymous, str("ab"), true},
LexicalVariable {"cd", VariableTypeAnonymous, str("cd"), true},
LexicalVariable {"ef", VariableTypeAnonymous, str("ef"), true},
}));
});
it("renumbers the grammar's expected conflict symbols based on any moved rules", [&]() {

View file

@ -1,6 +1,8 @@
#include "rule_helpers.h"
#include <memory>
#include "compiler/rules/symbol.h"
#include "compiler/variable.h"
#include "compiler/lexical_grammar.h"
namespace tree_sitter {
using std::make_shared;
@ -52,4 +54,9 @@ namespace tree_sitter {
return left.name == right.name && left.rule->operator==(*right.rule) &&
left.type == right.type;
}
bool operator==(const LexicalVariable &left, const LexicalVariable &right) {
return left.name == right.name && left.rule->operator==(*right.rule) &&
left.type == right.type && left.is_string == right.is_string;
}
}

View file

@ -15,7 +15,11 @@ namespace tree_sitter {
rule_ptr i_token(size_t index);
rule_ptr active_prec(int precedence, rule_ptr);
struct Variable;
struct LexicalVariable;
bool operator==(const Variable &left, const Variable &right);
bool operator==(const LexicalVariable &left, const LexicalVariable &right);
}
#endif // HELPERS_RULE_HELPERS_H_

View file

@ -3,6 +3,7 @@
#include "tree_sitter/compiler.h"
#include "compiler/parse_table.h"
#include "compiler/syntax_grammar.h"
#include "compiler/lexical_grammar.h"
#include "compiler/build_tables/parse_item.h"
#include "compiler/build_tables/lex_item.h"
@ -41,6 +42,11 @@ ostream &operator<<(ostream &stream, const SyntaxVariable &variable) {
return stream << string("{") << variable.name << string(", ") << variable.productions << string(", ") << to_string(variable.type) << string("}");
}
ostream &operator<<(ostream &stream, const LexicalVariable &variable) {
return stream << "{" << variable.name << ", " << variable.rule << ", " <<
to_string(variable.type) << ", " << to_string(variable.is_string) << "}";
}
std::ostream &operator<<(std::ostream &stream, const AdvanceAction &action) {
return stream << string("#<advance ") + to_string(action.state_index) + ">";
}

View file

@ -93,6 +93,7 @@ using std::string;
using std::to_string;
struct Variable;
struct SyntaxVariable;
struct LexicalVariable;
struct AdvanceAction;
struct AcceptTokenAction;
class ParseAction;
@ -107,6 +108,7 @@ ostream &operator<<(ostream &, const Rule &);
ostream &operator<<(ostream &, const rule_ptr &);
ostream &operator<<(ostream &, const Variable &);
ostream &operator<<(ostream &, const SyntaxVariable &);
ostream &operator<<(ostream &, const LexicalVariable &);
ostream &operator<<(ostream &, const AdvanceAction &);
ostream &operator<<(ostream &, const AcceptTokenAction &);
ostream &operator<<(ostream &, const ParseAction &);

View file

@ -99,7 +99,8 @@ class LexTableBuilder {
LexItem::CompletionStatus completion_status = item.completion_status();
if (completion_status.is_done) {
AcceptTokenAction action(item.lhs, completion_status.precedence.max,
completion_status.is_string);
item.lhs.is_built_in() ||
lex_grammar.variables[item.lhs.index].is_string);
auto current_action = lex_table.state(state_id).accept_action;
if (conflict_manager.resolve(action, current_action))

View file

@ -72,10 +72,11 @@ class ParseTableBuilder {
}));
CompileError error = process_part_state_queue();
if (error.type != TSCompileErrorTypeNone)
if (error.type != TSCompileErrorTypeNone) {
return { parse_table, error };
}
parse_table.mergeable_symbols = compatible_tokens.recovery_tokens;
update_unmergable_token_pairs();
build_error_parse_state();
@ -111,7 +112,7 @@ class ParseTableBuilder {
void build_error_parse_state() {
ParseState error_state;
for (const Symbol symbol : parse_table.mergeable_symbols) {
for (const Symbol symbol : compatible_tokens.recovery_tokens) {
add_out_of_context_parse_state(&error_state, symbol);
}
@ -292,6 +293,25 @@ class ParseTableBuilder {
}
}
void update_unmergable_token_pairs() {
for (const ParseState &state : parse_table.states) {
for (Symbol::Index token_index = 0, token_count = lexical_grammar.variables.size(); token_index < token_count; token_index++) {
Symbol token(token_index, Symbol::Terminal);
if (state.terminal_entries.count(token)) {
auto &incompatible_token_indices = compatible_tokens.unmergeable_pairs[token_index];
auto iter = incompatible_token_indices.begin();
while (iter != incompatible_token_indices.end()) {
if (state.terminal_entries.count(Symbol(*iter, Symbol::NonTerminal))) {
iter = incompatible_token_indices.erase(iter);
} else {
++iter;
}
}
}
}
}
}
void remove_duplicate_parse_states() {
map<size_t, set<ParseStateId>> state_indices_by_signature;
@ -382,11 +402,19 @@ class ParseTableBuilder {
for (auto &entry : state.terminal_entries) {
Symbol lookahead = entry.first;
const vector<ParseAction> &actions = entry.second.actions;
auto &incompatible_token_indices = compatible_tokens.unmergeable_pairs[lookahead.index];
const auto &other_entry = other.terminal_entries.find(lookahead);
if (other_entry == other.terminal_entries.end()) {
if (compatible_tokens.recovery_tokens.count(lookahead) == 0 && !lookahead.is_built_in())
return false;
if (!lookahead.is_built_in()) {
if (!compatible_tokens.recovery_tokens.count(lookahead))
return false;
for (Symbol::Index incompatible_index : incompatible_token_indices) {
if (other.terminal_entries.count(Symbol(incompatible_index, Symbol::Terminal))) {
return false;
}
}
}
if (actions.back().type != ParseActionTypeReduce)
return false;
if (!has_entry(other, entry.second))
@ -401,10 +429,18 @@ class ParseTableBuilder {
for (auto &entry : other.terminal_entries) {
Symbol lookahead = entry.first;
const vector<ParseAction> &actions = entry.second.actions;
auto &incompatible_token_indices = compatible_tokens.unmergeable_pairs[lookahead.index];
if (!state.terminal_entries.count(lookahead)) {
if (compatible_tokens.recovery_tokens.count(lookahead) == 0 && !lookahead.is_built_in())
return false;
if (!lookahead.is_built_in()) {
if (!compatible_tokens.recovery_tokens.count(lookahead))
return false;
for (Symbol::Index incompatible_index : incompatible_token_indices) {
if (state.terminal_entries.count(Symbol(incompatible_index, Symbol::Terminal))) {
return false;
}
}
}
if (actions.back().type != ParseActionTypeReduce)
return false;
if (!has_entry(state, entry.second))
@ -629,7 +665,7 @@ class ParseTableBuilder {
switch (symbol.type) {
case Symbol::Terminal: {
const Variable &variable = lexical_grammar.variables[symbol.index];
const LexicalVariable &variable = lexical_grammar.variables[symbol.index];
if (variable.type == VariableTypeNamed)
return variable.name;
else

View file

@ -83,6 +83,7 @@ using FirstCharactersIntersector = CharacterIntersector<true, false>;
CompatibleTokensResult get_compatible_tokens(const LexicalGrammar &grammar) {
CompatibleTokensResult result;
result.unmergeable_pairs.resize(grammar.variables.size());
AllCharacters all_separator_characters;
for (const rule_ptr &separator : grammar.separators)
@ -90,7 +91,8 @@ CompatibleTokensResult get_compatible_tokens(const LexicalGrammar &grammar) {
for (size_t i = 0; i < grammar.variables.size(); i++) {
Symbol symbol(i, Symbol::Terminal);
rule_ptr rule = grammar.variables[i].rule;
const LexicalVariable &variable = grammar.variables[i];
rule_ptr rule = variable.rule;
FirstCharacters first_characters;
first_characters.apply(rule);
@ -109,18 +111,20 @@ CompatibleTokensResult get_compatible_tokens(const LexicalGrammar &grammar) {
!last_characters.result.includes_all &&
!last_characters.result.intersects(all_separator_characters.result);
bool has_no_separators =
!all_characters.result.intersects(all_separator_characters.result);
bool has_separators =
all_characters.result.intersects(all_separator_characters.result);
if ((has_distinct_start && has_distinct_end) || has_no_separators)
if ((has_distinct_start && has_distinct_end) || !has_separators)
result.recovery_tokens.insert(symbol);
for (size_t j = 0; j < grammar.variables.size(); j++) {
if (j == i) continue;
Symbol other_symbol(j, Symbol::Terminal);
FirstCharactersIntersector intersector(&first_characters.result);
if (intersector.apply(grammar.variables[j].rule)) {
result.unmergeable_pairs[symbol].insert(other_symbol);
for (size_t j = 0; j < i; j++) {
const LexicalVariable &other_variable = grammar.variables[j];
if (has_separators) {
FirstCharactersIntersector intersector(&first_characters.result);
if (intersector.apply(other_variable.rule)) {
result.unmergeable_pairs[i].insert(j);
result.unmergeable_pairs[j].insert(i);
}
}
}
}

View file

@ -3,8 +3,9 @@
#include "compiler/rule.h"
#include "compiler/rules/symbol.h"
#include <map>
#include <vector>
#include <set>
#include <unordered_set>
namespace tree_sitter {
@ -14,7 +15,7 @@ namespace build_tables {
struct CompatibleTokensResult {
std::set<rules::Symbol> recovery_tokens;
std::map<rules::Symbol, std::set<rules::Symbol>> unmergeable_pairs;
std::vector<std::unordered_set<rules::Symbol::Index>> unmergeable_pairs;
};
CompatibleTokensResult get_compatible_tokens(const LexicalGrammar &);

View file

@ -32,19 +32,15 @@ LexItem::CompletionStatus LexItem::completion_status() const {
CompletionStatus apply_to(const rules::Choice *rule) {
for (const auto &element : rule->elements) {
CompletionStatus status = apply(element);
if (status.is_done)
return status;
if (status.is_done) return status;
}
return { false, PrecedenceRange(), false };
return { false, PrecedenceRange() };
}
CompletionStatus apply_to(const rules::Metadata *rule) {
CompletionStatus result = apply(rule->rule);
if (result.is_done) {
if (result.precedence.empty && rule->params.has_precedence)
result.precedence.add(rule->params.precedence);
if (rule->params.is_string)
result.is_string = true;
if (result.is_done && result.precedence.empty && rule->params.has_precedence) {
result.precedence.add(rule->params.precedence);
}
return result;
}
@ -54,15 +50,16 @@ LexItem::CompletionStatus LexItem::completion_status() const {
}
CompletionStatus apply_to(const rules::Blank *rule) {
return { true, PrecedenceRange(), false };
return { true, PrecedenceRange() };
}
CompletionStatus apply_to(const rules::Seq *rule) {
CompletionStatus left_status = apply(rule->left);
if (left_status.is_done)
if (left_status.is_done) {
return apply(rule->right);
else
return { false, PrecedenceRange(), false };
} else {
return { false, PrecedenceRange() };
}
}
};
@ -80,8 +77,9 @@ bool LexItemSet::operator==(const LexItemSet &other) const {
LexItemSet::TransitionMap LexItemSet::transitions() const {
TransitionMap result;
for (const LexItem &item : entries)
for (const LexItem &item : entries) {
lex_item_transitions(&result, item);
}
return result;
}

View file

@ -19,7 +19,6 @@ class LexItem {
struct CompletionStatus {
bool is_done;
PrecedenceRange precedence;
bool is_string;
};
bool operator==(const LexItem &other) const;

View file

@ -561,7 +561,7 @@ class CCodeGenerator {
return { variable.name, variable.type };
}
case Symbol::Terminal: {
const Variable &variable = lexical_grammar.variables[symbol.index];
const LexicalVariable &variable = lexical_grammar.variables[symbol.index];
return { variable.name, variable.type };
}
case Symbol::External:

View file

@ -0,0 +1,11 @@
#include "compiler/lexical_grammar.h"
namespace tree_sitter {
using std::string;
LexicalVariable::LexicalVariable(
const string &name, VariableType type, const rule_ptr &rule, bool is_string)
: name(name), rule(rule), type(type), is_string(is_string) {}
} // namespace tree_sitter

View file

@ -9,8 +9,17 @@
namespace tree_sitter {
struct LexicalVariable {
LexicalVariable(const std::string &, VariableType, const rule_ptr &, bool);
std::string name;
rule_ptr rule;
VariableType type;
bool is_string;
};
struct LexicalGrammar {
std::vector<Variable> variables;
std::vector<LexicalVariable> variables;
std::vector<rule_ptr> separators;
};

View file

@ -96,8 +96,6 @@ class ParseTable {
std::vector<ParseState> states;
std::map<rules::Symbol, ParseTableSymbolMetadata> symbols;
std::set<rules::Symbol> mergeable_symbols;
};
} // namespace tree_sitter

View file

@ -67,11 +67,11 @@ pair<LexicalGrammar, CompileError> expand_tokens(const LexicalGrammar &grammar)
LexicalGrammar result;
ExpandTokens expander;
for (const Variable &variable : grammar.variables) {
for (const LexicalVariable &variable : grammar.variables) {
auto rule = expander.apply(variable.rule);
if (expander.error.type)
return { result, expander.error };
result.variables.push_back(Variable(variable.name, variable.type, rule));
result.variables.push_back({variable.name, variable.type, rule, variable.is_string});
}
for (auto &sep : grammar.separators) {

View file

@ -56,7 +56,7 @@ class SymbolReplacer : public rules::IdentityRuleFn {
class TokenExtractor : public rules::IdentityRuleFn {
using rules::IdentityRuleFn::apply_to;
rule_ptr apply_to_token(const Rule *input, VariableType entry_type) {
rule_ptr apply_to_token(const Rule *input, VariableType entry_type, bool is_string) {
for (size_t i = 0; i < tokens.size(); i++)
if (tokens[i].rule->operator==(*input)) {
token_usage_counts[i]++;
@ -65,29 +65,30 @@ class TokenExtractor : public rules::IdentityRuleFn {
rule_ptr rule = input->copy();
size_t index = tokens.size();
tokens.push_back(Variable(token_description(rule), entry_type, rule));
tokens.push_back({token_description(rule), entry_type, rule, is_string});
token_usage_counts.push_back(1);
return make_shared<Symbol>(index, Symbol::Terminal);
}
rule_ptr apply_to(const rules::String *rule) {
return apply_to_token(rule, VariableTypeAnonymous);
return apply_to_token(rule, VariableTypeAnonymous, true);
}
rule_ptr apply_to(const rules::Pattern *rule) {
return apply_to_token(rule, VariableTypeAuxiliary);
return apply_to_token(rule, VariableTypeAuxiliary, false);
}
rule_ptr apply_to(const rules::Metadata *rule) {
if (rule->params.is_token)
return apply_to_token(rule->rule.get(), VariableTypeAuxiliary);
else
if (rule->params.is_token) {
return apply_to_token(rule->rule.get(), VariableTypeAuxiliary, false);
} else {
return rules::IdentityRuleFn::apply_to(rule);
}
}
public:
vector<size_t> token_usage_counts;
vector<Variable> tokens;
vector<LexicalVariable> tokens;
};
static CompileError extra_token_error(const string &message) {
@ -139,8 +140,9 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
for (const ConflictSet &conflict_set : grammar.expected_conflicts) {
ConflictSet new_conflict_set;
for (const Symbol &symbol : conflict_set)
for (const Symbol &symbol : conflict_set) {
new_conflict_set.insert(symbol_replacer.replace_symbol(symbol));
}
syntax_grammar.expected_conflicts.insert(new_conflict_set);
}
@ -154,7 +156,7 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
for (const rule_ptr &rule : grammar.extra_tokens) {
int i = 0;
bool used_elsewhere_in_grammar = false;
for (const Variable &variable : lexical_grammar.variables) {
for (const LexicalVariable &variable : lexical_grammar.variables) {
if (variable.rule->operator==(*rule)) {
syntax_grammar.extra_tokens.insert(Symbol(i, Symbol::Terminal));
used_elsewhere_in_grammar = true;
@ -171,9 +173,10 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
}
auto symbol = rule->as<Symbol>();
if (!symbol)
if (!symbol) {
return make_tuple(syntax_grammar, lexical_grammar,
extra_token_error(rule->to_string()));
}
Symbol new_symbol = symbol_replacer.replace_symbol(*symbol);
if (new_symbol.is_non_terminal()) {

View file

@ -8,7 +8,7 @@ namespace prepare_grammar {
LexicalGrammar normalize_rules(const LexicalGrammar &input_grammar) {
LexicalGrammar result(input_grammar);
for (Variable &variable : result.variables) {
for (LexicalVariable &variable : result.variables) {
variable.rule = rules::Choice::build(extract_choices(variable.rule));
}

View file

@ -8,10 +8,8 @@
namespace tree_sitter {
using std::string;
using std::to_string;
using std::pair;
using std::vector;
using std::set;
SyntaxVariable::SyntaxVariable(const string &name, VariableType type,
const vector<Production> &productions)