Allow anonymous tokens to be used in grammars' external token lists

This commit is contained in:
Max Brunsfeld 2017-03-17 16:31:29 -07:00
parent e2baf0930b
commit ed8fbff175
24 changed files with 282 additions and 183 deletions

View file

@ -38,7 +38,7 @@ class ParseTableBuilder {
set<string> conflicts;
ParseItemSetBuilder item_set_builder;
set<const Production *> fragile_productions;
vector<set<Symbol::Index>> incompatible_token_indices_by_index;
vector<set<Symbol>> incompatible_tokens_by_index;
bool allow_any_conflict;
public:
@ -109,10 +109,13 @@ class ParseTableBuilder {
void build_error_parse_state() {
ParseState error_state;
for (Symbol::Index i = 0; i < lexical_grammar.variables.size(); i++) {
for (unsigned i = 0; i < lexical_grammar.variables.size(); i++) {
Symbol token = Symbol::terminal(i);
bool has_non_reciprocal_conflict = false;
for (Symbol::Index incompatible_index : incompatible_token_indices_by_index[i]) {
if (!incompatible_token_indices_by_index[incompatible_index].count(i)) {
for (Symbol incompatible_token : incompatible_tokens_by_index[i]) {
if (incompatible_token.is_terminal() &&
!incompatible_tokens_by_index[incompatible_token.index].count(token)) {
has_non_reciprocal_conflict = true;
break;
}
@ -302,28 +305,25 @@ class ParseTableBuilder {
}
void compute_unmergable_token_pairs() {
incompatible_token_indices_by_index.resize(lexical_grammar.variables.size());
incompatible_tokens_by_index.resize(lexical_grammar.variables.size());
// First, assume that all tokens are mutually incompatible.
for (Symbol::Index i = 0, n = lexical_grammar.variables.size(); i < n; i++) {
auto &incompatible_indices = incompatible_token_indices_by_index[i];
for (Symbol::Index j = 0; j < n; j++) {
if (j != i) incompatible_indices.insert(j);
}
}
// For the remaining possibly-incompatible pairs of tokens, check if they
// are actually incompatible by actually generating lexical states that
// contain them both.
auto lex_table_builder = LexTableBuilder::create(lexical_grammar);
for (Symbol::Index i = 0, n = lexical_grammar.variables.size(); i < n; i++) {
auto &incompatible_indices = incompatible_token_indices_by_index[i];
auto iter = incompatible_indices.begin();
while (iter != incompatible_indices.end()) {
if (lex_table_builder->detect_conflict(i, *iter)) {
++iter;
} else {
iter = incompatible_indices.erase(iter);
for (unsigned i = 0, n = lexical_grammar.variables.size(); i < n; i++) {
Symbol token = Symbol::terminal(i);
auto &incompatible_indices = incompatible_tokens_by_index[i];
for (unsigned j = 0; j < n; j++) {
if (i == j) continue;
if (lex_table_builder->detect_conflict(i, j)) {
incompatible_indices.insert(Symbol::terminal(j));
}
}
for (const ExternalToken &external_token : grammar.external_tokens) {
if (external_token.corresponding_internal_token == token) {
for (unsigned j = 0; j < grammar.external_tokens.size(); j++) {
incompatible_indices.insert(Symbol::external(j));
}
}
}
}
@ -419,15 +419,14 @@ class ParseTableBuilder {
for (auto &entry : state.terminal_entries) {
Symbol lookahead = entry.first;
const vector<ParseAction> &actions = entry.second.actions;
auto &incompatible_token_indices = incompatible_token_indices_by_index[lookahead.index];
auto &incompatible_tokens = incompatible_tokens_by_index[lookahead.index];
const auto &other_entry = other.terminal_entries.find(lookahead);
if (other_entry == other.terminal_entries.end()) {
if (lookahead.is_external()) return false;
if (!lookahead.is_built_in()) {
for (Symbol::Index incompatible_index : incompatible_token_indices) {
Symbol incompatible_symbol = Symbol::terminal(incompatible_index);
if (other.terminal_entries.count(incompatible_symbol)) return false;
for (const Symbol &incompatible_token : incompatible_tokens) {
if (other.terminal_entries.count(incompatible_token)) return false;
}
}
if (actions.back().type != ParseActionTypeReduce)
@ -444,14 +443,13 @@ class ParseTableBuilder {
for (auto &entry : other.terminal_entries) {
Symbol lookahead = entry.first;
const vector<ParseAction> &actions = entry.second.actions;
auto &incompatible_token_indices = incompatible_token_indices_by_index[lookahead.index];
auto &incompatible_tokens = incompatible_tokens_by_index[lookahead.index];
if (!state.terminal_entries.count(lookahead)) {
if (lookahead.is_external()) return false;
if (!lookahead.is_built_in()) {
for (Symbol::Index incompatible_index : incompatible_token_indices) {
Symbol incompatible_symbol = Symbol::terminal(incompatible_index);
if (state.terminal_entries.count(incompatible_symbol)) return false;
for (const Symbol &incompatible_token : incompatible_tokens) {
if (state.terminal_entries.count(incompatible_token)) return false;
}
}
if (actions.back().type != ParseActionTypeReduce)

View file

@ -526,7 +526,7 @@ class CCodeGenerator {
// Helper functions
string external_token_id(Symbol::Index index) {
return "ts_external_token_" + syntax_grammar.external_tokens[index].name;
return "ts_external_token_" + sanitize_name(syntax_grammar.external_tokens[index].name);
}
string symbol_id(const Symbol &symbol) {

View file

@ -16,29 +16,21 @@ enum VariableType {
VariableTypeNamed,
};
struct ExternalToken {
struct Variable {
std::string name;
VariableType type;
rules::Symbol corresponding_internal_token;
rules::Rule rule;
inline bool operator==(const ExternalToken &other) const {
return name == other.name &&
type == other.type &&
corresponding_internal_token == other.corresponding_internal_token;
inline bool operator==(const Variable &other) const {
return name == other.name && rule == other.rule && type == other.type;
}
};
struct InputGrammar {
struct Variable {
std::string name;
VariableType type;
rules::Rule rule;
};
std::vector<Variable> variables;
std::vector<rules::Rule> extra_tokens;
std::vector<std::unordered_set<rules::NamedSymbol>> expected_conflicts;
std::vector<ExternalToken> external_tokens;
std::vector<Variable> external_tokens;
};
} // namespace tree_sitter

View file

@ -228,7 +228,7 @@ ParseGrammarResult parse_grammar(const string &input) {
error_message = result.error_message;
goto error;
}
grammar.variables.push_back(InputGrammar::Variable{
grammar.variables.push_back(Variable{
string(entry_json.name),
VariableTypeNamed,
result.rule
@ -293,18 +293,21 @@ ParseGrammarResult parse_grammar(const string &input) {
}
for (size_t i = 0, length = external_tokens_json.u.array.length; i < length; i++) {
json_value *token_name_json = external_tokens_json.u.array.values[i];
if (token_name_json->type != json_string) {
error_message = "External token values must be strings";
json_value *external_token_json = external_tokens_json.u.array.values[i];
auto result = parse_rule(external_token_json);
if (!result.error_message.empty()) {
error_message = "Invalid external token: " + result.error_message;
goto error;
}
string token_name = token_name_json->u.string.ptr;
grammar.external_tokens.push_back({
token_name,
VariableTypeNamed,
rules::NONE()
});
grammar.external_tokens.push_back(result.rule.match(
[](rules::NamedSymbol named_symbol) {
return Variable{named_symbol.value, VariableTypeNamed, named_symbol};
},
[](auto rule) {
return Variable{"", VariableTypeAnonymous, rule};
}
));
}
}

View file

@ -85,7 +85,7 @@ class ExpandRepeats {
return apply(rule);
}
vector<InitialSyntaxGrammar::Variable> aux_rules;
vector<Variable> aux_rules;
};
InitialSyntaxGrammar expand_repeats(const InitialSyntaxGrammar &grammar) {

View file

@ -156,7 +156,7 @@ class TokenExtractor {
}
vector<size_t> token_usage_counts;
vector<InternedGrammar::Variable> tokens;
vector<Variable> tokens;
};
tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
@ -167,8 +167,8 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
SymbolReplacer symbol_replacer;
TokenExtractor extractor;
// First, extract all of the grammar's tokens into the lexical grammar.
vector<InitialSyntaxGrammar::Variable> processed_variables;
// Extract all of the grammar's tokens into the lexical grammar.
vector<Variable> processed_variables;
for (const auto &variable : grammar.variables) {
processed_variables.push_back({
variable.name,
@ -177,6 +177,15 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
});
}
vector<Variable> processed_external_tokens;
for (const auto &external_token : grammar.external_tokens) {
processed_external_tokens.push_back({
external_token.name,
external_token.type,
extractor.apply(external_token.rule)
});
}
for (const auto &extracted_token : extractor.tokens) {
auto expansion = expand_token(extracted_token.rule);
if (expansion.error) return make_tuple(
@ -269,12 +278,22 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
if (error) return make_tuple(syntax_grammar, lexical_grammar, error);
}
for (const ExternalToken &external_token : grammar.external_tokens) {
Symbol internal_token = symbol_replacer.replace_symbol(
external_token.corresponding_internal_token
);
for (const auto &external_token : processed_external_tokens) {
Rule new_rule = symbol_replacer.apply(external_token.rule);
if (internal_token.is_non_terminal()) {
if (!new_rule.is<Symbol>()) {
return make_tuple(
syntax_grammar,
lexical_grammar,
CompileError(
TSCompileErrorTypeInvalidExternalToken,
"Non-symbol rule expressions can't be used as external tokens"
)
);
}
Symbol symbol = new_rule.get_unchecked<Symbol>();
if (symbol.is_non_terminal()) {
return make_tuple(
syntax_grammar,
lexical_grammar,
@ -285,11 +304,19 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
);
}
syntax_grammar.external_tokens.push_back(ExternalToken{
external_token.name,
external_token.type,
internal_token
});
if (symbol.is_external()) {
syntax_grammar.external_tokens.push_back(ExternalToken{
external_token.name,
external_token.type,
rules::NONE()
});
} else {
syntax_grammar.external_tokens.push_back(ExternalToken{
lexical_grammar.variables[symbol.index].name,
external_token.type,
symbol
});
}
}
return make_tuple(syntax_grammar, lexical_grammar, CompileError::none());

View file

@ -89,7 +89,7 @@ class FlattenRule {
}
};
SyntaxVariable flatten_rule(const InitialSyntaxGrammar::Variable &variable) {
SyntaxVariable flatten_rule(const Variable &variable) {
vector<Production> productions;
for (const Rule &rule_component : extract_choices(variable.rule)) {

View file

@ -11,7 +11,7 @@
namespace tree_sitter {
namespace prepare_grammar {
SyntaxVariable flatten_rule(const InitialSyntaxGrammar::Variable &variable);
SyntaxVariable flatten_rule(const Variable &variable);
std::pair<SyntaxGrammar, CompileError> flatten_grammar(const InitialSyntaxGrammar &);
} // namespace prepare_grammar

View file

@ -5,22 +5,13 @@
#include <vector>
#include "tree_sitter/compiler.h"
#include "compiler/grammar.h"
#include "compiler/syntax_grammar.h"
#include "compiler/rule.h"
namespace tree_sitter {
namespace prepare_grammar {
struct InitialSyntaxGrammar {
struct Variable {
std::string name;
VariableType type;
rules::Rule rule;
inline bool operator==(const Variable &other) const {
return name == other.name && type == other.type && rule == other.rule;
}
};
std::vector<Variable> variables;
std::set<rules::Symbol> extra_tokens;
std::set<std::set<rules::Symbol>> expected_conflicts;

View file

@ -21,14 +21,21 @@ class SymbolInterner {
public:
Rule apply(const Rule &rule) {
return rule.match(
[&](const rules::Blank &blank) -> Rule { return blank; },
[&](const rules::Blank &blank) -> Rule {
return blank;
},
[&](const rules::NamedSymbol &symbol) {
return intern_symbol(symbol);
},
[&](const rules::String &string) { return string; },
[&](const rules::Pattern &pattern) { return pattern; },
[&](const rules::String &string) {
return string;
},
[&](const rules::Pattern &pattern) {
return pattern;
},
[&](const rules::Choice &choice) {
vector<rules::Rule> elements;
@ -58,12 +65,18 @@ class SymbolInterner {
}
Symbol intern_symbol(rules::NamedSymbol named_symbol) {
for (size_t i = 0; i < grammar.variables.size(); i++)
if (grammar.variables[i].name == named_symbol.value)
for (size_t i = 0; i < grammar.variables.size(); i++) {
if (grammar.variables[i].name == named_symbol.value) {
return Symbol::non_terminal(i);
for (size_t i = 0; i < grammar.external_tokens.size(); i++)
if (grammar.external_tokens[i].name == named_symbol.value)
}
}
for (size_t i = 0; i < grammar.external_tokens.size(); i++) {
if (grammar.external_tokens[i].name == named_symbol.value) {
return Symbol::external(i);
}
}
missing_rule_name = named_symbol.value;
return rules::NONE();
}
@ -81,23 +94,21 @@ CompileError missing_rule_error(string rule_name) {
pair<InternedGrammar, CompileError> intern_symbols(const InputGrammar &grammar) {
InternedGrammar result;
SymbolInterner interner(grammar);
for (auto &external_token : grammar.external_tokens) {
Symbol corresponding_internal_token = rules::NONE();
for (size_t i = 0, n = grammar.variables.size(); i < n; i++) {
if (grammar.variables[i].name == external_token.name) {
corresponding_internal_token = Symbol::non_terminal(i);
break;
}
auto new_rule = interner.apply(external_token.rule);
if (!interner.missing_rule_name.empty()) {
return { result, missing_rule_error(interner.missing_rule_name) };
}
result.external_tokens.push_back(ExternalToken{
result.external_tokens.push_back(Variable{
external_token.name,
external_token.name[0] == '_' ? VariableTypeHidden : VariableTypeNamed,
corresponding_internal_token
external_token.name[0] == '_' ? VariableTypeHidden : external_token.type,
new_rule
});
}
SymbolInterner interner(grammar);
for (auto &variable : grammar.variables) {
auto new_rule = interner.apply(variable.rule);
@ -105,7 +116,7 @@ pair<InternedGrammar, CompileError> intern_symbols(const InputGrammar &grammar)
return { result, missing_rule_error(interner.missing_rule_name) };
}
result.variables.push_back(InternedGrammar::Variable{
result.variables.push_back(Variable{
variable.name,
variable.name[0] == '_' ? VariableTypeHidden : VariableTypeNamed,
new_rule
@ -131,7 +142,7 @@ pair<InternedGrammar, CompileError> intern_symbols(const InputGrammar &grammar)
result.expected_conflicts.insert(entry);
}
return { result, CompileError::none() };
return {result, CompileError::none()};
}
} // namespace prepare_grammar

View file

@ -11,20 +11,10 @@ namespace tree_sitter {
namespace prepare_grammar {
struct InternedGrammar {
struct Variable {
std::string name;
VariableType type;
rules::Rule rule;
bool operator==(const Variable &other) const {
return name == other.name && type == other.type && rule == other.rule;
}
};
std::vector<Variable> variables;
std::vector<rules::Rule> extra_tokens;
std::set<std::set<rules::Symbol>> expected_conflicts;
std::vector<ExternalToken> external_tokens;
std::vector<Variable> external_tokens;
};
} // namespace prepare_grammar

View file

@ -30,6 +30,18 @@ struct SyntaxVariable {
using ConflictSet = std::set<rules::Symbol>;
struct ExternalToken {
std::string name;
VariableType type;
rules::Symbol corresponding_internal_token;
inline bool operator==(const ExternalToken &other) const {
return name == other.name &&
type == other.type &&
corresponding_internal_token == other.corresponding_internal_token;
}
};
struct SyntaxGrammar {
std::vector<SyntaxVariable> variables;
std::set<rules::Symbol> extra_tokens;