Merge pull request #197 from tree-sitter/extract-simple-aliases

Simplify treatment of rules that are always aliased one way
This commit is contained in:
Max Brunsfeld 2018-08-30 10:48:46 -07:00 committed by GitHub
commit 6caa19d912
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
14 changed files with 273 additions and 79 deletions

View file

@ -29,6 +29,7 @@
'src/compiler/prepare_grammar/expand_repeats.cc',
'src/compiler/prepare_grammar/expand_tokens.cc',
'src/compiler/prepare_grammar/extract_choices.cc',
'src/compiler/prepare_grammar/extract_simple_aliases.cc',
'src/compiler/prepare_grammar/extract_tokens.cc',
'src/compiler/prepare_grammar/flatten_grammar.cc',
'src/compiler/prepare_grammar/intern_symbols.cc',

View file

@ -45,6 +45,7 @@ struct ParseStateQueueEntry {
class ParseTableBuilderImpl : public ParseTableBuilder {
const SyntaxGrammar grammar;
const LexicalGrammar lexical_grammar;
const std::unordered_map<rules::Symbol, rules::Alias> &simple_aliases;
unordered_map<ParseItemSet, ParseStateId> state_ids_by_item_set;
vector<const ParseItemSet *> item_sets_by_state_id;
deque<ParseStateQueueEntry> parse_state_queue;
@ -56,9 +57,13 @@ class ParseTableBuilderImpl : public ParseTableBuilder {
set<std::pair<Symbol, Symbol>> logged_conflict_tokens;
public:
ParseTableBuilderImpl(const SyntaxGrammar &syntax_grammar, const LexicalGrammar &lexical_grammar)
: grammar(syntax_grammar),
ParseTableBuilderImpl(
const SyntaxGrammar &syntax_grammar,
const LexicalGrammar &lexical_grammar,
const std::unordered_map<rules::Symbol, rules::Alias> &simple_aliases
) : grammar(syntax_grammar),
lexical_grammar(lexical_grammar),
simple_aliases(simple_aliases),
item_set_builder(syntax_grammar, lexical_grammar) {}
BuildResult build() {
@ -403,12 +408,12 @@ class ParseTableBuilderImpl : public ParseTableBuilder {
}
void eliminate_unit_reductions() {
set<Symbol::Index> aliased_symbols;
set<Symbol> aliased_symbols;
for (auto &variable : grammar.variables) {
for (auto &production : variable.productions) {
for (auto &step : production) {
if (!step.alias.value.empty()) {
aliased_symbols.insert(step.symbol.index);
aliased_symbols.insert(step.symbol);
}
}
}
@ -430,7 +435,8 @@ class ParseTableBuilderImpl : public ParseTableBuilder {
if (action.type == ParseActionTypeReduce &&
action.consumed_symbol_count == 1 &&
action.alias_sequence_id == 0 &&
!aliased_symbols.count(action.symbol.index) &&
!simple_aliases.count(action.symbol) &&
!aliased_symbols.count(action.symbol) &&
grammar.variables[action.symbol.index].type != VariableTypeNamed &&
(unit_reduction_symbol == -1 || unit_reduction_symbol == action.symbol.index)
) {
@ -887,9 +893,14 @@ class ParseTableBuilderImpl : public ParseTableBuilder {
unique_ptr<ParseTableBuilder> ParseTableBuilder::create(
const SyntaxGrammar &syntax_grammar,
const LexicalGrammar &lexical_grammar
const LexicalGrammar &lexical_grammar,
const std::unordered_map<rules::Symbol, rules::Alias> &simple_aliases
) {
return unique_ptr<ParseTableBuilder>(new ParseTableBuilderImpl(syntax_grammar, lexical_grammar));
return unique_ptr<ParseTableBuilder>(new ParseTableBuilderImpl(
syntax_grammar,
lexical_grammar,
simple_aliases
));
}
ParseTableBuilder::BuildResult ParseTableBuilder::build() {

View file

@ -2,6 +2,7 @@
#define COMPILER_BUILD_TABLES_PARSE_TABLE_BUILDER_H_
#include <memory>
#include <unordered_map>
#include "compiler/parse_table.h"
#include "compiler/compile_error.h"
@ -16,7 +17,11 @@ namespace build_tables {
class ParseTableBuilder {
public:
static std::unique_ptr<ParseTableBuilder> create(const SyntaxGrammar &, const LexicalGrammar &);
static std::unique_ptr<ParseTableBuilder> create(
const SyntaxGrammar &,
const LexicalGrammar &,
const std::unordered_map<rules::Symbol, rules::Alias> &
);
struct BuildResult {
ParseTable parse_table;

View file

@ -22,19 +22,23 @@ extern "C" TSCompileResult ts_compile_grammar(const char *input, FILE *log_file)
ParseGrammarResult parse_result = parse_grammar(string(input));
if (!parse_result.error_message.empty()) {
return { nullptr, strdup(parse_result.error_message.c_str()),
TSCompileErrorTypeInvalidGrammar };
return {nullptr, strdup(parse_result.error_message.c_str()), TSCompileErrorTypeInvalidGrammar};
}
auto prepare_grammar_result = prepare_grammar::prepare_grammar(parse_result.grammar);
SyntaxGrammar &syntax_grammar = get<0>(prepare_grammar_result);
LexicalGrammar &lexical_grammar = get<1>(prepare_grammar_result);
CompileError error = get<2>(prepare_grammar_result);
SyntaxGrammar &syntax_grammar = prepare_grammar_result.syntax_grammar;
LexicalGrammar &lexical_grammar = prepare_grammar_result.lexical_grammar;
auto &simple_aliases = prepare_grammar_result.simple_aliases;
CompileError error = prepare_grammar_result.error;
if (error.type) {
return {nullptr, strdup(error.message.c_str()), error.type};
}
auto builder = build_tables::ParseTableBuilder::create(syntax_grammar, lexical_grammar);
auto builder = build_tables::ParseTableBuilder::create(
syntax_grammar,
lexical_grammar,
simple_aliases
);
auto build_tables_result = builder->build();
error = build_tables_result.error;
if (error.type != 0) {
@ -48,11 +52,12 @@ extern "C" TSCompileResult ts_compile_grammar(const char *input, FILE *log_file)
move(build_tables_result.keyword_lex_table),
build_tables_result.keyword_capture_token,
move(syntax_grammar),
move(lexical_grammar)
move(lexical_grammar),
move(simple_aliases)
);
set_log_file(nullptr);
return { strdup(code.c_str()), nullptr, TSCompileErrorTypeNone };
return {strdup(code.c_str()), nullptr, TSCompileErrorTypeNone};
}
} // namespace tree_sitter

View file

@ -23,6 +23,7 @@ using std::pair;
using std::set;
using std::string;
using std::to_string;
using std::unordered_map;
using std::unordered_set;
using std::vector;
using util::escape_char;
@ -77,6 +78,7 @@ class CCodeGenerator {
Symbol keyword_capture_token;
const SyntaxGrammar syntax_grammar;
const LexicalGrammar lexical_grammar;
unordered_map<Symbol, Alias> simple_aliases;
map<Symbol, string> symbol_ids;
vector<pair<size_t, ParseTableEntry>> parse_table_entries;
vector<set<Symbol::Index>> external_scanner_states;
@ -84,18 +86,21 @@ class CCodeGenerator {
set<Alias> unique_aliases;
public:
CCodeGenerator(string name, ParseTable &&parse_table, LexTable &&main_lex_table,
LexTable &&keyword_lex_table, Symbol keyword_capture_token,
SyntaxGrammar &&syntax_grammar, LexicalGrammar &&lexical_grammar)
: indent_level(0),
name(name),
parse_table(move(parse_table)),
main_lex_table(move(main_lex_table)),
keyword_lex_table(move(keyword_lex_table)),
keyword_capture_token(keyword_capture_token),
syntax_grammar(move(syntax_grammar)),
lexical_grammar(move(lexical_grammar)),
next_parse_action_list_index(0) {}
CCodeGenerator(
string name, ParseTable &&parse_table, LexTable &&main_lex_table,
LexTable &&keyword_lex_table, Symbol keyword_capture_token,
SyntaxGrammar &&syntax_grammar, LexicalGrammar &&lexical_grammar,
unordered_map<Symbol, Alias> &&simple_aliases
) : indent_level(0),
name(name),
parse_table(move(parse_table)),
main_lex_table(move(main_lex_table)),
keyword_lex_table(move(keyword_lex_table)),
keyword_capture_token(keyword_capture_token),
syntax_grammar(move(syntax_grammar)),
lexical_grammar(move(lexical_grammar)),
simple_aliases(move(simple_aliases)),
next_parse_action_list_index(0) {}
string code() {
buffer = "";
@ -757,14 +762,28 @@ class CCodeGenerator {
}
string symbol_name(const Symbol &symbol) {
if (symbol == rules::END_OF_INPUT())
if (symbol == rules::END_OF_INPUT()) {
return "END";
}
auto simple_alias_entry = simple_aliases.find(symbol);
if (simple_alias_entry != simple_aliases.end()) {
return simple_alias_entry->second.value;
}
return entry_for_symbol(symbol).first;
}
VariableType symbol_type(const Symbol &symbol) {
if (symbol == rules::END_OF_INPUT())
if (symbol == rules::END_OF_INPUT()) {
return VariableTypeHidden;
}
auto simple_alias_entry = simple_aliases.find(symbol);
if (simple_alias_entry != simple_aliases.end()) {
return simple_alias_entry->second.is_named ? VariableTypeNamed : VariableTypeHidden;
}
return entry_for_symbol(symbol).second;
}
@ -874,9 +893,12 @@ class CCodeGenerator {
}
};
string c_code(string name, ParseTable &&parse_table, LexTable &&lex_table,
LexTable &&keyword_lex_table, Symbol keyword_capture_token,
SyntaxGrammar &&syntax_grammar, LexicalGrammar &&lexical_grammar) {
string c_code(
string name, ParseTable &&parse_table, LexTable &&lex_table,
LexTable &&keyword_lex_table, Symbol keyword_capture_token,
SyntaxGrammar &&syntax_grammar, LexicalGrammar &&lexical_grammar,
unordered_map<Symbol, Alias> &&simple_aliases
) {
return CCodeGenerator(
name,
move(parse_table),
@ -884,7 +906,8 @@ string c_code(string name, ParseTable &&parse_table, LexTable &&lex_table,
move(keyword_lex_table),
keyword_capture_token,
move(syntax_grammar),
move(lexical_grammar)
move(lexical_grammar),
move(simple_aliases)
).code();
}

View file

@ -2,6 +2,7 @@
#define COMPILER_GENERATE_CODE_C_CODE_H_
#include <string>
#include <unordered_map>
#include "compiler/rule.h"
namespace tree_sitter {
@ -20,7 +21,8 @@ std::string c_code(
LexTable &&,
rules::Symbol,
SyntaxGrammar &&,
LexicalGrammar &&
LexicalGrammar &&,
std::unordered_map<rules::Symbol, rules::Alias> &&
);
} // namespace generate_code

View file

@ -0,0 +1,111 @@
#include "compiler/prepare_grammar/extract_simple_aliases.h"
#include "compiler/lexical_grammar.h"
#include "compiler/syntax_grammar.h"
#include <unordered_map>
#include <vector>
namespace tree_sitter {
namespace prepare_grammar {
using std::pair;
using std::vector;
using std::unordered_map;
using rules::Alias;
using rules::Symbol;
template <typename T>
static void apply_alias(T *variable, Alias alias) {
if (!alias.value.empty()) {
variable->name = alias.value;
variable->type = alias.is_named ? VariableTypeNamed : VariableTypeAnonymous;
}
}
std::unordered_map<rules::Symbol, rules::Alias>
extract_simple_aliases(SyntaxGrammar *syntax_grammar, LexicalGrammar *lexical_grammar) {
struct SymbolStatus {
Alias alias;
bool eligible = true;
};
vector<SymbolStatus> terminal_status_list(lexical_grammar->variables.size());
vector<SymbolStatus> non_terminal_status_list(syntax_grammar->variables.size());
vector<SymbolStatus> external_status_list(syntax_grammar->external_tokens.size());
for (const SyntaxVariable &variable : syntax_grammar->variables) {
for (const Production &production : variable.productions) {
for (const ProductionStep &step : production.steps) {
SymbolStatus *status;
if (step.symbol.is_built_in()) {
continue;
} else if (step.symbol.is_external()) {
status = &external_status_list[step.symbol.index];
} else if (step.symbol.is_terminal()) {
status = &terminal_status_list[step.symbol.index];
} else {
status = &non_terminal_status_list[step.symbol.index];
}
if (step.alias.value.empty()) {
status->alias = Alias();
status->eligible = false;
}
if (status->eligible) {
if (status->alias.value.empty()) {
status->alias = step.alias;
} else if (status->alias != step.alias) {
status->alias = Alias();
status->eligible = false;
}
}
}
}
}
for (SyntaxVariable &variable : syntax_grammar->variables) {
for (Production &production : variable.productions) {
for (ProductionStep &step : production.steps) {
SymbolStatus *status;
if (step.symbol.is_built_in()) {
continue;
} else if (step.symbol.is_external()) {
status = &external_status_list[step.symbol.index];
} else if (step.symbol.is_terminal()) {
status = &terminal_status_list[step.symbol.index];
} else {
status = &non_terminal_status_list[step.symbol.index];
}
if (!status->alias.value.empty()) {
step.alias = Alias();
}
}
}
}
unordered_map<Symbol, Alias> result;
for (unsigned i = 0, n = terminal_status_list.size(); i < n; i++) {
if (!terminal_status_list[i].alias.value.empty()) {
result[Symbol::terminal(i)] = terminal_status_list[i].alias;
}
}
for (unsigned i = 0, n = non_terminal_status_list.size(); i < n; i++) {
if (!non_terminal_status_list[i].alias.value.empty()) {
result[Symbol::non_terminal(i)] = non_terminal_status_list[i].alias;
}
}
for (unsigned i = 0, n = external_status_list.size(); i < n; i++) {
if (!external_status_list[i].alias.value.empty()) {
result[Symbol::external(i)] = external_status_list[i].alias;
}
}
return result;
}
} // namespace prepare_grammar
} // namespace tree_sitter

View file

@ -0,0 +1,21 @@
#ifndef COMPILER_PREPARE_GRAMMAR_EXTRACT_SIMPLE_ALIASES_H_
#define COMPILER_PREPARE_GRAMMAR_EXTRACT_SIMPLE_ALIASES_H_
#include "compiler/rules/symbol.h"
#include "compiler/rules/metadata.h"
#include <unordered_map>
namespace tree_sitter {
struct SyntaxGrammar;
struct LexicalGrammar;
namespace prepare_grammar {
std::unordered_map<rules::Symbol, rules::Alias>
extract_simple_aliases(SyntaxGrammar *, LexicalGrammar *);
} // namespace prepare_grammar
} // namespace tree_sitter
#endif // COMPILER_PREPARE_GRAMMAR_EXTRACT_SIMPLE_ALIASES_H_

View file

@ -2,69 +2,64 @@
#include "compiler/prepare_grammar/expand_repeats.h"
#include "compiler/prepare_grammar/expand_tokens.h"
#include "compiler/prepare_grammar/extract_tokens.h"
#include "compiler/prepare_grammar/extract_simple_aliases.h"
#include "compiler/prepare_grammar/intern_symbols.h"
#include "compiler/prepare_grammar/flatten_grammar.h"
#include "compiler/prepare_grammar/normalize_rules.h"
#include "compiler/lexical_grammar.h"
#include "compiler/prepare_grammar/initial_syntax_grammar.h"
#include "compiler/lexical_grammar.h"
#include "compiler/syntax_grammar.h"
namespace tree_sitter {
namespace prepare_grammar {
using std::tuple;
using std::get;
using std::make_tuple;
using std::move;
tuple<SyntaxGrammar, LexicalGrammar, CompileError> prepare_grammar(
const InputGrammar &input_grammar) {
/*
* Convert all string-based `NamedSymbols` into numerical `Symbols`
*/
PrepareGrammarResult prepare_grammar(const InputGrammar &input_grammar) {
PrepareGrammarResult result;
// Convert all string-based `NamedSymbols` into numerical `Symbols`
auto intern_result = intern_symbols(input_grammar);
CompileError error = intern_result.second;
if (error.type)
return make_tuple(SyntaxGrammar(), LexicalGrammar(), error);
if (error.type) {
result.error = error;
return result;
}
/*
* Separate grammar into lexical and syntactic components
*/
// Separate grammar into lexical and syntactic components
auto extract_result = extract_tokens(intern_result.first);
error = get<2>(extract_result);
if (error.type) {
return make_tuple(SyntaxGrammar(), LexicalGrammar(), error);
result.error = error;
return result;
}
/*
* Replace `Repeat` rules with pairs of recursive rules
*/
// Replace `Repeat` rules with pairs of recursive rules
InitialSyntaxGrammar syntax_grammar1 = expand_repeats(get<0>(extract_result));
/*
* Expand `String` and `Pattern` rules into full rule trees
*/
LexicalGrammar lex_grammar = get<1>(extract_result);
// auto expand_tokens_result = expand_tokens(get<1>(extract_result));
// LexicalGrammar lex_grammar = expand_tokens_result.first;
// error = expand_tokens_result.second;
// if (error.type)
// return make_tuple(SyntaxGrammar(), LexicalGrammar(), error);
/*
* Flatten syntax rules into lists of productions.
*/
// Flatten syntax rules into lists of productions.
auto flatten_result = flatten_grammar(syntax_grammar1);
SyntaxGrammar syntax_grammar = flatten_result.first;
error = flatten_result.second;
if (error.type)
return make_tuple(SyntaxGrammar(), LexicalGrammar(), error);
if (error.type) {
result.error = error;
return result;
}
/*
* Ensure all lexical rules are in a consistent format.
*/
lex_grammar = normalize_rules(lex_grammar);
// Ensure all lexical rules are in a consistent format.
LexicalGrammar lexical_grammar = normalize_rules(get<1>(extract_result));
return make_tuple(syntax_grammar, lex_grammar, CompileError::none());
// Find any symbols that always have the same alias applied to them.
// Remove those aliases since they can be applied in a simpler way.
auto simple_aliases = extract_simple_aliases(&syntax_grammar, &lexical_grammar);
return {
move(syntax_grammar),
move(lexical_grammar),
move(simple_aliases),
CompileError::none(),
};
}
} // namespace prepare_grammar

View file

@ -1,7 +1,7 @@
#ifndef COMPILER_PREPARE_GRAMMAR_PREPARE_GRAMMAR_H_
#define COMPILER_PREPARE_GRAMMAR_PREPARE_GRAMMAR_H_
#include <tuple>
#include <unordered_map>
#include "compiler/grammar.h"
#include "compiler/syntax_grammar.h"
#include "compiler/lexical_grammar.h"
@ -10,7 +10,14 @@
namespace tree_sitter {
namespace prepare_grammar {
std::tuple<SyntaxGrammar, LexicalGrammar, CompileError> prepare_grammar(const InputGrammar &);
struct PrepareGrammarResult {
SyntaxGrammar syntax_grammar;
LexicalGrammar lexical_grammar;
std::unordered_map<rules::Symbol, rules::Alias> simple_aliases;
CompileError error;
};
PrepareGrammarResult prepare_grammar(const InputGrammar &);
} // namespace prepare_grammar
} // namespace tree_sitter

View file

@ -91,7 +91,7 @@ struct Rule {
}
template <typename ...FunctionTypes>
inline auto match(FunctionTypes && ...functions) const -> decltype(accept(util::make_visitor(std::forward<FunctionTypes>(functions)...))){
inline auto match(FunctionTypes && ...functions) const -> decltype(accept(util::make_visitor(std::forward<FunctionTypes>(functions)...))) {
return accept(util::make_visitor(std::forward<FunctionTypes>(functions)...));
}

View file

@ -36,6 +36,7 @@
#define SYM_NAME(symbol) ts_language_symbol_name(self->language, symbol)
static const unsigned MAX_VERSION_COUNT = 6;
static const unsigned MAX_VERSION_COUNT_OVERFLOW = 4;
static const unsigned MAX_SUMMARY_DEPTH = 16;
static const unsigned MAX_COST_DIFFERENCE = 16 * ERROR_COST_PER_SKIPPED_TREE;
@ -633,7 +634,7 @@ static StackVersion ts_parser__reduce(TSParser *self, StackVersion version, TSSy
// Error recovery can sometimes cause lots of stack versions to merge,
// such that a single pop operation can produce a lots of slices.
// Avoid creating too many stack versions in that situation.
if (i > 0 && slice_version > MAX_VERSION_COUNT) {
if (i > 0 && slice_version > MAX_VERSION_COUNT + MAX_VERSION_COUNT_OVERFLOW) {
ts_stack_remove_version(self->stack, slice_version);
ts_subtree_array_delete(&self->tree_pool, &slice.subtrees);
removed_version_count++;

View file

@ -2,11 +2,12 @@
Aliases on rules that are unit reductions
==========================================
one two three;
one two three four;
---
(statement
(identifier)
(b_prime (identifier))
(c_prime (identifier)))
(c_prime (identifier))
(identifier))

View file

@ -10,6 +10,9 @@
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "_a"},
// The `_b` rule is always aliased to `b_prime`, so it is internally treated
// as a simple alias.
{
"type": "ALIAS",
"named": true,
@ -19,6 +22,9 @@
"name": "_b"
}
},
// The `_c` rule is used without an alias in addition to being aliased to `c_prime`,
// so it is not a simple alias.
{
"type": "ALIAS",
"named": true,
@ -28,6 +34,11 @@
"name": "_c"
}
},
{
"type": "SYMBOL",
"name": "_c"
},
{
"type": "STRING",
"value": ";"
@ -57,7 +68,7 @@
"_c": {
"type": "SYMBOL",
"name": "_B"
"name": "_C"
},
"_C": {