Add a way to automatically inline rules

This commit is contained in:
Max Brunsfeld 2017-07-11 21:17:27 -07:00
parent 26a25278cd
commit 65bf1389e1
15 changed files with 219 additions and 14 deletions

View file

@ -156,14 +156,14 @@ struct hash<ParseItem> {
if (item.is_done()) {
if (!item.production->empty()) {
hash_combine(&result, item.production->back().precedence);
hash_combine(&result, item.production->back().associativity);
hash_combine<unsigned>(&result, item.production->back().associativity);
}
} else {
for (size_t i = 0, n = item.production->size(); i < n; i++) {
auto &step = item.production->at(i);
hash_combine(&result, step.symbol);
hash_combine(&result, step.precedence);
hash_combine(&result, step.associativity);
hash_combine<unsigned>(&result, step.associativity);
}
}
return result;

View file

@ -1,4 +1,5 @@
#include "compiler/build_tables/parse_item_set_builder.h"
#include <algorithm>
#include <cassert>
#include <set>
#include <unordered_map>
@ -11,8 +12,10 @@
namespace tree_sitter {
namespace build_tables {
using std::move;
using std::vector;
using std::set;
using std::find;
using std::get;
using std::pair;
using std::tuple;
@ -21,8 +24,36 @@ using std::make_tuple;
using rules::Symbol;
using rules::NONE;
static vector<Production> inline_production(const ParseItem &item, const SyntaxGrammar &grammar) {
vector<Production> result;
for (const Production &production_to_insert : grammar.variables[item.next_symbol().index].productions) {
auto begin = item.production->steps.begin();
auto end = item.production->steps.end();
auto step = begin + item.step_index;
Production production{{begin, step}, item.production->dynamic_precedence};
production.steps.insert(
production.steps.end(),
production_to_insert.steps.begin(),
production_to_insert.steps.end()
);
production.back().precedence = item.precedence();
production.back().associativity = item.associativity();
production.steps.insert(
production.steps.end(),
step + 1,
end
);
if (find(result.begin(), result.end(), production) == result.end()) {
result.push_back(move(production));
}
}
return result;
}
ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar,
const LexicalGrammar &lexical_grammar) {
const LexicalGrammar &lexical_grammar) : grammar{grammar} {
vector<Symbol> symbols_to_process;
set<Symbol::Index> processed_non_terminals;
@ -145,24 +176,56 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar,
for (auto &pair : cached_lookaheads_by_non_terminal) {
for (const Production &production : grammar.variables[pair.first].productions) {
component_cache[i].push_back({
ParseItem(Symbol::non_terminal(pair.first), production, 0),
pair.second.first,
pair.second.second
});
Symbol lhs = Symbol::non_terminal(pair.first);
ParseItem item(lhs, production, 0);
if (grammar.variables_to_inline.count(item.next_symbol())) {
vector<Production> &inlined_productions = inlined_productions_by_original_production[item];
if (inlined_productions.empty()) {
inlined_productions = inline_production(item, grammar);
}
for (const Production &inlined_production : inlined_productions) {
ParseItemSetComponent component{
ParseItem(lhs, inlined_production, 0),
pair.second.first,
pair.second.second
};
if (find(component_cache[i].begin(), component_cache[i].end(), component) == component_cache[i].end()) {
component_cache[i].push_back(component);
}
}
} else if (!grammar.variables_to_inline.count(lhs)) {
ParseItemSetComponent component{
ParseItem(lhs, production, 0),
pair.second.first,
pair.second.second
};
if (find(component_cache[i].begin(), component_cache[i].end(), component) == component_cache[i].end()) {
component_cache[i].push_back(component);
}
}
}
}
}
}
void ParseItemSetBuilder::apply_transitive_closure(ParseItemSet *item_set) {
for (const auto &pair : item_set->entries) {
const ParseItem &item = pair.first;
const LookaheadSet &lookaheads = pair.second;
if (item.lhs() != rules::START() && item.step_index == 0) continue;
for (auto iter = item_set->entries.begin(), end = item_set->entries.end(); iter != end;) {
const ParseItem &item = iter->first;
const LookaheadSet &lookaheads = iter->second;
if (item.lhs() != rules::START() && item.step_index == 0) {
++iter;
continue;
}
const Symbol &next_symbol = item.next_symbol();
if (!next_symbol.is_non_terminal() || next_symbol.is_built_in()) continue;
if (!next_symbol.is_non_terminal() || next_symbol.is_built_in()) {
++iter;
continue;
}
LookaheadSet next_lookaheads;
size_t next_step = item.step_index + 1;
@ -178,6 +241,24 @@ void ParseItemSetBuilder::apply_transitive_closure(ParseItemSet *item_set) {
current_lookaheads.insert_all(component.lookaheads);
if (component.propagates_lookaheads) current_lookaheads.insert_all(next_lookaheads);
}
if (grammar.variables_to_inline.count(next_symbol)) {
vector<Production> &inlined_productions = inlined_productions_by_original_production[item];
if (inlined_productions.empty()) {
inlined_productions = inline_production(item, grammar);
}
for (const Production &inlined_production : inlined_productions) {
item_set->entries.insert({
ParseItem(item.lhs(), inlined_production, item.step_index),
lookaheads
});
}
iter = item_set->entries.erase(iter);
} else {
++iter;
}
}
}

View file

@ -4,6 +4,7 @@
#include "compiler/build_tables/parse_item.h"
#include "compiler/rule.h"
#include <map>
#include <vector>
namespace tree_sitter {
@ -17,11 +18,19 @@ class ParseItemSetBuilder {
ParseItem item;
LookaheadSet lookaheads;
bool propagates_lookaheads;
inline bool operator==(const ParseItemSetComponent &other) {
return item == other.item &&
lookaheads == other.lookaheads &&
propagates_lookaheads == other.propagates_lookaheads;
}
};
const SyntaxGrammar &grammar;
std::map<rules::Symbol, LookaheadSet> first_sets;
std::map<rules::Symbol, LookaheadSet> last_sets;
std::map<rules::Symbol::Index, std::vector<ParseItemSetComponent>> component_cache;
std::map<ParseItem, std::vector<Production>> inlined_productions_by_original_production;
public:
ParseItemSetBuilder(const SyntaxGrammar &, const LexicalGrammar &);

View file

@ -31,6 +31,7 @@ struct InputGrammar {
std::vector<rules::Rule> extra_tokens;
std::vector<std::unordered_set<rules::NamedSymbol>> expected_conflicts;
std::vector<Variable> external_tokens;
std::unordered_set<rules::NamedSymbol> variables_to_inline;
};
} // namespace tree_sitter

View file

@ -205,7 +205,7 @@ ParseGrammarResult parse_grammar(const string &input) {
string error_message;
string name;
InputGrammar grammar;
json_value name_json, rules_json, extras_json, conflicts_json, external_tokens_json;
json_value name_json, rules_json, extras_json, conflicts_json, external_tokens_json, inline_rules_json;
json_settings settings = { 0, json_enable_comments, 0, 0, 0, 0 };
char parse_error[json_error_max];
@ -299,6 +299,24 @@ ParseGrammarResult parse_grammar(const string &input) {
}
}
inline_rules_json = grammar_json->operator[]("inline");
if (inline_rules_json.type != json_none) {
if (inline_rules_json.type != json_array) {
error_message = "Inline rules must be an array";
goto error;
}
for (size_t i = 0, length = inline_rules_json.u.array.length; i < length; i++) {
json_value *inline_rule_json = inline_rules_json.u.array.values[i];
if (inline_rule_json->type != json_string) {
error_message = "Inline rules must be an array of rule names";
goto error;
}
grammar.variables_to_inline.insert(rules::NamedSymbol{string(inline_rule_json->u.string.ptr)});
}
}
external_tokens_json = grammar_json->operator[]("externals");
if (external_tokens_json.type != json_none) {
if (external_tokens_json.type != json_array) {

View file

@ -2,6 +2,7 @@
#define COMPILER_GRAMMAR_JSON_H_
#include <string>
#include <unordered_set>
#include "tree_sitter/compiler.h"
#include "compiler/grammar.h"

View file

@ -94,6 +94,7 @@ InitialSyntaxGrammar expand_repeats(const InitialSyntaxGrammar &grammar) {
result.extra_tokens = grammar.extra_tokens;
result.expected_conflicts = grammar.expected_conflicts;
result.external_tokens = grammar.external_tokens;
result.variables_to_inline = grammar.variables_to_inline;
ExpandRepeats expander(result.variables.size());
for (auto &variable : result.variables) {

View file

@ -235,6 +235,10 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
syntax_grammar.expected_conflicts.insert(new_conflict_set);
}
for (const Symbol &symbol : grammar.variables_to_inline) {
syntax_grammar.variables_to_inline.insert(symbol_replacer.replace_symbol(symbol));
}
// The grammar's extra tokens can be either token rules or symbols
// pointing to token rules. If they are symbols, then they'll be handled by
// the parser; add them to the syntax grammar's extra tokens. If they

View file

@ -111,6 +111,7 @@ SyntaxVariable flatten_rule(const Variable &variable) {
pair<SyntaxGrammar, CompileError> flatten_grammar(const InitialSyntaxGrammar &grammar) {
SyntaxGrammar result;
result.external_tokens = grammar.external_tokens;
result.variables_to_inline = grammar.variables_to_inline;
for (const auto &expected_conflict : grammar.expected_conflicts) {
result.expected_conflicts.insert({

View file

@ -16,6 +16,7 @@ struct InitialSyntaxGrammar {
std::set<rules::Symbol> extra_tokens;
std::set<std::set<rules::Symbol>> expected_conflicts;
std::vector<ExternalToken> external_tokens;
std::set<rules::Symbol> variables_to_inline;
};
} // namespace prepare_grammar

View file

@ -142,6 +142,13 @@ pair<InternedGrammar, CompileError> intern_symbols(const InputGrammar &grammar)
result.expected_conflicts.insert(entry);
}
for (auto &named_symbol : grammar.variables_to_inline) {
auto symbol = interner.intern_symbol(named_symbol);
if (symbol != rules::NONE()) {
result.variables_to_inline.insert(symbol);
}
}
return {result, CompileError::none()};
}

View file

@ -15,6 +15,7 @@ struct InternedGrammar {
std::vector<rules::Rule> extra_tokens;
std::set<std::set<rules::Symbol>> expected_conflicts;
std::vector<Variable> external_tokens;
std::set<rules::Symbol> variables_to_inline;
};
} // namespace prepare_grammar

View file

@ -74,6 +74,7 @@ struct SyntaxGrammar {
std::set<rules::Symbol> extra_tokens;
std::set<ConflictSet> expected_conflicts;
std::vector<ExternalToken> external_tokens;
std::set<rules::Symbol> variables_to_inline;
};
} // namespace tree_sitter

View file

@ -0,0 +1,11 @@
==================================
Expressions
==================================
1 + 2 * 3;
---
(statement (sum
(number)
(product (number) (number))))

View file

@ -0,0 +1,68 @@
{
"name": "inline_rules",
"extras": [
{"type": "PATTERN", "value": "\\s"}
],
"inline": [
"expression"
],
"rules": {
"statement": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": ";"}
]
},
"expression": {
"type": "CHOICE",
"members": [
{"type": "SYMBOL", "name": "sum"},
{"type": "SYMBOL", "name": "product"},
{"type": "SYMBOL", "name": "number"},
{"type": "SYMBOL", "name": "parenthesized_expression"}
]
},
"parenthesized_expression": {
"type": "SEQ",
"members": [
{"type": "STRING", "value": "("},
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": ")"}
]
},
"sum": {
"type": "PREC_LEFT",
"value": 0,
"content": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": "+"},
{"type": "SYMBOL", "name": "expression"}
]
}
},
"product": {
"type": "PREC_LEFT",
"value": 2,
"content": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": "*"},
{"type": "SYMBOL", "name": "expression"}
]
}
},
"number": {"type": "PATTERN", "value": "\\d+"}
}
}