Introduce 'ubiquitous_tokens' concept, for parsing comments and such

This commit is contained in:
Max Brunsfeld 2014-05-06 12:54:04 -07:00
parent b010e1667e
commit 4700e33746
24 changed files with 18321 additions and 19057 deletions

View file

@ -30,7 +30,7 @@ namespace tree_sitter {
LexConflictManager conflict_manager;
unordered_map<const LexItemSet, LexStateId> lex_state_ids;
LexTable lex_table;
LexItemSet build_lex_item_set(const set<Symbol> &symbols) {
LexItemSet result;
for (const auto &symbol : symbols) {
@ -43,7 +43,7 @@ namespace tree_sitter {
}
return result;
}
LexStateId add_lex_state(const LexItemSet &item_set) {
auto pair = lex_state_ids.find(item_set);
if (pair == lex_state_ids.end()) {
@ -57,7 +57,7 @@ namespace tree_sitter {
return pair->second;
}
}
void add_error_lex_state() {
LexItemSet item_set = build_lex_item_set(parse_table->symbols);
add_advance_actions(item_set, LexTable::ERROR_STATE_ID);
@ -84,7 +84,7 @@ namespace tree_sitter {
}
}
}
void add_token_start(const LexItemSet &item_set, LexStateId state_id) {
for (const auto &item : item_set)
if (item.is_token_start())

View file

@ -28,7 +28,7 @@ namespace tree_sitter {
unordered_map<const ParseItemSet, ParseStateId> parse_state_ids;
SymTransitions sym_transitions;
ParseTable parse_table;
ParseStateId add_parse_state(const ParseItemSet &item_set) {
auto pair = parse_state_ids.find(item_set);
if (pair == parse_state_ids.end()) {
@ -46,18 +46,24 @@ namespace tree_sitter {
for (const auto &transition : sym_transitions(item_set, grammar)) {
const Symbol &symbol = transition.first;
const ParseItemSet &item_set = transition.second;
auto current_actions = parse_table.states[state_id].actions;
auto current_action = current_actions.find(symbol);
auto &actions = parse_table.states[state_id].actions;
auto current_action = actions.find(symbol);
set<int> precedence_values = precedence_values_for_item_set(item_set);
if (current_action == current_actions.end() ||
if (current_action == actions.end() ||
conflict_manager.resolve_parse_action(symbol, current_action->second, ParseAction::Shift(0, precedence_values))) {
ParseStateId new_state_id = add_parse_state(item_set);
parse_table.add_action(state_id, symbol, ParseAction::Shift(new_state_id, precedence_values));
}
}
for (const Symbol &symbol : grammar.options.ubiquitous_tokens) {
auto &actions = parse_table.states[state_id].actions;
if (actions.find(symbol) == actions.end())
parse_table.add_action(state_id, symbol, ParseAction::Shift(state_id, { 0 }));
}
}
void add_reduce_actions(const ParseItemSet &item_set, ParseStateId state_id) {
for (const ParseItem &item : item_set) {
if (item.is_done()) {
@ -66,7 +72,7 @@ namespace tree_sitter {
ParseAction::Reduce(item.lhs, item.consumed_symbol_count, item.precedence());
auto current_actions = parse_table.states[state_id].actions;
auto current_action = current_actions.find(item.lookahead_sym);
if (current_action == current_actions.end() ||
conflict_manager.resolve_parse_action(item.lookahead_sym, current_action->second, action)) {
parse_table.add_action(state_id, item.lookahead_sym, action);
@ -74,7 +80,7 @@ namespace tree_sitter {
}
}
}
set<int> precedence_values_for_item_set(const ParseItemSet &item_set) {
set<int> result;
for (const auto &item : item_set)

View file

@ -75,6 +75,7 @@ namespace tree_sitter {
state_and_symbol_counts(),
symbol_enum(),
symbol_names_list(),
ubiquitous_symbols_list(),
hidden_symbols_list(),
lex_function(),
lex_states_list(),
@ -221,6 +222,13 @@ namespace tree_sitter {
return result + "};";
}
string ubiquitous_symbols_list() {
string result = "UBIQUITOUS_SYMBOLS = {\n";
for (auto &symbol : syntax_grammar.options.ubiquitous_tokens)
result += indent("[" + symbol_id(symbol) + "] = 1,") + "\n";
return result + "};";
}
string hidden_symbols_list() {
string result = "HIDDEN_SYMBOLS = {\n";
for (auto &symbol : parse_table.symbols)

View file

@ -7,7 +7,12 @@ namespace tree_sitter {
using rules::rule_ptr;
Grammar::Grammar(const std::vector<std::pair<std::string, rules::rule_ptr>> &rules) :
rules(rules) {}
rules(rules),
options({}) {}
Grammar::Grammar(const std::vector<std::pair<std::string, rules::rule_ptr>> &rules, GrammarOptions options) :
rules(rules),
options(options) {}
bool Grammar::operator==(const Grammar &other) const {
if (other.rules.size() != rules.size()) return false;

View file

@ -59,7 +59,7 @@ namespace tree_sitter {
aux_rules.insert(aux_rules.end(), expander.aux_rules.begin(), expander.aux_rules.end());
}
return PreparedGrammar(rules, aux_rules);
return PreparedGrammar(rules, aux_rules, grammar.options);
}
}
}

View file

@ -46,16 +46,20 @@ namespace tree_sitter {
}
rule_ptr apply_to(const Symbol *rule) {
auto replacement_pair = replacements.find(*rule);
if (replacement_pair != replacements.end())
return replacement_pair->second.copy();
else if (rule->is_built_in())
return rule->copy();
else
return make_shared<Symbol>(new_index_for_symbol(*rule), rule->options);
return replace_symbol(*rule).copy();
}
public:
Symbol replace_symbol(const Symbol &rule) {
auto replacement_pair = replacements.find(rule);
if (replacement_pair != replacements.end())
return replacement_pair->second;
else if (rule.is_built_in())
return rule;
else
return Symbol(new_index_for_symbol(rule), rule.options);
}
SymbolInliner(const map<Symbol, Symbol> &replacements, size_t rule_count, size_t aux_rule_count) :
replacements(replacements)
{}
@ -131,13 +135,21 @@ namespace tree_sitter {
aux_tokens.insert(aux_tokens.end(), extractor.tokens.begin(), extractor.tokens.end());
SymbolInliner inliner(symbol_replacements, input_grammar.rules.size(), input_grammar.aux_rules.size());
vector<Symbol> ubiquitous_tokens;
for (auto &pair : rules)
pair.second = inliner.apply(pair.second);
for (auto &pair : aux_rules)
pair.second = inliner.apply(pair.second);
for (auto &symbol : input_grammar.options.ubiquitous_tokens) {
ubiquitous_tokens.push_back(inliner.replace_symbol(symbol));
}
PreparedGrammarOptions parse_options(input_grammar.options);
parse_options.ubiquitous_tokens = ubiquitous_tokens;
return {
PreparedGrammar(rules, aux_rules),
PreparedGrammar(rules, aux_rules, parse_options),
PreparedGrammar(tokens, aux_tokens)
};
}

View file

@ -19,19 +19,32 @@ namespace tree_sitter {
using rules::IdentityRuleFn::apply_to;
rule_ptr apply_to(const rules::NamedSymbol *rule) {
for (size_t i = 0; i < grammar.rules.size(); i++)
if (grammar.rules[i].first == rule->name)
return make_shared<rules::Symbol>(i);
missing_rule_name = rule->name;
return rule_ptr();
auto result = symbol_for_rule_name(rule->name);
if (!result.get()) missing_rule_name = rule->name;
return result;
}
public:
std::shared_ptr<rules::Symbol> symbol_for_rule_name(string rule_name) {
for (size_t i = 0; i < grammar.rules.size(); i++)
if (grammar.rules[i].first == rule_name)
return make_shared<rules::Symbol>(i);
return nullptr;
}
explicit InternSymbols(const Grammar &grammar) : grammar(grammar) {}
const Grammar grammar;
string missing_rule_name;
};
pair<PreparedGrammar, const GrammarError *> missing_rule_error(string rule_name) {
return {
PreparedGrammar({}, {}),
new GrammarError(GrammarErrorTypeUndefinedSymbol,
"Undefined rule '" + rule_name + "'")
};
}
pair<PreparedGrammar, const GrammarError *> intern_symbols(const Grammar &grammar) {
InternSymbols interner(grammar);
vector<pair<string, rule_ptr>> rules;
@ -39,15 +52,22 @@ namespace tree_sitter {
for (auto &pair : grammar.rules) {
auto new_rule = interner.apply(pair.second);
if (!interner.missing_rule_name.empty())
return {
PreparedGrammar({}, {}),
new GrammarError(GrammarErrorTypeUndefinedSymbol,
"Undefined rule '" + interner.missing_rule_name + "'")
};
return missing_rule_error(interner.missing_rule_name);
rules.push_back({ pair.first, new_rule });
}
return { PreparedGrammar(rules, {}), nullptr };
vector<rules::Symbol> ubiquitous_tokens;
for (auto &name : grammar.options.ubiquitous_tokens) {
auto token = interner.symbol_for_rule_name(name);
if (!token.get())
return missing_rule_error(name);
ubiquitous_tokens.push_back(*token);
}
return {
PreparedGrammar(rules, {}, PreparedGrammarOptions({ ubiquitous_tokens })),
nullptr
};
}
}
}

View file

@ -14,7 +14,15 @@ namespace tree_sitter {
PreparedGrammar::PreparedGrammar(const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,
const std::vector<std::pair<std::string, rules::rule_ptr>> &aux_rules) :
Grammar(rules),
aux_rules(aux_rules) {}
aux_rules(aux_rules),
options({}) {}
PreparedGrammar::PreparedGrammar(const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,
const std::vector<std::pair<std::string, rules::rule_ptr>> &aux_rules,
PreparedGrammarOptions options) :
Grammar(rules),
aux_rules(aux_rules),
options(options) {}
const rule_ptr & PreparedGrammar::rule(const Symbol &symbol) const {
return symbol.is_auxiliary() ?

View file

@ -8,16 +8,24 @@
#include "compiler/rules/symbol.h"
namespace tree_sitter {
struct PreparedGrammarOptions {
std::vector<rules::Symbol> ubiquitous_tokens;
};
class PreparedGrammar : public Grammar {
public:
PreparedGrammar(const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,
const std::vector<std::pair<std::string, rules::rule_ptr>> &aux_rules);
PreparedGrammar(const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,
const std::vector<std::pair<std::string, rules::rule_ptr>> &aux_rules,
PreparedGrammarOptions options);
bool operator==(const PreparedGrammar &other) const;
const std::string & rule_name(const rules::Symbol &symbol) const;
const rules::rule_ptr & rule(const rules::Symbol &symbol) const;
const std::vector<std::pair<std::string, rules::rule_ptr>> aux_rules;
const PreparedGrammarOptions options;
};
std::ostream& operator<<(std::ostream &stream, const PreparedGrammar &grammar);

View file

@ -57,51 +57,70 @@ size_t ts_stack_right_position(const ts_stack *stack) {
return result;
}
ts_tree * ts_stack_reduce(ts_stack *stack, ts_symbol symbol, int immediate_child_count, const int *hidden_symbol_flags) {
size_t new_stack_size = stack->size - immediate_child_count;
int flags[immediate_child_count];
ts_tree * ts_stack_reduce(ts_stack *stack,
ts_symbol symbol,
int immediate_child_count,
const int *hidden_symbol_flags,
const int *ubiquitous_symbol_flags) {
// First, walk down the stack to determine which symbols will be reduced.
// The child node count is known ahead of time, but some of the
// nodes at the top of the stack might be hidden nodes, in which
// case we 'collapse' them. Some may also be ubiquitous tokens,
// which don't count towards the child node count.
static int collapse_flags[100];
int child_count = 0;
for (int i = 0; i < immediate_child_count; i++) {
ts_tree *child = stack->entries[new_stack_size + i].node;
ts_tree *child = stack->entries[stack->size - 1 - i].node;
size_t grandchild_count;
ts_tree **grandchildren = ts_tree_children(child, &grandchild_count);
flags[i] = (
hidden_symbol_flags[ts_tree_symbol(child)] ||
ts_symbol symbol = ts_tree_symbol(child);
if (ubiquitous_symbol_flags[symbol])
immediate_child_count++;
collapse_flags[i] = (
hidden_symbol_flags[symbol] ||
(grandchild_count == 1 && ts_tree_size(child) == ts_tree_size(grandchildren[0]))
);
child_count += (flags[i]) ? grandchild_count : 1;
child_count += (collapse_flags[i]) ? grandchild_count : 1;
}
size_t child_index = 0;
// Walk down the stack again, building up the array of children.
// Though we collapse the hidden child nodes, we also need to
// keep track of the actual immediate children so that we can
// later collapse the stack again when the document is edited.
// We store the children and immediate children in the same array,
// to reduce allocations.
size_t size = 0, offset = 0;
size_t child_index = child_count;
ts_tree **children = malloc((child_count + immediate_child_count) * sizeof(ts_tree *));
ts_tree **immediate_children = children + child_count;
for (int i = 0; i < immediate_child_count; i++) {
ts_tree *child = stack->entries[new_stack_size + i].node;
immediate_children[i] = child;
ts_tree *child = stack->entries[stack->size - 1 - i].node;
immediate_children[immediate_child_count - 1 - i] = child;
if (i == 0) {
offset = ts_tree_offset(child);
size = ts_tree_size(child);
if (collapse_flags[i]) {
size_t grandchild_count;
ts_tree **grandchildren = ts_tree_children(child, &grandchild_count);
child_index -= grandchild_count;
memcpy(children + child_index, grandchildren, (grandchild_count * sizeof(ts_tree *)));
} else {
size += ts_tree_offset(child) + ts_tree_size(child);
child_index--;
children[child_index] = child;
}
if (flags[i]) {
size_t grandchild_count;
ts_tree ** grandchildren = ts_tree_children(child, &grandchild_count);
memcpy(children + child_index, grandchildren, (grandchild_count * sizeof(ts_tree *)));
child_index += grandchild_count;
if (child_index == 0) {
offset += ts_tree_offset(child);
size += ts_tree_size(child);
} else {
children[child_index] = child;
child_index++;
size += ts_tree_offset(child) + ts_tree_size(child);
}
}
ts_tree *lookahead = ts_tree_make_node(symbol, child_count, immediate_child_count, children, size, offset);
ts_stack_shrink(stack, new_stack_size);
ts_stack_shrink(stack, stack->size - immediate_child_count);
return lookahead;
}