Fix handling of ubiquitous tokens used in grammar rules

This commit is contained in:
Max Brunsfeld 2014-07-01 20:47:35 -07:00
parent 59cc65c2e3
commit 83a1b9439e
30 changed files with 39086 additions and 32890 deletions

View file

@ -11,12 +11,16 @@
#include "compiler/build_tables/parse_item.h"
#include "compiler/build_tables/item_set_closure.h"
#include "compiler/build_tables/item_set_transitions.h"
#include "compiler/build_tables/first_set.h"
#include <iostream>
namespace tree_sitter {
using std::pair;
using std::string;
using std::vector;
using std::set;
using std::map;
using std::unordered_map;
using std::make_shared;
using rules::Symbol;
@ -33,8 +37,8 @@ namespace tree_sitter {
if (pair == parse_state_ids.end()) {
ParseStateId state_id = parse_table.add_state();
parse_state_ids[item_set] = state_id;
add_shift_actions(item_set, state_id);
add_reduce_actions(item_set, state_id);
add_shift_actions(item_set, state_id);
add_ubiquitous_token_actions(item_set, state_id);
return state_id;
} else {
@ -43,26 +47,34 @@ namespace tree_sitter {
}
void add_shift_actions(const ParseItemSet &item_set, ParseStateId state_id) {
map<Symbol, size_t> shifts;
for (const auto &transition : sym_transitions(item_set, grammar)) {
const Symbol &symbol = transition.first;
const ParseItemSet &next_item_set = transition.second;
auto &actions = parse_table.states[state_id].actions;
auto current_action = actions.find(symbol);
set<int> precedence_values = precedence_values_for_item_set(next_item_set);
if (current_action == actions.end() ||
conflict_manager.resolve_parse_action(symbol, current_action->second, ParseAction::Shift(0, precedence_values))) {
ParseAction new_action = ParseAction::Shift(0, precedence_values_for_item_set(next_item_set));
if (should_add_action(state_id, symbol, new_action)) {
ParseStateId new_state_id = add_parse_state(next_item_set);
parse_table.add_action(state_id, symbol, ParseAction::Shift(new_state_id, precedence_values));
new_action.state_index = new_state_id;
parse_table.add_action(state_id, symbol, new_action);
shifts.insert({ symbol, new_state_id });
}
}
}
void add_ubiquitous_token_actions(const ParseItemSet &item_set, ParseStateId state_id) {
for (const Symbol &symbol : grammar.ubiquitous_tokens) {
auto &actions = parse_table.states[state_id].actions;
if (actions.find(symbol) == actions.end())
parse_table.add_action(state_id, symbol, ParseAction::ShiftExtra());
for (auto &pair : shifts) {
const Symbol &shift_symbol = pair.first;
size_t new_state_id = pair.second;
if (grammar.ubiquitous_tokens.find(shift_symbol) != grammar.ubiquitous_tokens.end()) {
for (const auto &pair : parse_table.states[state_id].actions) {
const Symbol &lookahead_sym = pair.first;
ParseAction action = ParseAction::ReduceExtra(shift_symbol);
if (should_add_action(new_state_id, lookahead_sym, action))
parse_table.add_action(new_state_id, lookahead_sym, action);
}
}
}
}
@ -75,19 +87,39 @@ namespace tree_sitter {
ParseAction action = (item.lhs == rules::START()) ?
ParseAction::Accept() :
ParseAction::Reduce(item.lhs, item.consumed_symbol_count, item.precedence());
for (auto &lookahead_sym : lookahead_symbols) {
auto current_actions = parse_table.states[state_id].actions;
auto current_action = current_actions.find(lookahead_sym);
if (current_action == current_actions.end() ||
conflict_manager.resolve_parse_action(lookahead_sym, current_action->second, action)) {
for (auto &lookahead_sym : lookahead_symbols)
if (should_add_action(state_id, lookahead_sym, action))
parse_table.add_action(state_id, lookahead_sym, action);
}
}
}
}
}
void add_ubiquitous_token_actions(const ParseItemSet &item_set, ParseStateId state_id) {
for (const Symbol &symbol : grammar.ubiquitous_tokens) {
auto &actions = parse_table.states[state_id].actions;
if (actions.find(symbol) == actions.end())
parse_table.add_action(state_id, symbol, ParseAction::ShiftExtra());
}
}
set<Symbol> first_set_for_item_set(const ParseItemSet &item_set) {
set<Symbol> result;
for (const auto &pair : item_set) {
auto new_set = first_set(pair.first.rule, grammar);
result.insert(new_set.begin(), new_set.end());
}
return result;
}
bool should_add_action(size_t state_id, const Symbol &symbol, const ParseAction &action) {
auto current_actions = parse_table.states[state_id].actions;
auto current_action = current_actions.find(symbol);
return (
current_action == current_actions.end() ||
conflict_manager.resolve_parse_action(symbol, current_action->second, action)
);
}
set<int> precedence_values_for_item_set(const ParseItemSet &item_set) {
set<int> result;
for (const auto &pair : item_set) {

View file

@ -290,6 +290,9 @@ namespace tree_sitter {
symbol_id(action.symbol) + ", " +
to_string(action.consumed_symbol_count) + ")");
break;
case ParseActionTypeReduceExtra:
add("REDUCE_EXTRA(" + symbol_id(action.symbol) + ")");
break;
default:;
}
}

View file

@ -2,9 +2,10 @@
#include "compiler/rules/rule.h"
namespace tree_sitter {
using std::string;
using std::ostream;
using std::pair;
using std::set;
using std::string;
using std::vector;
using rules::rule_ptr;
@ -59,20 +60,20 @@ namespace tree_sitter {
return stream << string("#<null>");
}
const vector<string> & Grammar::ubiquitous_tokens() const {
const set<string> & Grammar::ubiquitous_tokens() const {
return ubiquitous_tokens_;
}
Grammar & Grammar::ubiquitous_tokens(const vector<string> &ubiquitous_tokens) {
Grammar & Grammar::ubiquitous_tokens(const set<string> &ubiquitous_tokens) {
ubiquitous_tokens_ = ubiquitous_tokens;
return *this;
}
const vector<char> & Grammar::separators() const {
const set<char> & Grammar::separators() const {
return separators_;
}
Grammar & Grammar::separators(const vector<char> &separators) {
Grammar & Grammar::separators(const set<char> &separators) {
separators_ = separators;
return *this;
}

View file

@ -39,7 +39,11 @@ namespace tree_sitter {
}
ParseAction ParseAction::ShiftExtra() {
return ParseAction(ParseActionTypeShiftExtra, -1, Symbol(-1), 0, set<int>({}));
return ParseAction(ParseActionTypeShiftExtra, -1, Symbol(-1), 0, { 0 });
}
ParseAction ParseAction::ReduceExtra(Symbol symbol) {
return ParseAction(ParseActionTypeReduceExtra, -1, symbol, 0, { 0 });
}
ParseAction ParseAction::Reduce(Symbol symbol, size_t consumed_symbol_count, int precedence) {

View file

@ -14,6 +14,7 @@ namespace tree_sitter {
ParseActionTypeShift,
ParseActionTypeShiftExtra,
ParseActionTypeReduce,
ParseActionTypeReduceExtra,
ParseActionTypeAccept,
} ParseActionType;
@ -28,8 +29,9 @@ namespace tree_sitter {
static ParseAction Accept();
static ParseAction Error();
static ParseAction Shift(size_t state_index, std::set<int> precedence_values);
static ParseAction ShiftExtra();
static ParseAction Reduce(rules::Symbol symbol, size_t consumed_symbol_count, int precedence);
static ParseAction ShiftExtra();
static ParseAction ReduceExtra(rules::Symbol symbol);
bool operator==(const ParseAction &action) const;
ParseActionType type;

View file

@ -51,14 +51,14 @@ namespace tree_sitter {
for (auto &pair : grammar.rules) {
auto rule = expander.apply(pair.second);
if (expander.error)
return { LexicalGrammar({}, {}, {}), expander.error };
return { LexicalGrammar(), expander.error };
rules.push_back({ pair.first, rule });
}
for (auto &pair : grammar.aux_rules) {
auto rule = expander.apply(pair.second);
if (expander.error)
return { LexicalGrammar({}, {}, {}), expander.error };
return { LexicalGrammar(), expander.error };
aux_rules.push_back({ pair.first, rule });
}

View file

@ -1,6 +1,7 @@
#include "compiler/prepare_grammar/extract_tokens.h"
#include <map>
#include <vector>
#include <set>
#include <string>
#include "tree_sitter/compiler.h"
#include "compiler/prepared_grammar.h"
@ -18,6 +19,7 @@ namespace tree_sitter {
using std::map;
using std::to_string;
using std::vector;
using std::set;
using std::make_shared;
using rules::rule_ptr;
using rules::Symbol;
@ -96,7 +98,7 @@ namespace tree_sitter {
pair<SyntaxGrammar, LexicalGrammar> extract_tokens(const InternedGrammar &input_grammar) {
vector<pair<string, rule_ptr>> rules, tokens, aux_rules, aux_tokens;
vector<Symbol> ubiquitous_tokens;
set<Symbol> ubiquitous_tokens;
TokenExtractor extractor;
map<Symbol, Symbol> symbol_replacements;
@ -120,7 +122,7 @@ namespace tree_sitter {
for (auto &pair : rules)
pair.second = inliner.apply(pair.second);
for (auto &symbol : input_grammar.ubiquitous_tokens)
ubiquitous_tokens.push_back(inliner.replace_symbol(symbol));
ubiquitous_tokens.insert(inliner.replace_symbol(symbol));
return {
SyntaxGrammar(rules, aux_rules, ubiquitous_tokens),

View file

@ -1,6 +1,7 @@
#include "compiler/prepare_grammar/intern_symbols.h"
#include <memory>
#include <vector>
#include <set>
#include "tree_sitter/compiler.h"
#include "compiler/prepare_grammar/interned_grammar.h"
#include "compiler/prepared_grammar.h"
@ -12,6 +13,7 @@ namespace tree_sitter {
using std::string;
using rules::rule_ptr;
using std::vector;
using std::set;
using std::pair;
using std::make_shared;
@ -58,12 +60,12 @@ namespace tree_sitter {
rules.push_back({ pair.first, new_rule });
}
vector<rules::Symbol> ubiquitous_tokens;
set<rules::Symbol> ubiquitous_tokens;
for (auto &name : grammar.ubiquitous_tokens()) {
auto token = interner.symbol_for_rule_name(name);
if (!token.get())
return missing_rule_error(name);
ubiquitous_tokens.push_back(*token);
ubiquitous_tokens.insert(*token);
}
InternedGrammar result;

View file

@ -3,6 +3,7 @@
#include <utility>
#include <vector>
#include <set>
#include <string>
#include "tree_sitter/compiler.h"
#include "compiler/rules/symbol.h"
@ -12,8 +13,8 @@ namespace tree_sitter {
class InternedGrammar {
public:
std::vector<std::pair<std::string, rules::rule_ptr>> rules;
std::vector<rules::Symbol> ubiquitous_tokens;
std::vector<char> separators;
std::set<rules::Symbol> ubiquitous_tokens;
std::set<char> separators;
};
}
}

View file

@ -8,6 +8,7 @@ namespace tree_sitter {
using std::string;
using std::pair;
using std::vector;
using std::set;
const rules::rule_ptr & PreparedGrammar::rule(const rules::Symbol &symbol) const {
return symbol.is_auxiliary() ?
@ -25,6 +26,16 @@ namespace tree_sitter {
SyntaxGrammar::SyntaxGrammar() {}
LexicalGrammar::LexicalGrammar() {}
SyntaxGrammar::SyntaxGrammar(
const vector<pair<string, rules::rule_ptr>> &rules,
const vector<pair<string, rules::rule_ptr>> &aux_rules) :
PreparedGrammar(rules, aux_rules) {}
LexicalGrammar::LexicalGrammar(
const vector<pair<string, rules::rule_ptr>> &rules,
const vector<pair<string, rules::rule_ptr>> &aux_rules) :
PreparedGrammar(rules, aux_rules) {}
PreparedGrammar::PreparedGrammar(
const vector<pair<string, rules::rule_ptr>> &rules,
const vector<pair<string, rules::rule_ptr>> &aux_rules) :
@ -34,14 +45,14 @@ namespace tree_sitter {
SyntaxGrammar::SyntaxGrammar(
const vector<pair<string, rules::rule_ptr>> &rules,
const vector<pair<string, rules::rule_ptr>> &aux_rules,
const vector<rules::Symbol> &ubiquitous_tokens) :
const set<rules::Symbol> &ubiquitous_tokens) :
PreparedGrammar(rules, aux_rules),
ubiquitous_tokens(ubiquitous_tokens) {}
LexicalGrammar::LexicalGrammar(
const vector<pair<string, rules::rule_ptr>> &rules,
const vector<pair<string, rules::rule_ptr>> &aux_rules,
const vector<char> &separators) :
const set<char> &separators) :
PreparedGrammar(rules, aux_rules),
separators(separators) {}
}

View file

@ -3,6 +3,7 @@
#include <vector>
#include <string>
#include <set>
#include <utility>
#include "tree_sitter/compiler.h"
#include "compiler/rules/symbol.h"
@ -25,23 +26,29 @@ namespace tree_sitter {
class SyntaxGrammar : public PreparedGrammar {
public:
SyntaxGrammar();
SyntaxGrammar(
const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,
const std::vector<std::pair<std::string, rules::rule_ptr>> &aux_rules);
SyntaxGrammar(
const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,
const std::vector<std::pair<std::string, rules::rule_ptr>> &aux_rules,
const std::vector<rules::Symbol> &ubiquitous_tokens);
const std::set<rules::Symbol> &ubiquitous_tokens);
std::vector<rules::Symbol> ubiquitous_tokens;
std::set<rules::Symbol> ubiquitous_tokens;
};
class LexicalGrammar : public PreparedGrammar {
public:
LexicalGrammar();
LexicalGrammar(
const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,
const std::vector<std::pair<std::string, rules::rule_ptr>> &aux_rules);
LexicalGrammar(
const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,
const std::vector<std::pair<std::string, rules::rule_ptr>> &aux_rules,
const std::vector<char> &separators);
const std::set<char> &separators);
std::vector<char> separators;
std::set<char> separators;
};
}

View file

@ -9,20 +9,37 @@ static const TSParseAction * actions_for_state(TSStateMachine *machine, TSStateI
return machine->config.parse_table + (state * machine->config.symbol_count);
}
void shift(TSStateMachine *machine, TSStateId parse_state, int is_extra) {
machine->lookahead->is_extra = is_extra;
void shift(TSStateMachine *machine, TSStateId parse_state) {
if (machine->lookahead->is_extra)
parse_state = ts_stack_top_state(&machine->stack);
ts_stack_push(&machine->stack, parse_state, machine->lookahead);
machine->lookahead = machine->next_lookahead;
machine->next_lookahead = NULL;
}
void shift_extra(TSStateMachine *machine) {
machine->lookahead->is_extra = 1;
shift(machine, 0);
}
void reduce(TSStateMachine *machine, TSSymbol symbol, size_t child_count) {
machine->next_lookahead = machine->lookahead;
machine->lookahead = ts_stack_reduce(&machine->stack,
symbol,
child_count,
machine->config.hidden_symbol_flags,
1);
machine->lookahead = ts_stack_reduce(
&machine->stack,
symbol,
child_count,
machine->config.hidden_symbol_flags, 1);
}
int reduce_extra(TSStateMachine *machine, TSSymbol symbol) {
TSTree *top_node = ts_stack_top_node(&machine->stack);
if (top_node->symbol == symbol && !top_node->is_extra) {
reduce(machine, symbol, 1);
machine->lookahead->is_extra = 1;
return 1;
} else {
return 0;
}
}
static size_t breakdown_stack(TSStateMachine *machine, TSInputEdit *edit) {
@ -199,6 +216,8 @@ void ts_state_machine_initialize(TSStateMachine *machine, TSInput input, TSInput
ts_lexer_advance(&machine->lexer);
}
// #define TS_DEBUG_PARSE
#ifdef TS_DEBUG_PARSE
#include <stdio.h>
#define DEBUG_PARSE(...) fprintf(stderr, "\n" __VA_ARGS__)
@ -212,26 +231,33 @@ TSTree * ts_state_machine_parse(TSStateMachine *machine, const char **symbol_nam
switch (action.type) {
case TSParseActionTypeShift:
DEBUG_PARSE("SHIFT %d", action.data.to_state);
shift(machine, action.data.to_state, 0);
shift(machine, action.data.to_state);
return NULL;
case TSParseActionTypeShiftExtra:
DEBUG_PARSE("SHIFT EXTRA");
shift(machine, ts_stack_top_state(&machine->stack), 1);
shift_extra(machine);
return NULL;
case TSParseActionTypeReduce:
DEBUG_PARSE("REDUCE %s %d", symbol_names[action.data.symbol], action.data.child_count);
reduce(machine, action.data.symbol, action.data.child_count);
return NULL;
case TSParseActionTypeReduceExtra:
if (!reduce_extra(machine, action.data.symbol))
goto error;
DEBUG_PARSE("REDUCE EXTRA");
return NULL;
case TSParseActionTypeAccept:
DEBUG_PARSE("ACCEPT");
return get_tree_root(machine);
case TSParseActionTypeError:
DEBUG_PARSE("ERROR");
if (handle_error(machine))
return NULL;
else
return get_tree_root(machine);
goto error;
default:
return NULL;
}
error:
DEBUG_PARSE("ERROR");
if (handle_error(machine))
return NULL;
else
return get_tree_root(machine);
}