Fix handling of ubiquitous tokens used in grammar rules
This commit is contained in:
parent
59cc65c2e3
commit
83a1b9439e
30 changed files with 39086 additions and 32890 deletions
|
|
@ -11,12 +11,16 @@
|
|||
#include "compiler/build_tables/parse_item.h"
|
||||
#include "compiler/build_tables/item_set_closure.h"
|
||||
#include "compiler/build_tables/item_set_transitions.h"
|
||||
#include "compiler/build_tables/first_set.h"
|
||||
|
||||
#include <iostream>
|
||||
|
||||
namespace tree_sitter {
|
||||
using std::pair;
|
||||
using std::string;
|
||||
using std::vector;
|
||||
using std::set;
|
||||
using std::map;
|
||||
using std::unordered_map;
|
||||
using std::make_shared;
|
||||
using rules::Symbol;
|
||||
|
|
@ -33,8 +37,8 @@ namespace tree_sitter {
|
|||
if (pair == parse_state_ids.end()) {
|
||||
ParseStateId state_id = parse_table.add_state();
|
||||
parse_state_ids[item_set] = state_id;
|
||||
add_shift_actions(item_set, state_id);
|
||||
add_reduce_actions(item_set, state_id);
|
||||
add_shift_actions(item_set, state_id);
|
||||
add_ubiquitous_token_actions(item_set, state_id);
|
||||
return state_id;
|
||||
} else {
|
||||
|
|
@ -43,26 +47,34 @@ namespace tree_sitter {
|
|||
}
|
||||
|
||||
void add_shift_actions(const ParseItemSet &item_set, ParseStateId state_id) {
|
||||
map<Symbol, size_t> shifts;
|
||||
|
||||
for (const auto &transition : sym_transitions(item_set, grammar)) {
|
||||
const Symbol &symbol = transition.first;
|
||||
const ParseItemSet &next_item_set = transition.second;
|
||||
auto &actions = parse_table.states[state_id].actions;
|
||||
auto current_action = actions.find(symbol);
|
||||
|
||||
set<int> precedence_values = precedence_values_for_item_set(next_item_set);
|
||||
if (current_action == actions.end() ||
|
||||
conflict_manager.resolve_parse_action(symbol, current_action->second, ParseAction::Shift(0, precedence_values))) {
|
||||
ParseAction new_action = ParseAction::Shift(0, precedence_values_for_item_set(next_item_set));
|
||||
if (should_add_action(state_id, symbol, new_action)) {
|
||||
ParseStateId new_state_id = add_parse_state(next_item_set);
|
||||
parse_table.add_action(state_id, symbol, ParseAction::Shift(new_state_id, precedence_values));
|
||||
new_action.state_index = new_state_id;
|
||||
parse_table.add_action(state_id, symbol, new_action);
|
||||
|
||||
shifts.insert({ symbol, new_state_id });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void add_ubiquitous_token_actions(const ParseItemSet &item_set, ParseStateId state_id) {
|
||||
for (const Symbol &symbol : grammar.ubiquitous_tokens) {
|
||||
auto &actions = parse_table.states[state_id].actions;
|
||||
if (actions.find(symbol) == actions.end())
|
||||
parse_table.add_action(state_id, symbol, ParseAction::ShiftExtra());
|
||||
for (auto &pair : shifts) {
|
||||
const Symbol &shift_symbol = pair.first;
|
||||
size_t new_state_id = pair.second;
|
||||
|
||||
if (grammar.ubiquitous_tokens.find(shift_symbol) != grammar.ubiquitous_tokens.end()) {
|
||||
for (const auto &pair : parse_table.states[state_id].actions) {
|
||||
const Symbol &lookahead_sym = pair.first;
|
||||
ParseAction action = ParseAction::ReduceExtra(shift_symbol);
|
||||
if (should_add_action(new_state_id, lookahead_sym, action))
|
||||
parse_table.add_action(new_state_id, lookahead_sym, action);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -75,19 +87,39 @@ namespace tree_sitter {
|
|||
ParseAction action = (item.lhs == rules::START()) ?
|
||||
ParseAction::Accept() :
|
||||
ParseAction::Reduce(item.lhs, item.consumed_symbol_count, item.precedence());
|
||||
|
||||
for (auto &lookahead_sym : lookahead_symbols) {
|
||||
auto current_actions = parse_table.states[state_id].actions;
|
||||
auto current_action = current_actions.find(lookahead_sym);
|
||||
if (current_action == current_actions.end() ||
|
||||
conflict_manager.resolve_parse_action(lookahead_sym, current_action->second, action)) {
|
||||
for (auto &lookahead_sym : lookahead_symbols)
|
||||
if (should_add_action(state_id, lookahead_sym, action))
|
||||
parse_table.add_action(state_id, lookahead_sym, action);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void add_ubiquitous_token_actions(const ParseItemSet &item_set, ParseStateId state_id) {
|
||||
for (const Symbol &symbol : grammar.ubiquitous_tokens) {
|
||||
auto &actions = parse_table.states[state_id].actions;
|
||||
if (actions.find(symbol) == actions.end())
|
||||
parse_table.add_action(state_id, symbol, ParseAction::ShiftExtra());
|
||||
}
|
||||
}
|
||||
|
||||
set<Symbol> first_set_for_item_set(const ParseItemSet &item_set) {
|
||||
set<Symbol> result;
|
||||
for (const auto &pair : item_set) {
|
||||
auto new_set = first_set(pair.first.rule, grammar);
|
||||
result.insert(new_set.begin(), new_set.end());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
bool should_add_action(size_t state_id, const Symbol &symbol, const ParseAction &action) {
|
||||
auto current_actions = parse_table.states[state_id].actions;
|
||||
auto current_action = current_actions.find(symbol);
|
||||
return (
|
||||
current_action == current_actions.end() ||
|
||||
conflict_manager.resolve_parse_action(symbol, current_action->second, action)
|
||||
);
|
||||
}
|
||||
|
||||
set<int> precedence_values_for_item_set(const ParseItemSet &item_set) {
|
||||
set<int> result;
|
||||
for (const auto &pair : item_set) {
|
||||
|
|
|
|||
|
|
@ -290,6 +290,9 @@ namespace tree_sitter {
|
|||
symbol_id(action.symbol) + ", " +
|
||||
to_string(action.consumed_symbol_count) + ")");
|
||||
break;
|
||||
case ParseActionTypeReduceExtra:
|
||||
add("REDUCE_EXTRA(" + symbol_id(action.symbol) + ")");
|
||||
break;
|
||||
default:;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,9 +2,10 @@
|
|||
#include "compiler/rules/rule.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
using std::string;
|
||||
using std::ostream;
|
||||
using std::pair;
|
||||
using std::set;
|
||||
using std::string;
|
||||
using std::vector;
|
||||
using rules::rule_ptr;
|
||||
|
||||
|
|
@ -59,20 +60,20 @@ namespace tree_sitter {
|
|||
return stream << string("#<null>");
|
||||
}
|
||||
|
||||
const vector<string> & Grammar::ubiquitous_tokens() const {
|
||||
const set<string> & Grammar::ubiquitous_tokens() const {
|
||||
return ubiquitous_tokens_;
|
||||
}
|
||||
|
||||
Grammar & Grammar::ubiquitous_tokens(const vector<string> &ubiquitous_tokens) {
|
||||
Grammar & Grammar::ubiquitous_tokens(const set<string> &ubiquitous_tokens) {
|
||||
ubiquitous_tokens_ = ubiquitous_tokens;
|
||||
return *this;
|
||||
}
|
||||
|
||||
const vector<char> & Grammar::separators() const {
|
||||
const set<char> & Grammar::separators() const {
|
||||
return separators_;
|
||||
}
|
||||
|
||||
Grammar & Grammar::separators(const vector<char> &separators) {
|
||||
Grammar & Grammar::separators(const set<char> &separators) {
|
||||
separators_ = separators;
|
||||
return *this;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -39,7 +39,11 @@ namespace tree_sitter {
|
|||
}
|
||||
|
||||
ParseAction ParseAction::ShiftExtra() {
|
||||
return ParseAction(ParseActionTypeShiftExtra, -1, Symbol(-1), 0, set<int>({}));
|
||||
return ParseAction(ParseActionTypeShiftExtra, -1, Symbol(-1), 0, { 0 });
|
||||
}
|
||||
|
||||
ParseAction ParseAction::ReduceExtra(Symbol symbol) {
|
||||
return ParseAction(ParseActionTypeReduceExtra, -1, symbol, 0, { 0 });
|
||||
}
|
||||
|
||||
ParseAction ParseAction::Reduce(Symbol symbol, size_t consumed_symbol_count, int precedence) {
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ namespace tree_sitter {
|
|||
ParseActionTypeShift,
|
||||
ParseActionTypeShiftExtra,
|
||||
ParseActionTypeReduce,
|
||||
ParseActionTypeReduceExtra,
|
||||
ParseActionTypeAccept,
|
||||
} ParseActionType;
|
||||
|
||||
|
|
@ -28,8 +29,9 @@ namespace tree_sitter {
|
|||
static ParseAction Accept();
|
||||
static ParseAction Error();
|
||||
static ParseAction Shift(size_t state_index, std::set<int> precedence_values);
|
||||
static ParseAction ShiftExtra();
|
||||
static ParseAction Reduce(rules::Symbol symbol, size_t consumed_symbol_count, int precedence);
|
||||
static ParseAction ShiftExtra();
|
||||
static ParseAction ReduceExtra(rules::Symbol symbol);
|
||||
bool operator==(const ParseAction &action) const;
|
||||
|
||||
ParseActionType type;
|
||||
|
|
|
|||
|
|
@ -51,14 +51,14 @@ namespace tree_sitter {
|
|||
for (auto &pair : grammar.rules) {
|
||||
auto rule = expander.apply(pair.second);
|
||||
if (expander.error)
|
||||
return { LexicalGrammar({}, {}, {}), expander.error };
|
||||
return { LexicalGrammar(), expander.error };
|
||||
rules.push_back({ pair.first, rule });
|
||||
}
|
||||
|
||||
for (auto &pair : grammar.aux_rules) {
|
||||
auto rule = expander.apply(pair.second);
|
||||
if (expander.error)
|
||||
return { LexicalGrammar({}, {}, {}), expander.error };
|
||||
return { LexicalGrammar(), expander.error };
|
||||
aux_rules.push_back({ pair.first, rule });
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#include "compiler/prepare_grammar/extract_tokens.h"
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include "tree_sitter/compiler.h"
|
||||
#include "compiler/prepared_grammar.h"
|
||||
|
|
@ -18,6 +19,7 @@ namespace tree_sitter {
|
|||
using std::map;
|
||||
using std::to_string;
|
||||
using std::vector;
|
||||
using std::set;
|
||||
using std::make_shared;
|
||||
using rules::rule_ptr;
|
||||
using rules::Symbol;
|
||||
|
|
@ -96,7 +98,7 @@ namespace tree_sitter {
|
|||
|
||||
pair<SyntaxGrammar, LexicalGrammar> extract_tokens(const InternedGrammar &input_grammar) {
|
||||
vector<pair<string, rule_ptr>> rules, tokens, aux_rules, aux_tokens;
|
||||
vector<Symbol> ubiquitous_tokens;
|
||||
set<Symbol> ubiquitous_tokens;
|
||||
|
||||
TokenExtractor extractor;
|
||||
map<Symbol, Symbol> symbol_replacements;
|
||||
|
|
@ -120,7 +122,7 @@ namespace tree_sitter {
|
|||
for (auto &pair : rules)
|
||||
pair.second = inliner.apply(pair.second);
|
||||
for (auto &symbol : input_grammar.ubiquitous_tokens)
|
||||
ubiquitous_tokens.push_back(inliner.replace_symbol(symbol));
|
||||
ubiquitous_tokens.insert(inliner.replace_symbol(symbol));
|
||||
|
||||
return {
|
||||
SyntaxGrammar(rules, aux_rules, ubiquitous_tokens),
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#include "compiler/prepare_grammar/intern_symbols.h"
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include "tree_sitter/compiler.h"
|
||||
#include "compiler/prepare_grammar/interned_grammar.h"
|
||||
#include "compiler/prepared_grammar.h"
|
||||
|
|
@ -12,6 +13,7 @@ namespace tree_sitter {
|
|||
using std::string;
|
||||
using rules::rule_ptr;
|
||||
using std::vector;
|
||||
using std::set;
|
||||
using std::pair;
|
||||
using std::make_shared;
|
||||
|
||||
|
|
@ -58,12 +60,12 @@ namespace tree_sitter {
|
|||
rules.push_back({ pair.first, new_rule });
|
||||
}
|
||||
|
||||
vector<rules::Symbol> ubiquitous_tokens;
|
||||
set<rules::Symbol> ubiquitous_tokens;
|
||||
for (auto &name : grammar.ubiquitous_tokens()) {
|
||||
auto token = interner.symbol_for_rule_name(name);
|
||||
if (!token.get())
|
||||
return missing_rule_error(name);
|
||||
ubiquitous_tokens.push_back(*token);
|
||||
ubiquitous_tokens.insert(*token);
|
||||
}
|
||||
|
||||
InternedGrammar result;
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include "tree_sitter/compiler.h"
|
||||
#include "compiler/rules/symbol.h"
|
||||
|
|
@ -12,8 +13,8 @@ namespace tree_sitter {
|
|||
class InternedGrammar {
|
||||
public:
|
||||
std::vector<std::pair<std::string, rules::rule_ptr>> rules;
|
||||
std::vector<rules::Symbol> ubiquitous_tokens;
|
||||
std::vector<char> separators;
|
||||
std::set<rules::Symbol> ubiquitous_tokens;
|
||||
std::set<char> separators;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ namespace tree_sitter {
|
|||
using std::string;
|
||||
using std::pair;
|
||||
using std::vector;
|
||||
using std::set;
|
||||
|
||||
const rules::rule_ptr & PreparedGrammar::rule(const rules::Symbol &symbol) const {
|
||||
return symbol.is_auxiliary() ?
|
||||
|
|
@ -25,6 +26,16 @@ namespace tree_sitter {
|
|||
SyntaxGrammar::SyntaxGrammar() {}
|
||||
LexicalGrammar::LexicalGrammar() {}
|
||||
|
||||
SyntaxGrammar::SyntaxGrammar(
|
||||
const vector<pair<string, rules::rule_ptr>> &rules,
|
||||
const vector<pair<string, rules::rule_ptr>> &aux_rules) :
|
||||
PreparedGrammar(rules, aux_rules) {}
|
||||
|
||||
LexicalGrammar::LexicalGrammar(
|
||||
const vector<pair<string, rules::rule_ptr>> &rules,
|
||||
const vector<pair<string, rules::rule_ptr>> &aux_rules) :
|
||||
PreparedGrammar(rules, aux_rules) {}
|
||||
|
||||
PreparedGrammar::PreparedGrammar(
|
||||
const vector<pair<string, rules::rule_ptr>> &rules,
|
||||
const vector<pair<string, rules::rule_ptr>> &aux_rules) :
|
||||
|
|
@ -34,14 +45,14 @@ namespace tree_sitter {
|
|||
SyntaxGrammar::SyntaxGrammar(
|
||||
const vector<pair<string, rules::rule_ptr>> &rules,
|
||||
const vector<pair<string, rules::rule_ptr>> &aux_rules,
|
||||
const vector<rules::Symbol> &ubiquitous_tokens) :
|
||||
const set<rules::Symbol> &ubiquitous_tokens) :
|
||||
PreparedGrammar(rules, aux_rules),
|
||||
ubiquitous_tokens(ubiquitous_tokens) {}
|
||||
|
||||
LexicalGrammar::LexicalGrammar(
|
||||
const vector<pair<string, rules::rule_ptr>> &rules,
|
||||
const vector<pair<string, rules::rule_ptr>> &aux_rules,
|
||||
const vector<char> &separators) :
|
||||
const set<char> &separators) :
|
||||
PreparedGrammar(rules, aux_rules),
|
||||
separators(separators) {}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <set>
|
||||
#include <utility>
|
||||
#include "tree_sitter/compiler.h"
|
||||
#include "compiler/rules/symbol.h"
|
||||
|
|
@ -25,23 +26,29 @@ namespace tree_sitter {
|
|||
class SyntaxGrammar : public PreparedGrammar {
|
||||
public:
|
||||
SyntaxGrammar();
|
||||
SyntaxGrammar(
|
||||
const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,
|
||||
const std::vector<std::pair<std::string, rules::rule_ptr>> &aux_rules);
|
||||
SyntaxGrammar(
|
||||
const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,
|
||||
const std::vector<std::pair<std::string, rules::rule_ptr>> &aux_rules,
|
||||
const std::vector<rules::Symbol> &ubiquitous_tokens);
|
||||
const std::set<rules::Symbol> &ubiquitous_tokens);
|
||||
|
||||
std::vector<rules::Symbol> ubiquitous_tokens;
|
||||
std::set<rules::Symbol> ubiquitous_tokens;
|
||||
};
|
||||
|
||||
class LexicalGrammar : public PreparedGrammar {
|
||||
public:
|
||||
LexicalGrammar();
|
||||
LexicalGrammar(
|
||||
const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,
|
||||
const std::vector<std::pair<std::string, rules::rule_ptr>> &aux_rules);
|
||||
LexicalGrammar(
|
||||
const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,
|
||||
const std::vector<std::pair<std::string, rules::rule_ptr>> &aux_rules,
|
||||
const std::vector<char> &separators);
|
||||
const std::set<char> &separators);
|
||||
|
||||
std::vector<char> separators;
|
||||
std::set<char> separators;
|
||||
};
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -9,20 +9,37 @@ static const TSParseAction * actions_for_state(TSStateMachine *machine, TSStateI
|
|||
return machine->config.parse_table + (state * machine->config.symbol_count);
|
||||
}
|
||||
|
||||
void shift(TSStateMachine *machine, TSStateId parse_state, int is_extra) {
|
||||
machine->lookahead->is_extra = is_extra;
|
||||
void shift(TSStateMachine *machine, TSStateId parse_state) {
|
||||
if (machine->lookahead->is_extra)
|
||||
parse_state = ts_stack_top_state(&machine->stack);
|
||||
ts_stack_push(&machine->stack, parse_state, machine->lookahead);
|
||||
machine->lookahead = machine->next_lookahead;
|
||||
machine->next_lookahead = NULL;
|
||||
}
|
||||
|
||||
void shift_extra(TSStateMachine *machine) {
|
||||
machine->lookahead->is_extra = 1;
|
||||
shift(machine, 0);
|
||||
}
|
||||
|
||||
void reduce(TSStateMachine *machine, TSSymbol symbol, size_t child_count) {
|
||||
machine->next_lookahead = machine->lookahead;
|
||||
machine->lookahead = ts_stack_reduce(&machine->stack,
|
||||
symbol,
|
||||
child_count,
|
||||
machine->config.hidden_symbol_flags,
|
||||
1);
|
||||
machine->lookahead = ts_stack_reduce(
|
||||
&machine->stack,
|
||||
symbol,
|
||||
child_count,
|
||||
machine->config.hidden_symbol_flags, 1);
|
||||
}
|
||||
|
||||
int reduce_extra(TSStateMachine *machine, TSSymbol symbol) {
|
||||
TSTree *top_node = ts_stack_top_node(&machine->stack);
|
||||
if (top_node->symbol == symbol && !top_node->is_extra) {
|
||||
reduce(machine, symbol, 1);
|
||||
machine->lookahead->is_extra = 1;
|
||||
return 1;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static size_t breakdown_stack(TSStateMachine *machine, TSInputEdit *edit) {
|
||||
|
|
@ -199,6 +216,8 @@ void ts_state_machine_initialize(TSStateMachine *machine, TSInput input, TSInput
|
|||
ts_lexer_advance(&machine->lexer);
|
||||
}
|
||||
|
||||
// #define TS_DEBUG_PARSE
|
||||
|
||||
#ifdef TS_DEBUG_PARSE
|
||||
#include <stdio.h>
|
||||
#define DEBUG_PARSE(...) fprintf(stderr, "\n" __VA_ARGS__)
|
||||
|
|
@ -212,26 +231,33 @@ TSTree * ts_state_machine_parse(TSStateMachine *machine, const char **symbol_nam
|
|||
switch (action.type) {
|
||||
case TSParseActionTypeShift:
|
||||
DEBUG_PARSE("SHIFT %d", action.data.to_state);
|
||||
shift(machine, action.data.to_state, 0);
|
||||
shift(machine, action.data.to_state);
|
||||
return NULL;
|
||||
case TSParseActionTypeShiftExtra:
|
||||
DEBUG_PARSE("SHIFT EXTRA");
|
||||
shift(machine, ts_stack_top_state(&machine->stack), 1);
|
||||
shift_extra(machine);
|
||||
return NULL;
|
||||
case TSParseActionTypeReduce:
|
||||
DEBUG_PARSE("REDUCE %s %d", symbol_names[action.data.symbol], action.data.child_count);
|
||||
reduce(machine, action.data.symbol, action.data.child_count);
|
||||
return NULL;
|
||||
case TSParseActionTypeReduceExtra:
|
||||
if (!reduce_extra(machine, action.data.symbol))
|
||||
goto error;
|
||||
DEBUG_PARSE("REDUCE EXTRA");
|
||||
return NULL;
|
||||
case TSParseActionTypeAccept:
|
||||
DEBUG_PARSE("ACCEPT");
|
||||
return get_tree_root(machine);
|
||||
case TSParseActionTypeError:
|
||||
DEBUG_PARSE("ERROR");
|
||||
if (handle_error(machine))
|
||||
return NULL;
|
||||
else
|
||||
return get_tree_root(machine);
|
||||
goto error;
|
||||
default:
|
||||
return NULL;
|
||||
}
|
||||
error:
|
||||
DEBUG_PARSE("ERROR");
|
||||
if (handle_error(machine))
|
||||
return NULL;
|
||||
else
|
||||
return get_tree_root(machine);
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue