Record in parse table which actions can hide splits

Suppose a parse state S has multiple actions for a terminal lookahead symbol A.
Then during incremental parsing, while in state S, the parser should not
reuse a non-terminal lookahead B where FIRST(B) contains A, because reusing B
might prematurely discard one of the possible actions that a batch parser
would have attempted in state S, upon seeing A as a lookahead.
This commit is contained in:
Max Brunsfeld 2015-12-17 12:48:55 -08:00
parent 7fbb628c78
commit c495076adb
19 changed files with 58613 additions and 60661 deletions

View file

@ -79,7 +79,7 @@ class ParseTableBuilder {
add_reduce_extra_actions(state);
}
mark_fragile_reductions();
mark_fragile_actions();
remove_duplicate_states();
parse_table.symbols.insert({ rules::ERROR(), {} });
@ -134,9 +134,9 @@ class ParseTableBuilder {
}
void add_shift_extra_actions(ParseStateId state_id) {
ParseAction action = ParseAction::ShiftExtra();
for (const Symbol &ubiquitous_symbol : grammar.ubiquitous_tokens)
add_action(state_id, ubiquitous_symbol, ParseAction::ShiftExtra(),
null_item_set);
add_action(state_id, ubiquitous_symbol, action, null_item_set);
}
void add_reduce_extra_actions(ParseStateId state_id) {
@ -148,7 +148,7 @@ class ParseTableBuilder {
continue;
for (const ParseAction &action : actions_for_symbol->second)
if (action.type == ParseActionTypeShift) {
if (action.type == ParseActionTypeShift && !action.extra) {
size_t dest_state_id = action.state_index;
ParseAction reduce_extra = ParseAction::ReduceExtra(ubiquitous_symbol);
for (const auto &pair : state.actions)
@ -157,14 +157,36 @@ class ParseTableBuilder {
}
}
void mark_fragile_reductions() {
void mark_fragile_actions() {
for (ParseState &state : parse_table.states) {
set<Symbol> symbols_with_multiple_actions;
for (auto &entry : state.actions) {
if (entry.second.size() > 1)
symbols_with_multiple_actions.insert(entry.first);
for (ParseAction &action : entry.second) {
if (action.type == ParseActionTypeReduce) {
if (action.type == ParseActionTypeReduce && !action.extra) {
if (has_fragile_production(action.production))
action.type = ParseActionTypeReduceFragile;
action.fragile = true;
action.production = NULL;
action.precedence_range = PrecedenceRange();
action.associativity = rules::AssociativityNone;
}
}
}
if (!symbols_with_multiple_actions.empty()) {
for (auto &entry : state.actions) {
if (!entry.first.is_token) {
set<Symbol> first_set = get_first_set(entry.first);
for (const Symbol &symbol : symbols_with_multiple_actions) {
if (first_set.count(symbol)) {
entry.second[0].can_hide_split = true;
break;
}
}
}
}
}
@ -175,6 +197,7 @@ class ParseTableBuilder {
bool done = false;
while (!done) {
done = true;
map<ParseStateId, ParseStateId> replacements;
for (size_t i = 0, size = parse_table.states.size(); i < size; i++) {
for (size_t j = 0; j < i; j++) {
@ -210,9 +233,8 @@ class ParseTableBuilder {
}
}
for (auto replacement = replacements.rbegin(); replacement != replacements.rend(); ++replacement) {
parse_table.states.erase(parse_table.states.begin() + replacement->first);
}
for (auto i = replacements.rbegin(); i != replacements.rend(); ++i)
parse_table.states.erase(parse_table.states.begin() + i->first);
}
}

View file

@ -22,12 +22,12 @@ pair<bool, ConflictType> ParseConflictManager::resolve(
switch (old_action.type) {
case ParseActionTypeError:
case ParseActionTypeShiftExtra:
case ParseActionTypeReduceExtra:
return { true, ConflictTypeNone };
case ParseActionTypeShift:
if (new_action.type == ParseActionTypeReduce) {
if (new_action.extra)
return { false, ConflictTypeNone };
int min_precedence = old_action.precedence_range.min;
int max_precedence = old_action.precedence_range.max;
int new_precedence = new_action.precedence_range.max;
@ -54,6 +54,12 @@ pair<bool, ConflictType> ParseConflictManager::resolve(
}
case ParseActionTypeReduce:
if (new_action.extra)
return { false, ConflictTypeNone };
if (old_action.extra)
return { true, ConflictTypeNone };
if (new_action.extra)
return { false, ConflictTypeNone };
if (new_action.type == ParseActionTypeReduce) {
int old_precedence = old_action.precedence_range.min;
int new_precedence = new_action.precedence_range.min;

View file

@ -21,8 +21,7 @@ ParseItem::ParseItem(const Symbol &lhs, const Production &production,
bool ParseItem::operator==(const ParseItem &other) const {
return ((variable_index == other.variable_index) &&
(step_index == other.step_index) &&
(production == other.production));
(step_index == other.step_index) && (production == other.production));
}
bool ParseItem::operator<(const ParseItem &other) const {
@ -75,7 +74,7 @@ rules::Associativity ParseItem::associativity() const {
size_t ParseItem::Hash::operator()(const ParseItem &item) const {
size_t result = hash<int>()(item.variable_index);
result ^= hash<unsigned int>()(item.step_index);
result ^= hash<void *>()((void *)item.production);
result ^= hash<const void *>()(static_cast<const void *>(item.production));
return result;
}

View file

@ -329,21 +329,23 @@ class CCodeGenerator {
add("ACCEPT_INPUT()");
break;
case ParseActionTypeShift:
add("SHIFT(" + to_string(action.state_index) + ")");
break;
case ParseActionTypeShiftExtra:
add("SHIFT_EXTRA()");
break;
case ParseActionTypeReduceFragile:
add("REDUCE_FRAGILE(" + symbol_id(action.symbol) + ", " +
to_string(action.consumed_symbol_count) + ")");
if (action.extra) {
add("SHIFT_EXTRA()");
} else {
add("SHIFT(" + to_string(action.state_index) + ", ");
add_action_flags(action);
add(")");
}
break;
case ParseActionTypeReduce:
add("REDUCE(" + symbol_id(action.symbol) + ", " +
to_string(action.consumed_symbol_count) + ")");
break;
case ParseActionTypeReduceExtra:
add("REDUCE_EXTRA(" + symbol_id(action.symbol) + ")");
if (action.extra) {
add("REDUCE_EXTRA(" + symbol_id(action.symbol) + ")");
} else {
add("REDUCE(" + symbol_id(action.symbol) + ", " +
to_string(action.consumed_symbol_count) + ", ");
add_action_flags(action);
add(")");
}
break;
default: {}
}
@ -351,6 +353,17 @@ class CCodeGenerator {
}
}
void add_action_flags(const ParseAction &action) {
if (action.fragile && action.can_hide_split)
add("FRAGILE|CAN_HIDE_SPLIT");
else if (action.fragile)
add("FRAGILE");
else if (action.can_hide_split)
add("CAN_HIDE_SPLIT");
else
add("0");
}
// Helper functions
string lex_state_index(size_t i) {

View file

@ -17,6 +17,9 @@ ParseAction::ParseAction(ParseActionType type, ParseStateId state_index,
rules::Associativity associativity,
const Production *production)
: type(type),
extra(false),
fragile(false),
can_hide_split(false),
symbol(symbol),
state_index(state_index),
consumed_symbol_count(consumed_symbol_count),
@ -26,6 +29,9 @@ ParseAction::ParseAction(ParseActionType type, ParseStateId state_index,
ParseAction::ParseAction()
: type(ParseActionTypeError),
extra(false),
fragile(false),
can_hide_split(false),
symbol(Symbol(-1)),
state_index(-1),
consumed_symbol_count(0),
@ -49,14 +55,17 @@ ParseAction ParseAction::Shift(ParseStateId state_index,
ParseAction ParseAction::ShiftExtra() {
ParseAction action;
action.type = ParseActionTypeShiftExtra;
action.type = ParseActionTypeShift;
action.extra = true;
return action;
}
ParseAction ParseAction::ReduceExtra(Symbol symbol) {
ParseAction action;
action.type = ParseActionTypeReduceExtra;
action.type = ParseActionTypeReduce;
action.extra = true;
action.symbol = symbol;
action.consumed_symbol_count = 1;
return action;
}
@ -69,13 +78,11 @@ ParseAction ParseAction::Reduce(Symbol symbol, size_t consumed_symbol_count,
}
bool ParseAction::operator==(const ParseAction &other) const {
return (
type == other.type &&
symbol == other.symbol &&
state_index == other.state_index &&
production == other.production &&
consumed_symbol_count == other.consumed_symbol_count
);
return (type == other.type && extra == other.extra &&
fragile == other.fragile && can_hide_split == other.can_hide_split &&
symbol == other.symbol && state_index == other.state_index &&
production == other.production &&
consumed_symbol_count == other.consumed_symbol_count);
}
bool ParseAction::operator<(const ParseAction &other) const {
@ -83,6 +90,18 @@ bool ParseAction::operator<(const ParseAction &other) const {
return true;
if (other.type < type)
return false;
if (extra && !other.extra)
return true;
if (other.extra && !extra)
return false;
if (fragile && !other.fragile)
return true;
if (other.fragile && !fragile)
return false;
if (can_hide_split && !other.can_hide_split)
return true;
if (other.can_hide_split && !can_hide_split)
return false;
if (symbol < other.symbol)
return true;
if (other.symbol < symbol)
@ -121,16 +140,20 @@ ParseStateId ParseTable::add_state() {
ParseAction &ParseTable::set_action(ParseStateId id, Symbol symbol,
ParseAction action) {
bool structural = action.type != ParseActionTypeShiftExtra;
symbols[symbol].structural += structural;
if (action.extra)
symbols[symbol];
else
symbols[symbol].structural = true;
states[id].actions[symbol] = vector<ParseAction>({ action });
return *states[id].actions[symbol].begin();
}
ParseAction &ParseTable::add_action(ParseStateId id, Symbol symbol,
ParseAction action) {
bool structural = action.type != ParseActionTypeShiftExtra;
symbols[symbol].structural += structural;
if (action.extra)
symbols[symbol];
else
symbols[symbol].structural = true;
states[id].actions[symbol].push_back(action);
return *states[id].actions[symbol].rbegin();
}

View file

@ -17,10 +17,6 @@ typedef uint64_t ParseStateId;
typedef enum {
ParseActionTypeError,
ParseActionTypeReduceExtra,
ParseActionTypeReduceFragile,
ParseActionTypeShiftExtra,
ParseActionTypeShift,
ParseActionTypeReduce,
ParseActionTypeAccept,
@ -45,6 +41,9 @@ class ParseAction {
bool operator<(const ParseAction &) const;
ParseActionType type;
bool extra;
bool fragile;
bool can_hide_split;
rules::Symbol symbol;
ParseStateId state_index;
size_t consumed_symbol_count;
@ -64,6 +63,8 @@ struct hash<tree_sitter::ParseAction> {
hash<tree_sitter::rules::Symbol>()(action.symbol) ^
hash<size_t>()(action.state_index) ^
hash<size_t>()(action.consumed_symbol_count) ^
hash<bool>()(action.extra) ^ hash<bool>()(action.fragile) ^
hash<bool>()(action.can_hide_split) ^
hash<int>()(action.associativity) ^
hash<int>()(action.precedence_range.min) ^
hash<int>()(action.precedence_range.max) ^

View file

@ -81,7 +81,8 @@ SyntaxGrammar flatten_grammar(const InitialSyntaxGrammar &grammar) {
vector<Production> productions;
for (const rule_ptr &rule_component : extract_choices(variable.rule)) {
Production production = FlattenRule().flatten(rule_component);
if (std::find(productions.begin(), productions.end(), production) == productions.end())
if (std::find(productions.begin(), productions.end(), production) ==
productions.end())
productions.push_back(production);
}
result.variables.push_back(

View file

@ -21,9 +21,7 @@ SyntaxVariable::SyntaxVariable(const string &name, VariableType type,
ProductionStep::ProductionStep(const rules::Symbol &symbol, int precedence,
rules::Associativity associativity)
: symbol(symbol),
precedence(precedence),
associativity(associativity) {}
: symbol(symbol), precedence(precedence), associativity(associativity) {}
bool ProductionStep::operator==(const ProductionStep &other) const {
return symbol == other.symbol && precedence == other.precedence &&

View file

@ -23,6 +23,8 @@
#define SYM_NAME(sym) self->language->symbol_names[sym]
#define BOOL_STRING(value) (value ? "true" : "false")
typedef struct {
TSTree *reusable_subtree;
size_t reusable_subtree_pos;
@ -46,7 +48,8 @@ static void ts_parser__breakdown_top_of_stack(TSParser *self, int head) {
StackPopResult *first_result = vector_get(&pop_results, 0);
TSTree **removed_trees = first_result->trees;
TSTree *parent = removed_trees[0];
LOG("breakdown_pop sym:%s, size:%lu", SYM_NAME(parent->symbol), ts_tree_total_size(parent).chars);
LOG("breakdown_pop sym:%s, size:%lu", SYM_NAME(parent->symbol),
ts_tree_total_size(parent).chars);
for (size_t i = 0; i < pop_results.size; i++) {
StackPopResult *pop_result = vector_get(&pop_results, i);
@ -58,23 +61,23 @@ static void ts_parser__breakdown_top_of_stack(TSParser *self, int head) {
for (size_t j = 0; j < parent->child_count; j++) {
last_child = parent->children[j];
if (!last_child->options.extra) {
TSParseAction action = ts_language_last_action(self->language, state, last_child->symbol);
TSParseAction action =
ts_language_last_action(self->language, state, last_child->symbol);
assert(action.type == TSParseActionTypeShift);
state = action.data.to_state;
}
LOG("breakdown_push sym:%s, size:%lu", SYM_NAME(last_child->symbol), ts_tree_total_size(last_child).chars);
merged = ts_stack_push(self->stack, pop_result->head_index, state, last_child);
LOG("breakdown_push sym:%s, size:%lu", SYM_NAME(last_child->symbol),
ts_tree_total_size(last_child).chars);
merged =
ts_stack_push(self->stack, pop_result->head_index, state, last_child);
}
for (size_t j = 1, count = pop_result->tree_count; j < count; j++) {
merged = ts_stack_push(self->stack, pop_result->head_index, state, pop_result->trees[j]);
}
for (size_t j = 1, count = pop_result->tree_count; j < count; j++)
merged = ts_stack_push(self->stack, pop_result->head_index, state,
pop_result->trees[j]);
if (i == 0)
assert(!merged);
else
assert(merged);
assert((i == 0) ^ merged);
}
free(removed_trees);
@ -115,12 +118,13 @@ static void ts_parser__pop_reusable_subtree(LookaheadState *state) {
}
static bool ts_parser__can_reuse(TSParser *self, int head, TSTree *subtree) {
if (!subtree || subtree->symbol == ts_builtin_sym_error || ts_tree_is_fragile(subtree))
if (!subtree || subtree->symbol == ts_builtin_sym_error ||
ts_tree_is_fragile(subtree))
return false;
TSStateId state = ts_stack_top_state(self->stack, head);
const TSParseAction *action =
ts_language_actions(self->language, state, subtree->symbol);
return action->type != TSParseActionTypeError;
return action->type != TSParseActionTypeError && !action->can_hide_split;
}
/*
@ -155,7 +159,8 @@ static TSTree *ts_parser__get_next_lookahead(TSParser *self, int head) {
LOG("breakdown_extra sym:%s", SYM_NAME(state->reusable_subtree->symbol));
can_reuse = false;
} else if (!ts_parser__can_reuse(self, head, state->reusable_subtree)) {
LOG("breakdown_non_reusable sym:%s", SYM_NAME(state->reusable_subtree->symbol));
LOG("breakdown_non_reusable sym:%s",
SYM_NAME(state->reusable_subtree->symbol));
can_reuse = false;
}
@ -196,10 +201,12 @@ static TSTree *ts_parser__select_tree(void *data, TSTree *left, TSTree *right) {
TSParser *self = data;
int comparison = ts_tree_compare(left, right);
if (comparison <= 0) {
LOG("select tree:%s, over_tree:%s", SYM_NAME(left->symbol), SYM_NAME(right->symbol));
LOG("select tree:%s, over_tree:%s", SYM_NAME(left->symbol),
SYM_NAME(right->symbol));
return left;
} else {
LOG("select tree:%s, over_tree:%s", SYM_NAME(right->symbol), SYM_NAME(left->symbol));
LOG("select tree:%s, over_tree:%s", SYM_NAME(right->symbol),
SYM_NAME(left->symbol));
return right;
}
}
@ -208,8 +215,8 @@ static TSTree *ts_parser__select_tree(void *data, TSTree *left, TSTree *right) {
* Parse Actions
*/
static bool ts_parser__shift(TSParser *self, int head,
TSStateId parse_state, TSTree *lookahead) {
static bool ts_parser__shift(TSParser *self, int head, TSStateId parse_state,
TSTree *lookahead) {
if (self->language->symbol_metadata[lookahead->symbol].extra)
ts_tree_set_fragile(lookahead);
if (ts_stack_push(self->stack, head, parse_state, lookahead)) {
@ -231,7 +238,8 @@ static bool ts_parser__shift_extra(TSParser *self, int head, TSStateId state,
}
static bool ts_parser__reduce(TSParser *self, int head, TSSymbol symbol,
int child_count, bool extra, bool count_extra) {
int child_count, bool extra, bool fragile,
bool count_extra) {
vector_clear(&self->reduce_parents);
const TSSymbolMetadata *all_metadata = self->language->symbol_metadata;
TSSymbolMetadata metadata = all_metadata[symbol];
@ -359,25 +367,21 @@ static bool ts_parser__reduce(TSParser *self, int head, TSSymbol symbol,
}
}
return removed_heads < revealed_heads;
}
static bool ts_parser__reduce_fragile(TSParser *self, int head, TSSymbol symbol,
size_t child_count) {
bool result = ts_parser__reduce(self, head, symbol, child_count, false, false);
if (result)
if (fragile) {
for (size_t i = 0; i < self->reduce_parents.size; i++) {
TSTree **parent = vector_get(&self->reduce_parents, i);
ts_tree_set_fragile_left(*parent);
ts_tree_set_fragile_right(*parent);
}
return result;
}
return removed_heads < revealed_heads;
}
static void ts_parser__reduce_error(TSParser *self, int head,
size_t child_count, TSTree *lookahead) {
bool result = ts_parser__reduce(self, head, ts_builtin_sym_error, child_count,
false, true);
false, false, true);
if (result) {
TSTree **parent = vector_back(&self->reduce_parents);
StackEntry *stack_entry = ts_stack_head(self->stack, head);
@ -528,7 +532,8 @@ static bool ts_parser__consume_lookahead(TSParser *self, int head,
LOG("split_action from_head:%d, new_head:%d", head, current_head);
}
LookaheadState *lookahead_state = vector_get(&self->lookahead_states, current_head);
LookaheadState *lookahead_state =
vector_get(&self->lookahead_states, current_head);
// TODO: Remove this by making a separate symbol for errors returned from
// the lexer.
@ -555,40 +560,33 @@ static bool ts_parser__consume_lookahead(TSParser *self, int head,
}
case TSParseActionTypeShift:
LOG("shift state:%u", action.data.to_state);
lookahead_state->is_verifying = (lookahead->child_count > 0);
return ts_parser__shift(self, current_head, action.data.to_state,
lookahead);
case TSParseActionTypeShiftExtra:
LOG("shift_extra");
return ts_parser__shift_extra(self, current_head, state, lookahead);
if (action.extra) {
LOG("shift_extra");
return ts_parser__shift_extra(self, current_head, state, lookahead);
} else {
LOG("shift state:%u", action.data.to_state);
lookahead_state->is_verifying = (lookahead->child_count > 0);
return ts_parser__shift(self, current_head, action.data.to_state,
lookahead);
}
case TSParseActionTypeReduce:
LOG("reduce sym:%s, child_count:%u", SYM_NAME(action.data.symbol),
action.data.child_count);
lookahead_state->is_verifying = false;
if (!ts_parser__reduce(self, current_head, action.data.symbol,
action.data.child_count, false, false))
if (!next_action)
return false;
break;
case TSParseActionTypeReduceExtra:
LOG("reduce_extra sym:%s", SYM_NAME(action.data.symbol));
lookahead_state->is_verifying = false;
ts_parser__reduce(self, current_head, action.data.symbol, 1,
true, false);
break;
case TSParseActionTypeReduceFragile:
LOG("reduce_fragile sym:%s, count:%u", SYM_NAME(action.data.symbol),
action.data.child_count);
lookahead_state->is_verifying = false;
if (!ts_parser__reduce_fragile(self, current_head, action.data.symbol,
action.data.child_count))
if (!next_action)
return false;
if (action.extra) {
LOG("reduce_extra sym:%s", SYM_NAME(action.data.symbol));
ts_parser__reduce(self, current_head, action.data.symbol, 1, true,
false, false);
} else {
LOG("reduce sym:%s, child_count:%u, fragile:%s",
SYM_NAME(action.data.symbol), action.data.child_count,
BOOL_STRING(action.fragile));
if (!ts_parser__reduce(self, current_head, action.data.symbol,
action.data.child_count, false,
action.fragile, false))
if (!next_action)
return false;
}
break;
case TSParseActionTypeAccept:
@ -643,7 +641,8 @@ TSTree *ts_parser_parse(TSParser *self, TSInput input, TSTree *previous_tree) {
ts_stack_head_count(self->stack),
ts_stack_top_state(self->stack, head), position.chars);
if (!ts_parser__can_reuse(self, head, lookahead) || position.chars != last_position.chars) {
if (!ts_parser__can_reuse(self, head, lookahead) ||
position.chars != last_position.chars) {
TSTree *reused_lookahead = ts_parser__get_next_lookahead(self, head);
if (ts_parser__can_reuse(self, head, reused_lookahead)) {
lookahead = reused_lookahead;