Optimize iteration over state successors during query analysis

This commit is contained in:
Max Brunsfeld 2020-08-19 13:15:45 -07:00
parent bd42729a41
commit aac75e35b1
2 changed files with 267 additions and 147 deletions

View file

@ -20,6 +20,22 @@ typedef struct {
bool is_reusable;
} TableEntry;
typedef struct {
const TSLanguage *language;
const uint16_t *data;
const uint16_t *group_end;
TSStateId state;
uint16_t table_value;
uint16_t section_index;
uint16_t group_count;
bool is_small_state;
const TSParseAction *actions;
TSSymbol symbol;
TSStateId next_state;
uint16_t action_count;
} LookaheadIterator;
void ts_language_table_entry(const TSLanguage *, TSStateId, TSSymbol, TableEntry *);
TSSymbolMetadata ts_language_symbol_metadata(const TSLanguage *, TSSymbol);
@ -62,6 +78,13 @@ static inline bool ts_language_has_reduce_action(
return entry.action_count > 0 && entry.actions[0].type == TSParseActionTypeReduce;
}
// Lookup the table value for a given symbol and state.
//
// For non-terminal symbols, the table value represents a successor state.
// For terminal symbols, it represents an index in the actions table.
// For 'large' parse states, this is a direct lookup. For 'small' parse
// states, this requires searching through the symbol groups to find
// the given symbol.
static inline uint16_t ts_language_lookup(
const TSLanguage *self,
TSStateId state,
@ -73,8 +96,8 @@ static inline uint16_t ts_language_lookup(
) {
uint32_t index = self->small_parse_table_map[state - self->large_state_count];
const uint16_t *data = &self->small_parse_table[index];
uint16_t section_count = *(data++);
for (unsigned i = 0; i < section_count; i++) {
uint16_t group_count = *(data++);
for (unsigned i = 0; i < group_count; i++) {
uint16_t section_value = *(data++);
uint16_t symbol_count = *(data++);
for (unsigned i = 0; i < symbol_count; i++) {
@ -87,6 +110,85 @@ static inline uint16_t ts_language_lookup(
}
}
// Iterate over all of the symbols that are valid in the given state.
//
// For 'large' parse states, this just requires iterating through
// all possible symbols and checking the parse table for each one.
// For 'small' parse states, this exploits the structure of the
// table to only visit the valid symbols.
static inline LookaheadIterator ts_language_lookaheads(
const TSLanguage *self,
TSStateId state
) {
bool is_small_state =
self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_SMALL_STATES &&
state >= self->large_state_count;
const uint16_t *data;
const uint16_t *group_end = NULL;
uint16_t group_count = 0;
if (is_small_state) {
uint32_t index = self->small_parse_table_map[state - self->large_state_count];
data = &self->small_parse_table[index];
group_end = data + 1;
group_count = *data;
} else {
data = &self->parse_table[state * self->symbol_count] - 1;
}
return (LookaheadIterator) {
.language = self,
.data = data,
.group_end = group_end,
.group_count = group_count,
.is_small_state = is_small_state,
.symbol = UINT16_MAX,
.next_state = 0,
};
}
static inline bool ts_lookahead_iterator_next(LookaheadIterator *self) {
// For small parse states, valid symbols are listed explicitly,
// grouped by their value. There's no need to look up the actions
// again until moving to the next group.
if (self->is_small_state) {
self->data++;
if (self->data == self->group_end) {
if (self->group_count == 0) return false;
self->group_count--;
self->table_value = *(self->data++);
unsigned symbol_count = *(self->data++);
self->group_end = self->data + symbol_count;
self->symbol = *self->data;
} else {
self->symbol = *self->data;
return true;
}
}
// For large parse states, iterate through every symbol until one
// is found that has valid actions.
else {
do {
self->data++;
self->symbol++;
if (self->symbol >= self->language->symbol_count) return false;
self->table_value = *self->data;
} while (!self->table_value);
}
// Depending on if the symbols is terminal or non-terminal, the table value either
// represents a list of actions or a successor state.
if (self->symbol < self->language->token_count) {
const TSParseActionEntry *entry = &self->language->parse_actions[self->table_value];
self->action_count = entry->entry.count;
self->actions = (const TSParseAction *)(entry + 1);
self->next_state = 0;
} else {
self->action_count = 0;
self->next_state = self->table_value;
}
return true;
}
static inline TSStateId ts_language_next_state(
const TSLanguage *self,
TSStateId state,

View file

@ -599,7 +599,7 @@ static inline int analysis_state__compare(
if (self->stack[i].parse_state > other->stack[i].parse_state) return 1;
if (self->stack[i].field_id < other->stack[i].field_id) return -1;
if (self->stack[i].field_id > other->stack[i].field_id) return 1;
}
}
if (self->step_index < other->step_index) return -1;
if (self->step_index > other->step_index) return 1;
return 0;
@ -769,47 +769,44 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *impossible_index
// 3) A list of predecessor states for each state.
StatePredecessorMap predecessor_map = state_predecessor_map_new(self->language);
for (TSStateId state = 1; state < self->language->state_count; state++) {
unsigned subgraph_index = 0, exists;
for (TSSymbol sym = 0; sym < self->language->token_count; sym++) {
unsigned count;
const TSParseAction *actions = ts_language_actions(self->language, state, sym, &count);
for (unsigned i = 0; i < count; i++) {
const TSParseAction *action = &actions[i];
if (action->type == TSParseActionTypeReduce) {
TSSymbol symbol = self->language->public_symbol_map[action->params.reduce.symbol];
array_search_sorted_by(
&subgraphs,
0,
.symbol,
symbol,
&subgraph_index,
&exists
);
if (exists) {
AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index];
if (subgraph->nodes.size == 0 || array_back(&subgraph->nodes)->state != state) {
array_push(&subgraph->nodes, ((AnalysisSubgraphNode) {
.state = state,
.production_id = action->params.reduce.production_id,
.child_index = action->params.reduce.child_count,
.done = true,
}));
unsigned subgraph_index, exists;
LookaheadIterator lookahead_iterator = ts_language_lookaheads(self->language, state);
while (ts_lookahead_iterator_next(&lookahead_iterator)) {
if (lookahead_iterator.action_count) {
for (unsigned i = 0; i < lookahead_iterator.action_count; i++) {
const TSParseAction *action = &lookahead_iterator.actions[i];
if (action->type == TSParseActionTypeReduce) {
TSSymbol symbol = self->language->public_symbol_map[action->params.reduce.symbol];
array_search_sorted_by(
&subgraphs,
0,
.symbol,
symbol,
&subgraph_index,
&exists
);
if (exists) {
AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index];
if (subgraph->nodes.size == 0 || array_back(&subgraph->nodes)->state != state) {
array_push(&subgraph->nodes, ((AnalysisSubgraphNode) {
.state = state,
.production_id = action->params.reduce.production_id,
.child_index = action->params.reduce.child_count,
.done = true,
}));
}
}
} else if (action->type == TSParseActionTypeShift && !action->params.shift.extra) {
TSStateId next_state = action->params.shift.state;
state_predecessor_map_add(&predecessor_map, next_state, state);
}
} else if (action->type == TSParseActionTypeShift && !action->params.shift.extra) {
TSStateId next_state = action->params.shift.state;
state_predecessor_map_add(&predecessor_map, next_state, state);
}
}
}
for (TSSymbol sym = self->language->token_count; sym < self->language->symbol_count; sym++) {
TSStateId next_state = ts_language_next_state(self->language, state, sym);
if (next_state != 0 && next_state != state) {
state_predecessor_map_add(&predecessor_map, next_state, state);
TSSymbol symbol = self->language->public_symbol_map[sym];
} else if (lookahead_iterator.next_state != 0 && lookahead_iterator.next_state != state) {
state_predecessor_map_add(&predecessor_map, lookahead_iterator.next_state, state);
TSSymbol symbol = self->language->public_symbol_map[lookahead_iterator.symbol];
array_search_sorted_by(
&subgraphs,
subgraph_index,
0,
.symbol,
symbol,
&subgraph_index,
@ -871,6 +868,12 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *impossible_index
for (unsigned i = 0; i < subgraphs.size; i++) {
AnalysisSubgraph *subgraph = &subgraphs.contents[i];
printf(" %u, %s:\n", subgraph->symbol, ts_language_symbol_name(self->language, subgraph->symbol));
for (unsigned j = 0; j < subgraph->start_states.size; j++) {
printf(
" {state: %u}\n",
subgraph->start_states.contents[j]
);
}
for (unsigned j = 0; j < subgraph->nodes.size; j++) {
AnalysisSubgraphNode *node = &subgraph->nodes.contents[j];
printf(
@ -985,122 +988,137 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *impossible_index
// Follow every possible path in the parse table, but only visit states that
// are part of the subgraph for the current symbol.
for (TSSymbol sym = 0; sym < self->language->symbol_count; sym++) {
LookaheadIterator lookahead_iterator = ts_language_lookaheads(self->language, parse_state);
while (ts_lookahead_iterator_next(&lookahead_iterator)) {
TSSymbol sym = lookahead_iterator.symbol;
TSStateId next_parse_state;
if (lookahead_iterator.action_count) {
const TSParseAction *action = &lookahead_iterator.actions[lookahead_iterator.action_count - 1];
if (action->type == TSParseActionTypeShift && !action->params.shift.extra) {
next_parse_state = action->params.shift.state;
} else {
continue;
}
} else if (lookahead_iterator.next_state != 0 && lookahead_iterator.next_state != parse_state) {
next_parse_state = lookahead_iterator.next_state;
} else {
continue;
}
AnalysisSubgraphNode successor = {
.state = ts_language_next_state(self->language, parse_state, sym),
.state = next_parse_state,
.child_index = child_index + 1,
};
if (successor.state && successor.state != parse_state) {
unsigned node_index;
array_search_sorted_with(
&subgraph->nodes, 0,
analysis_subgraph_node__compare, &successor,
&node_index, &exists
);
while (node_index < subgraph->nodes.size) {
AnalysisSubgraphNode *node = &subgraph->nodes.contents[node_index++];
if (node->state != successor.state || node->child_index != successor.child_index) break;
unsigned node_index;
array_search_sorted_with(
&subgraph->nodes, 0,
analysis_subgraph_node__compare, &successor,
&node_index, &exists
);
while (node_index < subgraph->nodes.size) {
AnalysisSubgraphNode *node = &subgraph->nodes.contents[node_index++];
if (node->state != successor.state || node->child_index != successor.child_index) break;
// Use the subgraph to determine what alias and field will eventually be applied
// to this child node.
TSSymbol alias = ts_language_alias_at(self->language, node->production_id, child_index);
TSSymbol visible_symbol = alias
? alias
: self->language->symbol_metadata[sym].visible
? self->language->public_symbol_map[sym]
: 0;
TSFieldId field_id = parent_field_id;
if (!field_id) {
const TSFieldMapEntry *field_map, *field_map_end;
ts_language_field_map(self->language, node->production_id, &field_map, &field_map_end);
for (; field_map != field_map_end; field_map++) {
if (field_map->child_index == child_index) {
field_id = field_map->field_id;
break;
}
}
}
AnalysisState next_state = *state;
analysis_state__top(&next_state)->child_index++;
analysis_state__top(&next_state)->parse_state = successor.state;
if (node->done) analysis_state__top(&next_state)->done = true;
// Determine if this hypothetical child node would match the current step
// of the query pattern.
bool does_match = false;
if (visible_symbol) {
does_match = true;
if (step->symbol == NAMED_WILDCARD_SYMBOL) {
if (!self->language->symbol_metadata[visible_symbol].named) does_match = false;
} else if (step->symbol != WILDCARD_SYMBOL) {
if (step->symbol != visible_symbol) does_match = false;
}
if (step->field && step->field != field_id) {
does_match = false;
}
}
// If this is a hidden child, then push a new entry to the stack, in order to
// walk through the children of this child.
else if (sym >= self->language->token_count && next_state.depth < MAX_ANALYSIS_STATE_DEPTH) {
next_state.depth++;
analysis_state__top(&next_state)->parse_state = parse_state;
analysis_state__top(&next_state)->child_index = 0;
analysis_state__top(&next_state)->parent_symbol = sym;
analysis_state__top(&next_state)->field_id = field_id;
analysis_state__top(&next_state)->done = false;
} else {
continue;
}
// Pop from the stack when this state reached the end of its current syntax node.
while (next_state.depth > 0 && analysis_state__top(&next_state)->done) {
next_state.depth--;
}
// If this hypothetical child did match the current step of the query pattern,
// then advance to the next step at the current depth. This involves skipping
// over any descendant steps of the current child.
const QueryStep *next_step = step;
if (does_match) {
for (;;) {
next_state.step_index++;
next_step = &self->steps.contents[next_state.step_index];
if (
next_step->depth == PATTERN_DONE_MARKER ||
next_step->depth <= parent_depth + 1
) break;
}
}
for (;;) {
// If this state can make further progress, then add it to the states for the next iteration.
// Otherwise, record the fact that matching can fail at this step of the pattern.
if (!next_step->is_dead_end) {
bool did_finish_pattern = self->steps.contents[next_state.step_index].depth != parent_depth + 1;
if (did_finish_pattern) can_finish_pattern = true;
if (next_state.depth > 0 && !did_finish_pattern) {
array_insert_sorted_with(&next_states, 0, analysis_state__compare, next_state);
} else {
array_insert_sorted_by(&final_step_indices, 0, , next_state.step_index);
}
}
// If the state has advanced to a step with an alternative step, then add another state at
// that alternative step to the next iteration.
if (
does_match &&
next_step->alternative_index != NONE &&
next_step->alternative_index > next_state.step_index
) {
next_state.step_index = next_step->alternative_index;
next_step = &self->steps.contents[next_state.step_index];
} else {
// Use the subgraph to determine what alias and field will eventually be applied
// to this child node.
TSSymbol alias = ts_language_alias_at(self->language, node->production_id, child_index);
TSSymbol visible_symbol = alias
? alias
: self->language->symbol_metadata[sym].visible
? self->language->public_symbol_map[sym]
: 0;
TSFieldId field_id = parent_field_id;
if (!field_id) {
const TSFieldMapEntry *field_map, *field_map_end;
ts_language_field_map(self->language, node->production_id, &field_map, &field_map_end);
for (; field_map != field_map_end; field_map++) {
if (field_map->child_index == child_index) {
field_id = field_map->field_id;
break;
}
}
}
AnalysisState next_state = *state;
analysis_state__top(&next_state)->child_index++;
analysis_state__top(&next_state)->parse_state = successor.state;
if (node->done) analysis_state__top(&next_state)->done = true;
// Determine if this hypothetical child node would match the current step
// of the query pattern.
bool does_match = false;
if (visible_symbol) {
does_match = true;
if (step->symbol == NAMED_WILDCARD_SYMBOL) {
if (!self->language->symbol_metadata[visible_symbol].named) does_match = false;
} else if (step->symbol != WILDCARD_SYMBOL) {
if (step->symbol != visible_symbol) does_match = false;
}
if (step->field && step->field != field_id) {
does_match = false;
}
}
// If this is a hidden child, then push a new entry to the stack, in order to
// walk through the children of this child.
else if (sym >= self->language->token_count && next_state.depth < MAX_ANALYSIS_STATE_DEPTH) {
next_state.depth++;
analysis_state__top(&next_state)->parse_state = parse_state;
analysis_state__top(&next_state)->child_index = 0;
analysis_state__top(&next_state)->parent_symbol = sym;
analysis_state__top(&next_state)->field_id = field_id;
analysis_state__top(&next_state)->done = false;
} else {
continue;
}
// Pop from the stack when this state reached the end of its current syntax node.
while (next_state.depth > 0 && analysis_state__top(&next_state)->done) {
next_state.depth--;
}
// If this hypothetical child did match the current step of the query pattern,
// then advance to the next step at the current depth. This involves skipping
// over any descendant steps of the current child.
const QueryStep *next_step = step;
if (does_match) {
for (;;) {
next_state.step_index++;
next_step = &self->steps.contents[next_state.step_index];
if (
next_step->depth == PATTERN_DONE_MARKER ||
next_step->depth <= parent_depth + 1
) break;
}
}
for (;;) {
// If this state can make further progress, then add it to the states for the next iteration.
// Otherwise, record the fact that matching can fail at this step of the pattern.
if (!next_step->is_dead_end) {
bool did_finish_pattern = self->steps.contents[next_state.step_index].depth != parent_depth + 1;
if (did_finish_pattern) can_finish_pattern = true;
if (next_state.depth > 0 && !did_finish_pattern) {
array_insert_sorted_with(&next_states, 0, analysis_state__compare, next_state);
} else {
array_insert_sorted_by(&final_step_indices, 0, , next_state.step_index);
}
}
// If the state has advanced to a step with an alternative step, then add another state at
// that alternative step to the next iteration.
if (
does_match &&
next_step->alternative_index != NONE &&
next_step->alternative_index > next_state.step_index
) {
next_state.step_index = next_step->alternative_index;
next_step = &self->steps.contents[next_state.step_index];
} else {
break;
}
}
}
}
}