Optimize iteration over state successors during query analysis
This commit is contained in:
parent
bd42729a41
commit
aac75e35b1
2 changed files with 267 additions and 147 deletions
|
|
@ -20,6 +20,22 @@ typedef struct {
|
|||
bool is_reusable;
|
||||
} TableEntry;
|
||||
|
||||
typedef struct {
|
||||
const TSLanguage *language;
|
||||
const uint16_t *data;
|
||||
const uint16_t *group_end;
|
||||
TSStateId state;
|
||||
uint16_t table_value;
|
||||
uint16_t section_index;
|
||||
uint16_t group_count;
|
||||
bool is_small_state;
|
||||
|
||||
const TSParseAction *actions;
|
||||
TSSymbol symbol;
|
||||
TSStateId next_state;
|
||||
uint16_t action_count;
|
||||
} LookaheadIterator;
|
||||
|
||||
void ts_language_table_entry(const TSLanguage *, TSStateId, TSSymbol, TableEntry *);
|
||||
|
||||
TSSymbolMetadata ts_language_symbol_metadata(const TSLanguage *, TSSymbol);
|
||||
|
|
@ -62,6 +78,13 @@ static inline bool ts_language_has_reduce_action(
|
|||
return entry.action_count > 0 && entry.actions[0].type == TSParseActionTypeReduce;
|
||||
}
|
||||
|
||||
// Lookup the table value for a given symbol and state.
|
||||
//
|
||||
// For non-terminal symbols, the table value represents a successor state.
|
||||
// For terminal symbols, it represents an index in the actions table.
|
||||
// For 'large' parse states, this is a direct lookup. For 'small' parse
|
||||
// states, this requires searching through the symbol groups to find
|
||||
// the given symbol.
|
||||
static inline uint16_t ts_language_lookup(
|
||||
const TSLanguage *self,
|
||||
TSStateId state,
|
||||
|
|
@ -73,8 +96,8 @@ static inline uint16_t ts_language_lookup(
|
|||
) {
|
||||
uint32_t index = self->small_parse_table_map[state - self->large_state_count];
|
||||
const uint16_t *data = &self->small_parse_table[index];
|
||||
uint16_t section_count = *(data++);
|
||||
for (unsigned i = 0; i < section_count; i++) {
|
||||
uint16_t group_count = *(data++);
|
||||
for (unsigned i = 0; i < group_count; i++) {
|
||||
uint16_t section_value = *(data++);
|
||||
uint16_t symbol_count = *(data++);
|
||||
for (unsigned i = 0; i < symbol_count; i++) {
|
||||
|
|
@ -87,6 +110,85 @@ static inline uint16_t ts_language_lookup(
|
|||
}
|
||||
}
|
||||
|
||||
// Iterate over all of the symbols that are valid in the given state.
|
||||
//
|
||||
// For 'large' parse states, this just requires iterating through
|
||||
// all possible symbols and checking the parse table for each one.
|
||||
// For 'small' parse states, this exploits the structure of the
|
||||
// table to only visit the valid symbols.
|
||||
static inline LookaheadIterator ts_language_lookaheads(
|
||||
const TSLanguage *self,
|
||||
TSStateId state
|
||||
) {
|
||||
bool is_small_state =
|
||||
self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_SMALL_STATES &&
|
||||
state >= self->large_state_count;
|
||||
const uint16_t *data;
|
||||
const uint16_t *group_end = NULL;
|
||||
uint16_t group_count = 0;
|
||||
if (is_small_state) {
|
||||
uint32_t index = self->small_parse_table_map[state - self->large_state_count];
|
||||
data = &self->small_parse_table[index];
|
||||
group_end = data + 1;
|
||||
group_count = *data;
|
||||
} else {
|
||||
data = &self->parse_table[state * self->symbol_count] - 1;
|
||||
}
|
||||
return (LookaheadIterator) {
|
||||
.language = self,
|
||||
.data = data,
|
||||
.group_end = group_end,
|
||||
.group_count = group_count,
|
||||
.is_small_state = is_small_state,
|
||||
.symbol = UINT16_MAX,
|
||||
.next_state = 0,
|
||||
};
|
||||
}
|
||||
|
||||
static inline bool ts_lookahead_iterator_next(LookaheadIterator *self) {
|
||||
// For small parse states, valid symbols are listed explicitly,
|
||||
// grouped by their value. There's no need to look up the actions
|
||||
// again until moving to the next group.
|
||||
if (self->is_small_state) {
|
||||
self->data++;
|
||||
if (self->data == self->group_end) {
|
||||
if (self->group_count == 0) return false;
|
||||
self->group_count--;
|
||||
self->table_value = *(self->data++);
|
||||
unsigned symbol_count = *(self->data++);
|
||||
self->group_end = self->data + symbol_count;
|
||||
self->symbol = *self->data;
|
||||
} else {
|
||||
self->symbol = *self->data;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// For large parse states, iterate through every symbol until one
|
||||
// is found that has valid actions.
|
||||
else {
|
||||
do {
|
||||
self->data++;
|
||||
self->symbol++;
|
||||
if (self->symbol >= self->language->symbol_count) return false;
|
||||
self->table_value = *self->data;
|
||||
} while (!self->table_value);
|
||||
}
|
||||
|
||||
// Depending on if the symbols is terminal or non-terminal, the table value either
|
||||
// represents a list of actions or a successor state.
|
||||
if (self->symbol < self->language->token_count) {
|
||||
const TSParseActionEntry *entry = &self->language->parse_actions[self->table_value];
|
||||
self->action_count = entry->entry.count;
|
||||
self->actions = (const TSParseAction *)(entry + 1);
|
||||
self->next_state = 0;
|
||||
} else {
|
||||
self->action_count = 0;
|
||||
self->next_state = self->table_value;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline TSStateId ts_language_next_state(
|
||||
const TSLanguage *self,
|
||||
TSStateId state,
|
||||
|
|
|
|||
308
lib/src/query.c
308
lib/src/query.c
|
|
@ -599,7 +599,7 @@ static inline int analysis_state__compare(
|
|||
if (self->stack[i].parse_state > other->stack[i].parse_state) return 1;
|
||||
if (self->stack[i].field_id < other->stack[i].field_id) return -1;
|
||||
if (self->stack[i].field_id > other->stack[i].field_id) return 1;
|
||||
}
|
||||
}
|
||||
if (self->step_index < other->step_index) return -1;
|
||||
if (self->step_index > other->step_index) return 1;
|
||||
return 0;
|
||||
|
|
@ -769,47 +769,44 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *impossible_index
|
|||
// 3) A list of predecessor states for each state.
|
||||
StatePredecessorMap predecessor_map = state_predecessor_map_new(self->language);
|
||||
for (TSStateId state = 1; state < self->language->state_count; state++) {
|
||||
unsigned subgraph_index = 0, exists;
|
||||
for (TSSymbol sym = 0; sym < self->language->token_count; sym++) {
|
||||
unsigned count;
|
||||
const TSParseAction *actions = ts_language_actions(self->language, state, sym, &count);
|
||||
for (unsigned i = 0; i < count; i++) {
|
||||
const TSParseAction *action = &actions[i];
|
||||
if (action->type == TSParseActionTypeReduce) {
|
||||
TSSymbol symbol = self->language->public_symbol_map[action->params.reduce.symbol];
|
||||
array_search_sorted_by(
|
||||
&subgraphs,
|
||||
0,
|
||||
.symbol,
|
||||
symbol,
|
||||
&subgraph_index,
|
||||
&exists
|
||||
);
|
||||
if (exists) {
|
||||
AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index];
|
||||
if (subgraph->nodes.size == 0 || array_back(&subgraph->nodes)->state != state) {
|
||||
array_push(&subgraph->nodes, ((AnalysisSubgraphNode) {
|
||||
.state = state,
|
||||
.production_id = action->params.reduce.production_id,
|
||||
.child_index = action->params.reduce.child_count,
|
||||
.done = true,
|
||||
}));
|
||||
unsigned subgraph_index, exists;
|
||||
LookaheadIterator lookahead_iterator = ts_language_lookaheads(self->language, state);
|
||||
while (ts_lookahead_iterator_next(&lookahead_iterator)) {
|
||||
if (lookahead_iterator.action_count) {
|
||||
for (unsigned i = 0; i < lookahead_iterator.action_count; i++) {
|
||||
const TSParseAction *action = &lookahead_iterator.actions[i];
|
||||
if (action->type == TSParseActionTypeReduce) {
|
||||
TSSymbol symbol = self->language->public_symbol_map[action->params.reduce.symbol];
|
||||
array_search_sorted_by(
|
||||
&subgraphs,
|
||||
0,
|
||||
.symbol,
|
||||
symbol,
|
||||
&subgraph_index,
|
||||
&exists
|
||||
);
|
||||
if (exists) {
|
||||
AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index];
|
||||
if (subgraph->nodes.size == 0 || array_back(&subgraph->nodes)->state != state) {
|
||||
array_push(&subgraph->nodes, ((AnalysisSubgraphNode) {
|
||||
.state = state,
|
||||
.production_id = action->params.reduce.production_id,
|
||||
.child_index = action->params.reduce.child_count,
|
||||
.done = true,
|
||||
}));
|
||||
}
|
||||
}
|
||||
} else if (action->type == TSParseActionTypeShift && !action->params.shift.extra) {
|
||||
TSStateId next_state = action->params.shift.state;
|
||||
state_predecessor_map_add(&predecessor_map, next_state, state);
|
||||
}
|
||||
} else if (action->type == TSParseActionTypeShift && !action->params.shift.extra) {
|
||||
TSStateId next_state = action->params.shift.state;
|
||||
state_predecessor_map_add(&predecessor_map, next_state, state);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (TSSymbol sym = self->language->token_count; sym < self->language->symbol_count; sym++) {
|
||||
TSStateId next_state = ts_language_next_state(self->language, state, sym);
|
||||
if (next_state != 0 && next_state != state) {
|
||||
state_predecessor_map_add(&predecessor_map, next_state, state);
|
||||
TSSymbol symbol = self->language->public_symbol_map[sym];
|
||||
} else if (lookahead_iterator.next_state != 0 && lookahead_iterator.next_state != state) {
|
||||
state_predecessor_map_add(&predecessor_map, lookahead_iterator.next_state, state);
|
||||
TSSymbol symbol = self->language->public_symbol_map[lookahead_iterator.symbol];
|
||||
array_search_sorted_by(
|
||||
&subgraphs,
|
||||
subgraph_index,
|
||||
0,
|
||||
.symbol,
|
||||
symbol,
|
||||
&subgraph_index,
|
||||
|
|
@ -871,6 +868,12 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *impossible_index
|
|||
for (unsigned i = 0; i < subgraphs.size; i++) {
|
||||
AnalysisSubgraph *subgraph = &subgraphs.contents[i];
|
||||
printf(" %u, %s:\n", subgraph->symbol, ts_language_symbol_name(self->language, subgraph->symbol));
|
||||
for (unsigned j = 0; j < subgraph->start_states.size; j++) {
|
||||
printf(
|
||||
" {state: %u}\n",
|
||||
subgraph->start_states.contents[j]
|
||||
);
|
||||
}
|
||||
for (unsigned j = 0; j < subgraph->nodes.size; j++) {
|
||||
AnalysisSubgraphNode *node = &subgraph->nodes.contents[j];
|
||||
printf(
|
||||
|
|
@ -985,122 +988,137 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *impossible_index
|
|||
|
||||
// Follow every possible path in the parse table, but only visit states that
|
||||
// are part of the subgraph for the current symbol.
|
||||
for (TSSymbol sym = 0; sym < self->language->symbol_count; sym++) {
|
||||
LookaheadIterator lookahead_iterator = ts_language_lookaheads(self->language, parse_state);
|
||||
while (ts_lookahead_iterator_next(&lookahead_iterator)) {
|
||||
TSSymbol sym = lookahead_iterator.symbol;
|
||||
|
||||
TSStateId next_parse_state;
|
||||
if (lookahead_iterator.action_count) {
|
||||
const TSParseAction *action = &lookahead_iterator.actions[lookahead_iterator.action_count - 1];
|
||||
if (action->type == TSParseActionTypeShift && !action->params.shift.extra) {
|
||||
next_parse_state = action->params.shift.state;
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
} else if (lookahead_iterator.next_state != 0 && lookahead_iterator.next_state != parse_state) {
|
||||
next_parse_state = lookahead_iterator.next_state;
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
|
||||
AnalysisSubgraphNode successor = {
|
||||
.state = ts_language_next_state(self->language, parse_state, sym),
|
||||
.state = next_parse_state,
|
||||
.child_index = child_index + 1,
|
||||
};
|
||||
if (successor.state && successor.state != parse_state) {
|
||||
unsigned node_index;
|
||||
array_search_sorted_with(
|
||||
&subgraph->nodes, 0,
|
||||
analysis_subgraph_node__compare, &successor,
|
||||
&node_index, &exists
|
||||
);
|
||||
while (node_index < subgraph->nodes.size) {
|
||||
AnalysisSubgraphNode *node = &subgraph->nodes.contents[node_index++];
|
||||
if (node->state != successor.state || node->child_index != successor.child_index) break;
|
||||
unsigned node_index;
|
||||
array_search_sorted_with(
|
||||
&subgraph->nodes, 0,
|
||||
analysis_subgraph_node__compare, &successor,
|
||||
&node_index, &exists
|
||||
);
|
||||
while (node_index < subgraph->nodes.size) {
|
||||
AnalysisSubgraphNode *node = &subgraph->nodes.contents[node_index++];
|
||||
if (node->state != successor.state || node->child_index != successor.child_index) break;
|
||||
|
||||
// Use the subgraph to determine what alias and field will eventually be applied
|
||||
// to this child node.
|
||||
TSSymbol alias = ts_language_alias_at(self->language, node->production_id, child_index);
|
||||
TSSymbol visible_symbol = alias
|
||||
? alias
|
||||
: self->language->symbol_metadata[sym].visible
|
||||
? self->language->public_symbol_map[sym]
|
||||
: 0;
|
||||
TSFieldId field_id = parent_field_id;
|
||||
if (!field_id) {
|
||||
const TSFieldMapEntry *field_map, *field_map_end;
|
||||
ts_language_field_map(self->language, node->production_id, &field_map, &field_map_end);
|
||||
for (; field_map != field_map_end; field_map++) {
|
||||
if (field_map->child_index == child_index) {
|
||||
field_id = field_map->field_id;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
AnalysisState next_state = *state;
|
||||
analysis_state__top(&next_state)->child_index++;
|
||||
analysis_state__top(&next_state)->parse_state = successor.state;
|
||||
if (node->done) analysis_state__top(&next_state)->done = true;
|
||||
|
||||
// Determine if this hypothetical child node would match the current step
|
||||
// of the query pattern.
|
||||
bool does_match = false;
|
||||
if (visible_symbol) {
|
||||
does_match = true;
|
||||
if (step->symbol == NAMED_WILDCARD_SYMBOL) {
|
||||
if (!self->language->symbol_metadata[visible_symbol].named) does_match = false;
|
||||
} else if (step->symbol != WILDCARD_SYMBOL) {
|
||||
if (step->symbol != visible_symbol) does_match = false;
|
||||
}
|
||||
if (step->field && step->field != field_id) {
|
||||
does_match = false;
|
||||
}
|
||||
}
|
||||
|
||||
// If this is a hidden child, then push a new entry to the stack, in order to
|
||||
// walk through the children of this child.
|
||||
else if (sym >= self->language->token_count && next_state.depth < MAX_ANALYSIS_STATE_DEPTH) {
|
||||
next_state.depth++;
|
||||
analysis_state__top(&next_state)->parse_state = parse_state;
|
||||
analysis_state__top(&next_state)->child_index = 0;
|
||||
analysis_state__top(&next_state)->parent_symbol = sym;
|
||||
analysis_state__top(&next_state)->field_id = field_id;
|
||||
analysis_state__top(&next_state)->done = false;
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Pop from the stack when this state reached the end of its current syntax node.
|
||||
while (next_state.depth > 0 && analysis_state__top(&next_state)->done) {
|
||||
next_state.depth--;
|
||||
}
|
||||
|
||||
// If this hypothetical child did match the current step of the query pattern,
|
||||
// then advance to the next step at the current depth. This involves skipping
|
||||
// over any descendant steps of the current child.
|
||||
const QueryStep *next_step = step;
|
||||
if (does_match) {
|
||||
for (;;) {
|
||||
next_state.step_index++;
|
||||
next_step = &self->steps.contents[next_state.step_index];
|
||||
if (
|
||||
next_step->depth == PATTERN_DONE_MARKER ||
|
||||
next_step->depth <= parent_depth + 1
|
||||
) break;
|
||||
}
|
||||
}
|
||||
|
||||
for (;;) {
|
||||
// If this state can make further progress, then add it to the states for the next iteration.
|
||||
// Otherwise, record the fact that matching can fail at this step of the pattern.
|
||||
if (!next_step->is_dead_end) {
|
||||
bool did_finish_pattern = self->steps.contents[next_state.step_index].depth != parent_depth + 1;
|
||||
if (did_finish_pattern) can_finish_pattern = true;
|
||||
if (next_state.depth > 0 && !did_finish_pattern) {
|
||||
array_insert_sorted_with(&next_states, 0, analysis_state__compare, next_state);
|
||||
} else {
|
||||
array_insert_sorted_by(&final_step_indices, 0, , next_state.step_index);
|
||||
}
|
||||
}
|
||||
|
||||
// If the state has advanced to a step with an alternative step, then add another state at
|
||||
// that alternative step to the next iteration.
|
||||
if (
|
||||
does_match &&
|
||||
next_step->alternative_index != NONE &&
|
||||
next_step->alternative_index > next_state.step_index
|
||||
) {
|
||||
next_state.step_index = next_step->alternative_index;
|
||||
next_step = &self->steps.contents[next_state.step_index];
|
||||
} else {
|
||||
// Use the subgraph to determine what alias and field will eventually be applied
|
||||
// to this child node.
|
||||
TSSymbol alias = ts_language_alias_at(self->language, node->production_id, child_index);
|
||||
TSSymbol visible_symbol = alias
|
||||
? alias
|
||||
: self->language->symbol_metadata[sym].visible
|
||||
? self->language->public_symbol_map[sym]
|
||||
: 0;
|
||||
TSFieldId field_id = parent_field_id;
|
||||
if (!field_id) {
|
||||
const TSFieldMapEntry *field_map, *field_map_end;
|
||||
ts_language_field_map(self->language, node->production_id, &field_map, &field_map_end);
|
||||
for (; field_map != field_map_end; field_map++) {
|
||||
if (field_map->child_index == child_index) {
|
||||
field_id = field_map->field_id;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
AnalysisState next_state = *state;
|
||||
analysis_state__top(&next_state)->child_index++;
|
||||
analysis_state__top(&next_state)->parse_state = successor.state;
|
||||
if (node->done) analysis_state__top(&next_state)->done = true;
|
||||
|
||||
// Determine if this hypothetical child node would match the current step
|
||||
// of the query pattern.
|
||||
bool does_match = false;
|
||||
if (visible_symbol) {
|
||||
does_match = true;
|
||||
if (step->symbol == NAMED_WILDCARD_SYMBOL) {
|
||||
if (!self->language->symbol_metadata[visible_symbol].named) does_match = false;
|
||||
} else if (step->symbol != WILDCARD_SYMBOL) {
|
||||
if (step->symbol != visible_symbol) does_match = false;
|
||||
}
|
||||
if (step->field && step->field != field_id) {
|
||||
does_match = false;
|
||||
}
|
||||
}
|
||||
|
||||
// If this is a hidden child, then push a new entry to the stack, in order to
|
||||
// walk through the children of this child.
|
||||
else if (sym >= self->language->token_count && next_state.depth < MAX_ANALYSIS_STATE_DEPTH) {
|
||||
next_state.depth++;
|
||||
analysis_state__top(&next_state)->parse_state = parse_state;
|
||||
analysis_state__top(&next_state)->child_index = 0;
|
||||
analysis_state__top(&next_state)->parent_symbol = sym;
|
||||
analysis_state__top(&next_state)->field_id = field_id;
|
||||
analysis_state__top(&next_state)->done = false;
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Pop from the stack when this state reached the end of its current syntax node.
|
||||
while (next_state.depth > 0 && analysis_state__top(&next_state)->done) {
|
||||
next_state.depth--;
|
||||
}
|
||||
|
||||
// If this hypothetical child did match the current step of the query pattern,
|
||||
// then advance to the next step at the current depth. This involves skipping
|
||||
// over any descendant steps of the current child.
|
||||
const QueryStep *next_step = step;
|
||||
if (does_match) {
|
||||
for (;;) {
|
||||
next_state.step_index++;
|
||||
next_step = &self->steps.contents[next_state.step_index];
|
||||
if (
|
||||
next_step->depth == PATTERN_DONE_MARKER ||
|
||||
next_step->depth <= parent_depth + 1
|
||||
) break;
|
||||
}
|
||||
}
|
||||
|
||||
for (;;) {
|
||||
// If this state can make further progress, then add it to the states for the next iteration.
|
||||
// Otherwise, record the fact that matching can fail at this step of the pattern.
|
||||
if (!next_step->is_dead_end) {
|
||||
bool did_finish_pattern = self->steps.contents[next_state.step_index].depth != parent_depth + 1;
|
||||
if (did_finish_pattern) can_finish_pattern = true;
|
||||
if (next_state.depth > 0 && !did_finish_pattern) {
|
||||
array_insert_sorted_with(&next_states, 0, analysis_state__compare, next_state);
|
||||
} else {
|
||||
array_insert_sorted_by(&final_step_indices, 0, , next_state.step_index);
|
||||
}
|
||||
}
|
||||
|
||||
// If the state has advanced to a step with an alternative step, then add another state at
|
||||
// that alternative step to the next iteration.
|
||||
if (
|
||||
does_match &&
|
||||
next_step->alternative_index != NONE &&
|
||||
next_step->alternative_index > next_state.step_index
|
||||
) {
|
||||
next_state.step_index = next_step->alternative_index;
|
||||
next_step = &self->steps.contents[next_state.step_index];
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue