Don't start with duplicate states in ts_query__analyze_patterns

This change exposes a new `primary_state_ids` field on the `TSLanguage`
struct, and populates it by tracking the first encountered state with a
given `core_id`. (For posterity: the initial change just exposed
`core_id` and deduplicated within `ts_analyze_query`).

With this `primary_state_ids` field in place, the
`ts_query__analyze_patterns` function only needs to populate its
subgraphs with starting states that are _primary_, since non-primary
states behave identically to primary ones. This leads to large savings
across the board, since most states are not primary.
This commit is contained in:
Alex Pinkus 2022-01-11 21:57:06 -08:00
parent bf210f0c9e
commit eaf9b170f1
6 changed files with 64 additions and 23 deletions

View file

@ -106,6 +106,7 @@ impl Generator {
}
self.add_non_terminal_alias_map();
self.add_primary_state_id_list();
let mut main_lex_table = LexTable::default();
swap(&mut main_lex_table, &mut self.main_lex_table);
@ -565,6 +566,29 @@ impl Generator {
add_line!(self, "");
}
/// Produces a list of the "primary state" for every state in the grammar.
///
/// The "primary state" for a given state is the first encountered state that behaves
/// identically with respect to query analysis. We derive this by keeping track of the `core_id`
/// for each state and treating the first state with a given `core_id` as primary.
fn add_primary_state_id_list(&mut self) {
add_line!(
self,
"static const TSStateId ts_primary_state_ids[STATE_COUNT] = {{"
);
indent!(self);
let mut first_state_for_each_core_id = HashMap::new();
for (idx, state) in self.parse_table.states.iter().enumerate() {
let primary_state = first_state_for_each_core_id
.entry(state.core_id)
.or_insert(idx);
add_line!(self, "[{}] = {},", idx, primary_state);
}
dedent!(self);
add_line!(self, "}};");
add_line!(self, "");
}
fn add_field_sequences(&mut self) {
let mut flat_field_maps = vec![];
let mut next_flat_field_map_index = 0;
@ -1369,6 +1393,7 @@ impl Generator {
if !self.parse_table.production_infos.is_empty() {
add_line!(self, ".alias_sequences = &ts_alias_sequences[0][0],");
}
add_line!(self, ".ts_primary_state_ids = ts_primary_state_ids,");
// Lexing
add_line!(self, ".lex_modes = ts_lex_modes,");

View file

@ -855,5 +855,5 @@ extern "C" {
);
}
pub const TREE_SITTER_LANGUAGE_VERSION: usize = 13;
pub const TREE_SITTER_LANGUAGE_VERSION: usize = 14;
pub const TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION: usize = 13;

View file

@ -21,7 +21,7 @@ extern "C" {
* The Tree-sitter library is generally backwards-compatible with languages
* generated using older CLI versions, but is not forwards-compatible.
*/
#define TREE_SITTER_LANGUAGE_VERSION 13
#define TREE_SITTER_LANGUAGE_VERSION 14
/**
* The earliest ABI version that is supported by the current version of the

View file

@ -110,6 +110,7 @@ struct TSLanguage {
const TSSymbol *public_symbol_map;
const uint16_t *alias_map;
const TSSymbol *alias_sequences;
const TSStateId *ts_primary_state_ids;
const TSLexMode *lex_modes;
bool (*lex_fn)(TSLexer *, TSStateId);
bool (*keyword_lex_fn)(TSLexer *, TSStateId);

View file

@ -200,6 +200,19 @@ static inline TSStateId ts_language_next_state(
}
}
// Whether the state is a "primary state". If this returns false, it indicates that there exists
// another state that behaves identically to this one with respect to query analysis.
static inline bool ts_language_state_is_primary(
const TSLanguage *self,
TSStateId state
) {
if (self->version >= 14) {
return state == self->ts_primary_state_ids[state];
} else {
return true;
}
}
static inline const bool *ts_language_enabled_external_tokens(
const TSLanguage *self,
unsigned external_scanner_state

View file

@ -960,28 +960,30 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) {
if (lookahead_iterator.next_state != state) {
state_predecessor_map_add(&predecessor_map, lookahead_iterator.next_state, state);
}
const TSSymbol *aliases, *aliases_end;
ts_language_aliases_for_symbol(
self->language,
lookahead_iterator.symbol,
&aliases,
&aliases_end
);
for (const TSSymbol *symbol = aliases; symbol < aliases_end; symbol++) {
array_search_sorted_by(
&subgraphs,
.symbol,
*symbol,
&subgraph_index,
&exists
if (ts_language_state_is_primary(self->language, state)) {
const TSSymbol *aliases, *aliases_end;
ts_language_aliases_for_symbol(
self->language,
lookahead_iterator.symbol,
&aliases,
&aliases_end
);
if (exists) {
AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index];
if (
subgraph->start_states.size == 0 ||
*array_back(&subgraph->start_states) != state
)
array_push(&subgraph->start_states, state);
for (const TSSymbol *symbol = aliases; symbol < aliases_end; symbol++) {
array_search_sorted_by(
&subgraphs,
.symbol,
*symbol,
&subgraph_index,
&exists
);
if (exists) {
AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index];
if (
subgraph->start_states.size == 0 ||
*array_back(&subgraph->start_states) != state
)
array_push(&subgraph->start_states, state);
}
}
}
}