Don't start with duplicate states in ts_query__analyze_patterns
This change exposes a new `primary_state_ids` field on the `TSLanguage` struct, and populates it by tracking the first encountered state with a given `core_id`. (For posterity: the initial change just exposed `core_id` and deduplicated within `ts_analyze_query`). With this `primary_state_ids` field in place, the `ts_query__analyze_patterns` function only needs to populate its subgraphs with starting states that are _primary_, since non-primary states behave identically to primary ones. This leads to large savings across the board, since most states are not primary.
This commit is contained in:
parent
bf210f0c9e
commit
eaf9b170f1
6 changed files with 64 additions and 23 deletions
|
|
@ -106,6 +106,7 @@ impl Generator {
|
|||
}
|
||||
|
||||
self.add_non_terminal_alias_map();
|
||||
self.add_primary_state_id_list();
|
||||
|
||||
let mut main_lex_table = LexTable::default();
|
||||
swap(&mut main_lex_table, &mut self.main_lex_table);
|
||||
|
|
@ -565,6 +566,29 @@ impl Generator {
|
|||
add_line!(self, "");
|
||||
}
|
||||
|
||||
/// Produces a list of the "primary state" for every state in the grammar.
|
||||
///
|
||||
/// The "primary state" for a given state is the first encountered state that behaves
|
||||
/// identically with respect to query analysis. We derive this by keeping track of the `core_id`
|
||||
/// for each state and treating the first state with a given `core_id` as primary.
|
||||
fn add_primary_state_id_list(&mut self) {
|
||||
add_line!(
|
||||
self,
|
||||
"static const TSStateId ts_primary_state_ids[STATE_COUNT] = {{"
|
||||
);
|
||||
indent!(self);
|
||||
let mut first_state_for_each_core_id = HashMap::new();
|
||||
for (idx, state) in self.parse_table.states.iter().enumerate() {
|
||||
let primary_state = first_state_for_each_core_id
|
||||
.entry(state.core_id)
|
||||
.or_insert(idx);
|
||||
add_line!(self, "[{}] = {},", idx, primary_state);
|
||||
}
|
||||
dedent!(self);
|
||||
add_line!(self, "}};");
|
||||
add_line!(self, "");
|
||||
}
|
||||
|
||||
fn add_field_sequences(&mut self) {
|
||||
let mut flat_field_maps = vec![];
|
||||
let mut next_flat_field_map_index = 0;
|
||||
|
|
@ -1369,6 +1393,7 @@ impl Generator {
|
|||
if !self.parse_table.production_infos.is_empty() {
|
||||
add_line!(self, ".alias_sequences = &ts_alias_sequences[0][0],");
|
||||
}
|
||||
add_line!(self, ".ts_primary_state_ids = ts_primary_state_ids,");
|
||||
|
||||
// Lexing
|
||||
add_line!(self, ".lex_modes = ts_lex_modes,");
|
||||
|
|
|
|||
|
|
@ -855,5 +855,5 @@ extern "C" {
|
|||
);
|
||||
}
|
||||
|
||||
pub const TREE_SITTER_LANGUAGE_VERSION: usize = 13;
|
||||
pub const TREE_SITTER_LANGUAGE_VERSION: usize = 14;
|
||||
pub const TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION: usize = 13;
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ extern "C" {
|
|||
* The Tree-sitter library is generally backwards-compatible with languages
|
||||
* generated using older CLI versions, but is not forwards-compatible.
|
||||
*/
|
||||
#define TREE_SITTER_LANGUAGE_VERSION 13
|
||||
#define TREE_SITTER_LANGUAGE_VERSION 14
|
||||
|
||||
/**
|
||||
* The earliest ABI version that is supported by the current version of the
|
||||
|
|
|
|||
|
|
@ -110,6 +110,7 @@ struct TSLanguage {
|
|||
const TSSymbol *public_symbol_map;
|
||||
const uint16_t *alias_map;
|
||||
const TSSymbol *alias_sequences;
|
||||
const TSStateId *ts_primary_state_ids;
|
||||
const TSLexMode *lex_modes;
|
||||
bool (*lex_fn)(TSLexer *, TSStateId);
|
||||
bool (*keyword_lex_fn)(TSLexer *, TSStateId);
|
||||
|
|
|
|||
|
|
@ -200,6 +200,19 @@ static inline TSStateId ts_language_next_state(
|
|||
}
|
||||
}
|
||||
|
||||
// Whether the state is a "primary state". If this returns false, it indicates that there exists
|
||||
// another state that behaves identically to this one with respect to query analysis.
|
||||
static inline bool ts_language_state_is_primary(
|
||||
const TSLanguage *self,
|
||||
TSStateId state
|
||||
) {
|
||||
if (self->version >= 14) {
|
||||
return state == self->ts_primary_state_ids[state];
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
static inline const bool *ts_language_enabled_external_tokens(
|
||||
const TSLanguage *self,
|
||||
unsigned external_scanner_state
|
||||
|
|
|
|||
|
|
@ -960,28 +960,30 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) {
|
|||
if (lookahead_iterator.next_state != state) {
|
||||
state_predecessor_map_add(&predecessor_map, lookahead_iterator.next_state, state);
|
||||
}
|
||||
const TSSymbol *aliases, *aliases_end;
|
||||
ts_language_aliases_for_symbol(
|
||||
self->language,
|
||||
lookahead_iterator.symbol,
|
||||
&aliases,
|
||||
&aliases_end
|
||||
);
|
||||
for (const TSSymbol *symbol = aliases; symbol < aliases_end; symbol++) {
|
||||
array_search_sorted_by(
|
||||
&subgraphs,
|
||||
.symbol,
|
||||
*symbol,
|
||||
&subgraph_index,
|
||||
&exists
|
||||
if (ts_language_state_is_primary(self->language, state)) {
|
||||
const TSSymbol *aliases, *aliases_end;
|
||||
ts_language_aliases_for_symbol(
|
||||
self->language,
|
||||
lookahead_iterator.symbol,
|
||||
&aliases,
|
||||
&aliases_end
|
||||
);
|
||||
if (exists) {
|
||||
AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index];
|
||||
if (
|
||||
subgraph->start_states.size == 0 ||
|
||||
*array_back(&subgraph->start_states) != state
|
||||
)
|
||||
array_push(&subgraph->start_states, state);
|
||||
for (const TSSymbol *symbol = aliases; symbol < aliases_end; symbol++) {
|
||||
array_search_sorted_by(
|
||||
&subgraphs,
|
||||
.symbol,
|
||||
*symbol,
|
||||
&subgraph_index,
|
||||
&exists
|
||||
);
|
||||
if (exists) {
|
||||
AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index];
|
||||
if (
|
||||
subgraph->start_states.size == 0 ||
|
||||
*array_back(&subgraph->start_states) != state
|
||||
)
|
||||
array_push(&subgraph->start_states, state);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue