tree-sitter/crates/generate/src/build_tables.rs
DanikVitek ffd777ba65 fix: Naming; Lifetimes; deny(rust_2018_idioms)
- One has to think about lifetimes if a type has one:
  - `<&'a Node<'tree>>::language` now returns `Language<'tree>` instead of
    `Language<'a>`, as it should;
- Renamed `struct TreeCursor<'cursor>` into `struct TreeCursor<'tree>`,
  to be consistant with the usages and reduse confusion;
- Remove explicit "outlives" requirements from `QueryMatches`, `QueryCaptures`,
  and their impl blocks, because they're inferred
- TODO: should `'query` be renamed into `'cursor`?
2026-01-11 00:40:33 +02:00

554 lines
19 KiB
Rust

mod build_lex_table;
mod build_parse_table;
mod coincident_tokens;
mod item;
mod item_set_builder;
mod minimize_parse_table;
mod token_conflicts;
use std::collections::{BTreeSet, HashMap};
pub use build_lex_table::LARGE_CHARACTER_RANGE_COUNT;
use build_parse_table::BuildTableResult;
pub use build_parse_table::ParseTableBuilderError;
use log::{debug, info};
use self::{
build_lex_table::build_lex_table,
build_parse_table::{build_parse_table, ParseStateInfo},
coincident_tokens::CoincidentTokenIndex,
item_set_builder::ParseItemSetBuilder,
minimize_parse_table::minimize_parse_table,
token_conflicts::TokenConflictMap,
};
use crate::{
grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar},
nfa::{CharacterSet, NfaCursor},
node_types::VariableInfo,
rules::{AliasMap, Symbol, SymbolType, TokenSet},
tables::{LexTable, ParseAction, ParseTable, ParseTableEntry},
OptLevel,
};
pub struct Tables {
pub parse_table: ParseTable,
pub main_lex_table: LexTable,
pub keyword_lex_table: LexTable,
pub large_character_sets: Vec<(Option<Symbol>, CharacterSet)>,
}
pub fn build_tables(
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
simple_aliases: &AliasMap,
variable_info: &[VariableInfo],
inlines: &InlinedProductionMap,
report_symbol_name: Option<&str>,
optimizations: OptLevel,
) -> BuildTableResult<Tables> {
let item_set_builder = ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines);
let following_tokens =
get_following_tokens(syntax_grammar, lexical_grammar, inlines, &item_set_builder);
let (mut parse_table, parse_state_info) = build_parse_table(
syntax_grammar,
lexical_grammar,
item_set_builder,
variable_info,
)?;
let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens);
let coincident_token_index = CoincidentTokenIndex::new(&parse_table, lexical_grammar);
let keywords = identify_keywords(
lexical_grammar,
&parse_table,
syntax_grammar.word_token,
&token_conflict_map,
&coincident_token_index,
);
populate_error_state(
&mut parse_table,
syntax_grammar,
lexical_grammar,
&coincident_token_index,
&token_conflict_map,
&keywords,
);
populate_used_symbols(&mut parse_table, syntax_grammar, lexical_grammar);
minimize_parse_table(
&mut parse_table,
syntax_grammar,
lexical_grammar,
simple_aliases,
&token_conflict_map,
&keywords,
optimizations,
);
let lex_tables = build_lex_table(
&mut parse_table,
syntax_grammar,
lexical_grammar,
&keywords,
&coincident_token_index,
&token_conflict_map,
);
populate_external_lex_states(&mut parse_table, syntax_grammar);
mark_fragile_tokens(&mut parse_table, lexical_grammar, &token_conflict_map);
if let Some(report_symbol_name) = report_symbol_name {
report_state_info(
syntax_grammar,
lexical_grammar,
&parse_table,
&parse_state_info,
report_symbol_name,
);
}
if parse_table.states.len() > u16::MAX as usize {
Err(ParseTableBuilderError::StateCount(parse_table.states.len()))?;
}
Ok(Tables {
parse_table,
main_lex_table: lex_tables.main_lex_table,
keyword_lex_table: lex_tables.keyword_lex_table,
large_character_sets: lex_tables.large_character_sets,
})
}
fn get_following_tokens(
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
inlines: &InlinedProductionMap,
builder: &ParseItemSetBuilder<'_>,
) -> Vec<TokenSet> {
let mut result = vec![TokenSet::new(); lexical_grammar.variables.len()];
let productions = syntax_grammar
.variables
.iter()
.flat_map(|v| &v.productions)
.chain(&inlines.productions);
let all_tokens = (0..result.len())
.map(Symbol::terminal)
.collect::<TokenSet>();
for production in productions {
for i in 1..production.steps.len() {
let left_tokens = builder.last_set(&production.steps[i - 1].symbol);
let right_tokens = builder.first_set(&production.steps[i].symbol);
let right_reserved_tokens = builder.reserved_first_set(&production.steps[i].symbol);
for left_token in left_tokens.iter() {
if left_token.is_terminal() {
result[left_token.index].insert_all_terminals(right_tokens);
if let Some(reserved_tokens) = right_reserved_tokens {
result[left_token.index].insert_all_terminals(reserved_tokens);
}
}
}
}
}
for extra in &syntax_grammar.extra_symbols {
if extra.is_terminal() {
for entry in &mut result {
entry.insert(*extra);
}
result[extra.index] = all_tokens.clone();
}
}
result
}
fn populate_error_state(
parse_table: &mut ParseTable,
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
coincident_token_index: &CoincidentTokenIndex<'_>,
token_conflict_map: &TokenConflictMap<'_>,
keywords: &TokenSet,
) {
let state = &mut parse_table.states[0];
let n = lexical_grammar.variables.len();
// First identify the *conflict-free tokens*: tokens that do not overlap with
// any other token in any way, besides matching exactly the same string.
let conflict_free_tokens = (0..n)
.filter_map(|i| {
let conflicts_with_other_tokens = (0..n).any(|j| {
j != i
&& !coincident_token_index.contains(Symbol::terminal(i), Symbol::terminal(j))
&& token_conflict_map.does_match_shorter_or_longer(i, j)
});
if conflicts_with_other_tokens {
None
} else {
debug!(
"error recovery - token {} has no conflicts",
lexical_grammar.variables[i].name
);
Some(Symbol::terminal(i))
}
})
.collect::<TokenSet>();
let recover_entry = ParseTableEntry {
reusable: false,
actions: vec![ParseAction::Recover],
};
// Exclude from the error-recovery state any token that conflicts with one of
// the *conflict-free tokens* identified above.
for i in 0..n {
let symbol = Symbol::terminal(i);
if !conflict_free_tokens.contains(&symbol)
&& !keywords.contains(&symbol)
&& syntax_grammar.word_token != Some(symbol)
{
if let Some(t) = conflict_free_tokens.iter().find(|t| {
!coincident_token_index.contains(symbol, *t)
&& token_conflict_map.does_conflict(symbol.index, t.index)
}) {
debug!(
"error recovery - exclude token {} because of conflict with {}",
lexical_grammar.variables[i].name, lexical_grammar.variables[t.index].name
);
continue;
}
}
debug!(
"error recovery - include token {}",
lexical_grammar.variables[i].name
);
state
.terminal_entries
.entry(symbol)
.or_insert_with(|| recover_entry.clone());
}
for (i, external_token) in syntax_grammar.external_tokens.iter().enumerate() {
if external_token.corresponding_internal_token.is_none() {
state
.terminal_entries
.entry(Symbol::external(i))
.or_insert_with(|| recover_entry.clone());
}
}
state.terminal_entries.insert(Symbol::end(), recover_entry);
}
fn populate_used_symbols(
parse_table: &mut ParseTable,
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
) {
let mut terminal_usages = vec![false; lexical_grammar.variables.len()];
let mut non_terminal_usages = vec![false; syntax_grammar.variables.len()];
let mut external_usages = vec![false; syntax_grammar.external_tokens.len()];
for state in &parse_table.states {
for symbol in state.terminal_entries.keys() {
match symbol.kind {
SymbolType::Terminal => terminal_usages[symbol.index] = true,
SymbolType::External => external_usages[symbol.index] = true,
_ => {}
}
}
for symbol in state.nonterminal_entries.keys() {
non_terminal_usages[symbol.index] = true;
}
}
parse_table.symbols.push(Symbol::end());
for (i, value) in terminal_usages.into_iter().enumerate() {
if value {
// Assign the grammar's word token a low numerical index. This ensures that
// it can be stored in a subtree with no heap allocations, even for grammars with
// very large numbers of tokens. This is an optimization, but it's also important to
// ensure that a subtree's symbol can be successfully reassigned to the word token
// without having to move the subtree to the heap.
// See https://github.com/tree-sitter/tree-sitter/issues/258
if syntax_grammar.word_token.is_some_and(|t| t.index == i) {
parse_table.symbols.insert(1, Symbol::terminal(i));
} else {
parse_table.symbols.push(Symbol::terminal(i));
}
}
}
for (i, value) in external_usages.into_iter().enumerate() {
if value {
parse_table.symbols.push(Symbol::external(i));
}
}
for (i, value) in non_terminal_usages.into_iter().enumerate() {
if value {
parse_table.symbols.push(Symbol::non_terminal(i));
}
}
}
fn populate_external_lex_states(parse_table: &mut ParseTable, syntax_grammar: &SyntaxGrammar) {
let mut external_tokens_by_corresponding_internal_token = HashMap::new();
for (i, external_token) in syntax_grammar.external_tokens.iter().enumerate() {
if let Some(symbol) = external_token.corresponding_internal_token {
external_tokens_by_corresponding_internal_token.insert(symbol.index, i);
}
}
// Ensure that external lex state 0 represents the absence of any
// external tokens.
parse_table.external_lex_states.push(TokenSet::new());
for i in 0..parse_table.states.len() {
let mut external_tokens = TokenSet::new();
for token in parse_table.states[i].terminal_entries.keys() {
if token.is_external() {
external_tokens.insert(*token);
} else if token.is_terminal() {
if let Some(index) =
external_tokens_by_corresponding_internal_token.get(&token.index)
{
external_tokens.insert(Symbol::external(*index));
}
}
}
parse_table.states[i].external_lex_state_id = parse_table
.external_lex_states
.iter()
.position(|tokens| *tokens == external_tokens)
.unwrap_or_else(|| {
parse_table.external_lex_states.push(external_tokens);
parse_table.external_lex_states.len() - 1
});
}
}
fn identify_keywords(
lexical_grammar: &LexicalGrammar,
parse_table: &ParseTable,
word_token: Option<Symbol>,
token_conflict_map: &TokenConflictMap<'_>,
coincident_token_index: &CoincidentTokenIndex<'_>,
) -> TokenSet {
if word_token.is_none() {
return TokenSet::new();
}
let word_token = word_token.unwrap();
let mut cursor = NfaCursor::new(&lexical_grammar.nfa, Vec::new());
// First find all of the candidate keyword tokens: tokens that start with
// letters or underscore and can match the same string as a word token.
let keyword_candidates = lexical_grammar
.variables
.iter()
.enumerate()
.filter_map(|(i, variable)| {
cursor.reset(vec![variable.start_state]);
if all_chars_are_alphabetical(&cursor)
&& token_conflict_map.does_match_same_string(i, word_token.index)
&& !token_conflict_map.does_match_different_string(i, word_token.index)
{
debug!(
"Keywords - add candidate {}",
lexical_grammar.variables[i].name
);
Some(Symbol::terminal(i))
} else {
None
}
})
.collect::<TokenSet>();
// Exclude keyword candidates that shadow another keyword candidate.
let keywords = keyword_candidates
.iter()
.filter(|token| {
for other_token in keyword_candidates.iter() {
if other_token != *token
&& token_conflict_map.does_match_same_string(other_token.index, token.index)
{
debug!(
"Keywords - exclude {} because it matches the same string as {}",
lexical_grammar.variables[token.index].name,
lexical_grammar.variables[other_token.index].name
);
return false;
}
}
true
})
.collect::<TokenSet>();
// Exclude keyword candidates for which substituting the keyword capture
// token would introduce new lexical conflicts with other tokens.
let keywords = keywords
.iter()
.filter(|token| {
for other_index in 0..lexical_grammar.variables.len() {
if keyword_candidates.contains(&Symbol::terminal(other_index)) {
continue;
}
// If the word token was already valid in every state containing
// this keyword candidate, then substituting the word token won't
// introduce any new lexical conflicts.
if coincident_token_index
.states_with(*token, Symbol::terminal(other_index))
.iter()
.all(|state_id| {
parse_table.states[*state_id]
.terminal_entries
.contains_key(&word_token)
})
{
continue;
}
if !token_conflict_map.has_same_conflict_status(
token.index,
word_token.index,
other_index,
) {
debug!(
"Keywords - exclude {} because of conflict with {}",
lexical_grammar.variables[token.index].name,
lexical_grammar.variables[other_index].name
);
return false;
}
}
debug!(
"Keywords - include {}",
lexical_grammar.variables[token.index].name,
);
true
})
.collect();
keywords
}
fn mark_fragile_tokens(
parse_table: &mut ParseTable,
lexical_grammar: &LexicalGrammar,
token_conflict_map: &TokenConflictMap<'_>,
) {
let n = lexical_grammar.variables.len();
let mut valid_tokens_mask = Vec::with_capacity(n);
for state in &mut parse_table.states {
valid_tokens_mask.clear();
valid_tokens_mask.resize(n, false);
for token in state.terminal_entries.keys() {
if token.is_terminal() {
valid_tokens_mask[token.index] = true;
}
}
for (token, entry) in &mut state.terminal_entries {
if token.is_terminal() {
for (i, is_valid) in valid_tokens_mask.iter().enumerate() {
if *is_valid && token_conflict_map.does_overlap(i, token.index) {
entry.reusable = false;
break;
}
}
}
}
}
}
fn report_state_info<'a>(
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
parse_table: &ParseTable,
parse_state_info: &[ParseStateInfo<'a>],
report_symbol_name: &'a str,
) {
let mut all_state_indices = BTreeSet::new();
let mut symbols_with_state_indices = (0..syntax_grammar.variables.len())
.map(|i| (Symbol::non_terminal(i), BTreeSet::new()))
.collect::<Vec<_>>();
for (i, state) in parse_table.states.iter().enumerate() {
all_state_indices.insert(i);
let item_set = &parse_state_info[state.id];
for entry in &item_set.1.entries {
if !entry.item.is_augmented() {
symbols_with_state_indices[entry.item.variable_index as usize]
.1
.insert(i);
}
}
}
symbols_with_state_indices.sort_unstable_by_key(|(_, states)| -(states.len() as i32));
let max_symbol_name_length = syntax_grammar
.variables
.iter()
.map(|v| v.name.len())
.max()
.unwrap();
for (symbol, states) in &symbols_with_state_indices {
info!(
"{:width$}\t{}",
syntax_grammar.variables[symbol.index].name,
states.len(),
width = max_symbol_name_length
);
}
info!("");
let state_indices = if report_symbol_name == "*" {
Some(&all_state_indices)
} else {
symbols_with_state_indices
.iter()
.find_map(|(symbol, state_indices)| {
if syntax_grammar.variables[symbol.index].name == report_symbol_name {
Some(state_indices)
} else {
None
}
})
};
if let Some(state_indices) = state_indices {
let mut state_indices = state_indices.iter().copied().collect::<Vec<_>>();
state_indices.sort_unstable_by_key(|i| (parse_table.states[*i].core_id, *i));
for state_index in state_indices {
let id = parse_table.states[state_index].id;
let (preceding_symbols, item_set) = &parse_state_info[id];
info!("state index: {state_index}");
info!("state id: {id}");
info!(
"symbol sequence: {}",
preceding_symbols
.iter()
.map(|symbol| {
if symbol.is_terminal() {
lexical_grammar.variables[symbol.index].name.clone()
} else if symbol.is_external() {
syntax_grammar.external_tokens[symbol.index].name.clone()
} else {
syntax_grammar.variables[symbol.index].name.clone()
}
})
.collect::<Vec<_>>()
.join(" ")
);
info!(
"\nitems:\n{}",
item::ParseItemSetDisplay(item_set, syntax_grammar, lexical_grammar),
);
}
}
}
fn all_chars_are_alphabetical(cursor: &NfaCursor<'_>) -> bool {
cursor.transition_chars().all(|(chars, is_sep)| {
if is_sep {
true
} else {
chars.chars().all(|c| c.is_alphabetic() || c == '_')
}
})
}