fix: Avoid generating unused character set constants

This commit is contained in:
Max Brunsfeld 2024-04-14 09:56:44 -07:00
parent a7a47d561d
commit 295344b142

View file

@ -64,7 +64,7 @@ struct Generator {
main_lex_table: LexTable,
keyword_lex_table: LexTable,
large_character_sets: Vec<(Option<Symbol>, CharacterSet)>,
large_character_set_constant_names: Vec<String>,
large_character_set_info: Vec<LargeCharacterSetInfo>,
large_state_count: usize,
keyword_capture_token: Option<Symbol>,
syntax_grammar: SyntaxGrammar,
@ -81,6 +81,11 @@ struct Generator {
abi_version: usize,
}
struct LargeCharacterSetInfo {
constant_name: String,
is_used: bool,
}
impl Generator {
fn generate(mut self) -> String {
self.init();
@ -108,11 +113,7 @@ impl Generator {
self.add_primary_state_id_list();
}
// Generate a helper function for each large character set.
// let mut sorted_large_char_sets = self.large_character_sets.iter().collect::<Vec<_>>();
for ix in 0..self.large_character_sets.len() {
self.add_character_set(ix);
}
let buffer_offset_before_lex_functions = self.buffer.len();
let mut main_lex_table = LexTable::default();
swap(&mut main_lex_table, &mut self.main_lex_table);
@ -124,6 +125,16 @@ impl Generator {
self.add_lex_function("ts_lex_keywords", keyword_lex_table);
}
// Once the lex functions are generated, and we've determined which large
// character sets are actually used, we can generate the large character set
// constants. Insert them into the output buffer before the lex functions.
let lex_functions = self.buffer[buffer_offset_before_lex_functions..].to_string();
self.buffer.truncate(buffer_offset_before_lex_functions);
for ix in 0..self.large_character_sets.len() {
self.add_character_set(ix);
}
self.buffer.push_str(&lex_functions);
self.add_lex_modes_list();
self.add_parse_table();
@ -238,6 +249,23 @@ impl Generator {
}
}
for (ix, (symbol, _)) in self.large_character_sets.iter().enumerate() {
let count = self.large_character_sets[0..ix]
.iter()
.filter(|(sym, _)| sym == symbol)
.count()
+ 1;
let constant_name = if let Some(symbol) = symbol {
format!("{}_character_set_{}", self.symbol_ids[&symbol], count)
} else {
format!("extras_character_set_{}", count)
};
self.large_character_set_info.push(LargeCharacterSetInfo {
constant_name,
is_used: false,
});
}
// Determine which states should use the "small state" representation, and which should
// use the normal array representation.
let threshold = cmp::min(SMALL_STATE_THRESHOLD, self.parse_table.symbols.len() / 2);
@ -829,10 +857,12 @@ impl Generator {
add!(self, "(!eof && ")
}
let char_set_info = &mut self.large_character_set_info[large_char_set_ix];
char_set_info.is_used = true;
add!(
self,
"set_contains({}, {}, lookahead)",
&self.large_character_set_constant_names[large_char_set_ix],
&char_set_info.constant_name,
large_set.range_count(),
);
if check_eof {
@ -940,20 +970,17 @@ impl Generator {
}
fn add_character_set(&mut self, ix: usize) {
let (symbol, characters) = self.large_character_sets[ix].clone();
let count = self.large_character_sets[0..ix]
.iter()
.filter(|(sym, _)| *sym == symbol)
.count()
+ 1;
let characters = self.large_character_sets[ix].1.clone();
let info = &self.large_character_set_info[ix];
if !info.is_used {
return;
}
let constant_name = if let Some(symbol) = symbol {
format!("{}_character_set_{}", self.symbol_ids[&symbol], count)
} else {
format!("extras_character_set_{}", count)
};
add_line!(self, "static TSCharacterRange {}[] = {{", constant_name);
self.large_character_set_constant_names.push(constant_name);
add_line!(
self,
"static TSCharacterRange {}[] = {{",
info.constant_name
);
indent!(self);
for (ix, range) in characters.ranges().enumerate() {
@ -1694,7 +1721,7 @@ pub fn render_c_code(
keyword_lex_table: tables.keyword_lex_table,
keyword_capture_token: tables.word_token,
large_character_sets: tables.large_character_sets,
large_character_set_constant_names: Vec::new(),
large_character_set_info: Vec::new(),
syntax_grammar,
lexical_grammar,
default_aliases,