From 295344b142a101284ea13d733f7d87f66cf0ebbd Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sun, 14 Apr 2024 09:56:44 -0700 Subject: [PATCH] fix: Avoid generating unused character set constants --- cli/src/generate/render.rs | 69 ++++++++++++++++++++++++++------------ 1 file changed, 48 insertions(+), 21 deletions(-) diff --git a/cli/src/generate/render.rs b/cli/src/generate/render.rs index 3401f63c..12e0c457 100644 --- a/cli/src/generate/render.rs +++ b/cli/src/generate/render.rs @@ -64,7 +64,7 @@ struct Generator { main_lex_table: LexTable, keyword_lex_table: LexTable, large_character_sets: Vec<(Option, CharacterSet)>, - large_character_set_constant_names: Vec, + large_character_set_info: Vec, large_state_count: usize, keyword_capture_token: Option, syntax_grammar: SyntaxGrammar, @@ -81,6 +81,11 @@ struct Generator { abi_version: usize, } +struct LargeCharacterSetInfo { + constant_name: String, + is_used: bool, +} + impl Generator { fn generate(mut self) -> String { self.init(); @@ -108,11 +113,7 @@ impl Generator { self.add_primary_state_id_list(); } - // Generate a helper function for each large character set. - // let mut sorted_large_char_sets = self.large_character_sets.iter().collect::>(); - for ix in 0..self.large_character_sets.len() { - self.add_character_set(ix); - } + let buffer_offset_before_lex_functions = self.buffer.len(); let mut main_lex_table = LexTable::default(); swap(&mut main_lex_table, &mut self.main_lex_table); @@ -124,6 +125,16 @@ impl Generator { self.add_lex_function("ts_lex_keywords", keyword_lex_table); } + // Once the lex functions are generated, and we've determined which large + // character sets are actually used, we can generate the large character set + // constants. Insert them into the output buffer before the lex functions. + let lex_functions = self.buffer[buffer_offset_before_lex_functions..].to_string(); + self.buffer.truncate(buffer_offset_before_lex_functions); + for ix in 0..self.large_character_sets.len() { + self.add_character_set(ix); + } + self.buffer.push_str(&lex_functions); + self.add_lex_modes_list(); self.add_parse_table(); @@ -238,6 +249,23 @@ impl Generator { } } + for (ix, (symbol, _)) in self.large_character_sets.iter().enumerate() { + let count = self.large_character_sets[0..ix] + .iter() + .filter(|(sym, _)| sym == symbol) + .count() + + 1; + let constant_name = if let Some(symbol) = symbol { + format!("{}_character_set_{}", self.symbol_ids[&symbol], count) + } else { + format!("extras_character_set_{}", count) + }; + self.large_character_set_info.push(LargeCharacterSetInfo { + constant_name, + is_used: false, + }); + } + // Determine which states should use the "small state" representation, and which should // use the normal array representation. let threshold = cmp::min(SMALL_STATE_THRESHOLD, self.parse_table.symbols.len() / 2); @@ -829,10 +857,12 @@ impl Generator { add!(self, "(!eof && ") } + let char_set_info = &mut self.large_character_set_info[large_char_set_ix]; + char_set_info.is_used = true; add!( self, "set_contains({}, {}, lookahead)", - &self.large_character_set_constant_names[large_char_set_ix], + &char_set_info.constant_name, large_set.range_count(), ); if check_eof { @@ -940,20 +970,17 @@ impl Generator { } fn add_character_set(&mut self, ix: usize) { - let (symbol, characters) = self.large_character_sets[ix].clone(); - let count = self.large_character_sets[0..ix] - .iter() - .filter(|(sym, _)| *sym == symbol) - .count() - + 1; + let characters = self.large_character_sets[ix].1.clone(); + let info = &self.large_character_set_info[ix]; + if !info.is_used { + return; + } - let constant_name = if let Some(symbol) = symbol { - format!("{}_character_set_{}", self.symbol_ids[&symbol], count) - } else { - format!("extras_character_set_{}", count) - }; - add_line!(self, "static TSCharacterRange {}[] = {{", constant_name); - self.large_character_set_constant_names.push(constant_name); + add_line!( + self, + "static TSCharacterRange {}[] = {{", + info.constant_name + ); indent!(self); for (ix, range) in characters.ranges().enumerate() { @@ -1694,7 +1721,7 @@ pub fn render_c_code( keyword_lex_table: tables.keyword_lex_table, keyword_capture_token: tables.word_token, large_character_sets: tables.large_character_sets, - large_character_set_constant_names: Vec::new(), + large_character_set_info: Vec::new(), syntax_grammar, lexical_grammar, default_aliases,