From 3b8a653167f8107547b80dba6e071100b9c37a5c Mon Sep 17 00:00:00 2001 From: bglgwyng Date: Fri, 7 Nov 2025 11:57:39 +0900 Subject: [PATCH] refactor: extract symbol ID generation and helper functions - Moved symbol ID generation logic out of renderer initialization into standalone function - Extracted sanitize_identifier and metadata_for_symbol as reusable helper functions - Symbol IDs now computed before rendering and passed to renderer constructor --- crates/generate/src/generate.rs | 14 +- crates/generate/src/render.rs | 448 +++++++++++++++++++++----------- 2 files changed, 314 insertions(+), 148 deletions(-) diff --git a/crates/generate/src/generate.rs b/crates/generate/src/generate.rs index c4a1ec84..0679bdc9 100644 --- a/crates/generate/src/generate.rs +++ b/crates/generate/src/generate.rs @@ -41,7 +41,7 @@ use parse_grammar::parse_grammar; pub use parse_grammar::ParseGrammarError; use prepare_grammar::prepare_grammar; pub use prepare_grammar::PrepareGrammarError; -use render::render_c_code; +use render::{generate_symbol_ids, render_c_code}; pub use render::{ABI_VERSION_MAX, ABI_VERSION_MIN}; static JSON_COMMENT_REGEX: LazyLock = LazyLock::new(|| { @@ -373,12 +373,24 @@ fn generate_parser_for_grammar_with_opts( report_symbol_name, optimizations, )?; + + // Generate symbol IDs before rendering C code + let (symbol_ids, alias_ids, unique_aliases) = generate_symbol_ids( + &tables.parse_table, + &syntax_grammar, + &lexical_grammar, + &simple_aliases, + ); + let c_code = render_c_code( &input_grammar.name, tables, syntax_grammar, lexical_grammar, simple_aliases, + symbol_ids, + alias_ids, + unique_aliases, abi_version, semantic_version, supertype_symbol_map, diff --git a/crates/generate/src/render.rs b/crates/generate/src/render.rs index bcfc832e..8a55b47d 100644 --- a/crates/generate/src/render.rs +++ b/crates/generate/src/render.rs @@ -175,15 +175,7 @@ impl Generator { } fn init(&mut self) { - let mut symbol_identifiers = HashSet::new(); - for i in 0..self.parse_table.symbols.len() { - self.assign_symbol_id(self.parse_table.symbols[i], &mut symbol_identifiers); - } - self.symbol_ids.insert( - Symbol::end_of_nonterminal_extra(), - self.symbol_ids[&Symbol::end()].clone(), - ); - + // symbol_ids and alias_ids are now passed in from the constructor self.symbol_map = HashMap::new(); for symbol in &self.parse_table.symbols { @@ -1708,54 +1700,13 @@ impl Generator { ) } - fn assign_symbol_id(&mut self, symbol: Symbol, used_identifiers: &mut HashSet) { - let mut id; - if symbol == Symbol::end() { - id = "ts_builtin_sym_end".to_string(); - } else { - let (name, kind) = self.metadata_for_symbol(symbol); - id = match kind { - VariableType::Auxiliary => format!("aux_sym_{}", self.sanitize_identifier(name)), - VariableType::Anonymous => format!("anon_sym_{}", self.sanitize_identifier(name)), - VariableType::Hidden | VariableType::Named => { - format!("sym_{}", self.sanitize_identifier(name)) - } - }; - - let mut suffix_number = 1; - let mut suffix = String::new(); - while used_identifiers.contains(&id) { - id.drain(id.len() - suffix.len()..); - suffix_number += 1; - suffix = suffix_number.to_string(); - id += &suffix; - } - } - - used_identifiers.insert(id.clone()); - self.symbol_ids.insert(symbol, id); - } fn field_id(&self, field_name: &str) -> String { format!("field_{field_name}") } fn metadata_for_symbol(&self, symbol: Symbol) -> (&str, VariableType) { - match symbol.kind { - SymbolType::End | SymbolType::EndOfNonTerminalExtra => ("end", VariableType::Hidden), - SymbolType::NonTerminal => { - let variable = &self.syntax_grammar.variables[symbol.index]; - (&variable.name, variable.kind) - } - SymbolType::Terminal => { - let variable = &self.lexical_grammar.variables[symbol.index]; - (&variable.name, variable.kind) - } - SymbolType::External => { - let token = &self.syntax_grammar.external_tokens[symbol.index]; - (&token.name, token.kind) - } - } + metadata_for_symbol(symbol, &self.syntax_grammar, &self.lexical_grammar) } fn symbols_for_alias(&self, alias: &Alias) -> Vec { @@ -1776,98 +1727,7 @@ impl Generator { } fn sanitize_identifier(&self, name: &str) -> String { - let mut result = String::with_capacity(name.len()); - for c in name.chars() { - if c.is_ascii_alphanumeric() || c == '_' { - result.push(c); - } else { - 'special_chars: { - let replacement = match c { - ' ' if name.len() == 1 => "SPACE", - '~' => "TILDE", - '`' => "BQUOTE", - '!' => "BANG", - '@' => "AT", - '#' => "POUND", - '$' => "DOLLAR", - '%' => "PERCENT", - '^' => "CARET", - '&' => "AMP", - '*' => "STAR", - '(' => "LPAREN", - ')' => "RPAREN", - '-' => "DASH", - '+' => "PLUS", - '=' => "EQ", - '{' => "LBRACE", - '}' => "RBRACE", - '[' => "LBRACK", - ']' => "RBRACK", - '\\' => "BSLASH", - '|' => "PIPE", - ':' => "COLON", - ';' => "SEMI", - '"' => "DQUOTE", - '\'' => "SQUOTE", - '<' => "LT", - '>' => "GT", - ',' => "COMMA", - '.' => "DOT", - '?' => "QMARK", - '/' => "SLASH", - '\n' => "LF", - '\r' => "CR", - '\t' => "TAB", - '\0' => "NULL", - '\u{0001}' => "SOH", - '\u{0002}' => "STX", - '\u{0003}' => "ETX", - '\u{0004}' => "EOT", - '\u{0005}' => "ENQ", - '\u{0006}' => "ACK", - '\u{0007}' => "BEL", - '\u{0008}' => "BS", - '\u{000b}' => "VTAB", - '\u{000c}' => "FF", - '\u{000e}' => "SO", - '\u{000f}' => "SI", - '\u{0010}' => "DLE", - '\u{0011}' => "DC1", - '\u{0012}' => "DC2", - '\u{0013}' => "DC3", - '\u{0014}' => "DC4", - '\u{0015}' => "NAK", - '\u{0016}' => "SYN", - '\u{0017}' => "ETB", - '\u{0018}' => "CAN", - '\u{0019}' => "EM", - '\u{001a}' => "SUB", - '\u{001b}' => "ESC", - '\u{001c}' => "FS", - '\u{001d}' => "GS", - '\u{001e}' => "RS", - '\u{001f}' => "US", - '\u{007F}' => "DEL", - '\u{FEFF}' => "BOM", - '\u{0080}'..='\u{FFFF}' => { - write!(result, "u{:04x}", c as u32).unwrap(); - break 'special_chars; - } - '\u{10000}'..='\u{10FFFF}' => { - write!(result, "U{:08x}", c as u32).unwrap(); - break 'special_chars; - } - '0'..='9' | 'a'..='z' | 'A'..='Z' | '_' => unreachable!(), - ' ' => break 'special_chars, - }; - if !result.is_empty() && !result.ends_with('_') { - result.push('_'); - } - result += replacement; - } - } - } - result + sanitize_identifier(name) } fn sanitize_string(&self, name: &str) -> String { @@ -1915,23 +1775,311 @@ impl Generator { } } +/// Helper function to sanitize identifiers for C code generation. +fn sanitize_identifier(name: &str) -> String { + let mut result = String::with_capacity(name.len()); + for c in name.chars() { + if c.is_ascii_alphanumeric() || c == '_' { + result.push(c); + } else { + 'special_chars: { + let replacement = match c { + ' ' if name.len() == 1 => "SPACE", + '~' => "TILDE", + '`' => "BQUOTE", + '!' => "BANG", + '@' => "AT", + '#' => "POUND", + '$' => "DOLLAR", + '%' => "PERCENT", + '^' => "CARET", + '&' => "AMP", + '*' => "STAR", + '(' => "LPAREN", + ')' => "RPAREN", + '-' => "DASH", + '+' => "PLUS", + '=' => "EQ", + '{' => "LBRACE", + '}' => "RBRACE", + '[' => "LBRACK", + ']' => "RBRACK", + '\\' => "BSLASH", + '|' => "PIPE", + ':' => "COLON", + ';' => "SEMI", + '"' => "DQUOTE", + '\'' => "SQUOTE", + '<' => "LT", + '>' => "GT", + ',' => "COMMA", + '.' => "DOT", + '?' => "QMARK", + '/' => "SLASH", + '\n' => "LF", + '\r' => "CR", + '\t' => "TAB", + '\0' => "NULL", + '\u{0001}' => "SOH", + '\u{0002}' => "STX", + '\u{0003}' => "ETX", + '\u{0004}' => "EOT", + '\u{0005}' => "ENQ", + '\u{0006}' => "ACK", + '\u{0007}' => "BEL", + '\u{0008}' => "BS", + '\u{000b}' => "VTAB", + '\u{000c}' => "FF", + '\u{000e}' => "SO", + '\u{000f}' => "SI", + '\u{0010}' => "DLE", + '\u{0011}' => "DC1", + '\u{0012}' => "DC2", + '\u{0013}' => "DC3", + '\u{0014}' => "DC4", + '\u{0015}' => "NAK", + '\u{0016}' => "SYN", + '\u{0017}' => "ETB", + '\u{0018}' => "CAN", + '\u{0019}' => "EM", + '\u{001a}' => "SUB", + '\u{001b}' => "ESC", + '\u{001c}' => "FS", + '\u{001d}' => "GS", + '\u{001e}' => "RS", + '\u{001f}' => "US", + '\u{007F}' => "DEL", + '\u{FEFF}' => "BOM", + '\u{0080}'..='\u{FFFF}' => { + write!(result, "u{:04x}", c as u32).unwrap(); + break 'special_chars; + } + '\u{10000}'..='\u{10FFFF}' => { + write!(result, "U{:08x}", c as u32).unwrap(); + break 'special_chars; + } + '0'..='9' | 'a'..='z' | 'A'..='Z' | '_' => unreachable!(), + ' ' => break 'special_chars, + }; + if !result.is_empty() && !result.ends_with('_') { + result.push('_'); + } + result += replacement; + } + } + } + result +} + +/// Helper function to get metadata for a symbol. +fn metadata_for_symbol<'a>( + symbol: Symbol, + syntax_grammar: &'a SyntaxGrammar, + lexical_grammar: &'a LexicalGrammar, +) -> (&'a str, VariableType) { + match symbol.kind { + SymbolType::End | SymbolType::EndOfNonTerminalExtra => ("end", VariableType::Hidden), + SymbolType::NonTerminal => { + let variable = &syntax_grammar.variables[symbol.index]; + (&variable.name as &str, variable.kind) + } + SymbolType::Terminal => { + let variable = &lexical_grammar.variables[symbol.index]; + (&variable.name as &str, variable.kind) + } + SymbolType::External => { + let token = &syntax_grammar.external_tokens[symbol.index]; + (&token.name as &str, token.kind) + } + } +} + +/// Helper function to assign a symbol ID. +fn assign_symbol_id( + symbol: Symbol, + syntax_grammar: &SyntaxGrammar, + lexical_grammar: &LexicalGrammar, + symbol_ids: &mut HashMap, + used_identifiers: &mut HashSet, +) { + let mut id; + if symbol == Symbol::end() { + id = "ts_builtin_sym_end".to_string(); + } else { + let (name, kind) = metadata_for_symbol(symbol, syntax_grammar, lexical_grammar); + id = match kind { + VariableType::Auxiliary => format!("aux_sym_{}", sanitize_identifier(name)), + VariableType::Anonymous => format!("anon_sym_{}", sanitize_identifier(name)), + VariableType::Hidden | VariableType::Named => { + format!("sym_{}", sanitize_identifier(name)) + } + }; + + let mut suffix_number = 1; + let mut suffix = String::new(); + while used_identifiers.contains(&id) { + id.drain(id.len() - suffix.len()..); + suffix_number += 1; + suffix = suffix_number.to_string(); + id += &suffix; + } + } + + used_identifiers.insert(id.clone()); + symbol_ids.insert(symbol, id); +} + +/// Generates symbol IDs and alias IDs for the given parse table and grammars. +/// +/// This function must be called before `render_c_code` to generate the symbol mappings +/// that will be used in the generated C code. +/// +/// # Arguments +/// +/// * `parse_table` - The generated parse table for the language +/// * `syntax_grammar` - The syntax grammar extracted from the language's grammar +/// * `lexical_grammar` - The lexical grammar extracted from the language's grammar +/// * `default_aliases` - A map describing the global rename rules that should apply +/// +/// # Returns +/// +/// A tuple containing: +/// * `symbol_ids` - HashMap mapping each Symbol to its C identifier string +/// * `alias_ids` - HashMap mapping each Alias to its C identifier string +/// * `unique_aliases` - Sorted vector of unique aliases +pub fn generate_symbol_ids( + parse_table: &ParseTable, + syntax_grammar: &SyntaxGrammar, + lexical_grammar: &LexicalGrammar, + default_aliases: &AliasMap, +) -> (HashMap, HashMap, Vec) { + let mut symbol_ids = HashMap::new(); + let mut alias_ids = HashMap::new(); + let mut unique_aliases = Vec::new(); + let mut symbol_identifiers = HashSet::new(); + + // Generate symbol IDs + for i in 0..parse_table.symbols.len() { + assign_symbol_id( + parse_table.symbols[i], + syntax_grammar, + lexical_grammar, + &mut symbol_ids, + &mut symbol_identifiers, + ); + } + + symbol_ids.insert( + Symbol::end_of_nonterminal_extra(), + symbol_ids[&Symbol::end()].clone(), + ); + + // Build symbol map to find canonical symbols for aliases + let mut symbol_map = HashMap::new(); + for symbol in &parse_table.symbols { + let mut mapping = symbol; + + if let Some(alias) = default_aliases.get(symbol) { + let kind = alias.kind(); + for other_symbol in &parse_table.symbols { + if let Some(other_alias) = default_aliases.get(other_symbol) { + if other_symbol < mapping && other_alias == alias { + mapping = other_symbol; + } + } else { + let (other_name, other_kind) = + metadata_for_symbol(*other_symbol, syntax_grammar, lexical_grammar); + if (other_name, other_kind) == (alias.value.as_str(), kind) { + mapping = other_symbol; + break; + } + } + } + } else if symbol.is_terminal() { + let metadata = metadata_for_symbol(*symbol, syntax_grammar, lexical_grammar); + for other_symbol in &parse_table.symbols { + let other_metadata = + metadata_for_symbol(*other_symbol, syntax_grammar, lexical_grammar); + if other_metadata == metadata { + if let Some(mapped) = symbol_map.get(other_symbol) { + if mapped == symbol { + break; + } + } + mapping = other_symbol; + break; + } + } + } + + symbol_map.insert(*symbol, *mapping); + } + + // Generate alias IDs + for production_info in &parse_table.production_infos { + for alias in &production_info.alias_sequence { + if let Some(alias) = &alias { + // Find symbols that match this alias + let matching_symbols: Vec = parse_table + .symbols + .iter() + .copied() + .filter(|symbol| { + default_aliases.get(symbol).map_or_else( + || { + let (name, kind) = + metadata_for_symbol(*symbol, syntax_grammar, lexical_grammar); + name == alias.value && kind == alias.kind() + }, + |default_alias| default_alias == alias, + ) + }) + .collect(); + + // Some aliases match an existing symbol in the grammar. + let alias_id = if let Some(existing_symbol) = matching_symbols.first() { + symbol_ids[&symbol_map[existing_symbol]].clone() + } + // Other aliases don't match any existing symbol, and need their own identifiers. + else { + if let Err(i) = unique_aliases.binary_search(alias) { + unique_aliases.insert(i, alias.clone()); + } + + if alias.is_named { + format!("alias_sym_{}", sanitize_identifier(&alias.value)) + } else { + format!("anon_alias_sym_{}", sanitize_identifier(&alias.value)) + } + }; + + alias_ids.entry(alias.clone()).or_insert(alias_id); + } + } + } + + (symbol_ids, alias_ids, unique_aliases) +} + /// Returns a String of C code for the given components of a parser. /// /// # Arguments /// /// * `name` - A string slice containing the name of the language -/// * `parse_table` - The generated parse table for the language -/// * `main_lex_table` - The generated lexing table for the language -/// * `keyword_lex_table` - The generated keyword lexing table for the language -/// * `keyword_capture_token` - A symbol indicating which token is used for keyword capture, if any. +/// * `tables` - The generated tables for the language /// * `syntax_grammar` - The syntax grammar extracted from the language's grammar /// * `lexical_grammar` - The lexical grammar extracted from the language's grammar /// * `default_aliases` - A map describing the global rename rules that should apply. the keys are /// symbols that are *always* aliased in the same way, and the values are the aliases that are /// applied to those symbols. +/// * `symbol_ids` - HashMap mapping each Symbol to its C identifier string +/// * `alias_ids` - HashMap mapping each Alias to its C identifier string +/// * `unique_aliases` - Sorted vector of unique aliases /// * `abi_version` - The language ABI version that should be generated. Usually you want /// Tree-sitter's current version, but right after making an ABI change, it may be useful to /// generate code with the previous ABI. +/// * `semantic_version` - Optional semantic version of the parser +/// * `supertype_symbol_map` - Map of supertype symbols #[allow(clippy::too_many_arguments)] pub fn render_c_code( name: &str, @@ -1939,6 +2087,9 @@ pub fn render_c_code( syntax_grammar: SyntaxGrammar, lexical_grammar: LexicalGrammar, default_aliases: AliasMap, + symbol_ids: HashMap, + alias_ids: HashMap, + unique_aliases: Vec, abi_version: usize, semantic_version: Option<(u8, u8, u8)>, supertype_symbol_map: BTreeMap>, @@ -1958,6 +2109,9 @@ pub fn render_c_code( syntax_grammar, lexical_grammar, default_aliases, + symbol_ids, + alias_ids, + unique_aliases, abi_version, metadata: semantic_version.map(|(major_version, minor_version, patch_version)| Metadata { major_version,