refactor: extract symbol ID generation and helper functions

- Moved symbol ID generation logic out of renderer initialization into standalone function
- Extracted sanitize_identifier and metadata_for_symbol as reusable helper functions
- Symbol IDs now computed before rendering and passed to renderer constructor
This commit is contained in:
bglgwyng 2025-11-07 11:57:39 +09:00
parent 13d4db8bb4
commit 3b8a653167
2 changed files with 314 additions and 148 deletions

View file

@ -41,7 +41,7 @@ use parse_grammar::parse_grammar;
pub use parse_grammar::ParseGrammarError;
use prepare_grammar::prepare_grammar;
pub use prepare_grammar::PrepareGrammarError;
use render::render_c_code;
use render::{generate_symbol_ids, render_c_code};
pub use render::{ABI_VERSION_MAX, ABI_VERSION_MIN};
static JSON_COMMENT_REGEX: LazyLock<Regex> = LazyLock::new(|| {
@ -373,12 +373,24 @@ fn generate_parser_for_grammar_with_opts(
report_symbol_name,
optimizations,
)?;
// Generate symbol IDs before rendering C code
let (symbol_ids, alias_ids, unique_aliases) = generate_symbol_ids(
&tables.parse_table,
&syntax_grammar,
&lexical_grammar,
&simple_aliases,
);
let c_code = render_c_code(
&input_grammar.name,
tables,
syntax_grammar,
lexical_grammar,
simple_aliases,
symbol_ids,
alias_ids,
unique_aliases,
abi_version,
semantic_version,
supertype_symbol_map,

View file

@ -175,15 +175,7 @@ impl Generator {
}
fn init(&mut self) {
let mut symbol_identifiers = HashSet::new();
for i in 0..self.parse_table.symbols.len() {
self.assign_symbol_id(self.parse_table.symbols[i], &mut symbol_identifiers);
}
self.symbol_ids.insert(
Symbol::end_of_nonterminal_extra(),
self.symbol_ids[&Symbol::end()].clone(),
);
// symbol_ids and alias_ids are now passed in from the constructor
self.symbol_map = HashMap::new();
for symbol in &self.parse_table.symbols {
@ -1708,54 +1700,13 @@ impl Generator {
)
}
fn assign_symbol_id(&mut self, symbol: Symbol, used_identifiers: &mut HashSet<String>) {
let mut id;
if symbol == Symbol::end() {
id = "ts_builtin_sym_end".to_string();
} else {
let (name, kind) = self.metadata_for_symbol(symbol);
id = match kind {
VariableType::Auxiliary => format!("aux_sym_{}", self.sanitize_identifier(name)),
VariableType::Anonymous => format!("anon_sym_{}", self.sanitize_identifier(name)),
VariableType::Hidden | VariableType::Named => {
format!("sym_{}", self.sanitize_identifier(name))
}
};
let mut suffix_number = 1;
let mut suffix = String::new();
while used_identifiers.contains(&id) {
id.drain(id.len() - suffix.len()..);
suffix_number += 1;
suffix = suffix_number.to_string();
id += &suffix;
}
}
used_identifiers.insert(id.clone());
self.symbol_ids.insert(symbol, id);
}
fn field_id(&self, field_name: &str) -> String {
format!("field_{field_name}")
}
fn metadata_for_symbol(&self, symbol: Symbol) -> (&str, VariableType) {
match symbol.kind {
SymbolType::End | SymbolType::EndOfNonTerminalExtra => ("end", VariableType::Hidden),
SymbolType::NonTerminal => {
let variable = &self.syntax_grammar.variables[symbol.index];
(&variable.name, variable.kind)
}
SymbolType::Terminal => {
let variable = &self.lexical_grammar.variables[symbol.index];
(&variable.name, variable.kind)
}
SymbolType::External => {
let token = &self.syntax_grammar.external_tokens[symbol.index];
(&token.name, token.kind)
}
}
metadata_for_symbol(symbol, &self.syntax_grammar, &self.lexical_grammar)
}
fn symbols_for_alias(&self, alias: &Alias) -> Vec<Symbol> {
@ -1776,98 +1727,7 @@ impl Generator {
}
fn sanitize_identifier(&self, name: &str) -> String {
let mut result = String::with_capacity(name.len());
for c in name.chars() {
if c.is_ascii_alphanumeric() || c == '_' {
result.push(c);
} else {
'special_chars: {
let replacement = match c {
' ' if name.len() == 1 => "SPACE",
'~' => "TILDE",
'`' => "BQUOTE",
'!' => "BANG",
'@' => "AT",
'#' => "POUND",
'$' => "DOLLAR",
'%' => "PERCENT",
'^' => "CARET",
'&' => "AMP",
'*' => "STAR",
'(' => "LPAREN",
')' => "RPAREN",
'-' => "DASH",
'+' => "PLUS",
'=' => "EQ",
'{' => "LBRACE",
'}' => "RBRACE",
'[' => "LBRACK",
']' => "RBRACK",
'\\' => "BSLASH",
'|' => "PIPE",
':' => "COLON",
';' => "SEMI",
'"' => "DQUOTE",
'\'' => "SQUOTE",
'<' => "LT",
'>' => "GT",
',' => "COMMA",
'.' => "DOT",
'?' => "QMARK",
'/' => "SLASH",
'\n' => "LF",
'\r' => "CR",
'\t' => "TAB",
'\0' => "NULL",
'\u{0001}' => "SOH",
'\u{0002}' => "STX",
'\u{0003}' => "ETX",
'\u{0004}' => "EOT",
'\u{0005}' => "ENQ",
'\u{0006}' => "ACK",
'\u{0007}' => "BEL",
'\u{0008}' => "BS",
'\u{000b}' => "VTAB",
'\u{000c}' => "FF",
'\u{000e}' => "SO",
'\u{000f}' => "SI",
'\u{0010}' => "DLE",
'\u{0011}' => "DC1",
'\u{0012}' => "DC2",
'\u{0013}' => "DC3",
'\u{0014}' => "DC4",
'\u{0015}' => "NAK",
'\u{0016}' => "SYN",
'\u{0017}' => "ETB",
'\u{0018}' => "CAN",
'\u{0019}' => "EM",
'\u{001a}' => "SUB",
'\u{001b}' => "ESC",
'\u{001c}' => "FS",
'\u{001d}' => "GS",
'\u{001e}' => "RS",
'\u{001f}' => "US",
'\u{007F}' => "DEL",
'\u{FEFF}' => "BOM",
'\u{0080}'..='\u{FFFF}' => {
write!(result, "u{:04x}", c as u32).unwrap();
break 'special_chars;
}
'\u{10000}'..='\u{10FFFF}' => {
write!(result, "U{:08x}", c as u32).unwrap();
break 'special_chars;
}
'0'..='9' | 'a'..='z' | 'A'..='Z' | '_' => unreachable!(),
' ' => break 'special_chars,
};
if !result.is_empty() && !result.ends_with('_') {
result.push('_');
}
result += replacement;
}
}
}
result
sanitize_identifier(name)
}
fn sanitize_string(&self, name: &str) -> String {
@ -1915,23 +1775,311 @@ impl Generator {
}
}
/// Helper function to sanitize identifiers for C code generation.
fn sanitize_identifier(name: &str) -> String {
let mut result = String::with_capacity(name.len());
for c in name.chars() {
if c.is_ascii_alphanumeric() || c == '_' {
result.push(c);
} else {
'special_chars: {
let replacement = match c {
' ' if name.len() == 1 => "SPACE",
'~' => "TILDE",
'`' => "BQUOTE",
'!' => "BANG",
'@' => "AT",
'#' => "POUND",
'$' => "DOLLAR",
'%' => "PERCENT",
'^' => "CARET",
'&' => "AMP",
'*' => "STAR",
'(' => "LPAREN",
')' => "RPAREN",
'-' => "DASH",
'+' => "PLUS",
'=' => "EQ",
'{' => "LBRACE",
'}' => "RBRACE",
'[' => "LBRACK",
']' => "RBRACK",
'\\' => "BSLASH",
'|' => "PIPE",
':' => "COLON",
';' => "SEMI",
'"' => "DQUOTE",
'\'' => "SQUOTE",
'<' => "LT",
'>' => "GT",
',' => "COMMA",
'.' => "DOT",
'?' => "QMARK",
'/' => "SLASH",
'\n' => "LF",
'\r' => "CR",
'\t' => "TAB",
'\0' => "NULL",
'\u{0001}' => "SOH",
'\u{0002}' => "STX",
'\u{0003}' => "ETX",
'\u{0004}' => "EOT",
'\u{0005}' => "ENQ",
'\u{0006}' => "ACK",
'\u{0007}' => "BEL",
'\u{0008}' => "BS",
'\u{000b}' => "VTAB",
'\u{000c}' => "FF",
'\u{000e}' => "SO",
'\u{000f}' => "SI",
'\u{0010}' => "DLE",
'\u{0011}' => "DC1",
'\u{0012}' => "DC2",
'\u{0013}' => "DC3",
'\u{0014}' => "DC4",
'\u{0015}' => "NAK",
'\u{0016}' => "SYN",
'\u{0017}' => "ETB",
'\u{0018}' => "CAN",
'\u{0019}' => "EM",
'\u{001a}' => "SUB",
'\u{001b}' => "ESC",
'\u{001c}' => "FS",
'\u{001d}' => "GS",
'\u{001e}' => "RS",
'\u{001f}' => "US",
'\u{007F}' => "DEL",
'\u{FEFF}' => "BOM",
'\u{0080}'..='\u{FFFF}' => {
write!(result, "u{:04x}", c as u32).unwrap();
break 'special_chars;
}
'\u{10000}'..='\u{10FFFF}' => {
write!(result, "U{:08x}", c as u32).unwrap();
break 'special_chars;
}
'0'..='9' | 'a'..='z' | 'A'..='Z' | '_' => unreachable!(),
' ' => break 'special_chars,
};
if !result.is_empty() && !result.ends_with('_') {
result.push('_');
}
result += replacement;
}
}
}
result
}
/// Helper function to get metadata for a symbol.
fn metadata_for_symbol<'a>(
symbol: Symbol,
syntax_grammar: &'a SyntaxGrammar,
lexical_grammar: &'a LexicalGrammar,
) -> (&'a str, VariableType) {
match symbol.kind {
SymbolType::End | SymbolType::EndOfNonTerminalExtra => ("end", VariableType::Hidden),
SymbolType::NonTerminal => {
let variable = &syntax_grammar.variables[symbol.index];
(&variable.name as &str, variable.kind)
}
SymbolType::Terminal => {
let variable = &lexical_grammar.variables[symbol.index];
(&variable.name as &str, variable.kind)
}
SymbolType::External => {
let token = &syntax_grammar.external_tokens[symbol.index];
(&token.name as &str, token.kind)
}
}
}
/// Helper function to assign a symbol ID.
fn assign_symbol_id(
symbol: Symbol,
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
symbol_ids: &mut HashMap<Symbol, String>,
used_identifiers: &mut HashSet<String>,
) {
let mut id;
if symbol == Symbol::end() {
id = "ts_builtin_sym_end".to_string();
} else {
let (name, kind) = metadata_for_symbol(symbol, syntax_grammar, lexical_grammar);
id = match kind {
VariableType::Auxiliary => format!("aux_sym_{}", sanitize_identifier(name)),
VariableType::Anonymous => format!("anon_sym_{}", sanitize_identifier(name)),
VariableType::Hidden | VariableType::Named => {
format!("sym_{}", sanitize_identifier(name))
}
};
let mut suffix_number = 1;
let mut suffix = String::new();
while used_identifiers.contains(&id) {
id.drain(id.len() - suffix.len()..);
suffix_number += 1;
suffix = suffix_number.to_string();
id += &suffix;
}
}
used_identifiers.insert(id.clone());
symbol_ids.insert(symbol, id);
}
/// Generates symbol IDs and alias IDs for the given parse table and grammars.
///
/// This function must be called before `render_c_code` to generate the symbol mappings
/// that will be used in the generated C code.
///
/// # Arguments
///
/// * `parse_table` - The generated parse table for the language
/// * `syntax_grammar` - The syntax grammar extracted from the language's grammar
/// * `lexical_grammar` - The lexical grammar extracted from the language's grammar
/// * `default_aliases` - A map describing the global rename rules that should apply
///
/// # Returns
///
/// A tuple containing:
/// * `symbol_ids` - HashMap mapping each Symbol to its C identifier string
/// * `alias_ids` - HashMap mapping each Alias to its C identifier string
/// * `unique_aliases` - Sorted vector of unique aliases
pub fn generate_symbol_ids(
parse_table: &ParseTable,
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
default_aliases: &AliasMap,
) -> (HashMap<Symbol, String>, HashMap<Alias, String>, Vec<Alias>) {
let mut symbol_ids = HashMap::new();
let mut alias_ids = HashMap::new();
let mut unique_aliases = Vec::new();
let mut symbol_identifiers = HashSet::new();
// Generate symbol IDs
for i in 0..parse_table.symbols.len() {
assign_symbol_id(
parse_table.symbols[i],
syntax_grammar,
lexical_grammar,
&mut symbol_ids,
&mut symbol_identifiers,
);
}
symbol_ids.insert(
Symbol::end_of_nonterminal_extra(),
symbol_ids[&Symbol::end()].clone(),
);
// Build symbol map to find canonical symbols for aliases
let mut symbol_map = HashMap::new();
for symbol in &parse_table.symbols {
let mut mapping = symbol;
if let Some(alias) = default_aliases.get(symbol) {
let kind = alias.kind();
for other_symbol in &parse_table.symbols {
if let Some(other_alias) = default_aliases.get(other_symbol) {
if other_symbol < mapping && other_alias == alias {
mapping = other_symbol;
}
} else {
let (other_name, other_kind) =
metadata_for_symbol(*other_symbol, syntax_grammar, lexical_grammar);
if (other_name, other_kind) == (alias.value.as_str(), kind) {
mapping = other_symbol;
break;
}
}
}
} else if symbol.is_terminal() {
let metadata = metadata_for_symbol(*symbol, syntax_grammar, lexical_grammar);
for other_symbol in &parse_table.symbols {
let other_metadata =
metadata_for_symbol(*other_symbol, syntax_grammar, lexical_grammar);
if other_metadata == metadata {
if let Some(mapped) = symbol_map.get(other_symbol) {
if mapped == symbol {
break;
}
}
mapping = other_symbol;
break;
}
}
}
symbol_map.insert(*symbol, *mapping);
}
// Generate alias IDs
for production_info in &parse_table.production_infos {
for alias in &production_info.alias_sequence {
if let Some(alias) = &alias {
// Find symbols that match this alias
let matching_symbols: Vec<Symbol> = parse_table
.symbols
.iter()
.copied()
.filter(|symbol| {
default_aliases.get(symbol).map_or_else(
|| {
let (name, kind) =
metadata_for_symbol(*symbol, syntax_grammar, lexical_grammar);
name == alias.value && kind == alias.kind()
},
|default_alias| default_alias == alias,
)
})
.collect();
// Some aliases match an existing symbol in the grammar.
let alias_id = if let Some(existing_symbol) = matching_symbols.first() {
symbol_ids[&symbol_map[existing_symbol]].clone()
}
// Other aliases don't match any existing symbol, and need their own identifiers.
else {
if let Err(i) = unique_aliases.binary_search(alias) {
unique_aliases.insert(i, alias.clone());
}
if alias.is_named {
format!("alias_sym_{}", sanitize_identifier(&alias.value))
} else {
format!("anon_alias_sym_{}", sanitize_identifier(&alias.value))
}
};
alias_ids.entry(alias.clone()).or_insert(alias_id);
}
}
}
(symbol_ids, alias_ids, unique_aliases)
}
/// Returns a String of C code for the given components of a parser.
///
/// # Arguments
///
/// * `name` - A string slice containing the name of the language
/// * `parse_table` - The generated parse table for the language
/// * `main_lex_table` - The generated lexing table for the language
/// * `keyword_lex_table` - The generated keyword lexing table for the language
/// * `keyword_capture_token` - A symbol indicating which token is used for keyword capture, if any.
/// * `tables` - The generated tables for the language
/// * `syntax_grammar` - The syntax grammar extracted from the language's grammar
/// * `lexical_grammar` - The lexical grammar extracted from the language's grammar
/// * `default_aliases` - A map describing the global rename rules that should apply. the keys are
/// symbols that are *always* aliased in the same way, and the values are the aliases that are
/// applied to those symbols.
/// * `symbol_ids` - HashMap mapping each Symbol to its C identifier string
/// * `alias_ids` - HashMap mapping each Alias to its C identifier string
/// * `unique_aliases` - Sorted vector of unique aliases
/// * `abi_version` - The language ABI version that should be generated. Usually you want
/// Tree-sitter's current version, but right after making an ABI change, it may be useful to
/// generate code with the previous ABI.
/// * `semantic_version` - Optional semantic version of the parser
/// * `supertype_symbol_map` - Map of supertype symbols
#[allow(clippy::too_many_arguments)]
pub fn render_c_code(
name: &str,
@ -1939,6 +2087,9 @@ pub fn render_c_code(
syntax_grammar: SyntaxGrammar,
lexical_grammar: LexicalGrammar,
default_aliases: AliasMap,
symbol_ids: HashMap<Symbol, String>,
alias_ids: HashMap<Alias, String>,
unique_aliases: Vec<Alias>,
abi_version: usize,
semantic_version: Option<(u8, u8, u8)>,
supertype_symbol_map: BTreeMap<Symbol, Vec<ChildType>>,
@ -1958,6 +2109,9 @@ pub fn render_c_code(
syntax_grammar,
lexical_grammar,
default_aliases,
symbol_ids,
alias_ids,
unique_aliases,
abi_version,
metadata: semantic_version.map(|(major_version, minor_version, patch_version)| Metadata {
major_version,