From 0ad40ec26367fa77a61e03bc309aff2aed68e6e5 Mon Sep 17 00:00:00 2001 From: bglgwyng Date: Sun, 9 Nov 2025 18:58:27 +0900 Subject: [PATCH] refactor: extract symbol ID generation into dedicated module --- crates/generate/src/introspect_grammar.rs | 306 ++++++++++++++++++++- crates/generate/src/render.rs | 315 +--------------------- 2 files changed, 310 insertions(+), 311 deletions(-) diff --git a/crates/generate/src/introspect_grammar.rs b/crates/generate/src/introspect_grammar.rs index a654c687..f90b82e4 100644 --- a/crates/generate/src/introspect_grammar.rs +++ b/crates/generate/src/introspect_grammar.rs @@ -1,12 +1,15 @@ -use std::collections::{BTreeMap, HashMap}; +use std::{ + collections::{BTreeMap, HashMap, HashSet}, + fmt::Write, +}; use crate::{ build_tables::{build_tables, Tables}, - grammars::{InputGrammar, LexicalGrammar, SyntaxGrammar}, + grammars::{InputGrammar, LexicalGrammar, SyntaxGrammar, VariableType}, node_types::{self, ChildType, VariableInfo}, prepare_grammar::prepare_grammar, - render::generate_symbol_ids, - rules::{Alias, Symbol}, + rules::{Alias, AliasMap, Symbol, SymbolType}, + tables::ParseTable, GenerateError, OptLevel, }; @@ -64,3 +67,298 @@ pub fn introspect_grammar( unique_aliases, }) } + +/// Generates symbol IDs and alias IDs for the given parse table and grammars. +/// +/// This function must be called before `render_c_code` to generate the symbol mappings +/// that will be used in the generated C code. +/// +/// # Arguments +/// +/// * `parse_table` - The generated parse table for the language +/// * `syntax_grammar` - The syntax grammar extracted from the language's grammar +/// * `lexical_grammar` - The lexical grammar extracted from the language's grammar +/// * `default_aliases` - A map describing the global rename rules that should apply +/// +/// # Returns +/// +/// A tuple containing: +/// * `symbol_ids` - HashMap mapping each Symbol to (C identifier string, numeric ID) +/// * `alias_ids` - HashMap mapping each Alias to its C identifier string +/// * `unique_aliases` - Sorted vector of unique aliases +pub fn generate_symbol_ids( + parse_table: &ParseTable, + syntax_grammar: &SyntaxGrammar, + lexical_grammar: &LexicalGrammar, + default_aliases: &AliasMap, +) -> ( + HashMap, + HashMap, + Vec, +) { + let mut symbol_ids = HashMap::new(); + let mut alias_ids = HashMap::new(); + let mut unique_aliases = Vec::new(); + let mut symbol_identifiers = HashSet::new(); + + // Generate symbol IDs with numeric IDs + // Symbol::end() gets 0, then other symbols get 1, 2, 3... + let mut numeric_id = 0u16; + for &symbol in &parse_table.symbols { + assign_symbol_id( + symbol, + syntax_grammar, + lexical_grammar, + &mut symbol_ids, + &mut symbol_identifiers, + numeric_id, + ); + numeric_id += 1; + } + + symbol_ids.insert( + Symbol::end_of_nonterminal_extra(), + symbol_ids[&Symbol::end()].clone(), + ); + + // Build symbol map to find canonical symbols for aliases + let mut symbol_map = HashMap::new(); + for symbol in &parse_table.symbols { + let mut mapping = symbol; + + if let Some(alias) = default_aliases.get(symbol) { + let kind = alias.kind(); + for other_symbol in &parse_table.symbols { + if let Some(other_alias) = default_aliases.get(other_symbol) { + if other_symbol < mapping && other_alias == alias { + mapping = other_symbol; + } + } else { + let (other_name, other_kind) = + metadata_for_symbol(*other_symbol, syntax_grammar, lexical_grammar); + if (other_name, other_kind) == (alias.value.as_str(), kind) { + mapping = other_symbol; + break; + } + } + } + } else if symbol.is_terminal() { + let metadata = metadata_for_symbol(*symbol, syntax_grammar, lexical_grammar); + for other_symbol in &parse_table.symbols { + let other_metadata = + metadata_for_symbol(*other_symbol, syntax_grammar, lexical_grammar); + if other_metadata == metadata { + if let Some(mapped) = symbol_map.get(other_symbol) { + if mapped == symbol { + break; + } + } + mapping = other_symbol; + break; + } + } + } + + symbol_map.insert(*symbol, *mapping); + } + + // Generate alias IDs + for production_info in &parse_table.production_infos { + for alias in &production_info.alias_sequence { + if let Some(alias) = &alias { + // Find symbols that match this alias + let matching_symbols: Vec = parse_table + .symbols + .iter() + .copied() + .filter(|symbol| { + default_aliases.get(symbol).map_or_else( + || { + let (name, kind) = + metadata_for_symbol(*symbol, syntax_grammar, lexical_grammar); + name == alias.value && kind == alias.kind() + }, + |default_alias| default_alias == alias, + ) + }) + .collect(); + + // Some aliases match an existing symbol in the grammar. + let alias_id = if let Some(existing_symbol) = matching_symbols.first() { + symbol_ids[&symbol_map[existing_symbol]].0.clone() + } + // Other aliases don't match any existing symbol, and need their own identifiers. + else { + if let Err(i) = unique_aliases.binary_search(alias) { + unique_aliases.insert(i, alias.clone()); + } + + if alias.is_named { + format!("alias_sym_{}", sanitize_identifier(&alias.value)) + } else { + format!("anon_alias_sym_{}", sanitize_identifier(&alias.value)) + } + }; + + alias_ids.entry(alias.clone()).or_insert(alias_id); + } + } + } + + (symbol_ids, alias_ids, unique_aliases) +} + +/// Helper function to sanitize identifiers for C code generation. +pub fn sanitize_identifier(name: &str) -> String { + let mut result = String::with_capacity(name.len()); + for c in name.chars() { + if c.is_ascii_alphanumeric() || c == '_' { + result.push(c); + } else { + 'special_chars: { + let replacement = match c { + ' ' if name.len() == 1 => "SPACE", + '~' => "TILDE", + '`' => "BQUOTE", + '!' => "BANG", + '@' => "AT", + '#' => "POUND", + '$' => "DOLLAR", + '%' => "PERCENT", + '^' => "CARET", + '&' => "AMP", + '*' => "STAR", + '(' => "LPAREN", + ')' => "RPAREN", + '-' => "DASH", + '+' => "PLUS", + '=' => "EQ", + '{' => "LBRACE", + '}' => "RBRACE", + '[' => "LBRACK", + ']' => "RBRACK", + '\\' => "BSLASH", + '|' => "PIPE", + ':' => "COLON", + ';' => "SEMI", + '"' => "DQUOTE", + '\'' => "SQUOTE", + '<' => "LT", + '>' => "GT", + ',' => "COMMA", + '.' => "DOT", + '?' => "QMARK", + '/' => "SLASH", + '\n' => "LF", + '\r' => "CR", + '\t' => "TAB", + '\0' => "NULL", + '\u{0001}' => "SOH", + '\u{0002}' => "STX", + '\u{0003}' => "ETX", + '\u{0004}' => "EOT", + '\u{0005}' => "ENQ", + '\u{0006}' => "ACK", + '\u{0007}' => "BEL", + '\u{0008}' => "BS", + '\u{000b}' => "VTAB", + '\u{000c}' => "FF", + '\u{000e}' => "SO", + '\u{000f}' => "SI", + '\u{0010}' => "DLE", + '\u{0011}' => "DC1", + '\u{0012}' => "DC2", + '\u{0013}' => "DC3", + '\u{0014}' => "DC4", + '\u{0015}' => "NAK", + '\u{0016}' => "SYN", + '\u{0017}' => "ETB", + '\u{0018}' => "CAN", + '\u{0019}' => "EM", + '\u{001a}' => "SUB", + '\u{001b}' => "ESC", + '\u{001c}' => "FS", + '\u{001d}' => "GS", + '\u{001e}' => "RS", + '\u{001f}' => "US", + '\u{007F}' => "DEL", + '\u{FEFF}' => "BOM", + '\u{0080}'..='\u{FFFF}' => { + write!(result, "u{:04x}", c as u32).unwrap(); + break 'special_chars; + } + '\u{10000}'..='\u{10FFFF}' => { + write!(result, "U{:08x}", c as u32).unwrap(); + break 'special_chars; + } + '0'..='9' | 'a'..='z' | 'A'..='Z' | '_' => unreachable!(), + ' ' => break 'special_chars, + }; + if !result.is_empty() && !result.ends_with('_') { + result.push('_'); + } + result += replacement; + } + } + } + result +} + +/// Helper function to get metadata for a symbol. +pub fn metadata_for_symbol<'a>( + symbol: Symbol, + syntax_grammar: &'a SyntaxGrammar, + lexical_grammar: &'a LexicalGrammar, +) -> (&'a str, VariableType) { + match symbol.kind { + SymbolType::End | SymbolType::EndOfNonTerminalExtra => ("end", VariableType::Hidden), + SymbolType::NonTerminal => { + let variable = &syntax_grammar.variables[symbol.index]; + (&variable.name as &str, variable.kind) + } + SymbolType::Terminal => { + let variable = &lexical_grammar.variables[symbol.index]; + (&variable.name as &str, variable.kind) + } + SymbolType::External => { + let token = &syntax_grammar.external_tokens[symbol.index]; + (&token.name as &str, token.kind) + } + } +} + +/// Helper function to assign a symbol ID. +fn assign_symbol_id( + symbol: Symbol, + syntax_grammar: &SyntaxGrammar, + lexical_grammar: &LexicalGrammar, + symbol_ids: &mut HashMap, + used_identifiers: &mut HashSet, + numeric_id: u16, +) { + let mut id; + if symbol == Symbol::end() { + id = "ts_builtin_sym_end".to_string(); + } else { + let (name, kind) = metadata_for_symbol(symbol, syntax_grammar, lexical_grammar); + id = match kind { + VariableType::Auxiliary => format!("aux_sym_{}", sanitize_identifier(name)), + VariableType::Anonymous => format!("anon_sym_{}", sanitize_identifier(name)), + VariableType::Hidden | VariableType::Named => { + format!("sym_{}", sanitize_identifier(name)) + } + }; + + let mut suffix_number = 1; + let mut suffix = String::new(); + while used_identifiers.contains(&id) { + id.drain(id.len() - suffix.len()..); + suffix_number += 1; + suffix = suffix_number.to_string(); + id += &suffix; + } + } + + used_identifiers.insert(id.clone()); + symbol_ids.insert(symbol, (id, numeric_id)); +} diff --git a/crates/generate/src/render.rs b/crates/generate/src/render.rs index d831a16d..bfdc0bd4 100644 --- a/crates/generate/src/render.rs +++ b/crates/generate/src/render.rs @@ -1,11 +1,14 @@ use std::{ cmp, - collections::{BTreeMap, BTreeSet, HashMap, HashSet}, + collections::{BTreeMap, BTreeSet, HashMap}, fmt::Write, mem::swap, }; -use crate::LANGUAGE_VERSION; +use crate::{ + introspect_grammar::{metadata_for_symbol, sanitize_identifier}, + LANGUAGE_VERSION, +}; use indoc::indoc; use super::{ @@ -245,9 +248,9 @@ impl Generator { } if alias.is_named { - format!("alias_sym_{}", self.sanitize_identifier(&alias.value)) + format!("alias_sym_{}", sanitize_identifier(&alias.value)) } else { - format!("anon_alias_sym_{}", self.sanitize_identifier(&alias.value)) + format!("anon_alias_sym_{}", sanitize_identifier(&alias.value)) } }; @@ -1700,10 +1703,7 @@ impl Generator { } fn external_token_id(&self, token: &ExternalToken) -> String { - format!( - "ts_external_token_{}", - self.sanitize_identifier(&token.name) - ) + format!("ts_external_token_{}", sanitize_identifier(&token.name)) } fn field_id(&self, field_name: &str) -> String { @@ -1731,10 +1731,6 @@ impl Generator { .collect() } - fn sanitize_identifier(&self, name: &str) -> String { - sanitize_identifier(name) - } - fn sanitize_string(&self, name: &str) -> String { let mut result = String::with_capacity(name.len()); for c in name.chars() { @@ -1780,301 +1776,6 @@ impl Generator { } } -/// Helper function to sanitize identifiers for C code generation. -fn sanitize_identifier(name: &str) -> String { - let mut result = String::with_capacity(name.len()); - for c in name.chars() { - if c.is_ascii_alphanumeric() || c == '_' { - result.push(c); - } else { - 'special_chars: { - let replacement = match c { - ' ' if name.len() == 1 => "SPACE", - '~' => "TILDE", - '`' => "BQUOTE", - '!' => "BANG", - '@' => "AT", - '#' => "POUND", - '$' => "DOLLAR", - '%' => "PERCENT", - '^' => "CARET", - '&' => "AMP", - '*' => "STAR", - '(' => "LPAREN", - ')' => "RPAREN", - '-' => "DASH", - '+' => "PLUS", - '=' => "EQ", - '{' => "LBRACE", - '}' => "RBRACE", - '[' => "LBRACK", - ']' => "RBRACK", - '\\' => "BSLASH", - '|' => "PIPE", - ':' => "COLON", - ';' => "SEMI", - '"' => "DQUOTE", - '\'' => "SQUOTE", - '<' => "LT", - '>' => "GT", - ',' => "COMMA", - '.' => "DOT", - '?' => "QMARK", - '/' => "SLASH", - '\n' => "LF", - '\r' => "CR", - '\t' => "TAB", - '\0' => "NULL", - '\u{0001}' => "SOH", - '\u{0002}' => "STX", - '\u{0003}' => "ETX", - '\u{0004}' => "EOT", - '\u{0005}' => "ENQ", - '\u{0006}' => "ACK", - '\u{0007}' => "BEL", - '\u{0008}' => "BS", - '\u{000b}' => "VTAB", - '\u{000c}' => "FF", - '\u{000e}' => "SO", - '\u{000f}' => "SI", - '\u{0010}' => "DLE", - '\u{0011}' => "DC1", - '\u{0012}' => "DC2", - '\u{0013}' => "DC3", - '\u{0014}' => "DC4", - '\u{0015}' => "NAK", - '\u{0016}' => "SYN", - '\u{0017}' => "ETB", - '\u{0018}' => "CAN", - '\u{0019}' => "EM", - '\u{001a}' => "SUB", - '\u{001b}' => "ESC", - '\u{001c}' => "FS", - '\u{001d}' => "GS", - '\u{001e}' => "RS", - '\u{001f}' => "US", - '\u{007F}' => "DEL", - '\u{FEFF}' => "BOM", - '\u{0080}'..='\u{FFFF}' => { - write!(result, "u{:04x}", c as u32).unwrap(); - break 'special_chars; - } - '\u{10000}'..='\u{10FFFF}' => { - write!(result, "U{:08x}", c as u32).unwrap(); - break 'special_chars; - } - '0'..='9' | 'a'..='z' | 'A'..='Z' | '_' => unreachable!(), - ' ' => break 'special_chars, - }; - if !result.is_empty() && !result.ends_with('_') { - result.push('_'); - } - result += replacement; - } - } - } - result -} - -/// Helper function to get metadata for a symbol. -fn metadata_for_symbol<'a>( - symbol: Symbol, - syntax_grammar: &'a SyntaxGrammar, - lexical_grammar: &'a LexicalGrammar, -) -> (&'a str, VariableType) { - match symbol.kind { - SymbolType::End | SymbolType::EndOfNonTerminalExtra => ("end", VariableType::Hidden), - SymbolType::NonTerminal => { - let variable = &syntax_grammar.variables[symbol.index]; - (&variable.name as &str, variable.kind) - } - SymbolType::Terminal => { - let variable = &lexical_grammar.variables[symbol.index]; - (&variable.name as &str, variable.kind) - } - SymbolType::External => { - let token = &syntax_grammar.external_tokens[symbol.index]; - (&token.name as &str, token.kind) - } - } -} - -/// Helper function to assign a symbol ID. -fn assign_symbol_id( - symbol: Symbol, - syntax_grammar: &SyntaxGrammar, - lexical_grammar: &LexicalGrammar, - symbol_ids: &mut HashMap, - used_identifiers: &mut HashSet, - numeric_id: u16, -) { - let mut id; - if symbol == Symbol::end() { - id = "ts_builtin_sym_end".to_string(); - } else { - let (name, kind) = metadata_for_symbol(symbol, syntax_grammar, lexical_grammar); - id = match kind { - VariableType::Auxiliary => format!("aux_sym_{}", sanitize_identifier(name)), - VariableType::Anonymous => format!("anon_sym_{}", sanitize_identifier(name)), - VariableType::Hidden | VariableType::Named => { - format!("sym_{}", sanitize_identifier(name)) - } - }; - - let mut suffix_number = 1; - let mut suffix = String::new(); - while used_identifiers.contains(&id) { - id.drain(id.len() - suffix.len()..); - suffix_number += 1; - suffix = suffix_number.to_string(); - id += &suffix; - } - } - - used_identifiers.insert(id.clone()); - symbol_ids.insert(symbol, (id, numeric_id)); -} - -/// Generates symbol IDs and alias IDs for the given parse table and grammars. -/// -/// This function must be called before `render_c_code` to generate the symbol mappings -/// that will be used in the generated C code. -/// -/// # Arguments -/// -/// * `parse_table` - The generated parse table for the language -/// * `syntax_grammar` - The syntax grammar extracted from the language's grammar -/// * `lexical_grammar` - The lexical grammar extracted from the language's grammar -/// * `default_aliases` - A map describing the global rename rules that should apply -/// -/// # Returns -/// -/// A tuple containing: -/// * `symbol_ids` - HashMap mapping each Symbol to (C identifier string, numeric ID) -/// * `alias_ids` - HashMap mapping each Alias to its C identifier string -/// * `unique_aliases` - Sorted vector of unique aliases -pub fn generate_symbol_ids( - parse_table: &ParseTable, - syntax_grammar: &SyntaxGrammar, - lexical_grammar: &LexicalGrammar, - default_aliases: &AliasMap, -) -> ( - HashMap, - HashMap, - Vec, -) { - let mut symbol_ids = HashMap::new(); - let mut alias_ids = HashMap::new(); - let mut unique_aliases = Vec::new(); - let mut symbol_identifiers = HashSet::new(); - - // Generate symbol IDs with numeric IDs - // Symbol::end() gets 0, then other symbols get 1, 2, 3... - let mut numeric_id = 0u16; - for &symbol in &parse_table.symbols { - assign_symbol_id( - symbol, - syntax_grammar, - lexical_grammar, - &mut symbol_ids, - &mut symbol_identifiers, - numeric_id, - ); - numeric_id += 1; - } - - symbol_ids.insert( - Symbol::end_of_nonterminal_extra(), - symbol_ids[&Symbol::end()].clone(), - ); - - // Build symbol map to find canonical symbols for aliases - let mut symbol_map = HashMap::new(); - for symbol in &parse_table.symbols { - let mut mapping = symbol; - - if let Some(alias) = default_aliases.get(symbol) { - let kind = alias.kind(); - for other_symbol in &parse_table.symbols { - if let Some(other_alias) = default_aliases.get(other_symbol) { - if other_symbol < mapping && other_alias == alias { - mapping = other_symbol; - } - } else { - let (other_name, other_kind) = - metadata_for_symbol(*other_symbol, syntax_grammar, lexical_grammar); - if (other_name, other_kind) == (alias.value.as_str(), kind) { - mapping = other_symbol; - break; - } - } - } - } else if symbol.is_terminal() { - let metadata = metadata_for_symbol(*symbol, syntax_grammar, lexical_grammar); - for other_symbol in &parse_table.symbols { - let other_metadata = - metadata_for_symbol(*other_symbol, syntax_grammar, lexical_grammar); - if other_metadata == metadata { - if let Some(mapped) = symbol_map.get(other_symbol) { - if mapped == symbol { - break; - } - } - mapping = other_symbol; - break; - } - } - } - - symbol_map.insert(*symbol, *mapping); - } - - // Generate alias IDs - for production_info in &parse_table.production_infos { - for alias in &production_info.alias_sequence { - if let Some(alias) = &alias { - // Find symbols that match this alias - let matching_symbols: Vec = parse_table - .symbols - .iter() - .copied() - .filter(|symbol| { - default_aliases.get(symbol).map_or_else( - || { - let (name, kind) = - metadata_for_symbol(*symbol, syntax_grammar, lexical_grammar); - name == alias.value && kind == alias.kind() - }, - |default_alias| default_alias == alias, - ) - }) - .collect(); - - // Some aliases match an existing symbol in the grammar. - let alias_id = if let Some(existing_symbol) = matching_symbols.first() { - symbol_ids[&symbol_map[existing_symbol]].0.clone() - } - // Other aliases don't match any existing symbol, and need their own identifiers. - else { - if let Err(i) = unique_aliases.binary_search(alias) { - unique_aliases.insert(i, alias.clone()); - } - - if alias.is_named { - format!("alias_sym_{}", sanitize_identifier(&alias.value)) - } else { - format!("anon_alias_sym_{}", sanitize_identifier(&alias.value)) - } - }; - - alias_ids.entry(alias.clone()).or_insert(alias_id); - } - } - } - - (symbol_ids, alias_ids, unique_aliases) -} - /// Returns a String of C code for the given components of a parser. /// /// # Arguments