refactor: extract symbol ID generation and helper functions
- Moved symbol ID generation logic out of renderer initialization into standalone function - Extracted sanitize_identifier and metadata_for_symbol as reusable helper functions - Symbol IDs now computed before rendering and passed to renderer constructor
This commit is contained in:
parent
13d4db8bb4
commit
3b8a653167
2 changed files with 314 additions and 148 deletions
|
|
@ -41,7 +41,7 @@ use parse_grammar::parse_grammar;
|
|||
pub use parse_grammar::ParseGrammarError;
|
||||
use prepare_grammar::prepare_grammar;
|
||||
pub use prepare_grammar::PrepareGrammarError;
|
||||
use render::render_c_code;
|
||||
use render::{generate_symbol_ids, render_c_code};
|
||||
pub use render::{ABI_VERSION_MAX, ABI_VERSION_MIN};
|
||||
|
||||
static JSON_COMMENT_REGEX: LazyLock<Regex> = LazyLock::new(|| {
|
||||
|
|
@ -373,12 +373,24 @@ fn generate_parser_for_grammar_with_opts(
|
|||
report_symbol_name,
|
||||
optimizations,
|
||||
)?;
|
||||
|
||||
// Generate symbol IDs before rendering C code
|
||||
let (symbol_ids, alias_ids, unique_aliases) = generate_symbol_ids(
|
||||
&tables.parse_table,
|
||||
&syntax_grammar,
|
||||
&lexical_grammar,
|
||||
&simple_aliases,
|
||||
);
|
||||
|
||||
let c_code = render_c_code(
|
||||
&input_grammar.name,
|
||||
tables,
|
||||
syntax_grammar,
|
||||
lexical_grammar,
|
||||
simple_aliases,
|
||||
symbol_ids,
|
||||
alias_ids,
|
||||
unique_aliases,
|
||||
abi_version,
|
||||
semantic_version,
|
||||
supertype_symbol_map,
|
||||
|
|
|
|||
|
|
@ -175,15 +175,7 @@ impl Generator {
|
|||
}
|
||||
|
||||
fn init(&mut self) {
|
||||
let mut symbol_identifiers = HashSet::new();
|
||||
for i in 0..self.parse_table.symbols.len() {
|
||||
self.assign_symbol_id(self.parse_table.symbols[i], &mut symbol_identifiers);
|
||||
}
|
||||
self.symbol_ids.insert(
|
||||
Symbol::end_of_nonterminal_extra(),
|
||||
self.symbol_ids[&Symbol::end()].clone(),
|
||||
);
|
||||
|
||||
// symbol_ids and alias_ids are now passed in from the constructor
|
||||
self.symbol_map = HashMap::new();
|
||||
|
||||
for symbol in &self.parse_table.symbols {
|
||||
|
|
@ -1708,54 +1700,13 @@ impl Generator {
|
|||
)
|
||||
}
|
||||
|
||||
fn assign_symbol_id(&mut self, symbol: Symbol, used_identifiers: &mut HashSet<String>) {
|
||||
let mut id;
|
||||
if symbol == Symbol::end() {
|
||||
id = "ts_builtin_sym_end".to_string();
|
||||
} else {
|
||||
let (name, kind) = self.metadata_for_symbol(symbol);
|
||||
id = match kind {
|
||||
VariableType::Auxiliary => format!("aux_sym_{}", self.sanitize_identifier(name)),
|
||||
VariableType::Anonymous => format!("anon_sym_{}", self.sanitize_identifier(name)),
|
||||
VariableType::Hidden | VariableType::Named => {
|
||||
format!("sym_{}", self.sanitize_identifier(name))
|
||||
}
|
||||
};
|
||||
|
||||
let mut suffix_number = 1;
|
||||
let mut suffix = String::new();
|
||||
while used_identifiers.contains(&id) {
|
||||
id.drain(id.len() - suffix.len()..);
|
||||
suffix_number += 1;
|
||||
suffix = suffix_number.to_string();
|
||||
id += &suffix;
|
||||
}
|
||||
}
|
||||
|
||||
used_identifiers.insert(id.clone());
|
||||
self.symbol_ids.insert(symbol, id);
|
||||
}
|
||||
|
||||
fn field_id(&self, field_name: &str) -> String {
|
||||
format!("field_{field_name}")
|
||||
}
|
||||
|
||||
fn metadata_for_symbol(&self, symbol: Symbol) -> (&str, VariableType) {
|
||||
match symbol.kind {
|
||||
SymbolType::End | SymbolType::EndOfNonTerminalExtra => ("end", VariableType::Hidden),
|
||||
SymbolType::NonTerminal => {
|
||||
let variable = &self.syntax_grammar.variables[symbol.index];
|
||||
(&variable.name, variable.kind)
|
||||
}
|
||||
SymbolType::Terminal => {
|
||||
let variable = &self.lexical_grammar.variables[symbol.index];
|
||||
(&variable.name, variable.kind)
|
||||
}
|
||||
SymbolType::External => {
|
||||
let token = &self.syntax_grammar.external_tokens[symbol.index];
|
||||
(&token.name, token.kind)
|
||||
}
|
||||
}
|
||||
metadata_for_symbol(symbol, &self.syntax_grammar, &self.lexical_grammar)
|
||||
}
|
||||
|
||||
fn symbols_for_alias(&self, alias: &Alias) -> Vec<Symbol> {
|
||||
|
|
@ -1776,98 +1727,7 @@ impl Generator {
|
|||
}
|
||||
|
||||
fn sanitize_identifier(&self, name: &str) -> String {
|
||||
let mut result = String::with_capacity(name.len());
|
||||
for c in name.chars() {
|
||||
if c.is_ascii_alphanumeric() || c == '_' {
|
||||
result.push(c);
|
||||
} else {
|
||||
'special_chars: {
|
||||
let replacement = match c {
|
||||
' ' if name.len() == 1 => "SPACE",
|
||||
'~' => "TILDE",
|
||||
'`' => "BQUOTE",
|
||||
'!' => "BANG",
|
||||
'@' => "AT",
|
||||
'#' => "POUND",
|
||||
'$' => "DOLLAR",
|
||||
'%' => "PERCENT",
|
||||
'^' => "CARET",
|
||||
'&' => "AMP",
|
||||
'*' => "STAR",
|
||||
'(' => "LPAREN",
|
||||
')' => "RPAREN",
|
||||
'-' => "DASH",
|
||||
'+' => "PLUS",
|
||||
'=' => "EQ",
|
||||
'{' => "LBRACE",
|
||||
'}' => "RBRACE",
|
||||
'[' => "LBRACK",
|
||||
']' => "RBRACK",
|
||||
'\\' => "BSLASH",
|
||||
'|' => "PIPE",
|
||||
':' => "COLON",
|
||||
';' => "SEMI",
|
||||
'"' => "DQUOTE",
|
||||
'\'' => "SQUOTE",
|
||||
'<' => "LT",
|
||||
'>' => "GT",
|
||||
',' => "COMMA",
|
||||
'.' => "DOT",
|
||||
'?' => "QMARK",
|
||||
'/' => "SLASH",
|
||||
'\n' => "LF",
|
||||
'\r' => "CR",
|
||||
'\t' => "TAB",
|
||||
'\0' => "NULL",
|
||||
'\u{0001}' => "SOH",
|
||||
'\u{0002}' => "STX",
|
||||
'\u{0003}' => "ETX",
|
||||
'\u{0004}' => "EOT",
|
||||
'\u{0005}' => "ENQ",
|
||||
'\u{0006}' => "ACK",
|
||||
'\u{0007}' => "BEL",
|
||||
'\u{0008}' => "BS",
|
||||
'\u{000b}' => "VTAB",
|
||||
'\u{000c}' => "FF",
|
||||
'\u{000e}' => "SO",
|
||||
'\u{000f}' => "SI",
|
||||
'\u{0010}' => "DLE",
|
||||
'\u{0011}' => "DC1",
|
||||
'\u{0012}' => "DC2",
|
||||
'\u{0013}' => "DC3",
|
||||
'\u{0014}' => "DC4",
|
||||
'\u{0015}' => "NAK",
|
||||
'\u{0016}' => "SYN",
|
||||
'\u{0017}' => "ETB",
|
||||
'\u{0018}' => "CAN",
|
||||
'\u{0019}' => "EM",
|
||||
'\u{001a}' => "SUB",
|
||||
'\u{001b}' => "ESC",
|
||||
'\u{001c}' => "FS",
|
||||
'\u{001d}' => "GS",
|
||||
'\u{001e}' => "RS",
|
||||
'\u{001f}' => "US",
|
||||
'\u{007F}' => "DEL",
|
||||
'\u{FEFF}' => "BOM",
|
||||
'\u{0080}'..='\u{FFFF}' => {
|
||||
write!(result, "u{:04x}", c as u32).unwrap();
|
||||
break 'special_chars;
|
||||
}
|
||||
'\u{10000}'..='\u{10FFFF}' => {
|
||||
write!(result, "U{:08x}", c as u32).unwrap();
|
||||
break 'special_chars;
|
||||
}
|
||||
'0'..='9' | 'a'..='z' | 'A'..='Z' | '_' => unreachable!(),
|
||||
' ' => break 'special_chars,
|
||||
};
|
||||
if !result.is_empty() && !result.ends_with('_') {
|
||||
result.push('_');
|
||||
}
|
||||
result += replacement;
|
||||
}
|
||||
}
|
||||
}
|
||||
result
|
||||
sanitize_identifier(name)
|
||||
}
|
||||
|
||||
fn sanitize_string(&self, name: &str) -> String {
|
||||
|
|
@ -1915,23 +1775,311 @@ impl Generator {
|
|||
}
|
||||
}
|
||||
|
||||
/// Helper function to sanitize identifiers for C code generation.
|
||||
fn sanitize_identifier(name: &str) -> String {
|
||||
let mut result = String::with_capacity(name.len());
|
||||
for c in name.chars() {
|
||||
if c.is_ascii_alphanumeric() || c == '_' {
|
||||
result.push(c);
|
||||
} else {
|
||||
'special_chars: {
|
||||
let replacement = match c {
|
||||
' ' if name.len() == 1 => "SPACE",
|
||||
'~' => "TILDE",
|
||||
'`' => "BQUOTE",
|
||||
'!' => "BANG",
|
||||
'@' => "AT",
|
||||
'#' => "POUND",
|
||||
'$' => "DOLLAR",
|
||||
'%' => "PERCENT",
|
||||
'^' => "CARET",
|
||||
'&' => "AMP",
|
||||
'*' => "STAR",
|
||||
'(' => "LPAREN",
|
||||
')' => "RPAREN",
|
||||
'-' => "DASH",
|
||||
'+' => "PLUS",
|
||||
'=' => "EQ",
|
||||
'{' => "LBRACE",
|
||||
'}' => "RBRACE",
|
||||
'[' => "LBRACK",
|
||||
']' => "RBRACK",
|
||||
'\\' => "BSLASH",
|
||||
'|' => "PIPE",
|
||||
':' => "COLON",
|
||||
';' => "SEMI",
|
||||
'"' => "DQUOTE",
|
||||
'\'' => "SQUOTE",
|
||||
'<' => "LT",
|
||||
'>' => "GT",
|
||||
',' => "COMMA",
|
||||
'.' => "DOT",
|
||||
'?' => "QMARK",
|
||||
'/' => "SLASH",
|
||||
'\n' => "LF",
|
||||
'\r' => "CR",
|
||||
'\t' => "TAB",
|
||||
'\0' => "NULL",
|
||||
'\u{0001}' => "SOH",
|
||||
'\u{0002}' => "STX",
|
||||
'\u{0003}' => "ETX",
|
||||
'\u{0004}' => "EOT",
|
||||
'\u{0005}' => "ENQ",
|
||||
'\u{0006}' => "ACK",
|
||||
'\u{0007}' => "BEL",
|
||||
'\u{0008}' => "BS",
|
||||
'\u{000b}' => "VTAB",
|
||||
'\u{000c}' => "FF",
|
||||
'\u{000e}' => "SO",
|
||||
'\u{000f}' => "SI",
|
||||
'\u{0010}' => "DLE",
|
||||
'\u{0011}' => "DC1",
|
||||
'\u{0012}' => "DC2",
|
||||
'\u{0013}' => "DC3",
|
||||
'\u{0014}' => "DC4",
|
||||
'\u{0015}' => "NAK",
|
||||
'\u{0016}' => "SYN",
|
||||
'\u{0017}' => "ETB",
|
||||
'\u{0018}' => "CAN",
|
||||
'\u{0019}' => "EM",
|
||||
'\u{001a}' => "SUB",
|
||||
'\u{001b}' => "ESC",
|
||||
'\u{001c}' => "FS",
|
||||
'\u{001d}' => "GS",
|
||||
'\u{001e}' => "RS",
|
||||
'\u{001f}' => "US",
|
||||
'\u{007F}' => "DEL",
|
||||
'\u{FEFF}' => "BOM",
|
||||
'\u{0080}'..='\u{FFFF}' => {
|
||||
write!(result, "u{:04x}", c as u32).unwrap();
|
||||
break 'special_chars;
|
||||
}
|
||||
'\u{10000}'..='\u{10FFFF}' => {
|
||||
write!(result, "U{:08x}", c as u32).unwrap();
|
||||
break 'special_chars;
|
||||
}
|
||||
'0'..='9' | 'a'..='z' | 'A'..='Z' | '_' => unreachable!(),
|
||||
' ' => break 'special_chars,
|
||||
};
|
||||
if !result.is_empty() && !result.ends_with('_') {
|
||||
result.push('_');
|
||||
}
|
||||
result += replacement;
|
||||
}
|
||||
}
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
/// Helper function to get metadata for a symbol.
|
||||
fn metadata_for_symbol<'a>(
|
||||
symbol: Symbol,
|
||||
syntax_grammar: &'a SyntaxGrammar,
|
||||
lexical_grammar: &'a LexicalGrammar,
|
||||
) -> (&'a str, VariableType) {
|
||||
match symbol.kind {
|
||||
SymbolType::End | SymbolType::EndOfNonTerminalExtra => ("end", VariableType::Hidden),
|
||||
SymbolType::NonTerminal => {
|
||||
let variable = &syntax_grammar.variables[symbol.index];
|
||||
(&variable.name as &str, variable.kind)
|
||||
}
|
||||
SymbolType::Terminal => {
|
||||
let variable = &lexical_grammar.variables[symbol.index];
|
||||
(&variable.name as &str, variable.kind)
|
||||
}
|
||||
SymbolType::External => {
|
||||
let token = &syntax_grammar.external_tokens[symbol.index];
|
||||
(&token.name as &str, token.kind)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper function to assign a symbol ID.
|
||||
fn assign_symbol_id(
|
||||
symbol: Symbol,
|
||||
syntax_grammar: &SyntaxGrammar,
|
||||
lexical_grammar: &LexicalGrammar,
|
||||
symbol_ids: &mut HashMap<Symbol, String>,
|
||||
used_identifiers: &mut HashSet<String>,
|
||||
) {
|
||||
let mut id;
|
||||
if symbol == Symbol::end() {
|
||||
id = "ts_builtin_sym_end".to_string();
|
||||
} else {
|
||||
let (name, kind) = metadata_for_symbol(symbol, syntax_grammar, lexical_grammar);
|
||||
id = match kind {
|
||||
VariableType::Auxiliary => format!("aux_sym_{}", sanitize_identifier(name)),
|
||||
VariableType::Anonymous => format!("anon_sym_{}", sanitize_identifier(name)),
|
||||
VariableType::Hidden | VariableType::Named => {
|
||||
format!("sym_{}", sanitize_identifier(name))
|
||||
}
|
||||
};
|
||||
|
||||
let mut suffix_number = 1;
|
||||
let mut suffix = String::new();
|
||||
while used_identifiers.contains(&id) {
|
||||
id.drain(id.len() - suffix.len()..);
|
||||
suffix_number += 1;
|
||||
suffix = suffix_number.to_string();
|
||||
id += &suffix;
|
||||
}
|
||||
}
|
||||
|
||||
used_identifiers.insert(id.clone());
|
||||
symbol_ids.insert(symbol, id);
|
||||
}
|
||||
|
||||
/// Generates symbol IDs and alias IDs for the given parse table and grammars.
|
||||
///
|
||||
/// This function must be called before `render_c_code` to generate the symbol mappings
|
||||
/// that will be used in the generated C code.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `parse_table` - The generated parse table for the language
|
||||
/// * `syntax_grammar` - The syntax grammar extracted from the language's grammar
|
||||
/// * `lexical_grammar` - The lexical grammar extracted from the language's grammar
|
||||
/// * `default_aliases` - A map describing the global rename rules that should apply
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A tuple containing:
|
||||
/// * `symbol_ids` - HashMap mapping each Symbol to its C identifier string
|
||||
/// * `alias_ids` - HashMap mapping each Alias to its C identifier string
|
||||
/// * `unique_aliases` - Sorted vector of unique aliases
|
||||
pub fn generate_symbol_ids(
|
||||
parse_table: &ParseTable,
|
||||
syntax_grammar: &SyntaxGrammar,
|
||||
lexical_grammar: &LexicalGrammar,
|
||||
default_aliases: &AliasMap,
|
||||
) -> (HashMap<Symbol, String>, HashMap<Alias, String>, Vec<Alias>) {
|
||||
let mut symbol_ids = HashMap::new();
|
||||
let mut alias_ids = HashMap::new();
|
||||
let mut unique_aliases = Vec::new();
|
||||
let mut symbol_identifiers = HashSet::new();
|
||||
|
||||
// Generate symbol IDs
|
||||
for i in 0..parse_table.symbols.len() {
|
||||
assign_symbol_id(
|
||||
parse_table.symbols[i],
|
||||
syntax_grammar,
|
||||
lexical_grammar,
|
||||
&mut symbol_ids,
|
||||
&mut symbol_identifiers,
|
||||
);
|
||||
}
|
||||
|
||||
symbol_ids.insert(
|
||||
Symbol::end_of_nonterminal_extra(),
|
||||
symbol_ids[&Symbol::end()].clone(),
|
||||
);
|
||||
|
||||
// Build symbol map to find canonical symbols for aliases
|
||||
let mut symbol_map = HashMap::new();
|
||||
for symbol in &parse_table.symbols {
|
||||
let mut mapping = symbol;
|
||||
|
||||
if let Some(alias) = default_aliases.get(symbol) {
|
||||
let kind = alias.kind();
|
||||
for other_symbol in &parse_table.symbols {
|
||||
if let Some(other_alias) = default_aliases.get(other_symbol) {
|
||||
if other_symbol < mapping && other_alias == alias {
|
||||
mapping = other_symbol;
|
||||
}
|
||||
} else {
|
||||
let (other_name, other_kind) =
|
||||
metadata_for_symbol(*other_symbol, syntax_grammar, lexical_grammar);
|
||||
if (other_name, other_kind) == (alias.value.as_str(), kind) {
|
||||
mapping = other_symbol;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if symbol.is_terminal() {
|
||||
let metadata = metadata_for_symbol(*symbol, syntax_grammar, lexical_grammar);
|
||||
for other_symbol in &parse_table.symbols {
|
||||
let other_metadata =
|
||||
metadata_for_symbol(*other_symbol, syntax_grammar, lexical_grammar);
|
||||
if other_metadata == metadata {
|
||||
if let Some(mapped) = symbol_map.get(other_symbol) {
|
||||
if mapped == symbol {
|
||||
break;
|
||||
}
|
||||
}
|
||||
mapping = other_symbol;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
symbol_map.insert(*symbol, *mapping);
|
||||
}
|
||||
|
||||
// Generate alias IDs
|
||||
for production_info in &parse_table.production_infos {
|
||||
for alias in &production_info.alias_sequence {
|
||||
if let Some(alias) = &alias {
|
||||
// Find symbols that match this alias
|
||||
let matching_symbols: Vec<Symbol> = parse_table
|
||||
.symbols
|
||||
.iter()
|
||||
.copied()
|
||||
.filter(|symbol| {
|
||||
default_aliases.get(symbol).map_or_else(
|
||||
|| {
|
||||
let (name, kind) =
|
||||
metadata_for_symbol(*symbol, syntax_grammar, lexical_grammar);
|
||||
name == alias.value && kind == alias.kind()
|
||||
},
|
||||
|default_alias| default_alias == alias,
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Some aliases match an existing symbol in the grammar.
|
||||
let alias_id = if let Some(existing_symbol) = matching_symbols.first() {
|
||||
symbol_ids[&symbol_map[existing_symbol]].clone()
|
||||
}
|
||||
// Other aliases don't match any existing symbol, and need their own identifiers.
|
||||
else {
|
||||
if let Err(i) = unique_aliases.binary_search(alias) {
|
||||
unique_aliases.insert(i, alias.clone());
|
||||
}
|
||||
|
||||
if alias.is_named {
|
||||
format!("alias_sym_{}", sanitize_identifier(&alias.value))
|
||||
} else {
|
||||
format!("anon_alias_sym_{}", sanitize_identifier(&alias.value))
|
||||
}
|
||||
};
|
||||
|
||||
alias_ids.entry(alias.clone()).or_insert(alias_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(symbol_ids, alias_ids, unique_aliases)
|
||||
}
|
||||
|
||||
/// Returns a String of C code for the given components of a parser.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `name` - A string slice containing the name of the language
|
||||
/// * `parse_table` - The generated parse table for the language
|
||||
/// * `main_lex_table` - The generated lexing table for the language
|
||||
/// * `keyword_lex_table` - The generated keyword lexing table for the language
|
||||
/// * `keyword_capture_token` - A symbol indicating which token is used for keyword capture, if any.
|
||||
/// * `tables` - The generated tables for the language
|
||||
/// * `syntax_grammar` - The syntax grammar extracted from the language's grammar
|
||||
/// * `lexical_grammar` - The lexical grammar extracted from the language's grammar
|
||||
/// * `default_aliases` - A map describing the global rename rules that should apply. the keys are
|
||||
/// symbols that are *always* aliased in the same way, and the values are the aliases that are
|
||||
/// applied to those symbols.
|
||||
/// * `symbol_ids` - HashMap mapping each Symbol to its C identifier string
|
||||
/// * `alias_ids` - HashMap mapping each Alias to its C identifier string
|
||||
/// * `unique_aliases` - Sorted vector of unique aliases
|
||||
/// * `abi_version` - The language ABI version that should be generated. Usually you want
|
||||
/// Tree-sitter's current version, but right after making an ABI change, it may be useful to
|
||||
/// generate code with the previous ABI.
|
||||
/// * `semantic_version` - Optional semantic version of the parser
|
||||
/// * `supertype_symbol_map` - Map of supertype symbols
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn render_c_code(
|
||||
name: &str,
|
||||
|
|
@ -1939,6 +2087,9 @@ pub fn render_c_code(
|
|||
syntax_grammar: SyntaxGrammar,
|
||||
lexical_grammar: LexicalGrammar,
|
||||
default_aliases: AliasMap,
|
||||
symbol_ids: HashMap<Symbol, String>,
|
||||
alias_ids: HashMap<Alias, String>,
|
||||
unique_aliases: Vec<Alias>,
|
||||
abi_version: usize,
|
||||
semantic_version: Option<(u8, u8, u8)>,
|
||||
supertype_symbol_map: BTreeMap<Symbol, Vec<ChildType>>,
|
||||
|
|
@ -1958,6 +2109,9 @@ pub fn render_c_code(
|
|||
syntax_grammar,
|
||||
lexical_grammar,
|
||||
default_aliases,
|
||||
symbol_ids,
|
||||
alias_ids,
|
||||
unique_aliases,
|
||||
abi_version,
|
||||
metadata: semantic_version.map(|(major_version, minor_version, patch_version)| Metadata {
|
||||
major_version,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue