From 21c9f9ae4f1c00c747fd08fd6201b1dc1564ad7e Mon Sep 17 00:00:00 2001 From: bglgwyng Date: Fri, 7 Nov 2025 17:29:05 +0900 Subject: [PATCH] refactor: change symbol_ids to store both string and numeric IDs - Modified symbol_ids HashMap to store tuples of (String, u16) instead of just String - Updated symbol ID generation to assign numeric IDs sequentially (0 for end symbol, then 1, 2, 3...) - Changed all symbol_ids access patterns throughout codebase to use tuple destructuring (.0 for string, .1 for numeric) - Updated node_types.json to use numeric u16 symbol_id instead of String --- crates/generate/src/generate.rs | 4 +- crates/generate/src/node_types.rs | 12 ++-- crates/generate/src/render.rs | 95 ++++++++++++++++++------------- 3 files changed, 63 insertions(+), 48 deletions(-) diff --git a/crates/generate/src/generate.rs b/crates/generate/src/generate.rs index e432b9fe..a5f067de 100644 --- a/crates/generate/src/generate.rs +++ b/crates/generate/src/generate.rs @@ -78,7 +78,7 @@ struct GrammarIntrospection { variable_info: Vec, supertype_symbol_map: BTreeMap>, tables: Tables, - symbol_ids: HashMap, + symbol_ids: HashMap, alias_ids: HashMap, unique_aliases: Vec, } @@ -405,7 +405,7 @@ fn introspect_grammar( optimizations, )?; - // Generate symbol IDs before rendering C code + // Generate symbol IDs (both string and numeric) before rendering C code let (symbol_ids, alias_ids, unique_aliases) = generate_symbol_ids( &tables.parse_table, &syntax_grammar, diff --git a/crates/generate/src/node_types.rs b/crates/generate/src/node_types.rs index cbacf4b2..b559f870 100644 --- a/crates/generate/src/node_types.rs +++ b/crates/generate/src/node_types.rs @@ -46,7 +46,7 @@ pub struct NodeInfoJSON { #[serde(skip_serializing_if = "Option::is_none")] subtypes: Option>, #[serde(skip_serializing_if = "Option::is_none")] - symbol_id: Option, + symbol_id: Option, } #[derive(Clone, Debug, Serialize, PartialEq, Eq, PartialOrd, Ord, Hash)] @@ -475,7 +475,7 @@ pub fn generate_node_types_json( lexical_grammar: &LexicalGrammar, default_aliases: &AliasMap, variable_info: &[VariableInfo], - symbol_ids: &HashMap, + symbol_ids: &HashMap, ) -> SuperTypeCycleResult> { let mut node_types_json = BTreeMap::new(); @@ -575,7 +575,7 @@ pub fn generate_node_types_json( fields: None, children: None, subtypes: None, - symbol_id: symbol_ids.get(&symbol).cloned(), + symbol_id: symbol_ids.get(&symbol).map(|t| t.1), }); let mut subtypes = info .children @@ -620,7 +620,7 @@ pub fn generate_node_types_json( fields: Some(BTreeMap::new()), children: None, subtypes: None, - symbol_id: symbol_ids.get(&symbol).cloned(), + symbol_id: symbol_ids.get(&symbol).map(|t| t.1), } }); @@ -758,7 +758,7 @@ pub fn generate_node_types_json( fields: None, children: None, subtypes: None, - symbol_id: symbol_ids.get(&symbol).cloned(), + symbol_id: symbol_ids.get(&symbol).map(|t| t.1), }); if let Some(children) = &mut node_type_json.children { children.required = false; @@ -777,7 +777,7 @@ pub fn generate_node_types_json( fields: None, children: None, subtypes: None, - symbol_id: symbol_ids.get(&symbol).cloned(), + symbol_id: symbol_ids.get(&symbol).map(|t| t.1), }), _ => {} } diff --git a/crates/generate/src/render.rs b/crates/generate/src/render.rs index fe38aa02..d831a16d 100644 --- a/crates/generate/src/render.rs +++ b/crates/generate/src/render.rs @@ -78,8 +78,7 @@ struct Generator { syntax_grammar: SyntaxGrammar, lexical_grammar: LexicalGrammar, default_aliases: AliasMap, - symbol_order: HashMap, - symbol_ids: HashMap, + symbol_ids: HashMap, alias_ids: HashMap, unique_aliases: Vec, symbol_map: HashMap, @@ -236,7 +235,7 @@ impl Generator { // Some aliases match an existing symbol in the grammar. let alias_id = if let Some(existing_symbol) = self.symbols_for_alias(alias).first() { - self.symbol_ids[&self.symbol_map[existing_symbol]].clone() + self.symbol_ids[&self.symbol_map[existing_symbol]].0.clone() } // Other aliases don't match any existing symbol, and need their own // identifiers. @@ -264,7 +263,7 @@ impl Generator { .count() + 1; let constant_name = if let Some(symbol) = symbol { - format!("{}_character_set_{}", self.symbol_ids[symbol], count) + format!("{}_character_set_{}", self.symbol_ids[symbol].0, count) } else { format!("extras_character_set_{count}") }; @@ -294,7 +293,7 @@ impl Generator { for (supertype, subtypes) in &self.supertype_symbol_map { if let Some(supertype) = self.symbol_ids.get(supertype) { self.supertype_map - .entry(supertype.clone()) + .entry(supertype.0.clone()) .or_insert_with(|| subtypes.clone()); } } @@ -416,18 +415,19 @@ impl Generator { fn add_symbol_enum(&mut self) { add_line!(self, "enum ts_symbol_identifiers {{"); indent!(self); - self.symbol_order.insert(Symbol::end(), 0); - let mut i = 1; + // symbol_ids already contains both string ID and numeric ID for symbol in &self.parse_table.symbols { - if *symbol != Symbol::end() { - self.symbol_order.insert(*symbol, i); - add_line!(self, "{} = {i},", self.symbol_ids[symbol]); - i += 1; + if *symbol == Symbol::end() { + continue; } + let (string_id, numeric_id) = &self.symbol_ids[symbol]; + add_line!(self, "{} = {numeric_id},", string_id); } - for alias in &self.unique_aliases { + // Add aliases after all symbols + let alias_start = self.parse_table.symbols.len(); + for (idx, alias) in self.unique_aliases.iter().enumerate() { + let i = alias_start + idx; add_line!(self, "{} = {i},", self.alias_ids[alias]); - i += 1; } dedent!(self); add_line!(self, "}};"); @@ -445,7 +445,7 @@ impl Generator { alias.value.as_str() }), ); - add_line!(self, "[{}] = \"{name}\",", self.symbol_ids[symbol]); + add_line!(self, "[{}] = \"{name}\",", self.symbol_ids[symbol].0); } for alias in &self.unique_aliases { add_line!( @@ -467,8 +467,8 @@ impl Generator { add_line!( self, "[{}] = {},", - self.symbol_ids[symbol], - self.symbol_ids[&self.symbol_map[symbol]], + self.symbol_ids[symbol].0, + self.symbol_ids[&self.symbol_map[symbol]].0, ); } @@ -516,7 +516,7 @@ impl Generator { ); indent!(self); for symbol in &self.parse_table.symbols { - add_line!(self, "[{}] = {{", self.symbol_ids[symbol]); + add_line!(self, "[{}] = {{", self.symbol_ids[symbol].0); indent!(self); if let Some(Alias { is_named, .. }) = self.default_aliases.get(symbol) { add_line!(self, ".visible = true,"); @@ -623,8 +623,8 @@ impl Generator { ); indent!(self); for (symbol, alias_ids) in alias_ids_by_symbol { - let symbol_id = &self.symbol_ids[symbol]; - let public_symbol_id = &self.symbol_ids[&self.symbol_map[symbol]]; + let symbol_id = &self.symbol_ids[symbol].0; + let public_symbol_id = &self.symbol_ids[&self.symbol_map[symbol]].0; add_line!(self, "{symbol_id}, {},", 1 + alias_ids.len()); indent!(self); add_line!(self, "{public_symbol_id},"); @@ -761,13 +761,15 @@ impl Generator { subtypes .iter() .flat_map(|s| match s { - ChildType::Normal(symbol) => vec![self.symbol_ids.get(symbol).cloned()], + ChildType::Normal(symbol) => { + vec![self.symbol_ids.get(symbol).map(|t| t.0.clone())] + } ChildType::Aliased(alias) => { self.alias_ids.get(alias).cloned().map_or_else( || { self.symbols_for_alias(alias) .into_iter() - .map(|s| self.symbol_ids.get(&s).cloned()) + .map(|s| self.symbol_ids.get(&s).map(|t| t.0.clone())) .collect() }, |a| vec![Some(a)], @@ -846,7 +848,7 @@ impl Generator { fn add_lex_state(&mut self, _state_ix: usize, state: LexState) { if let Some(accept_action) = state.accept_action { - add_line!(self, "ACCEPT_TOKEN({});", self.symbol_ids[&accept_action]); + add_line!(self, "ACCEPT_TOKEN({});", self.symbol_ids[&accept_action].0); } if let Some(eof_action) = state.eof_action { @@ -1190,7 +1192,7 @@ impl Generator { add_line!(self, "[{id}] = {{"); indent!(self); for token in set.iter() { - add_line!(self, "{},", self.symbol_ids[&token]); + add_line!(self, "{},", self.symbol_ids[&token].0); } dedent!(self); add_line!(self, "}},"); @@ -1230,7 +1232,7 @@ impl Generator { self, "[{}] = {},", self.external_token_id(token), - self.symbol_ids[&id_token], + self.symbol_ids[&id_token].0, ); } dedent!(self); @@ -1304,14 +1306,14 @@ impl Generator { nonterminal_entries.clear(); terminal_entries.extend(state.terminal_entries.iter()); nonterminal_entries.extend(state.nonterminal_entries.iter()); - terminal_entries.sort_unstable_by_key(|e| self.symbol_order.get(e.0)); + terminal_entries.sort_unstable_by_key(|e| self.symbol_ids.get(e.0).map(|t| &t.1)); nonterminal_entries.sort_unstable_by_key(|k| k.0); for (symbol, action) in &nonterminal_entries { add_line!( self, "[{}] = STATE({}),", - self.symbol_ids[symbol], + self.symbol_ids[symbol].0, match action { GotoAction::Goto(state) => *state, GotoAction::ShiftExtra => i, @@ -1325,7 +1327,11 @@ impl Generator { &mut parse_table_entries, &mut next_parse_action_list_index, ); - add_line!(self, "[{}] = ACTIONS({entry_id}),", self.symbol_ids[symbol]); + add_line!( + self, + "[{}] = ACTIONS({entry_id}),", + self.symbol_ids[symbol].0 + ); } dedent!(self); @@ -1354,7 +1360,7 @@ impl Generator { terminal_entries.clear(); terminal_entries.extend(state.terminal_entries.iter()); - terminal_entries.sort_unstable_by_key(|e| self.symbol_order.get(e.0)); + terminal_entries.sort_unstable_by_key(|e| self.symbol_ids.get(e.0).map(|t| &t.1)); // In a given parse state, many lookahead symbols have the same actions. // So in the "small state" representation, group symbols by their action @@ -1407,7 +1413,7 @@ impl Generator { symbols.sort_unstable(); indent!(self); for symbol in symbols { - add_line!(self, "{},", self.symbol_ids[symbol]); + add_line!(self, "{},", self.symbol_ids[symbol].0); } dedent!(self); } @@ -1483,7 +1489,7 @@ impl Generator { add!( self, "REDUCE({}, {child_count}, {dynamic_precedence}, {production_id})", - self.symbol_ids[&symbol] + self.symbol_ids[&symbol].0 ); } } @@ -1595,7 +1601,7 @@ impl Generator { add_line!( self, ".keyword_capture_token = {},", - self.symbol_ids[&keyword_capture_token] + self.symbol_ids[&keyword_capture_token].0 ); } @@ -1898,8 +1904,9 @@ fn assign_symbol_id( symbol: Symbol, syntax_grammar: &SyntaxGrammar, lexical_grammar: &LexicalGrammar, - symbol_ids: &mut HashMap, + symbol_ids: &mut HashMap, used_identifiers: &mut HashSet, + numeric_id: u16, ) { let mut id; if symbol == Symbol::end() { @@ -1925,7 +1932,7 @@ fn assign_symbol_id( } used_identifiers.insert(id.clone()); - symbol_ids.insert(symbol, id); + symbol_ids.insert(symbol, (id, numeric_id)); } /// Generates symbol IDs and alias IDs for the given parse table and grammars. @@ -1943,7 +1950,7 @@ fn assign_symbol_id( /// # Returns /// /// A tuple containing: -/// * `symbol_ids` - HashMap mapping each Symbol to its C identifier string +/// * `symbol_ids` - HashMap mapping each Symbol to (C identifier string, numeric ID) /// * `alias_ids` - HashMap mapping each Alias to its C identifier string /// * `unique_aliases` - Sorted vector of unique aliases pub fn generate_symbol_ids( @@ -1951,21 +1958,29 @@ pub fn generate_symbol_ids( syntax_grammar: &SyntaxGrammar, lexical_grammar: &LexicalGrammar, default_aliases: &AliasMap, -) -> (HashMap, HashMap, Vec) { +) -> ( + HashMap, + HashMap, + Vec, +) { let mut symbol_ids = HashMap::new(); let mut alias_ids = HashMap::new(); let mut unique_aliases = Vec::new(); let mut symbol_identifiers = HashSet::new(); - // Generate symbol IDs - for i in 0..parse_table.symbols.len() { + // Generate symbol IDs with numeric IDs + // Symbol::end() gets 0, then other symbols get 1, 2, 3... + let mut numeric_id = 0u16; + for &symbol in &parse_table.symbols { assign_symbol_id( - parse_table.symbols[i], + symbol, syntax_grammar, lexical_grammar, &mut symbol_ids, &mut symbol_identifiers, + numeric_id, ); + numeric_id += 1; } symbol_ids.insert( @@ -2037,7 +2052,7 @@ pub fn generate_symbol_ids( // Some aliases match an existing symbol in the grammar. let alias_id = if let Some(existing_symbol) = matching_symbols.first() { - symbol_ids[&symbol_map[existing_symbol]].clone() + symbol_ids[&symbol_map[existing_symbol]].0.clone() } // Other aliases don't match any existing symbol, and need their own identifiers. else { @@ -2086,7 +2101,7 @@ pub fn render_c_code( syntax_grammar: SyntaxGrammar, lexical_grammar: LexicalGrammar, default_aliases: AliasMap, - symbol_ids: HashMap, + symbol_ids: HashMap, alias_ids: HashMap, unique_aliases: Vec, abi_version: usize,