diff --git a/crates/cli/src/tests/node_test.rs b/crates/cli/src/tests/node_test.rs index 614bfdb9..85a83ef9 100644 --- a/crates/cli/src/tests/node_test.rs +++ b/crates/cli/src/tests/node_test.rs @@ -1,5 +1,7 @@ +use std::{collections::HashMap, fs}; + use tree_sitter::{InputEdit, Node, Parser, Point, Tree}; -use tree_sitter_generate::load_grammar_file; +use tree_sitter_generate::{load_grammar_file, NodeInfoJSON}; use super::{ get_random_edit, @@ -1243,3 +1245,120 @@ fn parse_json_example() -> Tree { parser.set_language(&get_language("json")).unwrap(); parser.parse(JSON_EXAMPLE, None).unwrap() } + +fn test_parser_and_node_types_compatibility(grammar_name: &str) { + let language = get_language(grammar_name); + let node_types: Vec = { + let node_types_path = fixtures_dir() + .join("grammars") + .join(grammar_name) + .join("src") + .join("node-types.json"); + + let node_types_content = fs::read_to_string(&node_types_path) + .unwrap_or_else(|_| panic!("Failed to read node-types.json at {:?}", node_types_path)); + + serde_json::from_str(&node_types_content) + .unwrap_or_else(|e| panic!("Failed to parse node-types.json: {}", e)) + }; + + let symbol_ids_by_kind_from_node_types: HashMap<(String, bool), Option<&Vec>> = node_types + .iter() + .map(|node_type| { + ( + (node_type.kind.clone(), node_type.named), + node_type.symbol_ids.as_ref(), // .unwrap_or(Vec::new()), + ) + }) + .collect(); + let kind_count = language.node_kind_count(); + + let mut symbol_ids_by_kind_from_language: HashMap<(String, bool), Vec> = HashMap::new(); + for i in 0..kind_count as u16 { + let kind = language.node_kind_for_id(i).unwrap(); + let named = language.node_kind_is_named(i); + + // TODO: find a better way + // In Go grammar, there is a node kind with an empty string. + // In node-types.json, it is "\u0000" but in parser.c, it is "\0" and not distinguishable from "". + let kind = if kind.is_empty() { + "\0".to_string() + } else { + kind.to_string() + }; + + symbol_ids_by_kind_from_language + .entry((kind, named)) + .or_insert_with(Vec::new) + .push(i); + } + + for (key, symbol_ids) in symbol_ids_by_kind_from_node_types { + assert_eq!( + symbol_ids, + symbol_ids_by_kind_from_language.get(&key), + "{:?}", + key + ); + } +} + +#[test] +fn test_bash_parser_and_node_types_compatibility() { + test_parser_and_node_types_compatibility("bash"); +} + +#[test] +fn test_c_parser_and_node_types_compatibility() { + test_parser_and_node_types_compatibility("c"); +} + +#[test] +fn test_cpp_parser_and_node_types_compatibility() { + test_parser_and_node_types_compatibility("cpp"); +} + +#[test] +fn test_go_parser_and_node_types_compatibility() { + test_parser_and_node_types_compatibility("go"); +} + +#[test] +fn test_html_parser_and_node_types_compatibility() { + test_parser_and_node_types_compatibility("html"); +} + +#[test] +fn test_java_parser_and_node_types_compatibility() { + test_parser_and_node_types_compatibility("java"); +} + +#[test] +fn test_javascript_parser_and_node_types_compatibility() { + test_parser_and_node_types_compatibility("javascript"); +} + +#[test] +fn test_jsdoc_parser_and_node_types_compatibility() { + test_parser_and_node_types_compatibility("jsdoc"); +} + +#[test] +fn test_json_parser_and_node_types_compatibility() { + test_parser_and_node_types_compatibility("json"); +} + +#[test] +fn test_python_parser_and_node_types_compatibility() { + test_parser_and_node_types_compatibility("python"); +} + +#[test] +fn test_ruby_parser_and_node_types_compatibility() { + test_parser_and_node_types_compatibility("ruby"); +} + +#[test] +fn test_rust_parser_and_node_types_compatibility() { + test_parser_and_node_types_compatibility("rust"); +} diff --git a/crates/generate/src/generate.rs b/crates/generate/src/generate.rs index 6a005637..f203648a 100644 --- a/crates/generate/src/generate.rs +++ b/crates/generate/src/generate.rs @@ -1,4 +1,4 @@ -use std::{collections::BTreeMap, sync::LazyLock}; +use std::sync::LazyLock; #[cfg(feature = "load")] use std::{ env, fs, @@ -9,9 +9,7 @@ use std::{ use bitflags::bitflags; use log::warn; -use node_types::VariableInfo; use regex::{Regex, RegexBuilder}; -use rules::{Alias, Symbol}; #[cfg(feature = "load")] use semver::Version; #[cfg(feature = "load")] @@ -22,6 +20,7 @@ use thiserror::Error; mod build_tables; mod dedup; mod grammars; +mod introspect_grammar; mod nfa; mod node_types; pub mod parse_grammar; @@ -32,13 +31,13 @@ mod render; mod rules; mod tables; -use build_tables::build_tables; pub use build_tables::ParseTableBuilderError; -use grammars::{InlinedProductionMap, InputGrammar, LexicalGrammar, SyntaxGrammar}; +use introspect_grammar::{introspect_grammar, GrammarIntrospection}; pub use node_types::{SuperTypeCycleError, VariableInfoError}; +#[cfg(feature = "load")] +pub use node_types::{FieldInfoJSON, NodeInfoJSON, NodeTypeJSON}; use parse_grammar::parse_grammar; pub use parse_grammar::ParseGrammarError; -use prepare_grammar::prepare_grammar; pub use prepare_grammar::PrepareGrammarError; use render::render_c_code; pub use render::{ABI_VERSION_MAX, ABI_VERSION_MIN}; @@ -50,22 +49,6 @@ static JSON_COMMENT_REGEX: LazyLock = LazyLock::new(|| { .unwrap() }); -struct JSONOutput { - #[cfg(feature = "load")] - node_types_json: String, - syntax_grammar: SyntaxGrammar, - lexical_grammar: LexicalGrammar, - inlines: InlinedProductionMap, - simple_aliases: BTreeMap, - variable_info: Vec, -} - -struct GeneratedParser { - c_code: String, - #[cfg(feature = "load")] - node_types_json: String, -} - // NOTE: This constant must be kept in sync with the definition of // `TREE_SITTER_LANGUAGE_VERSION` in `lib/include/tree_sitter/api.h`. const LANGUAGE_VERSION: usize = 15; @@ -277,9 +260,34 @@ where // If our job is only to generate `grammar.json` and not `parser.c`, stop here. let input_grammar = parse_grammar(&grammar_json)?; + let GrammarIntrospection { + syntax_grammar, + lexical_grammar, + simple_aliases, + variable_info, + supertype_symbol_map, + tables, + symbol_ids, + alias_ids, + unique_aliases, + } = introspect_grammar(&input_grammar, report_symbol_name, optimizations)?; + + #[cfg(feature = "load")] + let node_types_json = node_types::generate_node_types_json( + &syntax_grammar, + &lexical_grammar, + &simple_aliases, + &unique_aliases, + &variable_info, + &symbol_ids, + )?; + + write_file( + &src_path.join("node-types.json"), + &serde_json::to_string_pretty(&node_types_json).unwrap(), + )?; + if !generate_parser { - let node_types_json = generate_node_types_from_grammar(&input_grammar)?.node_types_json; - write_file(&src_path.join("node-types.json"), node_types_json)?; return Ok(()); } @@ -300,19 +308,21 @@ where } // Generate the parser and related files. - let GeneratedParser { - c_code, - node_types_json, - } = generate_parser_for_grammar_with_opts( - &input_grammar, + let c_code = render_c_code( + &input_grammar.name, + tables, + syntax_grammar, + lexical_grammar, + simple_aliases, + symbol_ids, + alias_ids, + unique_aliases, abi_version, semantic_version.map(|v| (v.major as u8, v.minor as u8, v.patch as u8)), - report_symbol_name, - optimizations, - )?; + supertype_symbol_map, + ); write_file(&src_path.join("parser.c"), c_code)?; - write_file(&src_path.join("node-types.json"), node_types_json)?; fs::create_dir_all(&header_path) .map_err(|e| GenerateError::IO(IoError::new(&e, Some(header_path.as_path()))))?; write_file(&header_path.join("alloc.h"), ALLOC_HEADER)?; @@ -328,82 +338,33 @@ pub fn generate_parser_for_grammar( ) -> GenerateResult<(String, String)> { let grammar_json = JSON_COMMENT_REGEX.replace_all(grammar_json, "\n"); let input_grammar = parse_grammar(&grammar_json)?; - let parser = generate_parser_for_grammar_with_opts( - &input_grammar, - LANGUAGE_VERSION, - semantic_version, - None, - OptLevel::empty(), - )?; - Ok((input_grammar.name, parser.c_code)) -} - -fn generate_node_types_from_grammar(input_grammar: &InputGrammar) -> GenerateResult { - let (syntax_grammar, lexical_grammar, inlines, simple_aliases) = - prepare_grammar(input_grammar)?; - let variable_info = - node_types::get_variable_info(&syntax_grammar, &lexical_grammar, &simple_aliases)?; - - #[cfg(feature = "load")] - let node_types_json = node_types::generate_node_types_json( - &syntax_grammar, - &lexical_grammar, - &simple_aliases, - &variable_info, - )?; - Ok(JSONOutput { - #[cfg(feature = "load")] - node_types_json: serde_json::to_string_pretty(&node_types_json).unwrap(), + let GrammarIntrospection { syntax_grammar, lexical_grammar, - inlines, simple_aliases, - variable_info, - }) -} + variable_info: _, + supertype_symbol_map, + tables, + symbol_ids, + alias_ids, + unique_aliases, + } = introspect_grammar(&input_grammar, None, OptLevel::empty())?; -fn generate_parser_for_grammar_with_opts( - input_grammar: &InputGrammar, - abi_version: usize, - semantic_version: Option<(u8, u8, u8)>, - report_symbol_name: Option<&str>, - optimizations: OptLevel, -) -> GenerateResult { - let JSONOutput { - syntax_grammar, - lexical_grammar, - inlines, - simple_aliases, - variable_info, - #[cfg(feature = "load")] - node_types_json, - } = generate_node_types_from_grammar(input_grammar)?; - let supertype_symbol_map = - node_types::get_supertype_symbol_map(&syntax_grammar, &simple_aliases, &variable_info); - let tables = build_tables( - &syntax_grammar, - &lexical_grammar, - &simple_aliases, - &variable_info, - &inlines, - report_symbol_name, - optimizations, - )?; let c_code = render_c_code( &input_grammar.name, tables, syntax_grammar, lexical_grammar, simple_aliases, - abi_version, + symbol_ids, + alias_ids, + unique_aliases, + LANGUAGE_VERSION, semantic_version, supertype_symbol_map, ); - Ok(GeneratedParser { - c_code, - #[cfg(feature = "load")] - node_types_json, - }) + + Ok((input_grammar.name, c_code)) } /// This will read the `tree-sitter.json` config file and attempt to extract the version. diff --git a/crates/generate/src/introspect_grammar.rs b/crates/generate/src/introspect_grammar.rs new file mode 100644 index 00000000..a0c597c8 --- /dev/null +++ b/crates/generate/src/introspect_grammar.rs @@ -0,0 +1,375 @@ +use std::{ + collections::{BTreeMap, HashMap, HashSet}, + fmt::Write, +}; + +use crate::{ + build_tables::{build_tables, Tables}, + grammars::{InputGrammar, LexicalGrammar, SyntaxGrammar, VariableType}, + node_types::{self, ChildType, VariableInfo}, + prepare_grammar::prepare_grammar, + rules::{Alias, AliasMap, Symbol, SymbolType}, + tables::ParseTable, + GenerateError, OptLevel, +}; + +pub struct GrammarIntrospection { + pub syntax_grammar: SyntaxGrammar, + pub lexical_grammar: LexicalGrammar, + pub simple_aliases: BTreeMap, + pub variable_info: Vec, + pub supertype_symbol_map: BTreeMap>, + pub tables: Tables, + pub symbol_ids: HashMap, + pub alias_ids: HashMap, + pub unique_aliases: Vec<(Alias, u16)>, +} + +pub fn introspect_grammar( + input_grammar: &InputGrammar, + report_symbol_name: Option<&str>, + optimizations: OptLevel, +) -> Result { + let (syntax_grammar, lexical_grammar, inlines, simple_aliases) = + prepare_grammar(input_grammar)?; + let variable_info = + node_types::get_variable_info(&syntax_grammar, &lexical_grammar, &simple_aliases)?; + + let supertype_symbol_map = + node_types::get_supertype_symbol_map(&syntax_grammar, &simple_aliases, &variable_info); + let tables = build_tables( + &syntax_grammar, + &lexical_grammar, + &simple_aliases, + &variable_info, + &inlines, + report_symbol_name, + optimizations, + )?; + + // Generate symbol IDs (both string and numeric) before rendering C code + let (symbol_ids, alias_ids, unique_aliases) = generate_symbol_ids( + &tables.parse_table, + &syntax_grammar, + &lexical_grammar, + &simple_aliases, + ); + + Ok(GrammarIntrospection { + syntax_grammar, + lexical_grammar, + simple_aliases, + variable_info, + supertype_symbol_map, + tables, + symbol_ids, + alias_ids, + unique_aliases, + }) +} + +/// Generates symbol IDs and alias IDs for the given parse table and grammars. +/// +/// This function must be called before `render_c_code` to generate the symbol mappings +/// that will be used in the generated C code. +/// +/// # Arguments +/// +/// * `parse_table` - The generated parse table for the language +/// * `syntax_grammar` - The syntax grammar extracted from the language's grammar +/// * `lexical_grammar` - The lexical grammar extracted from the language's grammar +/// * `default_aliases` - A map describing the global rename rules that should apply +/// +/// # Returns +/// +/// A tuple containing: +/// * `symbol_ids` - HashMap mapping each Symbol to (C identifier string, numeric ID) +/// * `alias_ids` - HashMap mapping each Alias to its C identifier string +/// * `unique_aliases` - Sorted vector of unique aliases +pub fn generate_symbol_ids( + parse_table: &ParseTable, + syntax_grammar: &SyntaxGrammar, + lexical_grammar: &LexicalGrammar, + default_aliases: &AliasMap, +) -> ( + HashMap, + HashMap, + Vec<(Alias, u16)>, +) { + let mut symbol_ids = HashMap::new(); + let mut alias_ids = HashMap::new(); + let mut unique_aliases = Vec::new(); + let mut symbol_identifiers = HashSet::new(); + + // Generate symbol IDs with numeric IDs + // Symbol::end() gets 0, then other symbols get 1, 2, 3... + let mut numeric_id = 0u16; + for &symbol in &parse_table.symbols { + assign_symbol_id( + symbol, + syntax_grammar, + lexical_grammar, + &mut symbol_ids, + &mut symbol_identifiers, + numeric_id, + ); + numeric_id += 1; + } + + symbol_ids.insert( + Symbol::end_of_nonterminal_extra(), + symbol_ids[&Symbol::end()].clone(), + ); + + // Build symbol map to find canonical symbols for aliases + let mut symbol_map = HashMap::new(); + for symbol in &parse_table.symbols { + let mut mapping = symbol; + + if let Some(alias) = default_aliases.get(symbol) { + let kind = alias.kind(); + for other_symbol in &parse_table.symbols { + if let Some(other_alias) = default_aliases.get(other_symbol) { + if other_symbol < mapping && other_alias == alias { + mapping = other_symbol; + } + } else { + let (other_name, other_kind) = + metadata_for_symbol(*other_symbol, syntax_grammar, lexical_grammar); + if (other_name, other_kind) == (alias.value.as_str(), kind) { + mapping = other_symbol; + break; + } + } + } + } else if symbol.is_terminal() { + let metadata = metadata_for_symbol(*symbol, syntax_grammar, lexical_grammar); + for other_symbol in &parse_table.symbols { + let other_metadata = + metadata_for_symbol(*other_symbol, syntax_grammar, lexical_grammar); + if other_metadata == metadata { + if let Some(mapped) = symbol_map.get(other_symbol) { + if mapped == symbol { + break; + } + } + mapping = other_symbol; + break; + } + } + } + + symbol_map.insert(*symbol, *mapping); + } + + // Generate alias IDs + for production_info in &parse_table.production_infos { + for alias in &production_info.alias_sequence { + if let Some(alias) = &alias { + // Find symbols that match this alias + let matching_symbols: Vec = parse_table + .symbols + .iter() + .copied() + .filter(|symbol| { + default_aliases.get(symbol).map_or_else( + || { + let (name, kind) = + metadata_for_symbol(*symbol, syntax_grammar, lexical_grammar); + name == alias.value && kind == alias.kind() + }, + |default_alias| default_alias == alias, + ) + }) + .collect(); + + // Some aliases match an existing symbol in the grammar. + let alias_id = if let Some(existing_symbol) = matching_symbols.first() { + symbol_ids[&symbol_map[existing_symbol]].0.clone() + } + // Other aliases don't match any existing symbol, and need their own identifiers. + else { + if let Err(i) = unique_aliases.binary_search(alias) { + unique_aliases.insert(i, alias.clone()); + } + + if alias.is_named { + format!("alias_sym_{}", sanitize_identifier(&alias.value)) + } else { + format!("anon_alias_sym_{}", sanitize_identifier(&alias.value)) + } + }; + + alias_ids.entry(alias.clone()).or_insert(alias_id); + } + } + } + + ( + symbol_ids, + alias_ids, + unique_aliases + .into_iter() + .map(|alias| { + let id = numeric_id; + numeric_id += 1; + (alias, id) + }) + .collect(), + ) +} + +/// Helper function to sanitize identifiers for C code generation. +pub fn sanitize_identifier(name: &str) -> String { + let mut result = String::with_capacity(name.len()); + for c in name.chars() { + if c.is_ascii_alphanumeric() || c == '_' { + result.push(c); + } else { + 'special_chars: { + let replacement = match c { + ' ' if name.len() == 1 => "SPACE", + '~' => "TILDE", + '`' => "BQUOTE", + '!' => "BANG", + '@' => "AT", + '#' => "POUND", + '$' => "DOLLAR", + '%' => "PERCENT", + '^' => "CARET", + '&' => "AMP", + '*' => "STAR", + '(' => "LPAREN", + ')' => "RPAREN", + '-' => "DASH", + '+' => "PLUS", + '=' => "EQ", + '{' => "LBRACE", + '}' => "RBRACE", + '[' => "LBRACK", + ']' => "RBRACK", + '\\' => "BSLASH", + '|' => "PIPE", + ':' => "COLON", + ';' => "SEMI", + '"' => "DQUOTE", + '\'' => "SQUOTE", + '<' => "LT", + '>' => "GT", + ',' => "COMMA", + '.' => "DOT", + '?' => "QMARK", + '/' => "SLASH", + '\n' => "LF", + '\r' => "CR", + '\t' => "TAB", + '\0' => "NULL", + '\u{0001}' => "SOH", + '\u{0002}' => "STX", + '\u{0003}' => "ETX", + '\u{0004}' => "EOT", + '\u{0005}' => "ENQ", + '\u{0006}' => "ACK", + '\u{0007}' => "BEL", + '\u{0008}' => "BS", + '\u{000b}' => "VTAB", + '\u{000c}' => "FF", + '\u{000e}' => "SO", + '\u{000f}' => "SI", + '\u{0010}' => "DLE", + '\u{0011}' => "DC1", + '\u{0012}' => "DC2", + '\u{0013}' => "DC3", + '\u{0014}' => "DC4", + '\u{0015}' => "NAK", + '\u{0016}' => "SYN", + '\u{0017}' => "ETB", + '\u{0018}' => "CAN", + '\u{0019}' => "EM", + '\u{001a}' => "SUB", + '\u{001b}' => "ESC", + '\u{001c}' => "FS", + '\u{001d}' => "GS", + '\u{001e}' => "RS", + '\u{001f}' => "US", + '\u{007F}' => "DEL", + '\u{FEFF}' => "BOM", + '\u{0080}'..='\u{FFFF}' => { + write!(result, "u{:04x}", c as u32).unwrap(); + break 'special_chars; + } + '\u{10000}'..='\u{10FFFF}' => { + write!(result, "U{:08x}", c as u32).unwrap(); + break 'special_chars; + } + '0'..='9' | 'a'..='z' | 'A'..='Z' | '_' => unreachable!(), + ' ' => break 'special_chars, + }; + if !result.is_empty() && !result.ends_with('_') { + result.push('_'); + } + result += replacement; + } + } + } + result +} + +/// Helper function to get metadata for a symbol. +pub fn metadata_for_symbol<'a>( + symbol: Symbol, + syntax_grammar: &'a SyntaxGrammar, + lexical_grammar: &'a LexicalGrammar, +) -> (&'a str, VariableType) { + match symbol.kind { + SymbolType::End | SymbolType::EndOfNonTerminalExtra => ("end", VariableType::Hidden), + SymbolType::NonTerminal => { + let variable = &syntax_grammar.variables[symbol.index]; + (&variable.name as &str, variable.kind) + } + SymbolType::Terminal => { + let variable = &lexical_grammar.variables[symbol.index]; + (&variable.name as &str, variable.kind) + } + SymbolType::External => { + let token = &syntax_grammar.external_tokens[symbol.index]; + (&token.name as &str, token.kind) + } + } +} + +/// Helper function to assign a symbol ID. +fn assign_symbol_id( + symbol: Symbol, + syntax_grammar: &SyntaxGrammar, + lexical_grammar: &LexicalGrammar, + symbol_ids: &mut HashMap, + used_identifiers: &mut HashSet, + numeric_id: u16, +) { + let mut id; + if symbol == Symbol::end() { + id = "ts_builtin_sym_end".to_string(); + } else { + let (name, kind) = metadata_for_symbol(symbol, syntax_grammar, lexical_grammar); + id = match kind { + VariableType::Auxiliary => format!("aux_sym_{}", sanitize_identifier(name)), + VariableType::Anonymous => format!("anon_sym_{}", sanitize_identifier(name)), + VariableType::Hidden | VariableType::Named => { + format!("sym_{}", sanitize_identifier(name)) + } + }; + + let mut suffix_number = 1; + let mut suffix = String::new(); + while used_identifiers.contains(&id) { + id.drain(id.len() - suffix.len()..); + suffix_number += 1; + suffix = suffix_number.to_string(); + id += &suffix; + } + } + + used_identifiers.insert(id.clone()); + symbol_ids.insert(symbol, (id, numeric_id)); +} diff --git a/crates/generate/src/node_types.rs b/crates/generate/src/node_types.rs index 2dde0c49..af369ac3 100644 --- a/crates/generate/src/node_types.rs +++ b/crates/generate/src/node_types.rs @@ -1,10 +1,11 @@ use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; -use serde::Serialize; +use serde::{Deserialize, Serialize}; use thiserror::Error; use super::{ grammars::{LexicalGrammar, SyntaxGrammar, VariableType}, + introspect_grammar::metadata_for_symbol, rules::{Alias, AliasMap, Symbol, SymbolType}, }; @@ -28,38 +29,40 @@ pub struct VariableInfo { pub has_multi_step_production: bool, } -#[derive(Debug, Serialize, PartialEq, Eq, Default, PartialOrd, Ord)] +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Default, PartialOrd, Ord)] #[cfg(feature = "load")] pub struct NodeInfoJSON { #[serde(rename = "type")] - kind: String, - named: bool, - #[serde(skip_serializing_if = "std::ops::Not::not")] - root: bool, - #[serde(skip_serializing_if = "std::ops::Not::not")] - extra: bool, + pub kind: String, + pub named: bool, + #[serde(skip_serializing_if = "std::ops::Not::not", default)] + pub root: bool, + #[serde(skip_serializing_if = "std::ops::Not::not", default)] + pub extra: bool, #[serde(skip_serializing_if = "Option::is_none")] - fields: Option>, + pub fields: Option>, #[serde(skip_serializing_if = "Option::is_none")] - children: Option, + pub children: Option, #[serde(skip_serializing_if = "Option::is_none")] - subtypes: Option>, + pub subtypes: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub symbol_ids: Option>, } -#[derive(Clone, Debug, Serialize, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord, Hash)] #[cfg(feature = "load")] pub struct NodeTypeJSON { #[serde(rename = "type")] - kind: String, - named: bool, + pub kind: String, + pub named: bool, } -#[derive(Debug, Serialize, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] #[cfg(feature = "load")] pub struct FieldInfoJSON { - multiple: bool, - required: bool, - types: Vec, + pub multiple: bool, + pub required: bool, + pub types: Vec, } #[derive(Clone, Copy, Debug, PartialEq, Eq)] @@ -471,10 +474,47 @@ pub fn generate_node_types_json( syntax_grammar: &SyntaxGrammar, lexical_grammar: &LexicalGrammar, default_aliases: &AliasMap, + unique_aliases: &Vec<(Alias, u16)>, variable_info: &[VariableInfo], + symbol_ids: &HashMap, ) -> SuperTypeCycleResult> { let mut node_types_json = BTreeMap::new(); + // Build a map from (kind, is_named) to all symbol IDs that map to that kind + let mut kind_to_symbol_ids: HashMap<(String, bool), Vec> = HashMap::new(); + for (symbol, (_name, numeric_id)) in symbol_ids { + // Get the actual kind name for this symbol (considering aliases) + let (kind, is_named) = if let Some(alias) = default_aliases.get(symbol) { + (alias.value.clone(), alias.is_named) + } else { + match symbol.kind { + // TODO: check if `SymbolType::EndOfNonTerminalExtra` is correct + SymbolType::End => continue, + _ => { + let (name, kind) = + metadata_for_symbol(*symbol, syntax_grammar, lexical_grammar); + (name.to_string(), kind == VariableType::Named) + } + } + }; + kind_to_symbol_ids + .entry((kind, is_named)) + .or_insert_with(Vec::new) + .push(*numeric_id); + } + + for unique_alias in unique_aliases { + kind_to_symbol_ids.insert( + (unique_alias.0.value.clone(), unique_alias.0.is_named), + vec![unique_alias.1], + ); + } + + // Sort the symbol IDs for each kind to ensure consistent ordering + for ids in kind_to_symbol_ids.values_mut() { + ids.sort_unstable(); + } + let child_type_to_node_type = |child_type: &ChildType| match child_type { ChildType::Aliased(alias) => NodeTypeJSON { kind: alias.value.clone(), @@ -571,6 +611,9 @@ pub fn generate_node_types_json( fields: None, children: None, subtypes: None, + symbol_ids: kind_to_symbol_ids + .get(&(variable.name.clone(), true)) + .cloned(), }); let mut subtypes = info .children @@ -615,6 +658,7 @@ pub fn generate_node_types_json( fields: Some(BTreeMap::new()), children: None, subtypes: None, + symbol_ids: kind_to_symbol_ids.get(&(kind.clone(), is_named)).cloned(), } }); @@ -705,15 +749,16 @@ pub fn generate_node_types_json( .iter() .enumerate() .flat_map(|(i, variable)| { + let symbol = Symbol::terminal(i); aliases_by_symbol - .get(&Symbol::terminal(i)) + .get(&symbol) .unwrap_or(&empty) .iter() .map(move |alias| { alias .as_ref() - .map_or((&variable.name, variable.kind), |alias| { - (&alias.value, alias.kind()) + .map_or((&variable.name, variable.kind, symbol), |alias| { + (&alias.value, alias.kind(), symbol) }) }) }); @@ -723,18 +768,21 @@ pub fn generate_node_types_json( .iter() .enumerate() .flat_map(|(i, token)| { + let symbol = Symbol::external(i); aliases_by_symbol - .get(&Symbol::external(i)) + .get(&symbol) .unwrap_or(&empty) .iter() .map(move |alias| { - alias.as_ref().map_or((&token.name, token.kind), |alias| { - (&alias.value, alias.kind()) - }) + alias + .as_ref() + .map_or((&token.name, token.kind, symbol), |alias| { + (&alias.value, alias.kind(), symbol) + }) }) }); - for (name, kind) in regular_tokens.chain(external_tokens) { + for (name, kind, _symbol) in regular_tokens.chain(external_tokens) { match kind { VariableType::Named => { let node_type_json = @@ -748,6 +796,7 @@ pub fn generate_node_types_json( fields: None, children: None, subtypes: None, + symbol_ids: kind_to_symbol_ids.get(&(name.clone(), true)).cloned(), }); if let Some(children) = &mut node_type_json.children { children.required = false; @@ -766,6 +815,7 @@ pub fn generate_node_types_json( fields: None, children: None, subtypes: None, + symbol_ids: kind_to_symbol_ids.get(&(name.clone(), false)).cloned(), }), _ => {} } @@ -845,8 +895,9 @@ mod tests { grammars::{ InputGrammar, LexicalVariable, Production, ProductionStep, SyntaxVariable, Variable, }, - prepare_grammar::prepare_grammar, + introspect_grammar, rules::Rule, + GrammarIntrospection, OptLevel, }; #[test] @@ -916,7 +967,8 @@ mod tests { ] .into_iter() .collect() - ) + ), + symbol_ids: Some(vec![4]), } ); assert_eq!( @@ -928,7 +980,8 @@ mod tests { extra: false, subtypes: None, children: None, - fields: None + fields: None, + symbol_ids: Some(vec![1]), } ); assert_eq!( @@ -940,7 +993,8 @@ mod tests { extra: false, subtypes: None, children: None, - fields: None + fields: None, + symbol_ids: Some(vec![2]), } ); } @@ -1016,7 +1070,8 @@ mod tests { ] .into_iter() .collect() - ) + ), + symbol_ids: Some(vec![4]), } ); assert_eq!( @@ -1028,7 +1083,8 @@ mod tests { extra: false, subtypes: None, children: None, - fields: None + fields: None, + symbol_ids: Some(vec![1]), } ); assert_eq!( @@ -1040,7 +1096,8 @@ mod tests { extra: false, subtypes: None, children: None, - fields: None + fields: None, + symbol_ids: Some(vec![2]), } ); assert_eq!( @@ -1052,7 +1109,8 @@ mod tests { extra: true, subtypes: None, children: None, - fields: None + fields: None, + symbol_ids: Some(vec![3]), } ); } @@ -1083,7 +1141,7 @@ mod tests { Variable { name: "v3".to_string(), kind: VariableType::Named, - rule: Rule::seq(vec![Rule::string("y"), Rule::repeat(Rule::string("z"))]), + rule: Rule::seq(vec![Rule::string("y"), Rule::string("z")]), }, ], ..Default::default() @@ -1128,7 +1186,8 @@ mod tests { ] .into_iter() .collect() - ) + ), + symbol_ids: Some(vec![5]), } ); assert_eq!( @@ -1140,7 +1199,8 @@ mod tests { extra: true, subtypes: None, children: None, - fields: Some(BTreeMap::default()) + fields: Some(BTreeMap::default()), + symbol_ids: Some(vec![6]), } ); assert_eq!( @@ -1152,7 +1212,8 @@ mod tests { extra: false, subtypes: None, children: None, - fields: None + fields: None, + symbol_ids: Some(vec![1]), } ); assert_eq!( @@ -1164,7 +1225,8 @@ mod tests { extra: false, subtypes: None, children: None, - fields: None + fields: None, + symbol_ids: Some(vec![2]), } ); } @@ -1226,6 +1288,7 @@ mod tests { named: true, }, ]), + symbol_ids: None, } ); assert_eq!( @@ -1251,7 +1314,8 @@ mod tests { ),] .into_iter() .collect() - ) + ), + symbol_ids: Some(vec![4]), } ); } @@ -1329,7 +1393,8 @@ mod tests { ),] .into_iter() .collect() - ) + ), + symbol_ids: Some(vec![5]), } ); assert_eq!( @@ -1349,6 +1414,7 @@ mod tests { },] }), fields: Some(BTreeMap::new()), + symbol_ids: Some(vec![6]), } ); } @@ -1402,6 +1468,7 @@ mod tests { ] }), fields: Some(BTreeMap::new()), + symbol_ids: Some(vec![3]), } ); } @@ -1450,6 +1517,7 @@ mod tests { rule: Rule::pattern("[\\w-]+", ""), }, ], + expected_conflicts: vec![vec!["type".to_string(), "expression".to_string()]], ..Default::default() }) .unwrap(); @@ -1465,6 +1533,7 @@ mod tests { subtypes: None, children: None, fields: None, + symbol_ids: Some(vec![2, 3]), }) ); assert_eq!( @@ -1477,6 +1546,7 @@ mod tests { subtypes: None, children: None, fields: None, + symbol_ids: Some(vec![7]), }) ); } @@ -1542,6 +1612,7 @@ mod tests { .into_iter() .collect() ), + symbol_ids: Some(vec![3]), } ); } @@ -1570,7 +1641,8 @@ mod tests { extra: false, fields: Some(BTreeMap::new()), children: None, - subtypes: None + subtypes: None, + symbol_ids: Some(vec![3]), }] ); } @@ -1678,6 +1750,7 @@ mod tests { .into_iter() .collect() ), + symbol_ids: Some(vec![7, 8]), }, NodeInfoJSON { kind: "script".to_string(), @@ -1695,6 +1768,7 @@ mod tests { }] }), fields: Some(BTreeMap::new()), + symbol_ids: Some(vec![6]), } ] ); @@ -1751,6 +1825,7 @@ mod tests { }] }), fields: Some(BTreeMap::new()), + symbol_ids: Some(vec![3, 5]), } ); } @@ -2056,15 +2131,25 @@ mod tests { } fn get_node_types(grammar: &InputGrammar) -> SuperTypeCycleResult> { - let (syntax_grammar, lexical_grammar, _, default_aliases) = - prepare_grammar(grammar).unwrap(); - let variable_info = - get_variable_info(&syntax_grammar, &lexical_grammar, &default_aliases).unwrap(); + let GrammarIntrospection { + syntax_grammar, + lexical_grammar, + simple_aliases, + variable_info, + supertype_symbol_map: _, + tables: _, + symbol_ids, + alias_ids: _, + unique_aliases, + } = introspect_grammar(grammar, None, OptLevel::default()).unwrap(); + generate_node_types_json( &syntax_grammar, &lexical_grammar, - &default_aliases, + &simple_aliases, + &unique_aliases, &variable_info, + &symbol_ids, ) } diff --git a/crates/generate/src/render.rs b/crates/generate/src/render.rs index bcfc832e..d10e544e 100644 --- a/crates/generate/src/render.rs +++ b/crates/generate/src/render.rs @@ -1,11 +1,14 @@ use std::{ cmp, - collections::{BTreeMap, BTreeSet, HashMap, HashSet}, + collections::{BTreeMap, BTreeSet, HashMap}, fmt::Write, mem::swap, }; -use crate::LANGUAGE_VERSION; +use crate::{ + introspect_grammar::{metadata_for_symbol, sanitize_identifier}, + LANGUAGE_VERSION, +}; use indoc::indoc; use super::{ @@ -78,10 +81,9 @@ struct Generator { syntax_grammar: SyntaxGrammar, lexical_grammar: LexicalGrammar, default_aliases: AliasMap, - symbol_order: HashMap, - symbol_ids: HashMap, + symbol_ids: HashMap, alias_ids: HashMap, - unique_aliases: Vec, + unique_aliases: Vec<(Alias, u16)>, symbol_map: HashMap, reserved_word_sets: Vec, reserved_word_set_ids_by_parse_state: Vec, @@ -175,15 +177,7 @@ impl Generator { } fn init(&mut self) { - let mut symbol_identifiers = HashSet::new(); - for i in 0..self.parse_table.symbols.len() { - self.assign_symbol_id(self.parse_table.symbols[i], &mut symbol_identifiers); - } - self.symbol_ids.insert( - Symbol::end_of_nonterminal_extra(), - self.symbol_ids[&Symbol::end()].clone(), - ); - + // symbol_ids and alias_ids are now passed in from the constructor self.symbol_map = HashMap::new(); for symbol in &self.parse_table.symbols { @@ -237,32 +231,6 @@ impl Generator { self.field_names.insert(i, field_name.clone()); } } - - for alias in &production_info.alias_sequence { - // Generate a mapping from aliases to C identifiers. - if let Some(alias) = &alias { - // Some aliases match an existing symbol in the grammar. - let alias_id = - if let Some(existing_symbol) = self.symbols_for_alias(alias).first() { - self.symbol_ids[&self.symbol_map[existing_symbol]].clone() - } - // Other aliases don't match any existing symbol, and need their own - // identifiers. - else { - if let Err(i) = self.unique_aliases.binary_search(alias) { - self.unique_aliases.insert(i, alias.clone()); - } - - if alias.is_named { - format!("alias_sym_{}", self.sanitize_identifier(&alias.value)) - } else { - format!("anon_alias_sym_{}", self.sanitize_identifier(&alias.value)) - } - }; - - self.alias_ids.entry(alias.clone()).or_insert(alias_id); - } - } } for (ix, (symbol, _)) in self.large_character_sets.iter().enumerate() { @@ -272,7 +240,7 @@ impl Generator { .count() + 1; let constant_name = if let Some(symbol) = symbol { - format!("{}_character_set_{}", self.symbol_ids[symbol], count) + format!("{}_character_set_{}", self.symbol_ids[symbol].0, count) } else { format!("extras_character_set_{count}") }; @@ -302,7 +270,7 @@ impl Generator { for (supertype, subtypes) in &self.supertype_symbol_map { if let Some(supertype) = self.symbol_ids.get(supertype) { self.supertype_map - .entry(supertype.clone()) + .entry(supertype.0.clone()) .or_insert_with(|| subtypes.clone()); } } @@ -424,18 +392,17 @@ impl Generator { fn add_symbol_enum(&mut self) { add_line!(self, "enum ts_symbol_identifiers {{"); indent!(self); - self.symbol_order.insert(Symbol::end(), 0); - let mut i = 1; + // symbol_ids already contains both string ID and numeric ID for symbol in &self.parse_table.symbols { - if *symbol != Symbol::end() { - self.symbol_order.insert(*symbol, i); - add_line!(self, "{} = {i},", self.symbol_ids[symbol]); - i += 1; + if *symbol == Symbol::end() { + continue; } + let (string_id, numeric_id) = &self.symbol_ids[symbol]; + add_line!(self, "{} = {numeric_id},", string_id); } - for alias in &self.unique_aliases { - add_line!(self, "{} = {i},", self.alias_ids[alias]); - i += 1; + // Add aliases after all symbols + for (alias, numeric_id) in self.unique_aliases.iter() { + add_line!(self, "{} = {},", self.alias_ids[alias], numeric_id); } dedent!(self); add_line!(self, "}};"); @@ -453,14 +420,14 @@ impl Generator { alias.value.as_str() }), ); - add_line!(self, "[{}] = \"{name}\",", self.symbol_ids[symbol]); + add_line!(self, "[{}] = \"{name}\",", self.symbol_ids[symbol].0); } for alias in &self.unique_aliases { add_line!( self, "[{}] = \"{}\",", - self.alias_ids[alias], - self.sanitize_string(&alias.value) + self.alias_ids[&alias.0], + self.sanitize_string(&alias.0.value) ); } dedent!(self); @@ -475,8 +442,8 @@ impl Generator { add_line!( self, "[{}] = {},", - self.symbol_ids[symbol], - self.symbol_ids[&self.symbol_map[symbol]], + self.symbol_ids[symbol].0, + self.symbol_ids[&self.symbol_map[symbol]].0, ); } @@ -484,8 +451,8 @@ impl Generator { add_line!( self, "[{}] = {},", - self.alias_ids[alias], - self.alias_ids[alias], + self.alias_ids[&alias.0], + self.alias_ids[&alias.0], ); } @@ -524,7 +491,7 @@ impl Generator { ); indent!(self); for symbol in &self.parse_table.symbols { - add_line!(self, "[{}] = {{", self.symbol_ids[symbol]); + add_line!(self, "[{}] = {{", self.symbol_ids[symbol].0); indent!(self); if let Some(Alias { is_named, .. }) = self.default_aliases.get(symbol) { add_line!(self, ".visible = true,"); @@ -556,10 +523,10 @@ impl Generator { add_line!(self, "}},"); } for alias in &self.unique_aliases { - add_line!(self, "[{}] = {{", self.alias_ids[alias]); + add_line!(self, "[{}] = {{", self.alias_ids[&alias.0]); indent!(self); add_line!(self, ".visible = true,"); - add_line!(self, ".named = {},", alias.is_named); + add_line!(self, ".named = {},", &alias.0.is_named); dedent!(self); add_line!(self, "}},"); } @@ -631,8 +598,8 @@ impl Generator { ); indent!(self); for (symbol, alias_ids) in alias_ids_by_symbol { - let symbol_id = &self.symbol_ids[symbol]; - let public_symbol_id = &self.symbol_ids[&self.symbol_map[symbol]]; + let symbol_id = &self.symbol_ids[symbol].0; + let public_symbol_id = &self.symbol_ids[&self.symbol_map[symbol]].0; add_line!(self, "{symbol_id}, {},", 1 + alias_ids.len()); indent!(self); add_line!(self, "{public_symbol_id},"); @@ -769,13 +736,15 @@ impl Generator { subtypes .iter() .flat_map(|s| match s { - ChildType::Normal(symbol) => vec![self.symbol_ids.get(symbol).cloned()], + ChildType::Normal(symbol) => { + vec![self.symbol_ids.get(symbol).map(|t| t.0.clone())] + } ChildType::Aliased(alias) => { self.alias_ids.get(alias).cloned().map_or_else( || { self.symbols_for_alias(alias) .into_iter() - .map(|s| self.symbol_ids.get(&s).cloned()) + .map(|s| self.symbol_ids.get(&s).map(|t| t.0.clone())) .collect() }, |a| vec![Some(a)], @@ -854,7 +823,7 @@ impl Generator { fn add_lex_state(&mut self, _state_ix: usize, state: LexState) { if let Some(accept_action) = state.accept_action { - add_line!(self, "ACCEPT_TOKEN({});", self.symbol_ids[&accept_action]); + add_line!(self, "ACCEPT_TOKEN({});", self.symbol_ids[&accept_action].0); } if let Some(eof_action) = state.eof_action { @@ -1198,7 +1167,7 @@ impl Generator { add_line!(self, "[{id}] = {{"); indent!(self); for token in set.iter() { - add_line!(self, "{},", self.symbol_ids[&token]); + add_line!(self, "{},", self.symbol_ids[&token].0); } dedent!(self); add_line!(self, "}},"); @@ -1238,7 +1207,7 @@ impl Generator { self, "[{}] = {},", self.external_token_id(token), - self.symbol_ids[&id_token], + self.symbol_ids[&id_token].0, ); } dedent!(self); @@ -1312,14 +1281,14 @@ impl Generator { nonterminal_entries.clear(); terminal_entries.extend(state.terminal_entries.iter()); nonterminal_entries.extend(state.nonterminal_entries.iter()); - terminal_entries.sort_unstable_by_key(|e| self.symbol_order.get(e.0)); + terminal_entries.sort_unstable_by_key(|e| self.symbol_ids.get(e.0).map(|t| &t.1)); nonterminal_entries.sort_unstable_by_key(|k| k.0); for (symbol, action) in &nonterminal_entries { add_line!( self, "[{}] = STATE({}),", - self.symbol_ids[symbol], + self.symbol_ids[symbol].0, match action { GotoAction::Goto(state) => *state, GotoAction::ShiftExtra => i, @@ -1333,7 +1302,11 @@ impl Generator { &mut parse_table_entries, &mut next_parse_action_list_index, ); - add_line!(self, "[{}] = ACTIONS({entry_id}),", self.symbol_ids[symbol]); + add_line!( + self, + "[{}] = ACTIONS({entry_id}),", + self.symbol_ids[symbol].0 + ); } dedent!(self); @@ -1362,7 +1335,7 @@ impl Generator { terminal_entries.clear(); terminal_entries.extend(state.terminal_entries.iter()); - terminal_entries.sort_unstable_by_key(|e| self.symbol_order.get(e.0)); + terminal_entries.sort_unstable_by_key(|e| self.symbol_ids.get(e.0).map(|t| &t.1)); // In a given parse state, many lookahead symbols have the same actions. // So in the "small state" representation, group symbols by their action @@ -1415,7 +1388,7 @@ impl Generator { symbols.sort_unstable(); indent!(self); for symbol in symbols { - add_line!(self, "{},", self.symbol_ids[symbol]); + add_line!(self, "{},", self.symbol_ids[symbol].0); } dedent!(self); } @@ -1491,7 +1464,7 @@ impl Generator { add!( self, "REDUCE({}, {child_count}, {dynamic_precedence}, {production_id})", - self.symbol_ids[&symbol] + self.symbol_ids[&symbol].0 ); } } @@ -1603,7 +1576,7 @@ impl Generator { add_line!( self, ".keyword_capture_token = {},", - self.symbol_ids[&keyword_capture_token] + self.symbol_ids[&keyword_capture_token].0 ); } @@ -1702,38 +1675,7 @@ impl Generator { } fn external_token_id(&self, token: &ExternalToken) -> String { - format!( - "ts_external_token_{}", - self.sanitize_identifier(&token.name) - ) - } - - fn assign_symbol_id(&mut self, symbol: Symbol, used_identifiers: &mut HashSet) { - let mut id; - if symbol == Symbol::end() { - id = "ts_builtin_sym_end".to_string(); - } else { - let (name, kind) = self.metadata_for_symbol(symbol); - id = match kind { - VariableType::Auxiliary => format!("aux_sym_{}", self.sanitize_identifier(name)), - VariableType::Anonymous => format!("anon_sym_{}", self.sanitize_identifier(name)), - VariableType::Hidden | VariableType::Named => { - format!("sym_{}", self.sanitize_identifier(name)) - } - }; - - let mut suffix_number = 1; - let mut suffix = String::new(); - while used_identifiers.contains(&id) { - id.drain(id.len() - suffix.len()..); - suffix_number += 1; - suffix = suffix_number.to_string(); - id += &suffix; - } - } - - used_identifiers.insert(id.clone()); - self.symbol_ids.insert(symbol, id); + format!("ts_external_token_{}", sanitize_identifier(&token.name)) } fn field_id(&self, field_name: &str) -> String { @@ -1741,21 +1683,7 @@ impl Generator { } fn metadata_for_symbol(&self, symbol: Symbol) -> (&str, VariableType) { - match symbol.kind { - SymbolType::End | SymbolType::EndOfNonTerminalExtra => ("end", VariableType::Hidden), - SymbolType::NonTerminal => { - let variable = &self.syntax_grammar.variables[symbol.index]; - (&variable.name, variable.kind) - } - SymbolType::Terminal => { - let variable = &self.lexical_grammar.variables[symbol.index]; - (&variable.name, variable.kind) - } - SymbolType::External => { - let token = &self.syntax_grammar.external_tokens[symbol.index]; - (&token.name, token.kind) - } - } + metadata_for_symbol(symbol, &self.syntax_grammar, &self.lexical_grammar) } fn symbols_for_alias(&self, alias: &Alias) -> Vec { @@ -1775,101 +1703,6 @@ impl Generator { .collect() } - fn sanitize_identifier(&self, name: &str) -> String { - let mut result = String::with_capacity(name.len()); - for c in name.chars() { - if c.is_ascii_alphanumeric() || c == '_' { - result.push(c); - } else { - 'special_chars: { - let replacement = match c { - ' ' if name.len() == 1 => "SPACE", - '~' => "TILDE", - '`' => "BQUOTE", - '!' => "BANG", - '@' => "AT", - '#' => "POUND", - '$' => "DOLLAR", - '%' => "PERCENT", - '^' => "CARET", - '&' => "AMP", - '*' => "STAR", - '(' => "LPAREN", - ')' => "RPAREN", - '-' => "DASH", - '+' => "PLUS", - '=' => "EQ", - '{' => "LBRACE", - '}' => "RBRACE", - '[' => "LBRACK", - ']' => "RBRACK", - '\\' => "BSLASH", - '|' => "PIPE", - ':' => "COLON", - ';' => "SEMI", - '"' => "DQUOTE", - '\'' => "SQUOTE", - '<' => "LT", - '>' => "GT", - ',' => "COMMA", - '.' => "DOT", - '?' => "QMARK", - '/' => "SLASH", - '\n' => "LF", - '\r' => "CR", - '\t' => "TAB", - '\0' => "NULL", - '\u{0001}' => "SOH", - '\u{0002}' => "STX", - '\u{0003}' => "ETX", - '\u{0004}' => "EOT", - '\u{0005}' => "ENQ", - '\u{0006}' => "ACK", - '\u{0007}' => "BEL", - '\u{0008}' => "BS", - '\u{000b}' => "VTAB", - '\u{000c}' => "FF", - '\u{000e}' => "SO", - '\u{000f}' => "SI", - '\u{0010}' => "DLE", - '\u{0011}' => "DC1", - '\u{0012}' => "DC2", - '\u{0013}' => "DC3", - '\u{0014}' => "DC4", - '\u{0015}' => "NAK", - '\u{0016}' => "SYN", - '\u{0017}' => "ETB", - '\u{0018}' => "CAN", - '\u{0019}' => "EM", - '\u{001a}' => "SUB", - '\u{001b}' => "ESC", - '\u{001c}' => "FS", - '\u{001d}' => "GS", - '\u{001e}' => "RS", - '\u{001f}' => "US", - '\u{007F}' => "DEL", - '\u{FEFF}' => "BOM", - '\u{0080}'..='\u{FFFF}' => { - write!(result, "u{:04x}", c as u32).unwrap(); - break 'special_chars; - } - '\u{10000}'..='\u{10FFFF}' => { - write!(result, "U{:08x}", c as u32).unwrap(); - break 'special_chars; - } - '0'..='9' | 'a'..='z' | 'A'..='Z' | '_' => unreachable!(), - ' ' => break 'special_chars, - }; - if !result.is_empty() && !result.ends_with('_') { - result.push('_'); - } - result += replacement; - } - } - } - result - } - fn sanitize_string(&self, name: &str) -> String { let mut result = String::with_capacity(name.len()); for c in name.chars() { @@ -1920,18 +1753,20 @@ impl Generator { /// # Arguments /// /// * `name` - A string slice containing the name of the language -/// * `parse_table` - The generated parse table for the language -/// * `main_lex_table` - The generated lexing table for the language -/// * `keyword_lex_table` - The generated keyword lexing table for the language -/// * `keyword_capture_token` - A symbol indicating which token is used for keyword capture, if any. +/// * `tables` - The generated tables for the language /// * `syntax_grammar` - The syntax grammar extracted from the language's grammar /// * `lexical_grammar` - The lexical grammar extracted from the language's grammar /// * `default_aliases` - A map describing the global rename rules that should apply. the keys are /// symbols that are *always* aliased in the same way, and the values are the aliases that are /// applied to those symbols. +/// * `symbol_ids` - HashMap mapping each Symbol to its C identifier string +/// * `alias_ids` - HashMap mapping each Alias to its C identifier string +/// * `unique_aliases` - Sorted vector of unique aliases /// * `abi_version` - The language ABI version that should be generated. Usually you want /// Tree-sitter's current version, but right after making an ABI change, it may be useful to /// generate code with the previous ABI. +/// * `semantic_version` - Optional semantic version of the parser +/// * `supertype_symbol_map` - Map of supertype symbols #[allow(clippy::too_many_arguments)] pub fn render_c_code( name: &str, @@ -1939,6 +1774,9 @@ pub fn render_c_code( syntax_grammar: SyntaxGrammar, lexical_grammar: LexicalGrammar, default_aliases: AliasMap, + symbol_ids: HashMap, + alias_ids: HashMap, + unique_aliases: Vec<(Alias, u16)>, abi_version: usize, semantic_version: Option<(u8, u8, u8)>, supertype_symbol_map: BTreeMap>, @@ -1958,6 +1796,9 @@ pub fn render_c_code( syntax_grammar, lexical_grammar, default_aliases, + symbol_ids, + alias_ids, + unique_aliases, abi_version, metadata: semantic_version.map(|(major_version, minor_version, patch_version)| Metadata { major_version, diff --git a/docs/src/assets/schemas/node-types.schema.json b/docs/src/assets/schemas/node-types.schema.json index 7ea8a5af..a114bc81 100644 --- a/docs/src/assets/schemas/node-types.schema.json +++ b/docs/src/assets/schemas/node-types.schema.json @@ -41,6 +41,12 @@ "items": { "$ref": "#/definitions/NodeType" } + }, + "symbol_ids": { + "type": "array", + "items": { + "type": "integer" + } } }, "oneOf": [ @@ -105,4 +111,4 @@ } } } -} +} \ No newline at end of file diff --git a/flake.lock b/flake.lock index 2b1bddae..9050b514 100644 --- a/flake.lock +++ b/flake.lock @@ -2,11 +2,11 @@ "nodes": { "nixpkgs": { "locked": { - "lastModified": 1756787288, - "narHash": "sha256-rw/PHa1cqiePdBxhF66V7R+WAP8WekQ0mCDG4CFqT8Y=", + "lastModified": 1762977756, + "narHash": "sha256-4PqRErxfe+2toFJFgcRKZ0UI9NSIOJa+7RXVtBhy4KE=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "d0fc30899600b9b3466ddb260fd83deb486c32f1", + "rev": "c5ae371f1a6a7fd27823bc500d9390b38c05fa55", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index 4cea3f87..2b7bdeed 100644 --- a/flake.nix +++ b/flake.nix @@ -326,7 +326,7 @@ clippy rust-analyzer rustfmt - cargo-llvm-cov + # cargo-llvm-cov cmake gnumake