This commit is contained in:
bglgwyng 2026-01-20 23:16:47 -05:00 committed by GitHub
commit 333d6a9a3c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 757 additions and 370 deletions

View file

@ -1,5 +1,7 @@
use std::{collections::HashMap, fs};
use tree_sitter::{InputEdit, Node, Parser, Point, Tree};
use tree_sitter_generate::load_grammar_file;
use tree_sitter_generate::{load_grammar_file, NodeInfoJSON};
use super::{
get_random_edit,
@ -1243,3 +1245,120 @@ fn parse_json_example() -> Tree {
parser.set_language(&get_language("json")).unwrap();
parser.parse(JSON_EXAMPLE, None).unwrap()
}
fn test_parser_and_node_types_compatibility(grammar_name: &str) {
let language = get_language(grammar_name);
let node_types: Vec<NodeInfoJSON> = {
let node_types_path = fixtures_dir()
.join("grammars")
.join(grammar_name)
.join("src")
.join("node-types.json");
let node_types_content = fs::read_to_string(&node_types_path)
.unwrap_or_else(|_| panic!("Failed to read node-types.json at {:?}", node_types_path));
serde_json::from_str(&node_types_content)
.unwrap_or_else(|e| panic!("Failed to parse node-types.json: {}", e))
};
let symbol_ids_by_kind_from_node_types: HashMap<(String, bool), Option<&Vec<u16>>> = node_types
.iter()
.map(|node_type| {
(
(node_type.kind.clone(), node_type.named),
node_type.symbol_ids.as_ref(), // .unwrap_or(Vec::new()),
)
})
.collect();
let kind_count = language.node_kind_count();
let mut symbol_ids_by_kind_from_language: HashMap<(String, bool), Vec<u16>> = HashMap::new();
for i in 0..kind_count as u16 {
let kind = language.node_kind_for_id(i).unwrap();
let named = language.node_kind_is_named(i);
// TODO: find a better way
// In Go grammar, there is a node kind with an empty string.
// In node-types.json, it is "\u0000" but in parser.c, it is "\0" and not distinguishable from "".
let kind = if kind.is_empty() {
"\0".to_string()
} else {
kind.to_string()
};
symbol_ids_by_kind_from_language
.entry((kind, named))
.or_insert_with(Vec::new)
.push(i);
}
for (key, symbol_ids) in symbol_ids_by_kind_from_node_types {
assert_eq!(
symbol_ids,
symbol_ids_by_kind_from_language.get(&key),
"{:?}",
key
);
}
}
#[test]
fn test_bash_parser_and_node_types_compatibility() {
test_parser_and_node_types_compatibility("bash");
}
#[test]
fn test_c_parser_and_node_types_compatibility() {
test_parser_and_node_types_compatibility("c");
}
#[test]
fn test_cpp_parser_and_node_types_compatibility() {
test_parser_and_node_types_compatibility("cpp");
}
#[test]
fn test_go_parser_and_node_types_compatibility() {
test_parser_and_node_types_compatibility("go");
}
#[test]
fn test_html_parser_and_node_types_compatibility() {
test_parser_and_node_types_compatibility("html");
}
#[test]
fn test_java_parser_and_node_types_compatibility() {
test_parser_and_node_types_compatibility("java");
}
#[test]
fn test_javascript_parser_and_node_types_compatibility() {
test_parser_and_node_types_compatibility("javascript");
}
#[test]
fn test_jsdoc_parser_and_node_types_compatibility() {
test_parser_and_node_types_compatibility("jsdoc");
}
#[test]
fn test_json_parser_and_node_types_compatibility() {
test_parser_and_node_types_compatibility("json");
}
#[test]
fn test_python_parser_and_node_types_compatibility() {
test_parser_and_node_types_compatibility("python");
}
#[test]
fn test_ruby_parser_and_node_types_compatibility() {
test_parser_and_node_types_compatibility("ruby");
}
#[test]
fn test_rust_parser_and_node_types_compatibility() {
test_parser_and_node_types_compatibility("rust");
}

View file

@ -1,4 +1,4 @@
use std::{collections::BTreeMap, sync::LazyLock};
use std::sync::LazyLock;
#[cfg(feature = "load")]
use std::{
env, fs,
@ -9,9 +9,7 @@ use std::{
use bitflags::bitflags;
use log::warn;
use node_types::VariableInfo;
use regex::{Regex, RegexBuilder};
use rules::{Alias, Symbol};
#[cfg(feature = "load")]
use semver::Version;
#[cfg(feature = "load")]
@ -22,6 +20,7 @@ use thiserror::Error;
mod build_tables;
mod dedup;
mod grammars;
mod introspect_grammar;
mod nfa;
mod node_types;
pub mod parse_grammar;
@ -32,13 +31,13 @@ mod render;
mod rules;
mod tables;
use build_tables::build_tables;
pub use build_tables::ParseTableBuilderError;
use grammars::{InlinedProductionMap, InputGrammar, LexicalGrammar, SyntaxGrammar};
use introspect_grammar::{introspect_grammar, GrammarIntrospection};
pub use node_types::{SuperTypeCycleError, VariableInfoError};
#[cfg(feature = "load")]
pub use node_types::{FieldInfoJSON, NodeInfoJSON, NodeTypeJSON};
use parse_grammar::parse_grammar;
pub use parse_grammar::ParseGrammarError;
use prepare_grammar::prepare_grammar;
pub use prepare_grammar::PrepareGrammarError;
use render::render_c_code;
pub use render::{ABI_VERSION_MAX, ABI_VERSION_MIN};
@ -50,22 +49,6 @@ static JSON_COMMENT_REGEX: LazyLock<Regex> = LazyLock::new(|| {
.unwrap()
});
struct JSONOutput {
#[cfg(feature = "load")]
node_types_json: String,
syntax_grammar: SyntaxGrammar,
lexical_grammar: LexicalGrammar,
inlines: InlinedProductionMap,
simple_aliases: BTreeMap<Symbol, Alias>,
variable_info: Vec<VariableInfo>,
}
struct GeneratedParser {
c_code: String,
#[cfg(feature = "load")]
node_types_json: String,
}
// NOTE: This constant must be kept in sync with the definition of
// `TREE_SITTER_LANGUAGE_VERSION` in `lib/include/tree_sitter/api.h`.
const LANGUAGE_VERSION: usize = 15;
@ -277,9 +260,34 @@ where
// If our job is only to generate `grammar.json` and not `parser.c`, stop here.
let input_grammar = parse_grammar(&grammar_json)?;
let GrammarIntrospection {
syntax_grammar,
lexical_grammar,
simple_aliases,
variable_info,
supertype_symbol_map,
tables,
symbol_ids,
alias_ids,
unique_aliases,
} = introspect_grammar(&input_grammar, report_symbol_name, optimizations)?;
#[cfg(feature = "load")]
let node_types_json = node_types::generate_node_types_json(
&syntax_grammar,
&lexical_grammar,
&simple_aliases,
&unique_aliases,
&variable_info,
&symbol_ids,
)?;
write_file(
&src_path.join("node-types.json"),
&serde_json::to_string_pretty(&node_types_json).unwrap(),
)?;
if !generate_parser {
let node_types_json = generate_node_types_from_grammar(&input_grammar)?.node_types_json;
write_file(&src_path.join("node-types.json"), node_types_json)?;
return Ok(());
}
@ -300,19 +308,21 @@ where
}
// Generate the parser and related files.
let GeneratedParser {
c_code,
node_types_json,
} = generate_parser_for_grammar_with_opts(
&input_grammar,
let c_code = render_c_code(
&input_grammar.name,
tables,
syntax_grammar,
lexical_grammar,
simple_aliases,
symbol_ids,
alias_ids,
unique_aliases,
abi_version,
semantic_version.map(|v| (v.major as u8, v.minor as u8, v.patch as u8)),
report_symbol_name,
optimizations,
)?;
supertype_symbol_map,
);
write_file(&src_path.join("parser.c"), c_code)?;
write_file(&src_path.join("node-types.json"), node_types_json)?;
fs::create_dir_all(&header_path)
.map_err(|e| GenerateError::IO(IoError::new(&e, Some(header_path.as_path()))))?;
write_file(&header_path.join("alloc.h"), ALLOC_HEADER)?;
@ -328,82 +338,33 @@ pub fn generate_parser_for_grammar(
) -> GenerateResult<(String, String)> {
let grammar_json = JSON_COMMENT_REGEX.replace_all(grammar_json, "\n");
let input_grammar = parse_grammar(&grammar_json)?;
let parser = generate_parser_for_grammar_with_opts(
&input_grammar,
LANGUAGE_VERSION,
semantic_version,
None,
OptLevel::empty(),
)?;
Ok((input_grammar.name, parser.c_code))
}
fn generate_node_types_from_grammar(input_grammar: &InputGrammar) -> GenerateResult<JSONOutput> {
let (syntax_grammar, lexical_grammar, inlines, simple_aliases) =
prepare_grammar(input_grammar)?;
let variable_info =
node_types::get_variable_info(&syntax_grammar, &lexical_grammar, &simple_aliases)?;
#[cfg(feature = "load")]
let node_types_json = node_types::generate_node_types_json(
&syntax_grammar,
&lexical_grammar,
&simple_aliases,
&variable_info,
)?;
Ok(JSONOutput {
#[cfg(feature = "load")]
node_types_json: serde_json::to_string_pretty(&node_types_json).unwrap(),
let GrammarIntrospection {
syntax_grammar,
lexical_grammar,
inlines,
simple_aliases,
variable_info,
})
}
variable_info: _,
supertype_symbol_map,
tables,
symbol_ids,
alias_ids,
unique_aliases,
} = introspect_grammar(&input_grammar, None, OptLevel::empty())?;
fn generate_parser_for_grammar_with_opts(
input_grammar: &InputGrammar,
abi_version: usize,
semantic_version: Option<(u8, u8, u8)>,
report_symbol_name: Option<&str>,
optimizations: OptLevel,
) -> GenerateResult<GeneratedParser> {
let JSONOutput {
syntax_grammar,
lexical_grammar,
inlines,
simple_aliases,
variable_info,
#[cfg(feature = "load")]
node_types_json,
} = generate_node_types_from_grammar(input_grammar)?;
let supertype_symbol_map =
node_types::get_supertype_symbol_map(&syntax_grammar, &simple_aliases, &variable_info);
let tables = build_tables(
&syntax_grammar,
&lexical_grammar,
&simple_aliases,
&variable_info,
&inlines,
report_symbol_name,
optimizations,
)?;
let c_code = render_c_code(
&input_grammar.name,
tables,
syntax_grammar,
lexical_grammar,
simple_aliases,
abi_version,
symbol_ids,
alias_ids,
unique_aliases,
LANGUAGE_VERSION,
semantic_version,
supertype_symbol_map,
);
Ok(GeneratedParser {
c_code,
#[cfg(feature = "load")]
node_types_json,
})
Ok((input_grammar.name, c_code))
}
/// This will read the `tree-sitter.json` config file and attempt to extract the version.

View file

@ -0,0 +1,375 @@
use std::{
collections::{BTreeMap, HashMap, HashSet},
fmt::Write,
};
use crate::{
build_tables::{build_tables, Tables},
grammars::{InputGrammar, LexicalGrammar, SyntaxGrammar, VariableType},
node_types::{self, ChildType, VariableInfo},
prepare_grammar::prepare_grammar,
rules::{Alias, AliasMap, Symbol, SymbolType},
tables::ParseTable,
GenerateError, OptLevel,
};
pub struct GrammarIntrospection {
pub syntax_grammar: SyntaxGrammar,
pub lexical_grammar: LexicalGrammar,
pub simple_aliases: BTreeMap<Symbol, Alias>,
pub variable_info: Vec<VariableInfo>,
pub supertype_symbol_map: BTreeMap<Symbol, Vec<ChildType>>,
pub tables: Tables,
pub symbol_ids: HashMap<Symbol, (String, u16)>,
pub alias_ids: HashMap<Alias, String>,
pub unique_aliases: Vec<(Alias, u16)>,
}
pub fn introspect_grammar(
input_grammar: &InputGrammar,
report_symbol_name: Option<&str>,
optimizations: OptLevel,
) -> Result<GrammarIntrospection, GenerateError> {
let (syntax_grammar, lexical_grammar, inlines, simple_aliases) =
prepare_grammar(input_grammar)?;
let variable_info =
node_types::get_variable_info(&syntax_grammar, &lexical_grammar, &simple_aliases)?;
let supertype_symbol_map =
node_types::get_supertype_symbol_map(&syntax_grammar, &simple_aliases, &variable_info);
let tables = build_tables(
&syntax_grammar,
&lexical_grammar,
&simple_aliases,
&variable_info,
&inlines,
report_symbol_name,
optimizations,
)?;
// Generate symbol IDs (both string and numeric) before rendering C code
let (symbol_ids, alias_ids, unique_aliases) = generate_symbol_ids(
&tables.parse_table,
&syntax_grammar,
&lexical_grammar,
&simple_aliases,
);
Ok(GrammarIntrospection {
syntax_grammar,
lexical_grammar,
simple_aliases,
variable_info,
supertype_symbol_map,
tables,
symbol_ids,
alias_ids,
unique_aliases,
})
}
/// Generates symbol IDs and alias IDs for the given parse table and grammars.
///
/// This function must be called before `render_c_code` to generate the symbol mappings
/// that will be used in the generated C code.
///
/// # Arguments
///
/// * `parse_table` - The generated parse table for the language
/// * `syntax_grammar` - The syntax grammar extracted from the language's grammar
/// * `lexical_grammar` - The lexical grammar extracted from the language's grammar
/// * `default_aliases` - A map describing the global rename rules that should apply
///
/// # Returns
///
/// A tuple containing:
/// * `symbol_ids` - HashMap mapping each Symbol to (C identifier string, numeric ID)
/// * `alias_ids` - HashMap mapping each Alias to its C identifier string
/// * `unique_aliases` - Sorted vector of unique aliases
pub fn generate_symbol_ids(
parse_table: &ParseTable,
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
default_aliases: &AliasMap,
) -> (
HashMap<Symbol, (String, u16)>,
HashMap<Alias, String>,
Vec<(Alias, u16)>,
) {
let mut symbol_ids = HashMap::new();
let mut alias_ids = HashMap::new();
let mut unique_aliases = Vec::new();
let mut symbol_identifiers = HashSet::new();
// Generate symbol IDs with numeric IDs
// Symbol::end() gets 0, then other symbols get 1, 2, 3...
let mut numeric_id = 0u16;
for &symbol in &parse_table.symbols {
assign_symbol_id(
symbol,
syntax_grammar,
lexical_grammar,
&mut symbol_ids,
&mut symbol_identifiers,
numeric_id,
);
numeric_id += 1;
}
symbol_ids.insert(
Symbol::end_of_nonterminal_extra(),
symbol_ids[&Symbol::end()].clone(),
);
// Build symbol map to find canonical symbols for aliases
let mut symbol_map = HashMap::new();
for symbol in &parse_table.symbols {
let mut mapping = symbol;
if let Some(alias) = default_aliases.get(symbol) {
let kind = alias.kind();
for other_symbol in &parse_table.symbols {
if let Some(other_alias) = default_aliases.get(other_symbol) {
if other_symbol < mapping && other_alias == alias {
mapping = other_symbol;
}
} else {
let (other_name, other_kind) =
metadata_for_symbol(*other_symbol, syntax_grammar, lexical_grammar);
if (other_name, other_kind) == (alias.value.as_str(), kind) {
mapping = other_symbol;
break;
}
}
}
} else if symbol.is_terminal() {
let metadata = metadata_for_symbol(*symbol, syntax_grammar, lexical_grammar);
for other_symbol in &parse_table.symbols {
let other_metadata =
metadata_for_symbol(*other_symbol, syntax_grammar, lexical_grammar);
if other_metadata == metadata {
if let Some(mapped) = symbol_map.get(other_symbol) {
if mapped == symbol {
break;
}
}
mapping = other_symbol;
break;
}
}
}
symbol_map.insert(*symbol, *mapping);
}
// Generate alias IDs
for production_info in &parse_table.production_infos {
for alias in &production_info.alias_sequence {
if let Some(alias) = &alias {
// Find symbols that match this alias
let matching_symbols: Vec<Symbol> = parse_table
.symbols
.iter()
.copied()
.filter(|symbol| {
default_aliases.get(symbol).map_or_else(
|| {
let (name, kind) =
metadata_for_symbol(*symbol, syntax_grammar, lexical_grammar);
name == alias.value && kind == alias.kind()
},
|default_alias| default_alias == alias,
)
})
.collect();
// Some aliases match an existing symbol in the grammar.
let alias_id = if let Some(existing_symbol) = matching_symbols.first() {
symbol_ids[&symbol_map[existing_symbol]].0.clone()
}
// Other aliases don't match any existing symbol, and need their own identifiers.
else {
if let Err(i) = unique_aliases.binary_search(alias) {
unique_aliases.insert(i, alias.clone());
}
if alias.is_named {
format!("alias_sym_{}", sanitize_identifier(&alias.value))
} else {
format!("anon_alias_sym_{}", sanitize_identifier(&alias.value))
}
};
alias_ids.entry(alias.clone()).or_insert(alias_id);
}
}
}
(
symbol_ids,
alias_ids,
unique_aliases
.into_iter()
.map(|alias| {
let id = numeric_id;
numeric_id += 1;
(alias, id)
})
.collect(),
)
}
/// Helper function to sanitize identifiers for C code generation.
pub fn sanitize_identifier(name: &str) -> String {
let mut result = String::with_capacity(name.len());
for c in name.chars() {
if c.is_ascii_alphanumeric() || c == '_' {
result.push(c);
} else {
'special_chars: {
let replacement = match c {
' ' if name.len() == 1 => "SPACE",
'~' => "TILDE",
'`' => "BQUOTE",
'!' => "BANG",
'@' => "AT",
'#' => "POUND",
'$' => "DOLLAR",
'%' => "PERCENT",
'^' => "CARET",
'&' => "AMP",
'*' => "STAR",
'(' => "LPAREN",
')' => "RPAREN",
'-' => "DASH",
'+' => "PLUS",
'=' => "EQ",
'{' => "LBRACE",
'}' => "RBRACE",
'[' => "LBRACK",
']' => "RBRACK",
'\\' => "BSLASH",
'|' => "PIPE",
':' => "COLON",
';' => "SEMI",
'"' => "DQUOTE",
'\'' => "SQUOTE",
'<' => "LT",
'>' => "GT",
',' => "COMMA",
'.' => "DOT",
'?' => "QMARK",
'/' => "SLASH",
'\n' => "LF",
'\r' => "CR",
'\t' => "TAB",
'\0' => "NULL",
'\u{0001}' => "SOH",
'\u{0002}' => "STX",
'\u{0003}' => "ETX",
'\u{0004}' => "EOT",
'\u{0005}' => "ENQ",
'\u{0006}' => "ACK",
'\u{0007}' => "BEL",
'\u{0008}' => "BS",
'\u{000b}' => "VTAB",
'\u{000c}' => "FF",
'\u{000e}' => "SO",
'\u{000f}' => "SI",
'\u{0010}' => "DLE",
'\u{0011}' => "DC1",
'\u{0012}' => "DC2",
'\u{0013}' => "DC3",
'\u{0014}' => "DC4",
'\u{0015}' => "NAK",
'\u{0016}' => "SYN",
'\u{0017}' => "ETB",
'\u{0018}' => "CAN",
'\u{0019}' => "EM",
'\u{001a}' => "SUB",
'\u{001b}' => "ESC",
'\u{001c}' => "FS",
'\u{001d}' => "GS",
'\u{001e}' => "RS",
'\u{001f}' => "US",
'\u{007F}' => "DEL",
'\u{FEFF}' => "BOM",
'\u{0080}'..='\u{FFFF}' => {
write!(result, "u{:04x}", c as u32).unwrap();
break 'special_chars;
}
'\u{10000}'..='\u{10FFFF}' => {
write!(result, "U{:08x}", c as u32).unwrap();
break 'special_chars;
}
'0'..='9' | 'a'..='z' | 'A'..='Z' | '_' => unreachable!(),
' ' => break 'special_chars,
};
if !result.is_empty() && !result.ends_with('_') {
result.push('_');
}
result += replacement;
}
}
}
result
}
/// Helper function to get metadata for a symbol.
pub fn metadata_for_symbol<'a>(
symbol: Symbol,
syntax_grammar: &'a SyntaxGrammar,
lexical_grammar: &'a LexicalGrammar,
) -> (&'a str, VariableType) {
match symbol.kind {
SymbolType::End | SymbolType::EndOfNonTerminalExtra => ("end", VariableType::Hidden),
SymbolType::NonTerminal => {
let variable = &syntax_grammar.variables[symbol.index];
(&variable.name as &str, variable.kind)
}
SymbolType::Terminal => {
let variable = &lexical_grammar.variables[symbol.index];
(&variable.name as &str, variable.kind)
}
SymbolType::External => {
let token = &syntax_grammar.external_tokens[symbol.index];
(&token.name as &str, token.kind)
}
}
}
/// Helper function to assign a symbol ID.
fn assign_symbol_id(
symbol: Symbol,
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
symbol_ids: &mut HashMap<Symbol, (String, u16)>,
used_identifiers: &mut HashSet<String>,
numeric_id: u16,
) {
let mut id;
if symbol == Symbol::end() {
id = "ts_builtin_sym_end".to_string();
} else {
let (name, kind) = metadata_for_symbol(symbol, syntax_grammar, lexical_grammar);
id = match kind {
VariableType::Auxiliary => format!("aux_sym_{}", sanitize_identifier(name)),
VariableType::Anonymous => format!("anon_sym_{}", sanitize_identifier(name)),
VariableType::Hidden | VariableType::Named => {
format!("sym_{}", sanitize_identifier(name))
}
};
let mut suffix_number = 1;
let mut suffix = String::new();
while used_identifiers.contains(&id) {
id.drain(id.len() - suffix.len()..);
suffix_number += 1;
suffix = suffix_number.to_string();
id += &suffix;
}
}
used_identifiers.insert(id.clone());
symbol_ids.insert(symbol, (id, numeric_id));
}

View file

@ -1,10 +1,11 @@
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
use serde::Serialize;
use serde::{Deserialize, Serialize};
use thiserror::Error;
use super::{
grammars::{LexicalGrammar, SyntaxGrammar, VariableType},
introspect_grammar::metadata_for_symbol,
rules::{Alias, AliasMap, Symbol, SymbolType},
};
@ -28,38 +29,40 @@ pub struct VariableInfo {
pub has_multi_step_production: bool,
}
#[derive(Debug, Serialize, PartialEq, Eq, Default, PartialOrd, Ord)]
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Default, PartialOrd, Ord)]
#[cfg(feature = "load")]
pub struct NodeInfoJSON {
#[serde(rename = "type")]
kind: String,
named: bool,
#[serde(skip_serializing_if = "std::ops::Not::not")]
root: bool,
#[serde(skip_serializing_if = "std::ops::Not::not")]
extra: bool,
pub kind: String,
pub named: bool,
#[serde(skip_serializing_if = "std::ops::Not::not", default)]
pub root: bool,
#[serde(skip_serializing_if = "std::ops::Not::not", default)]
pub extra: bool,
#[serde(skip_serializing_if = "Option::is_none")]
fields: Option<BTreeMap<String, FieldInfoJSON>>,
pub fields: Option<BTreeMap<String, FieldInfoJSON>>,
#[serde(skip_serializing_if = "Option::is_none")]
children: Option<FieldInfoJSON>,
pub children: Option<FieldInfoJSON>,
#[serde(skip_serializing_if = "Option::is_none")]
subtypes: Option<Vec<NodeTypeJSON>>,
pub subtypes: Option<Vec<NodeTypeJSON>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub symbol_ids: Option<Vec<u16>>,
}
#[derive(Clone, Debug, Serialize, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[cfg(feature = "load")]
pub struct NodeTypeJSON {
#[serde(rename = "type")]
kind: String,
named: bool,
pub kind: String,
pub named: bool,
}
#[derive(Debug, Serialize, PartialEq, Eq, PartialOrd, Ord)]
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
#[cfg(feature = "load")]
pub struct FieldInfoJSON {
multiple: bool,
required: bool,
types: Vec<NodeTypeJSON>,
pub multiple: bool,
pub required: bool,
pub types: Vec<NodeTypeJSON>,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
@ -471,10 +474,47 @@ pub fn generate_node_types_json(
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
default_aliases: &AliasMap,
unique_aliases: &Vec<(Alias, u16)>,
variable_info: &[VariableInfo],
symbol_ids: &HashMap<Symbol, (String, u16)>,
) -> SuperTypeCycleResult<Vec<NodeInfoJSON>> {
let mut node_types_json = BTreeMap::new();
// Build a map from (kind, is_named) to all symbol IDs that map to that kind
let mut kind_to_symbol_ids: HashMap<(String, bool), Vec<u16>> = HashMap::new();
for (symbol, (_name, numeric_id)) in symbol_ids {
// Get the actual kind name for this symbol (considering aliases)
let (kind, is_named) = if let Some(alias) = default_aliases.get(symbol) {
(alias.value.clone(), alias.is_named)
} else {
match symbol.kind {
// TODO: check if `SymbolType::EndOfNonTerminalExtra` is correct
SymbolType::End => continue,
_ => {
let (name, kind) =
metadata_for_symbol(*symbol, syntax_grammar, lexical_grammar);
(name.to_string(), kind == VariableType::Named)
}
}
};
kind_to_symbol_ids
.entry((kind, is_named))
.or_insert_with(Vec::new)
.push(*numeric_id);
}
for unique_alias in unique_aliases {
kind_to_symbol_ids.insert(
(unique_alias.0.value.clone(), unique_alias.0.is_named),
vec![unique_alias.1],
);
}
// Sort the symbol IDs for each kind to ensure consistent ordering
for ids in kind_to_symbol_ids.values_mut() {
ids.sort_unstable();
}
let child_type_to_node_type = |child_type: &ChildType| match child_type {
ChildType::Aliased(alias) => NodeTypeJSON {
kind: alias.value.clone(),
@ -571,6 +611,9 @@ pub fn generate_node_types_json(
fields: None,
children: None,
subtypes: None,
symbol_ids: kind_to_symbol_ids
.get(&(variable.name.clone(), true))
.cloned(),
});
let mut subtypes = info
.children
@ -615,6 +658,7 @@ pub fn generate_node_types_json(
fields: Some(BTreeMap::new()),
children: None,
subtypes: None,
symbol_ids: kind_to_symbol_ids.get(&(kind.clone(), is_named)).cloned(),
}
});
@ -705,15 +749,16 @@ pub fn generate_node_types_json(
.iter()
.enumerate()
.flat_map(|(i, variable)| {
let symbol = Symbol::terminal(i);
aliases_by_symbol
.get(&Symbol::terminal(i))
.get(&symbol)
.unwrap_or(&empty)
.iter()
.map(move |alias| {
alias
.as_ref()
.map_or((&variable.name, variable.kind), |alias| {
(&alias.value, alias.kind())
.map_or((&variable.name, variable.kind, symbol), |alias| {
(&alias.value, alias.kind(), symbol)
})
})
});
@ -723,18 +768,21 @@ pub fn generate_node_types_json(
.iter()
.enumerate()
.flat_map(|(i, token)| {
let symbol = Symbol::external(i);
aliases_by_symbol
.get(&Symbol::external(i))
.get(&symbol)
.unwrap_or(&empty)
.iter()
.map(move |alias| {
alias.as_ref().map_or((&token.name, token.kind), |alias| {
(&alias.value, alias.kind())
})
alias
.as_ref()
.map_or((&token.name, token.kind, symbol), |alias| {
(&alias.value, alias.kind(), symbol)
})
})
});
for (name, kind) in regular_tokens.chain(external_tokens) {
for (name, kind, _symbol) in regular_tokens.chain(external_tokens) {
match kind {
VariableType::Named => {
let node_type_json =
@ -748,6 +796,7 @@ pub fn generate_node_types_json(
fields: None,
children: None,
subtypes: None,
symbol_ids: kind_to_symbol_ids.get(&(name.clone(), true)).cloned(),
});
if let Some(children) = &mut node_type_json.children {
children.required = false;
@ -766,6 +815,7 @@ pub fn generate_node_types_json(
fields: None,
children: None,
subtypes: None,
symbol_ids: kind_to_symbol_ids.get(&(name.clone(), false)).cloned(),
}),
_ => {}
}
@ -845,8 +895,9 @@ mod tests {
grammars::{
InputGrammar, LexicalVariable, Production, ProductionStep, SyntaxVariable, Variable,
},
prepare_grammar::prepare_grammar,
introspect_grammar,
rules::Rule,
GrammarIntrospection, OptLevel,
};
#[test]
@ -916,7 +967,8 @@ mod tests {
]
.into_iter()
.collect()
)
),
symbol_ids: Some(vec![4]),
}
);
assert_eq!(
@ -928,7 +980,8 @@ mod tests {
extra: false,
subtypes: None,
children: None,
fields: None
fields: None,
symbol_ids: Some(vec![1]),
}
);
assert_eq!(
@ -940,7 +993,8 @@ mod tests {
extra: false,
subtypes: None,
children: None,
fields: None
fields: None,
symbol_ids: Some(vec![2]),
}
);
}
@ -1016,7 +1070,8 @@ mod tests {
]
.into_iter()
.collect()
)
),
symbol_ids: Some(vec![4]),
}
);
assert_eq!(
@ -1028,7 +1083,8 @@ mod tests {
extra: false,
subtypes: None,
children: None,
fields: None
fields: None,
symbol_ids: Some(vec![1]),
}
);
assert_eq!(
@ -1040,7 +1096,8 @@ mod tests {
extra: false,
subtypes: None,
children: None,
fields: None
fields: None,
symbol_ids: Some(vec![2]),
}
);
assert_eq!(
@ -1052,7 +1109,8 @@ mod tests {
extra: true,
subtypes: None,
children: None,
fields: None
fields: None,
symbol_ids: Some(vec![3]),
}
);
}
@ -1083,7 +1141,7 @@ mod tests {
Variable {
name: "v3".to_string(),
kind: VariableType::Named,
rule: Rule::seq(vec![Rule::string("y"), Rule::repeat(Rule::string("z"))]),
rule: Rule::seq(vec![Rule::string("y"), Rule::string("z")]),
},
],
..Default::default()
@ -1128,7 +1186,8 @@ mod tests {
]
.into_iter()
.collect()
)
),
symbol_ids: Some(vec![5]),
}
);
assert_eq!(
@ -1140,7 +1199,8 @@ mod tests {
extra: true,
subtypes: None,
children: None,
fields: Some(BTreeMap::default())
fields: Some(BTreeMap::default()),
symbol_ids: Some(vec![6]),
}
);
assert_eq!(
@ -1152,7 +1212,8 @@ mod tests {
extra: false,
subtypes: None,
children: None,
fields: None
fields: None,
symbol_ids: Some(vec![1]),
}
);
assert_eq!(
@ -1164,7 +1225,8 @@ mod tests {
extra: false,
subtypes: None,
children: None,
fields: None
fields: None,
symbol_ids: Some(vec![2]),
}
);
}
@ -1226,6 +1288,7 @@ mod tests {
named: true,
},
]),
symbol_ids: None,
}
);
assert_eq!(
@ -1251,7 +1314,8 @@ mod tests {
),]
.into_iter()
.collect()
)
),
symbol_ids: Some(vec![4]),
}
);
}
@ -1329,7 +1393,8 @@ mod tests {
),]
.into_iter()
.collect()
)
),
symbol_ids: Some(vec![5]),
}
);
assert_eq!(
@ -1349,6 +1414,7 @@ mod tests {
},]
}),
fields: Some(BTreeMap::new()),
symbol_ids: Some(vec![6]),
}
);
}
@ -1402,6 +1468,7 @@ mod tests {
]
}),
fields: Some(BTreeMap::new()),
symbol_ids: Some(vec![3]),
}
);
}
@ -1450,6 +1517,7 @@ mod tests {
rule: Rule::pattern("[\\w-]+", ""),
},
],
expected_conflicts: vec![vec!["type".to_string(), "expression".to_string()]],
..Default::default()
})
.unwrap();
@ -1465,6 +1533,7 @@ mod tests {
subtypes: None,
children: None,
fields: None,
symbol_ids: Some(vec![2, 3]),
})
);
assert_eq!(
@ -1477,6 +1546,7 @@ mod tests {
subtypes: None,
children: None,
fields: None,
symbol_ids: Some(vec![7]),
})
);
}
@ -1542,6 +1612,7 @@ mod tests {
.into_iter()
.collect()
),
symbol_ids: Some(vec![3]),
}
);
}
@ -1570,7 +1641,8 @@ mod tests {
extra: false,
fields: Some(BTreeMap::new()),
children: None,
subtypes: None
subtypes: None,
symbol_ids: Some(vec![3]),
}]
);
}
@ -1678,6 +1750,7 @@ mod tests {
.into_iter()
.collect()
),
symbol_ids: Some(vec![7, 8]),
},
NodeInfoJSON {
kind: "script".to_string(),
@ -1695,6 +1768,7 @@ mod tests {
}]
}),
fields: Some(BTreeMap::new()),
symbol_ids: Some(vec![6]),
}
]
);
@ -1751,6 +1825,7 @@ mod tests {
}]
}),
fields: Some(BTreeMap::new()),
symbol_ids: Some(vec![3, 5]),
}
);
}
@ -2056,15 +2131,25 @@ mod tests {
}
fn get_node_types(grammar: &InputGrammar) -> SuperTypeCycleResult<Vec<NodeInfoJSON>> {
let (syntax_grammar, lexical_grammar, _, default_aliases) =
prepare_grammar(grammar).unwrap();
let variable_info =
get_variable_info(&syntax_grammar, &lexical_grammar, &default_aliases).unwrap();
let GrammarIntrospection {
syntax_grammar,
lexical_grammar,
simple_aliases,
variable_info,
supertype_symbol_map: _,
tables: _,
symbol_ids,
alias_ids: _,
unique_aliases,
} = introspect_grammar(grammar, None, OptLevel::default()).unwrap();
generate_node_types_json(
&syntax_grammar,
&lexical_grammar,
&default_aliases,
&simple_aliases,
&unique_aliases,
&variable_info,
&symbol_ids,
)
}

View file

@ -1,11 +1,14 @@
use std::{
cmp,
collections::{BTreeMap, BTreeSet, HashMap, HashSet},
collections::{BTreeMap, BTreeSet, HashMap},
fmt::Write,
mem::swap,
};
use crate::LANGUAGE_VERSION;
use crate::{
introspect_grammar::{metadata_for_symbol, sanitize_identifier},
LANGUAGE_VERSION,
};
use indoc::indoc;
use super::{
@ -78,10 +81,9 @@ struct Generator {
syntax_grammar: SyntaxGrammar,
lexical_grammar: LexicalGrammar,
default_aliases: AliasMap,
symbol_order: HashMap<Symbol, usize>,
symbol_ids: HashMap<Symbol, String>,
symbol_ids: HashMap<Symbol, (String, u16)>,
alias_ids: HashMap<Alias, String>,
unique_aliases: Vec<Alias>,
unique_aliases: Vec<(Alias, u16)>,
symbol_map: HashMap<Symbol, Symbol>,
reserved_word_sets: Vec<TokenSet>,
reserved_word_set_ids_by_parse_state: Vec<usize>,
@ -175,15 +177,7 @@ impl Generator {
}
fn init(&mut self) {
let mut symbol_identifiers = HashSet::new();
for i in 0..self.parse_table.symbols.len() {
self.assign_symbol_id(self.parse_table.symbols[i], &mut symbol_identifiers);
}
self.symbol_ids.insert(
Symbol::end_of_nonterminal_extra(),
self.symbol_ids[&Symbol::end()].clone(),
);
// symbol_ids and alias_ids are now passed in from the constructor
self.symbol_map = HashMap::new();
for symbol in &self.parse_table.symbols {
@ -237,32 +231,6 @@ impl Generator {
self.field_names.insert(i, field_name.clone());
}
}
for alias in &production_info.alias_sequence {
// Generate a mapping from aliases to C identifiers.
if let Some(alias) = &alias {
// Some aliases match an existing symbol in the grammar.
let alias_id =
if let Some(existing_symbol) = self.symbols_for_alias(alias).first() {
self.symbol_ids[&self.symbol_map[existing_symbol]].clone()
}
// Other aliases don't match any existing symbol, and need their own
// identifiers.
else {
if let Err(i) = self.unique_aliases.binary_search(alias) {
self.unique_aliases.insert(i, alias.clone());
}
if alias.is_named {
format!("alias_sym_{}", self.sanitize_identifier(&alias.value))
} else {
format!("anon_alias_sym_{}", self.sanitize_identifier(&alias.value))
}
};
self.alias_ids.entry(alias.clone()).or_insert(alias_id);
}
}
}
for (ix, (symbol, _)) in self.large_character_sets.iter().enumerate() {
@ -272,7 +240,7 @@ impl Generator {
.count()
+ 1;
let constant_name = if let Some(symbol) = symbol {
format!("{}_character_set_{}", self.symbol_ids[symbol], count)
format!("{}_character_set_{}", self.symbol_ids[symbol].0, count)
} else {
format!("extras_character_set_{count}")
};
@ -302,7 +270,7 @@ impl Generator {
for (supertype, subtypes) in &self.supertype_symbol_map {
if let Some(supertype) = self.symbol_ids.get(supertype) {
self.supertype_map
.entry(supertype.clone())
.entry(supertype.0.clone())
.or_insert_with(|| subtypes.clone());
}
}
@ -424,18 +392,17 @@ impl Generator {
fn add_symbol_enum(&mut self) {
add_line!(self, "enum ts_symbol_identifiers {{");
indent!(self);
self.symbol_order.insert(Symbol::end(), 0);
let mut i = 1;
// symbol_ids already contains both string ID and numeric ID
for symbol in &self.parse_table.symbols {
if *symbol != Symbol::end() {
self.symbol_order.insert(*symbol, i);
add_line!(self, "{} = {i},", self.symbol_ids[symbol]);
i += 1;
if *symbol == Symbol::end() {
continue;
}
let (string_id, numeric_id) = &self.symbol_ids[symbol];
add_line!(self, "{} = {numeric_id},", string_id);
}
for alias in &self.unique_aliases {
add_line!(self, "{} = {i},", self.alias_ids[alias]);
i += 1;
// Add aliases after all symbols
for (alias, numeric_id) in self.unique_aliases.iter() {
add_line!(self, "{} = {},", self.alias_ids[alias], numeric_id);
}
dedent!(self);
add_line!(self, "}};");
@ -453,14 +420,14 @@ impl Generator {
alias.value.as_str()
}),
);
add_line!(self, "[{}] = \"{name}\",", self.symbol_ids[symbol]);
add_line!(self, "[{}] = \"{name}\",", self.symbol_ids[symbol].0);
}
for alias in &self.unique_aliases {
add_line!(
self,
"[{}] = \"{}\",",
self.alias_ids[alias],
self.sanitize_string(&alias.value)
self.alias_ids[&alias.0],
self.sanitize_string(&alias.0.value)
);
}
dedent!(self);
@ -475,8 +442,8 @@ impl Generator {
add_line!(
self,
"[{}] = {},",
self.symbol_ids[symbol],
self.symbol_ids[&self.symbol_map[symbol]],
self.symbol_ids[symbol].0,
self.symbol_ids[&self.symbol_map[symbol]].0,
);
}
@ -484,8 +451,8 @@ impl Generator {
add_line!(
self,
"[{}] = {},",
self.alias_ids[alias],
self.alias_ids[alias],
self.alias_ids[&alias.0],
self.alias_ids[&alias.0],
);
}
@ -524,7 +491,7 @@ impl Generator {
);
indent!(self);
for symbol in &self.parse_table.symbols {
add_line!(self, "[{}] = {{", self.symbol_ids[symbol]);
add_line!(self, "[{}] = {{", self.symbol_ids[symbol].0);
indent!(self);
if let Some(Alias { is_named, .. }) = self.default_aliases.get(symbol) {
add_line!(self, ".visible = true,");
@ -556,10 +523,10 @@ impl Generator {
add_line!(self, "}},");
}
for alias in &self.unique_aliases {
add_line!(self, "[{}] = {{", self.alias_ids[alias]);
add_line!(self, "[{}] = {{", self.alias_ids[&alias.0]);
indent!(self);
add_line!(self, ".visible = true,");
add_line!(self, ".named = {},", alias.is_named);
add_line!(self, ".named = {},", &alias.0.is_named);
dedent!(self);
add_line!(self, "}},");
}
@ -631,8 +598,8 @@ impl Generator {
);
indent!(self);
for (symbol, alias_ids) in alias_ids_by_symbol {
let symbol_id = &self.symbol_ids[symbol];
let public_symbol_id = &self.symbol_ids[&self.symbol_map[symbol]];
let symbol_id = &self.symbol_ids[symbol].0;
let public_symbol_id = &self.symbol_ids[&self.symbol_map[symbol]].0;
add_line!(self, "{symbol_id}, {},", 1 + alias_ids.len());
indent!(self);
add_line!(self, "{public_symbol_id},");
@ -769,13 +736,15 @@ impl Generator {
subtypes
.iter()
.flat_map(|s| match s {
ChildType::Normal(symbol) => vec![self.symbol_ids.get(symbol).cloned()],
ChildType::Normal(symbol) => {
vec![self.symbol_ids.get(symbol).map(|t| t.0.clone())]
}
ChildType::Aliased(alias) => {
self.alias_ids.get(alias).cloned().map_or_else(
|| {
self.symbols_for_alias(alias)
.into_iter()
.map(|s| self.symbol_ids.get(&s).cloned())
.map(|s| self.symbol_ids.get(&s).map(|t| t.0.clone()))
.collect()
},
|a| vec![Some(a)],
@ -854,7 +823,7 @@ impl Generator {
fn add_lex_state(&mut self, _state_ix: usize, state: LexState) {
if let Some(accept_action) = state.accept_action {
add_line!(self, "ACCEPT_TOKEN({});", self.symbol_ids[&accept_action]);
add_line!(self, "ACCEPT_TOKEN({});", self.symbol_ids[&accept_action].0);
}
if let Some(eof_action) = state.eof_action {
@ -1198,7 +1167,7 @@ impl Generator {
add_line!(self, "[{id}] = {{");
indent!(self);
for token in set.iter() {
add_line!(self, "{},", self.symbol_ids[&token]);
add_line!(self, "{},", self.symbol_ids[&token].0);
}
dedent!(self);
add_line!(self, "}},");
@ -1238,7 +1207,7 @@ impl Generator {
self,
"[{}] = {},",
self.external_token_id(token),
self.symbol_ids[&id_token],
self.symbol_ids[&id_token].0,
);
}
dedent!(self);
@ -1312,14 +1281,14 @@ impl Generator {
nonterminal_entries.clear();
terminal_entries.extend(state.terminal_entries.iter());
nonterminal_entries.extend(state.nonterminal_entries.iter());
terminal_entries.sort_unstable_by_key(|e| self.symbol_order.get(e.0));
terminal_entries.sort_unstable_by_key(|e| self.symbol_ids.get(e.0).map(|t| &t.1));
nonterminal_entries.sort_unstable_by_key(|k| k.0);
for (symbol, action) in &nonterminal_entries {
add_line!(
self,
"[{}] = STATE({}),",
self.symbol_ids[symbol],
self.symbol_ids[symbol].0,
match action {
GotoAction::Goto(state) => *state,
GotoAction::ShiftExtra => i,
@ -1333,7 +1302,11 @@ impl Generator {
&mut parse_table_entries,
&mut next_parse_action_list_index,
);
add_line!(self, "[{}] = ACTIONS({entry_id}),", self.symbol_ids[symbol]);
add_line!(
self,
"[{}] = ACTIONS({entry_id}),",
self.symbol_ids[symbol].0
);
}
dedent!(self);
@ -1362,7 +1335,7 @@ impl Generator {
terminal_entries.clear();
terminal_entries.extend(state.terminal_entries.iter());
terminal_entries.sort_unstable_by_key(|e| self.symbol_order.get(e.0));
terminal_entries.sort_unstable_by_key(|e| self.symbol_ids.get(e.0).map(|t| &t.1));
// In a given parse state, many lookahead symbols have the same actions.
// So in the "small state" representation, group symbols by their action
@ -1415,7 +1388,7 @@ impl Generator {
symbols.sort_unstable();
indent!(self);
for symbol in symbols {
add_line!(self, "{},", self.symbol_ids[symbol]);
add_line!(self, "{},", self.symbol_ids[symbol].0);
}
dedent!(self);
}
@ -1491,7 +1464,7 @@ impl Generator {
add!(
self,
"REDUCE({}, {child_count}, {dynamic_precedence}, {production_id})",
self.symbol_ids[&symbol]
self.symbol_ids[&symbol].0
);
}
}
@ -1603,7 +1576,7 @@ impl Generator {
add_line!(
self,
".keyword_capture_token = {},",
self.symbol_ids[&keyword_capture_token]
self.symbol_ids[&keyword_capture_token].0
);
}
@ -1702,38 +1675,7 @@ impl Generator {
}
fn external_token_id(&self, token: &ExternalToken) -> String {
format!(
"ts_external_token_{}",
self.sanitize_identifier(&token.name)
)
}
fn assign_symbol_id(&mut self, symbol: Symbol, used_identifiers: &mut HashSet<String>) {
let mut id;
if symbol == Symbol::end() {
id = "ts_builtin_sym_end".to_string();
} else {
let (name, kind) = self.metadata_for_symbol(symbol);
id = match kind {
VariableType::Auxiliary => format!("aux_sym_{}", self.sanitize_identifier(name)),
VariableType::Anonymous => format!("anon_sym_{}", self.sanitize_identifier(name)),
VariableType::Hidden | VariableType::Named => {
format!("sym_{}", self.sanitize_identifier(name))
}
};
let mut suffix_number = 1;
let mut suffix = String::new();
while used_identifiers.contains(&id) {
id.drain(id.len() - suffix.len()..);
suffix_number += 1;
suffix = suffix_number.to_string();
id += &suffix;
}
}
used_identifiers.insert(id.clone());
self.symbol_ids.insert(symbol, id);
format!("ts_external_token_{}", sanitize_identifier(&token.name))
}
fn field_id(&self, field_name: &str) -> String {
@ -1741,21 +1683,7 @@ impl Generator {
}
fn metadata_for_symbol(&self, symbol: Symbol) -> (&str, VariableType) {
match symbol.kind {
SymbolType::End | SymbolType::EndOfNonTerminalExtra => ("end", VariableType::Hidden),
SymbolType::NonTerminal => {
let variable = &self.syntax_grammar.variables[symbol.index];
(&variable.name, variable.kind)
}
SymbolType::Terminal => {
let variable = &self.lexical_grammar.variables[symbol.index];
(&variable.name, variable.kind)
}
SymbolType::External => {
let token = &self.syntax_grammar.external_tokens[symbol.index];
(&token.name, token.kind)
}
}
metadata_for_symbol(symbol, &self.syntax_grammar, &self.lexical_grammar)
}
fn symbols_for_alias(&self, alias: &Alias) -> Vec<Symbol> {
@ -1775,101 +1703,6 @@ impl Generator {
.collect()
}
fn sanitize_identifier(&self, name: &str) -> String {
let mut result = String::with_capacity(name.len());
for c in name.chars() {
if c.is_ascii_alphanumeric() || c == '_' {
result.push(c);
} else {
'special_chars: {
let replacement = match c {
' ' if name.len() == 1 => "SPACE",
'~' => "TILDE",
'`' => "BQUOTE",
'!' => "BANG",
'@' => "AT",
'#' => "POUND",
'$' => "DOLLAR",
'%' => "PERCENT",
'^' => "CARET",
'&' => "AMP",
'*' => "STAR",
'(' => "LPAREN",
')' => "RPAREN",
'-' => "DASH",
'+' => "PLUS",
'=' => "EQ",
'{' => "LBRACE",
'}' => "RBRACE",
'[' => "LBRACK",
']' => "RBRACK",
'\\' => "BSLASH",
'|' => "PIPE",
':' => "COLON",
';' => "SEMI",
'"' => "DQUOTE",
'\'' => "SQUOTE",
'<' => "LT",
'>' => "GT",
',' => "COMMA",
'.' => "DOT",
'?' => "QMARK",
'/' => "SLASH",
'\n' => "LF",
'\r' => "CR",
'\t' => "TAB",
'\0' => "NULL",
'\u{0001}' => "SOH",
'\u{0002}' => "STX",
'\u{0003}' => "ETX",
'\u{0004}' => "EOT",
'\u{0005}' => "ENQ",
'\u{0006}' => "ACK",
'\u{0007}' => "BEL",
'\u{0008}' => "BS",
'\u{000b}' => "VTAB",
'\u{000c}' => "FF",
'\u{000e}' => "SO",
'\u{000f}' => "SI",
'\u{0010}' => "DLE",
'\u{0011}' => "DC1",
'\u{0012}' => "DC2",
'\u{0013}' => "DC3",
'\u{0014}' => "DC4",
'\u{0015}' => "NAK",
'\u{0016}' => "SYN",
'\u{0017}' => "ETB",
'\u{0018}' => "CAN",
'\u{0019}' => "EM",
'\u{001a}' => "SUB",
'\u{001b}' => "ESC",
'\u{001c}' => "FS",
'\u{001d}' => "GS",
'\u{001e}' => "RS",
'\u{001f}' => "US",
'\u{007F}' => "DEL",
'\u{FEFF}' => "BOM",
'\u{0080}'..='\u{FFFF}' => {
write!(result, "u{:04x}", c as u32).unwrap();
break 'special_chars;
}
'\u{10000}'..='\u{10FFFF}' => {
write!(result, "U{:08x}", c as u32).unwrap();
break 'special_chars;
}
'0'..='9' | 'a'..='z' | 'A'..='Z' | '_' => unreachable!(),
' ' => break 'special_chars,
};
if !result.is_empty() && !result.ends_with('_') {
result.push('_');
}
result += replacement;
}
}
}
result
}
fn sanitize_string(&self, name: &str) -> String {
let mut result = String::with_capacity(name.len());
for c in name.chars() {
@ -1920,18 +1753,20 @@ impl Generator {
/// # Arguments
///
/// * `name` - A string slice containing the name of the language
/// * `parse_table` - The generated parse table for the language
/// * `main_lex_table` - The generated lexing table for the language
/// * `keyword_lex_table` - The generated keyword lexing table for the language
/// * `keyword_capture_token` - A symbol indicating which token is used for keyword capture, if any.
/// * `tables` - The generated tables for the language
/// * `syntax_grammar` - The syntax grammar extracted from the language's grammar
/// * `lexical_grammar` - The lexical grammar extracted from the language's grammar
/// * `default_aliases` - A map describing the global rename rules that should apply. the keys are
/// symbols that are *always* aliased in the same way, and the values are the aliases that are
/// applied to those symbols.
/// * `symbol_ids` - HashMap mapping each Symbol to its C identifier string
/// * `alias_ids` - HashMap mapping each Alias to its C identifier string
/// * `unique_aliases` - Sorted vector of unique aliases
/// * `abi_version` - The language ABI version that should be generated. Usually you want
/// Tree-sitter's current version, but right after making an ABI change, it may be useful to
/// generate code with the previous ABI.
/// * `semantic_version` - Optional semantic version of the parser
/// * `supertype_symbol_map` - Map of supertype symbols
#[allow(clippy::too_many_arguments)]
pub fn render_c_code(
name: &str,
@ -1939,6 +1774,9 @@ pub fn render_c_code(
syntax_grammar: SyntaxGrammar,
lexical_grammar: LexicalGrammar,
default_aliases: AliasMap,
symbol_ids: HashMap<Symbol, (String, u16)>,
alias_ids: HashMap<Alias, String>,
unique_aliases: Vec<(Alias, u16)>,
abi_version: usize,
semantic_version: Option<(u8, u8, u8)>,
supertype_symbol_map: BTreeMap<Symbol, Vec<ChildType>>,
@ -1958,6 +1796,9 @@ pub fn render_c_code(
syntax_grammar,
lexical_grammar,
default_aliases,
symbol_ids,
alias_ids,
unique_aliases,
abi_version,
metadata: semantic_version.map(|(major_version, minor_version, patch_version)| Metadata {
major_version,

View file

@ -41,6 +41,12 @@
"items": {
"$ref": "#/definitions/NodeType"
}
},
"symbol_ids": {
"type": "array",
"items": {
"type": "integer"
}
}
},
"oneOf": [
@ -105,4 +111,4 @@
}
}
}
}
}

6
flake.lock generated
View file

@ -2,11 +2,11 @@
"nodes": {
"nixpkgs": {
"locked": {
"lastModified": 1756787288,
"narHash": "sha256-rw/PHa1cqiePdBxhF66V7R+WAP8WekQ0mCDG4CFqT8Y=",
"lastModified": 1762977756,
"narHash": "sha256-4PqRErxfe+2toFJFgcRKZ0UI9NSIOJa+7RXVtBhy4KE=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "d0fc30899600b9b3466ddb260fd83deb486c32f1",
"rev": "c5ae371f1a6a7fd27823bc500d9390b38c05fa55",
"type": "github"
},
"original": {

View file

@ -326,7 +326,7 @@
clippy
rust-analyzer
rustfmt
cargo-llvm-cov
# cargo-llvm-cov
cmake
gnumake