refactor: extract grammar introspection into separate module

This commit is contained in:
bglgwyng 2025-11-09 16:19:49 +09:00
parent d2a2b4005a
commit ab9b098aad
3 changed files with 89 additions and 67 deletions

View file

@ -13,7 +13,6 @@ use std::{
use anyhow::Result;
use bitflags::bitflags;
use log::warn;
use node_types::VariableInfo;
use regex::{Regex, RegexBuilder};
use rules::{Alias, Symbol};
#[cfg(feature = "load")]
@ -26,6 +25,7 @@ use thiserror::Error;
mod build_tables;
mod dedup;
mod grammars;
mod introspect_grammar;
mod nfa;
mod node_types;
pub mod parse_grammar;
@ -36,15 +36,13 @@ mod render;
mod rules;
mod tables;
use build_tables::build_tables;
pub use build_tables::ParseTableBuilderError;
use grammars::{InlinedProductionMap, InputGrammar, LexicalGrammar, SyntaxGrammar};
use introspect_grammar::{introspect_grammar, GrammarIntrospection};
pub use node_types::{SuperTypeCycleError, VariableInfoError};
use parse_grammar::parse_grammar;
pub use parse_grammar::ParseGrammarError;
use prepare_grammar::prepare_grammar;
pub use prepare_grammar::PrepareGrammarError;
use render::{generate_symbol_ids, render_c_code};
use render::render_c_code;
pub use render::{ABI_VERSION_MAX, ABI_VERSION_MIN};
use crate::{build_tables::Tables, node_types::ChildType};
@ -56,18 +54,6 @@ static JSON_COMMENT_REGEX: LazyLock<Regex> = LazyLock::new(|| {
.unwrap()
});
struct GrammarIntrospection {
syntax_grammar: SyntaxGrammar,
lexical_grammar: LexicalGrammar,
simple_aliases: BTreeMap<Symbol, Alias>,
variable_info: Vec<VariableInfo>,
supertype_symbol_map: BTreeMap<Symbol, Vec<ChildType>>,
tables: Tables,
symbol_ids: HashMap<Symbol, (String, u16)>,
alias_ids: HashMap<Alias, String>,
unique_aliases: Vec<Alias>,
}
// NOTE: This constant must be kept in sync with the definition of
// `TREE_SITTER_LANGUAGE_VERSION` in `lib/include/tree_sitter/api.h`.
const LANGUAGE_VERSION: usize = 15;
@ -368,49 +354,6 @@ pub fn generate_parser_for_grammar(
Ok((input_grammar.name, c_code))
}
fn introspect_grammar(
input_grammar: &InputGrammar,
report_symbol_name: Option<&str>,
optimizations: OptLevel,
) -> Result<GrammarIntrospection, GenerateError> {
let (syntax_grammar, lexical_grammar, inlines, simple_aliases) =
prepare_grammar(input_grammar)?;
let variable_info =
node_types::get_variable_info(&syntax_grammar, &lexical_grammar, &simple_aliases)?;
let supertype_symbol_map =
node_types::get_supertype_symbol_map(&syntax_grammar, &simple_aliases, &variable_info);
let tables = build_tables(
&syntax_grammar,
&lexical_grammar,
&simple_aliases,
&variable_info,
&inlines,
report_symbol_name,
optimizations,
)?;
// Generate symbol IDs (both string and numeric) before rendering C code
let (symbol_ids, alias_ids, unique_aliases) = generate_symbol_ids(
&tables.parse_table,
&syntax_grammar,
&lexical_grammar,
&simple_aliases,
);
Ok(GrammarIntrospection {
syntax_grammar,
lexical_grammar,
simple_aliases,
variable_info,
supertype_symbol_map,
tables,
symbol_ids,
alias_ids,
unique_aliases,
})
}
/// This will read the `tree-sitter.json` config file and attempt to extract the version.
///
/// If the file is not found in the current directory or any of its parent directories, this will

View file

@ -0,0 +1,66 @@
use std::collections::{BTreeMap, HashMap};
use crate::{
build_tables::{build_tables, Tables},
grammars::{InputGrammar, LexicalGrammar, SyntaxGrammar},
node_types::{self, ChildType, VariableInfo},
prepare_grammar::prepare_grammar,
render::generate_symbol_ids,
rules::{Alias, Symbol},
GenerateError, OptLevel,
};
pub struct GrammarIntrospection {
pub syntax_grammar: SyntaxGrammar,
pub lexical_grammar: LexicalGrammar,
pub simple_aliases: BTreeMap<Symbol, Alias>,
pub variable_info: Vec<VariableInfo>,
pub supertype_symbol_map: BTreeMap<Symbol, Vec<ChildType>>,
pub tables: Tables,
pub symbol_ids: HashMap<Symbol, (String, u16)>,
pub alias_ids: HashMap<Alias, String>,
pub unique_aliases: Vec<Alias>,
}
pub fn introspect_grammar(
input_grammar: &InputGrammar,
report_symbol_name: Option<&str>,
optimizations: OptLevel,
) -> Result<GrammarIntrospection, GenerateError> {
let (syntax_grammar, lexical_grammar, inlines, simple_aliases) =
prepare_grammar(input_grammar)?;
let variable_info =
node_types::get_variable_info(&syntax_grammar, &lexical_grammar, &simple_aliases)?;
let supertype_symbol_map =
node_types::get_supertype_symbol_map(&syntax_grammar, &simple_aliases, &variable_info);
let tables = build_tables(
&syntax_grammar,
&lexical_grammar,
&simple_aliases,
&variable_info,
&inlines,
report_symbol_name,
optimizations,
)?;
// Generate symbol IDs (both string and numeric) before rendering C code
let (symbol_ids, alias_ids, unique_aliases) = generate_symbol_ids(
&tables.parse_table,
&syntax_grammar,
&lexical_grammar,
&simple_aliases,
);
Ok(GrammarIntrospection {
syntax_grammar,
lexical_grammar,
simple_aliases,
variable_info,
supertype_symbol_map,
tables,
symbol_ids,
alias_ids,
unique_aliases,
})
}

View file

@ -857,8 +857,10 @@ mod tests {
grammars::{
InputGrammar, LexicalVariable, Production, ProductionStep, SyntaxVariable, Variable,
},
introspect_grammar,
prepare_grammar::prepare_grammar,
rules::Rule,
GrammarIntrospection, OptLevel,
};
#[test]
@ -2091,17 +2093,28 @@ mod tests {
}
fn get_node_types(grammar: &InputGrammar) -> SuperTypeCycleResult<Vec<NodeInfoJSON>> {
let (syntax_grammar, lexical_grammar, _, default_aliases) =
prepare_grammar(grammar).unwrap();
let variable_info =
get_variable_info(&syntax_grammar, &lexical_grammar, &default_aliases).unwrap();
generate_node_types_json(
let GrammarIntrospection {
syntax_grammar,
lexical_grammar,
simple_aliases,
variable_info,
supertype_symbol_map: _,
tables: _,
symbol_ids: _,
alias_ids: _,
unique_aliases: _,
} = introspect_grammar(grammar, None, OptLevel::default()).unwrap();
let x = generate_node_types_json(
&syntax_grammar,
&lexical_grammar,
&default_aliases,
&simple_aliases,
&variable_info,
// TODO: use `symbol_ids`
&HashMap::new(),
)
);
return x;
}
fn build_syntax_grammar(