tree-sitter/cli/src/generate/render.rs

1711 lines
63 KiB
Rust
Raw Normal View History

2024-04-09 13:35:08 -04:00
use std::{
cmp,
collections::{HashMap, HashSet},
fmt::Write,
mem::swap,
};
use super::{
build_tables::Tables,
grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType},
nfa::CharacterSet,
rules::{Alias, AliasMap, Symbol, SymbolType},
tables::{
AdvanceAction, FieldLocation, GotoAction, LexState, LexTable, ParseAction, ParseTable,
ParseTableEntry,
},
};
const SMALL_STATE_THRESHOLD: usize = 64;
const ABI_VERSION_MIN: usize = 13;
const ABI_VERSION_MAX: usize = tree_sitter::LANGUAGE_VERSION;
const ABI_VERSION_WITH_PRIMARY_STATES: usize = 14;
2018-12-23 10:16:03 -08:00
macro_rules! add {
($this: tt, $($arg: tt)*) => {{
$this.buffer.write_fmt(format_args!($($arg)*)).unwrap();
}}
}
2019-01-02 12:34:40 -08:00
macro_rules! add_whitespace {
2024-04-09 21:42:59 -04:00
($this:tt) => {{
for _ in 0..$this.indent_level {
write!(&mut $this.buffer, " ").unwrap();
}
2019-01-02 12:34:40 -08:00
}};
}
macro_rules! add_line {
($this: tt, $($arg: tt)*) => {
add_whitespace!($this);
$this.buffer.write_fmt(format_args!($($arg)*)).unwrap();
$this.buffer += "\n";
}
}
2018-12-23 10:16:03 -08:00
macro_rules! indent {
2024-04-09 21:42:59 -04:00
($this:tt) => {
2018-12-23 10:16:03 -08:00
$this.indent_level += 1;
};
}
macro_rules! dedent {
2024-04-09 21:42:59 -04:00
($this:tt) => {
assert_ne!($this.indent_level, 0);
2018-12-23 10:16:03 -08:00
$this.indent_level -= 1;
};
}
struct Generator {
buffer: String,
indent_level: usize,
language_name: String,
parse_table: ParseTable,
main_lex_table: LexTable,
keyword_lex_table: LexTable,
large_character_sets: Vec<(Option<Symbol>, CharacterSet)>,
large_character_set_constant_names: Vec<String>,
large_state_count: usize,
keyword_capture_token: Option<Symbol>,
syntax_grammar: SyntaxGrammar,
lexical_grammar: LexicalGrammar,
default_aliases: AliasMap,
symbol_order: HashMap<Symbol, usize>,
symbol_ids: HashMap<Symbol, String>,
2018-12-23 10:16:03 -08:00
alias_ids: HashMap<Alias, String>,
unique_aliases: Vec<Alias>,
symbol_map: HashMap<Symbol, Symbol>,
2019-02-07 12:29:20 -08:00
field_names: Vec<String>,
#[allow(unused)]
abi_version: usize,
}
impl Generator {
fn generate(mut self) -> String {
2019-02-07 12:29:20 -08:00
self.init();
self.add_includes();
self.add_pragmas();
self.add_stats();
self.add_symbol_enum();
self.add_symbol_names_list();
self.add_unique_symbol_map();
2018-12-23 10:16:03 -08:00
self.add_symbol_metadata_list();
if !self.field_names.is_empty() {
self.add_field_name_enum();
2019-02-07 12:29:20 -08:00
self.add_field_name_names_list();
self.add_field_sequences();
}
if !self.parse_table.production_infos.is_empty() {
self.add_alias_sequences();
}
2018-12-23 10:16:03 -08:00
self.add_non_terminal_alias_map();
if self.abi_version >= ABI_VERSION_WITH_PRIMARY_STATES {
self.add_primary_state_id_list();
}
// Generate a helper function for each large character set.
// let mut sorted_large_char_sets = self.large_character_sets.iter().collect::<Vec<_>>();
for ix in 0..self.large_character_sets.len() {
self.add_character_set(ix);
}
2018-12-23 10:16:03 -08:00
let mut main_lex_table = LexTable::default();
swap(&mut main_lex_table, &mut self.main_lex_table);
self.add_lex_function("ts_lex", main_lex_table);
2018-12-23 10:16:03 -08:00
if self.keyword_capture_token.is_some() {
let mut keyword_lex_table = LexTable::default();
swap(&mut keyword_lex_table, &mut self.keyword_lex_table);
self.add_lex_function("ts_lex_keywords", keyword_lex_table);
2018-12-23 10:16:03 -08:00
}
self.add_lex_modes_list();
self.add_parse_table();
2018-12-23 10:16:03 -08:00
if !self.syntax_grammar.external_tokens.is_empty() {
self.add_external_token_enum();
self.add_external_scanner_symbol_map();
self.add_external_scanner_states_list();
}
self.add_parser_export();
self.buffer
}
2019-02-07 12:29:20 -08:00
fn init(&mut self) {
let mut symbol_identifiers = HashSet::new();
for i in 0..self.parse_table.symbols.len() {
self.assign_symbol_id(self.parse_table.symbols[i], &mut symbol_identifiers);
}
self.symbol_ids.insert(
Symbol::end_of_nonterminal_extra(),
self.symbol_ids[&Symbol::end()].clone(),
);
2019-02-07 12:29:20 -08:00
self.symbol_map = HashMap::new();
2024-02-04 01:30:33 -05:00
for symbol in &self.parse_table.symbols {
let mut mapping = symbol;
// There can be multiple symbols in the grammar that have the same name and kind,
// due to simple aliases. When that happens, ensure that they map to the same
// public-facing symbol. If one of the symbols is not aliased, choose that one
// to be the public-facing symbol. Otherwise, pick the symbol with the lowest
// numeric value.
if let Some(alias) = self.default_aliases.get(symbol) {
let kind = alias.kind();
for other_symbol in &self.parse_table.symbols {
if let Some(other_alias) = self.default_aliases.get(other_symbol) {
if other_symbol < mapping && other_alias == alias {
mapping = other_symbol;
}
} else if self.metadata_for_symbol(*other_symbol) == (&alias.value, kind) {
mapping = other_symbol;
break;
}
}
}
// Two anonymous tokens with different flags but the same string value
// should be represented with the same symbol in the public API. Examples:
2024-04-09 21:40:17 -04:00
// * "<" and token(prec(1, "<"))
// * "(" and token.immediate("(")
else if symbol.is_terminal() {
let metadata = self.metadata_for_symbol(*symbol);
for other_symbol in &self.parse_table.symbols {
let other_metadata = self.metadata_for_symbol(*other_symbol);
if other_metadata == metadata {
if let Some(mapped) = self.symbol_map.get(other_symbol) {
if mapped == symbol {
break;
}
}
mapping = other_symbol;
break;
}
}
}
self.symbol_map.insert(*symbol, *mapping);
}
for production_info in &self.parse_table.production_infos {
// Build a list of all field names
for field_name in production_info.field_map.keys() {
2024-02-04 01:30:33 -05:00
if let Err(i) = self.field_names.binary_search(field_name) {
self.field_names.insert(i, field_name.clone());
}
}
for alias in &production_info.alias_sequence {
// Generate a mapping from aliases to C identifiers.
if let Some(alias) = &alias {
2024-02-04 01:30:33 -05:00
let existing_symbol = self.parse_table.symbols.iter().copied().find(|symbol| {
self.default_aliases.get(symbol).map_or_else(
|| {
let (name, kind) = self.metadata_for_symbol(*symbol);
name == alias.value && kind == alias.kind()
},
|default_alias| default_alias == alias,
)
});
// Some aliases match an existing symbol in the grammar.
2024-04-09 21:42:59 -04:00
let alias_id = if let Some(existing_symbol) = existing_symbol {
self.symbol_ids[&self.symbol_map[&existing_symbol]].clone()
}
2024-04-09 21:40:17 -04:00
// Other aliases don't match any existing symbol, and need their own
// identifiers.
else {
if let Err(i) = self.unique_aliases.binary_search(alias) {
self.unique_aliases.insert(i, alias.clone());
}
2024-04-09 21:42:59 -04:00
if alias.is_named {
format!("alias_sym_{}", self.sanitize_identifier(&alias.value))
} else {
format!("anon_alias_sym_{}", self.sanitize_identifier(&alias.value))
2024-04-09 21:42:59 -04:00
}
};
self.alias_ids.entry(alias.clone()).or_insert(alias_id);
}
}
}
// Determine which states should use the "small state" representation, and which should
// use the normal array representation.
let threshold = cmp::min(SMALL_STATE_THRESHOLD, self.parse_table.symbols.len() / 2);
self.large_state_count = self
.parse_table
.states
.iter()
.enumerate()
.take_while(|(i, s)| {
*i <= 1 || s.terminal_entries.len() + s.nonterminal_entries.len() > threshold
})
.count();
2019-02-07 12:29:20 -08:00
}
fn add_includes(&mut self) {
add_line!(self, "#include \"tree_sitter/parser.h\"");
add_line!(self, "");
}
fn add_pragmas(&mut self) {
add_line!(self, "#if defined(__GNUC__) || defined(__clang__)");
2018-12-23 10:16:03 -08:00
add_line!(
self,
"#pragma GCC diagnostic ignored \"-Wmissing-field-initializers\""
);
add_line!(self, "#endif");
add_line!(self, "");
// Compiling large lexer functions can be very slow. Disabling optimizations
// is not ideal, but only a very small fraction of overall parse time is
// spent lexing, so the performance impact of this is negligible.
if self.main_lex_table.states.len() > 300 {
add_line!(self, "#ifdef _MSC_VER");
add_line!(self, "#pragma optimize(\"\", off)");
add_line!(self, "#elif defined(__clang__)");
add_line!(self, "#pragma clang optimize off");
add_line!(self, "#elif defined(__GNUC__)");
add_line!(self, "#pragma GCC optimize (\"O0\")");
add_line!(self, "#endif");
add_line!(self, "");
}
}
fn add_stats(&mut self) {
2018-12-23 10:16:03 -08:00
let token_count = self
.parse_table
.symbols
.iter()
.filter(|symbol| {
2019-01-03 10:31:14 -08:00
if symbol.is_terminal() || symbol.is_eof() {
2018-12-23 10:16:03 -08:00
true
} else if symbol.is_external() {
self.syntax_grammar.external_tokens[symbol.index]
.corresponding_internal_token
.is_none()
} else {
false
}
2018-12-23 10:16:03 -08:00
})
.count();
add_line!(self, "#define LANGUAGE_VERSION {}", self.abi_version);
2018-12-23 10:16:03 -08:00
add_line!(
self,
"#define STATE_COUNT {}",
self.parse_table.states.len()
);
add_line!(self, "#define LARGE_STATE_COUNT {}", self.large_state_count);
2018-12-23 10:16:03 -08:00
add_line!(
self,
"#define SYMBOL_COUNT {}",
self.parse_table.symbols.len()
);
2024-02-04 01:30:33 -05:00
add_line!(self, "#define ALIAS_COUNT {}", self.unique_aliases.len());
add_line!(self, "#define TOKEN_COUNT {}", token_count);
2018-12-23 10:16:03 -08:00
add_line!(
self,
"#define EXTERNAL_TOKEN_COUNT {}",
self.syntax_grammar.external_tokens.len()
);
2019-02-07 12:29:20 -08:00
add_line!(self, "#define FIELD_COUNT {}", self.field_names.len());
2019-01-11 17:26:45 -08:00
add_line!(
self,
"#define MAX_ALIAS_SEQUENCE_LENGTH {}",
self.parse_table.max_aliased_production_length
2019-01-11 17:26:45 -08:00
);
add_line!(
self,
"#define PRODUCTION_ID_COUNT {}",
self.parse_table.production_infos.len()
);
add_line!(self, "");
}
fn add_symbol_enum(&mut self) {
add_line!(self, "enum ts_symbol_identifiers {{");
2018-12-23 10:16:03 -08:00
indent!(self);
self.symbol_order.insert(Symbol::end(), 0);
2018-12-23 10:16:03 -08:00
let mut i = 1;
2024-02-04 01:30:33 -05:00
for symbol in &self.parse_table.symbols {
2018-12-23 10:16:03 -08:00
if *symbol != Symbol::end() {
self.symbol_order.insert(*symbol, i);
2024-02-13 03:04:33 -05:00
add_line!(self, "{} = {},", self.symbol_ids[symbol], i);
2018-12-23 10:16:03 -08:00
i += 1;
}
}
for alias in &self.unique_aliases {
2024-02-13 03:04:33 -05:00
add_line!(self, "{} = {},", self.alias_ids[alias], i);
i += 1;
2018-12-23 10:16:03 -08:00
}
dedent!(self);
add_line!(self, "}};");
add_line!(self, "");
}
fn add_symbol_names_list(&mut self) {
add_line!(self, "static const char * const ts_symbol_names[] = {{");
2018-12-23 10:16:03 -08:00
indent!(self);
2024-02-04 01:30:33 -05:00
for symbol in &self.parse_table.symbols {
let name = self.sanitize_string(
self.default_aliases
.get(symbol)
2024-02-04 01:30:33 -05:00
.map_or(self.metadata_for_symbol(*symbol).0, |alias| {
alias.value.as_str()
}),
);
2024-02-13 03:04:33 -05:00
add_line!(self, "[{}] = \"{}\",", self.symbol_ids[symbol], name);
2018-12-23 10:16:03 -08:00
}
for alias in &self.unique_aliases {
add_line!(
self,
"[{}] = \"{}\",",
2024-02-13 03:04:33 -05:00
self.alias_ids[alias],
self.sanitize_string(&alias.value)
);
2018-12-23 10:16:03 -08:00
}
dedent!(self);
add_line!(self, "}};");
add_line!(self, "");
}
fn add_unique_symbol_map(&mut self) {
add_line!(self, "static const TSSymbol ts_symbol_map[] = {{");
indent!(self);
for symbol in &self.parse_table.symbols {
add_line!(
self,
"[{}] = {},",
self.symbol_ids[symbol],
self.symbol_ids[&self.symbol_map[symbol]],
);
}
for alias in &self.unique_aliases {
add_line!(
self,
"[{}] = {},",
2024-02-13 03:04:33 -05:00
self.alias_ids[alias],
self.alias_ids[alias],
);
}
dedent!(self);
add_line!(self, "}};");
add_line!(self, "");
}
2019-02-07 12:29:20 -08:00
fn add_field_name_enum(&mut self) {
add_line!(self, "enum ts_field_identifiers {{");
2019-02-07 12:29:20 -08:00
indent!(self);
for (i, field_name) in self.field_names.iter().enumerate() {
add_line!(self, "{} = {},", self.field_id(field_name), i + 1);
}
dedent!(self);
add_line!(self, "}};");
add_line!(self, "");
}
fn add_field_name_names_list(&mut self) {
add_line!(self, "static const char * const ts_field_names[] = {{");
2019-02-07 12:29:20 -08:00
indent!(self);
add_line!(self, "[0] = NULL,");
for field_name in &self.field_names {
add_line!(
self,
"[{}] = \"{}\",",
self.field_id(field_name),
field_name
);
}
dedent!(self);
add_line!(self, "}};");
add_line!(self, "");
}
2018-12-23 10:16:03 -08:00
fn add_symbol_metadata_list(&mut self) {
add_line!(
self,
"static const TSSymbolMetadata ts_symbol_metadata[] = {{"
);
indent!(self);
for symbol in &self.parse_table.symbols {
2024-02-13 03:04:33 -05:00
add_line!(self, "[{}] = {{", self.symbol_ids[symbol]);
2018-12-23 10:16:03 -08:00
indent!(self);
if let Some(Alias { is_named, .. }) = self.default_aliases.get(symbol) {
2019-01-03 14:08:24 -08:00
add_line!(self, ".visible = true,");
add_line!(self, ".named = {},", is_named);
} else {
match self.metadata_for_symbol(*symbol).1 {
VariableType::Named => {
add_line!(self, ".visible = true,");
add_line!(self, ".named = true,");
}
VariableType::Anonymous => {
add_line!(self, ".visible = true,");
add_line!(self, ".named = false,");
}
VariableType::Hidden => {
add_line!(self, ".visible = false,");
add_line!(self, ".named = true,");
if self.syntax_grammar.supertype_symbols.contains(symbol) {
add_line!(self, ".supertype = true,");
}
2019-01-03 14:08:24 -08:00
}
VariableType::Auxiliary => {
add_line!(self, ".visible = false,");
add_line!(self, ".named = false,");
}
2018-12-23 10:16:03 -08:00
}
}
dedent!(self);
add_line!(self, "}},");
}
for alias in &self.unique_aliases {
2024-02-13 03:04:33 -05:00
add_line!(self, "[{}] = {{", self.alias_ids[alias]);
indent!(self);
add_line!(self, ".visible = true,");
add_line!(self, ".named = {},", alias.is_named);
dedent!(self);
add_line!(self, "}},");
2018-12-23 10:16:03 -08:00
}
dedent!(self);
add_line!(self, "}};");
add_line!(self, "");
}
fn add_alias_sequences(&mut self) {
2018-12-23 10:16:03 -08:00
add_line!(
self,
"static const TSSymbol ts_alias_sequences[PRODUCTION_ID_COUNT][MAX_ALIAS_SEQUENCE_LENGTH] = {{",
2018-12-23 10:16:03 -08:00
);
indent!(self);
2019-02-12 11:06:18 -08:00
for (i, production_info) in self.parse_table.production_infos.iter().enumerate() {
if production_info.alias_sequence.is_empty() {
// Work around MSVC's intolerance of empty array initializers by
// explicitly zero-initializing the first element.
if i == 0 {
add_line!(self, "[0] = {{0}},");
}
2019-02-07 12:29:20 -08:00
continue;
}
2018-12-23 10:16:03 -08:00
add_line!(self, "[{}] = {{", i);
indent!(self);
2019-02-12 11:06:18 -08:00
for (j, alias) in production_info.alias_sequence.iter().enumerate() {
if let Some(alias) = alias {
2024-02-13 03:04:33 -05:00
add_line!(self, "[{}] = {},", j, self.alias_ids[alias]);
2018-12-23 10:16:03 -08:00
}
}
dedent!(self);
add_line!(self, "}},");
}
dedent!(self);
add_line!(self, "}};");
add_line!(self, "");
}
2019-02-07 12:29:20 -08:00
fn add_non_terminal_alias_map(&mut self) {
let mut alias_ids_by_symbol = HashMap::new();
for variable in &self.syntax_grammar.variables {
for production in &variable.productions {
for step in &production.steps {
if let Some(alias) = &step.alias {
if step.symbol.is_non_terminal()
&& Some(alias) != self.default_aliases.get(&step.symbol)
2024-02-04 01:30:33 -05:00
&& self.symbol_ids.contains_key(&step.symbol)
{
2024-02-04 01:30:33 -05:00
if let Some(alias_id) = self.alias_ids.get(alias) {
let alias_ids =
alias_ids_by_symbol.entry(step.symbol).or_insert(Vec::new());
if let Err(i) = alias_ids.binary_search(&alias_id) {
alias_ids.insert(i, alias_id);
}
}
}
}
}
}
}
let mut alias_ids_by_symbol = alias_ids_by_symbol.iter().collect::<Vec<_>>();
alias_ids_by_symbol.sort_unstable_by_key(|e| e.0);
add_line!(
self,
"static const uint16_t ts_non_terminal_alias_map[] = {{"
);
indent!(self);
for (symbol, alias_ids) in alias_ids_by_symbol {
let symbol_id = &self.symbol_ids[symbol];
2024-02-13 03:04:33 -05:00
let public_symbol_id = &self.symbol_ids[&self.symbol_map[symbol]];
2024-02-04 01:30:33 -05:00
add_line!(self, "{symbol_id}, {},", 1 + alias_ids.len());
indent!(self);
2024-02-04 01:30:33 -05:00
add_line!(self, "{public_symbol_id},");
for alias_id in alias_ids {
2024-02-04 01:30:33 -05:00
add_line!(self, "{alias_id},");
}
dedent!(self);
}
add_line!(self, "0,");
dedent!(self);
add_line!(self, "}};");
add_line!(self, "");
}
/// Produces a list of the "primary state" for every state in the grammar.
///
/// The "primary state" for a given state is the first encountered state that behaves
/// identically with respect to query analysis. We derive this by keeping track of the `core_id`
/// for each state and treating the first state with a given `core_id` as primary.
fn add_primary_state_id_list(&mut self) {
add_line!(
self,
"static const TSStateId ts_primary_state_ids[STATE_COUNT] = {{"
);
indent!(self);
let mut first_state_for_each_core_id = HashMap::new();
for (idx, state) in self.parse_table.states.iter().enumerate() {
let primary_state = first_state_for_each_core_id
.entry(state.core_id)
.or_insert(idx);
2024-02-04 01:30:33 -05:00
add_line!(self, "[{idx}] = {primary_state},");
}
dedent!(self);
add_line!(self, "}};");
add_line!(self, "");
}
fn add_field_sequences(&mut self) {
let mut flat_field_maps = vec![];
let mut next_flat_field_map_index = 0;
self.get_field_map_id(
Vec::new(),
&mut flat_field_maps,
&mut next_flat_field_map_index,
2019-02-07 12:29:20 -08:00
);
let mut field_map_ids = Vec::new();
2019-02-12 11:06:18 -08:00
for production_info in &self.parse_table.production_infos {
2024-02-04 01:30:33 -05:00
if production_info.field_map.is_empty() {
field_map_ids.push((0, 0));
} else {
let mut flat_field_map = Vec::new();
2019-02-12 11:06:18 -08:00
for (field_name, locations) in &production_info.field_map {
for location in locations {
flat_field_map.push((field_name.clone(), *location));
}
}
field_map_ids.push((
self.get_field_map_id(
flat_field_map.clone(),
&mut flat_field_maps,
&mut next_flat_field_map_index,
),
flat_field_map.len(),
));
}
}
add_line!(
self,
"static const TSFieldMapSlice ts_field_map_slices[PRODUCTION_ID_COUNT] = {{",
);
2019-02-07 12:29:20 -08:00
indent!(self);
2019-02-12 11:06:18 -08:00
for (production_id, (row_id, length)) in field_map_ids.into_iter().enumerate() {
if length > 0 {
add_line!(
self,
2024-02-04 01:30:33 -05:00
"[{production_id}] = {{.index = {row_id}, .length = {length}}},",
);
2019-02-07 12:29:20 -08:00
}
}
dedent!(self);
add_line!(self, "}};");
add_line!(self, "");
2019-02-07 12:29:20 -08:00
add_line!(
self,
"static const TSFieldMapEntry ts_field_map_entries[] = {{",
);
indent!(self);
for (row_index, field_pairs) in flat_field_maps.into_iter().skip(1) {
2024-02-04 01:30:33 -05:00
add_line!(self, "[{row_index}] =");
2019-02-07 12:29:20 -08:00
indent!(self);
for (field_name, location) in field_pairs {
add_whitespace!(self);
add!(self, "{{{}, {}", self.field_id(&field_name), location.index);
if location.inherited {
add!(self, ", .inherited = true");
}
add!(self, "}},\n");
2019-02-07 12:29:20 -08:00
}
dedent!(self);
}
2019-02-07 12:29:20 -08:00
dedent!(self);
add_line!(self, "}};");
add_line!(self, "");
2018-12-23 10:16:03 -08:00
}
fn add_lex_function(&mut self, name: &str, lex_table: LexTable) {
2018-12-23 10:16:03 -08:00
add_line!(
self,
2024-02-04 01:30:33 -05:00
"static bool {name}(TSLexer *lexer, TSStateId state) {{",
2018-12-23 10:16:03 -08:00
);
indent!(self);
add_line!(self, "START_LEXER();");
2023-11-29 11:20:05 +02:00
add_line!(self, "eof = lexer->eof(lexer);");
2018-12-23 10:16:03 -08:00
add_line!(self, "switch (state) {{");
indent!(self);
2018-12-23 10:16:03 -08:00
for (i, state) in lex_table.states.into_iter().enumerate() {
2024-02-04 01:30:33 -05:00
add_line!(self, "case {i}:");
2018-12-23 10:16:03 -08:00
indent!(self);
self.add_lex_state(i, state);
2018-12-23 10:16:03 -08:00
dedent!(self);
}
add_line!(self, "default:");
indent!(self);
add_line!(self, "return false;");
dedent!(self);
dedent!(self);
add_line!(self, "}}");
2018-12-23 10:16:03 -08:00
dedent!(self);
add_line!(self, "}}");
add_line!(self, "");
}
fn add_lex_state(&mut self, _state_ix: usize, state: LexState) {
2018-12-23 10:16:03 -08:00
if let Some(accept_action) = state.accept_action {
2019-01-03 14:08:24 -08:00
add_line!(self, "ACCEPT_TOKEN({});", self.symbol_ids[&accept_action]);
2018-12-23 10:16:03 -08:00
}
if let Some(eof_action) = state.eof_action {
add_line!(self, "if (eof) ADVANCE({});", eof_action.state);
}
let mut chars_copy = CharacterSet::empty();
let mut large_set = CharacterSet::empty();
let mut ruled_out_chars = CharacterSet::empty();
// The transitions in a lex state are sorted with the single-character
// transitions first. If there are many single-character transitions,
// then implement them using an array of (lookahead character, state)
// pairs, instead of individual if statements, in order to reduce compile
// time.
let mut leading_simple_transition_count = 0;
let mut leading_simple_transition_range_count = 0;
for (chars, action) in &state.advance_actions {
if action.in_main_token
&& chars.ranges().all(|r| {
let start = *r.start() as u32;
let end = *r.end() as u32;
end <= start + 1 && end <= u16::MAX as u32
})
{
leading_simple_transition_count += 1;
leading_simple_transition_range_count += chars.range_count();
} else {
break;
}
}
if leading_simple_transition_range_count >= 8 {
add_line!(self, "ADVANCE_MAP(");
indent!(self);
for (chars, action) in &state.advance_actions[0..leading_simple_transition_count] {
for range in chars.ranges() {
add_whitespace!(self);
self.add_character(*range.start());
add!(self, ", {},\n", action.state);
if range.end() > range.start() {
add_whitespace!(self);
self.add_character(*range.end());
add!(self, ", {},\n", action.state);
}
}
ruled_out_chars = ruled_out_chars.add(chars);
}
dedent!(self);
add_line!(self, ");");
} else {
leading_simple_transition_count = 0;
}
for (chars, action) in &state.advance_actions[leading_simple_transition_count..] {
2019-01-02 12:34:40 -08:00
add_whitespace!(self);
// The lex state's advance actions are represented with disjoint
// sets of characters. When translating these disjoint sets into a
// sequence of checks, we don't need to re-check conditions that
// have already been checked due to previous transitions.
//
// Note that this simplification may result in an empty character set.
// That means that the transition is guaranteed (nothing further needs to
// be checked), not that this transition is impossible.
let simplified_chars = chars.simplify_ignoring(&ruled_out_chars);
// For large character sets, find the best matching character set from
// a pre-selected list of large character sets, which are based on the
// state transitions for invidual tokens. This transition may not exactly
// match one of the pre-selected character sets. In that case, determine
// the additional checks that need to be performed to match this transition.
let mut best_large_char_set: Option<(usize, CharacterSet, CharacterSet)> = None;
if simplified_chars.range_count() >= super::build_tables::LARGE_CHARACTER_RANGE_COUNT {
for (ix, (_, set)) in self.large_character_sets.iter().enumerate() {
chars_copy.assign(&simplified_chars);
2024-04-11 16:29:59 -07:00
large_set.assign(set);
let intersection = chars_copy.remove_intersection(&mut large_set);
if !intersection.is_empty() {
let additions = chars_copy.simplify_ignoring(&ruled_out_chars);
let removals = large_set.simplify_ignoring(&ruled_out_chars);
let total_range_count = additions.range_count() + removals.range_count();
if total_range_count >= simplified_chars.range_count() {
continue;
}
if let Some((_, best_additions, best_removals)) = &best_large_char_set {
let best_range_count =
best_additions.range_count() + best_removals.range_count();
if best_range_count < total_range_count {
continue;
}
}
best_large_char_set = Some((ix, additions, removals));
}
}
}
// Add this transition's character set to the set of ruled out characters,
// which don't need to be checked for subsequent transitions in this state.
ruled_out_chars = ruled_out_chars.add(chars);
let mut large_char_set_ix = None;
let mut asserted_chars = simplified_chars;
let mut negated_chars = CharacterSet::empty();
if let Some((char_set_ix, additions, removals)) = best_large_char_set {
asserted_chars = additions;
negated_chars = removals;
large_char_set_ix = Some(char_set_ix);
}
let mut line_break = "\n".to_string();
for _ in 0..self.indent_level + 2 {
line_break.push_str(" ");
}
let has_positive_condition = large_char_set_ix.is_some() || !asserted_chars.is_empty();
let has_negative_condition = !negated_chars.is_empty();
let has_condition = has_positive_condition || has_negative_condition;
if has_condition {
add!(self, "if (");
if has_positive_condition && has_negative_condition {
add!(self, "(");
}
}
if let Some(large_char_set_ix) = large_char_set_ix {
let large_set = &self.large_character_sets[large_char_set_ix].1;
// If the character set contains the null character, check that we
// are not at the end of the file.
let check_eof = large_set.contains('\0');
if check_eof {
add!(self, "(!eof && ")
}
add!(
self,
"set_contains({}, {}, lookahead)",
&self.large_character_set_constant_names[large_char_set_ix],
large_set.range_count(),
);
if check_eof {
add!(self, ")");
}
2018-12-23 10:16:03 -08:00
}
if !asserted_chars.is_empty() {
if large_char_set_ix.is_some() {
add!(self, " ||{line_break}");
}
// If the character set contains the max character, than it probably
// corresponds to a negated character class in a regex, so it will be more
// concise and readable to express it in terms of negated ranges.
let is_included = !asserted_chars.contains(char::MAX);
if !is_included {
asserted_chars = asserted_chars.negate().add_char('\0');
}
self.add_character_range_conditions(&asserted_chars, is_included, &line_break);
}
if has_negative_condition {
if has_positive_condition {
add!(self, ") &&{line_break}");
}
self.add_character_range_conditions(&negated_chars, false, &line_break);
}
if has_condition {
add!(self, ") ");
}
self.add_advance_action(action);
add!(self, "\n");
2018-12-23 10:16:03 -08:00
}
add_line!(self, "END_STATE();");
}
fn add_character_range_conditions(
2018-12-23 10:16:03 -08:00
&mut self,
characters: &CharacterSet,
is_included: bool,
line_break: &str,
) {
for (i, range) in characters.ranges().enumerate() {
let start = *range.start();
let end = *range.end();
if is_included {
if i > 0 {
2024-02-04 01:30:33 -05:00
add!(self, " ||{line_break}");
}
if start == '\0' {
add!(self, "(!eof && ");
if end == '\0' {
add!(self, "lookahead == 0");
} else {
add!(self, "lookahead <= ");
}
self.add_character(end);
add!(self, ")");
continue;
} else if end == start {
add!(self, "lookahead == ");
self.add_character(start);
} else if end as u32 == start as u32 + 1 {
add!(self, "lookahead == ");
self.add_character(start);
2024-02-04 01:30:33 -05:00
add!(self, " ||{line_break}lookahead == ");
self.add_character(end);
2019-01-02 12:34:40 -08:00
} else {
add!(self, "(");
self.add_character(start);
add!(self, " <= lookahead && lookahead <= ");
self.add_character(end);
add!(self, ")");
2019-01-02 12:34:40 -08:00
}
} else {
if i > 0 {
2024-02-04 01:30:33 -05:00
add!(self, " &&{line_break}");
2019-01-02 12:34:40 -08:00
}
if end == start {
add!(self, "lookahead != ");
self.add_character(start);
} else if end as u32 == start as u32 + 1 {
add!(self, "lookahead != ");
self.add_character(start);
2024-02-04 01:30:33 -05:00
add!(self, " &&{line_break}lookahead != ");
self.add_character(end);
} else if start != '\0' {
2024-02-04 01:30:33 -05:00
add!(self, "(lookahead < ");
self.add_character(start);
2024-02-04 01:30:33 -05:00
add!(self, " || ");
self.add_character(end);
2024-02-04 01:30:33 -05:00
add!(self, " < lookahead)");
2019-01-02 12:34:40 -08:00
} else {
2024-02-04 01:30:33 -05:00
add!(self, "lookahead > ");
self.add_character(end);
2019-01-02 12:34:40 -08:00
}
}
}
}
fn add_character_set(&mut self, ix: usize) {
let (symbol, characters) = self.large_character_sets[ix].clone();
let count = self.large_character_sets[0..ix]
.iter()
.filter(|(sym, _)| *sym == symbol)
.count()
+ 1;
let constant_name = if let Some(symbol) = symbol {
format!("{}_character_set_{}", self.symbol_ids[&symbol], count)
} else {
format!("extras_character_set_{}", count)
};
add_line!(self, "static TSCharacterRange {}[] = {{", constant_name);
self.large_character_set_constant_names.push(constant_name);
indent!(self);
for (ix, range) in characters.ranges().enumerate() {
let column = ix % 8;
if column == 0 {
if ix > 0 {
add!(self, "\n");
}
add_whitespace!(self);
} else {
add!(self, " ");
}
add!(self, "{{");
self.add_character(*range.start());
add!(self, ", ");
self.add_character(*range.end());
add!(self, "}},");
}
add!(self, "\n");
dedent!(self);
add_line!(self, "}};");
add_line!(self, "");
2019-01-02 12:34:40 -08:00
}
fn add_advance_action(&mut self, action: &AdvanceAction) {
if action.in_main_token {
add!(self, "ADVANCE({});", action.state);
} else {
2024-04-07 19:08:07 -04:00
add!(self, "SKIP({});", action.state);
}
}
2018-12-23 10:16:03 -08:00
fn add_lex_modes_list(&mut self) {
add_line!(
self,
"static const TSLexMode ts_lex_modes[STATE_COUNT] = {{"
);
2018-12-23 10:16:03 -08:00
indent!(self);
for (i, state) in self.parse_table.states.iter().enumerate() {
if state.is_end_of_non_terminal_extra() {
2024-02-04 01:30:33 -05:00
add_line!(self, "[{i}] = {{(TSStateId)(-1)}},");
2019-10-21 13:31:49 -07:00
} else if state.external_lex_state_id > 0 {
2018-12-23 10:16:03 -08:00
add_line!(
self,
2024-02-04 01:30:33 -05:00
"[{i}] = {{.lex_state = {}, .external_lex_state = {}}},",
2018-12-23 10:16:03 -08:00
state.lex_state_id,
state.external_lex_state_id
2018-12-23 10:16:03 -08:00
);
} else {
2024-02-04 01:30:33 -05:00
add_line!(self, "[{i}] = {{.lex_state = {}}},", state.lex_state_id);
2018-12-23 10:16:03 -08:00
}
}
dedent!(self);
add_line!(self, "}};");
add_line!(self, "");
}
fn add_external_token_enum(&mut self) {
add_line!(self, "enum ts_external_scanner_symbol_identifiers {{");
2018-12-23 10:16:03 -08:00
indent!(self);
for i in 0..self.syntax_grammar.external_tokens.len() {
add_line!(
self,
"{} = {},",
self.external_token_id(&self.syntax_grammar.external_tokens[i]),
i
);
}
dedent!(self);
add_line!(self, "}};");
add_line!(self, "");
}
fn add_external_scanner_symbol_map(&mut self) {
add_line!(
self,
"static const TSSymbol ts_external_scanner_symbol_map[EXTERNAL_TOKEN_COUNT] = {{"
2018-12-23 10:16:03 -08:00
);
indent!(self);
for i in 0..self.syntax_grammar.external_tokens.len() {
let token = &self.syntax_grammar.external_tokens[i];
2019-01-17 17:16:04 -08:00
let id_token = token
.corresponding_internal_token
2024-02-04 01:30:33 -05:00
.unwrap_or_else(|| Symbol::external(i));
2018-12-23 10:16:03 -08:00
add_line!(
self,
"[{}] = {},",
2024-02-04 01:30:33 -05:00
self.external_token_id(token),
self.symbol_ids[&id_token],
2018-12-23 10:16:03 -08:00
);
}
dedent!(self);
add_line!(self, "}};");
add_line!(self, "");
}
fn add_external_scanner_states_list(&mut self) {
add_line!(
self,
"static const bool ts_external_scanner_states[{}][EXTERNAL_TOKEN_COUNT] = {{",
self.parse_table.external_lex_states.len(),
2018-12-23 10:16:03 -08:00
);
indent!(self);
for i in 0..self.parse_table.external_lex_states.len() {
if !self.parse_table.external_lex_states[i].is_empty() {
2018-12-23 10:16:03 -08:00
add_line!(self, "[{}] = {{", i);
indent!(self);
for token in self.parse_table.external_lex_states[i].iter() {
2018-12-23 10:16:03 -08:00
add_line!(
self,
"[{}] = true,",
self.external_token_id(&self.syntax_grammar.external_tokens[token.index])
2018-12-23 10:16:03 -08:00
);
}
dedent!(self);
add_line!(self, "}},");
}
}
dedent!(self);
add_line!(self, "}};");
add_line!(self, "");
}
fn add_parse_table(&mut self) {
let mut parse_table_entries = HashMap::new();
2018-12-23 10:16:03 -08:00
let mut next_parse_action_list_index = 0;
self.get_parse_action_list_id(
&ParseTableEntry {
actions: Vec::new(),
reusable: false,
},
&mut parse_table_entries,
&mut next_parse_action_list_index,
);
add_line!(
self,
"static const uint16_t ts_parse_table[LARGE_STATE_COUNT][SYMBOL_COUNT] = {{",
2018-12-23 10:16:03 -08:00
);
indent!(self);
let mut terminal_entries = Vec::new();
let mut nonterminal_entries = Vec::new();
for (i, state) in self
.parse_table
.states
.iter()
.enumerate()
.take(self.large_state_count)
{
add_line!(self, "[{}] = {{", i);
indent!(self);
// Ensure the entries are in a deterministic order, since they are
// internally represented as a hash map.
terminal_entries.clear();
nonterminal_entries.clear();
terminal_entries.extend(state.terminal_entries.iter());
nonterminal_entries.extend(state.nonterminal_entries.iter());
terminal_entries.sort_unstable_by_key(|e| self.symbol_order.get(e.0));
nonterminal_entries.sort_unstable_by_key(|k| k.0);
2019-10-21 13:31:49 -07:00
for (symbol, action) in &nonterminal_entries {
add_line!(
self,
"[{}] = STATE({}),",
self.symbol_ids[symbol],
2019-10-21 13:31:49 -07:00
match action {
GotoAction::Goto(state) => *state,
GotoAction::ShiftExtra => i,
}
);
2018-12-23 10:16:03 -08:00
}
for (symbol, entry) in &terminal_entries {
2018-12-23 10:16:03 -08:00
let entry_id = self.get_parse_action_list_id(
entry,
&mut parse_table_entries,
&mut next_parse_action_list_index,
);
2024-02-04 01:30:33 -05:00
add_line!(self, "[{}] = ACTIONS({entry_id}),", self.symbol_ids[symbol]);
2018-12-23 10:16:03 -08:00
}
dedent!(self);
add_line!(self, "}},");
}
dedent!(self);
add_line!(self, "}};");
add_line!(self, "");
2019-05-16 17:19:44 -07:00
if self.large_state_count < self.parse_table.states.len() {
add_line!(self, "static const uint16_t ts_small_parse_table[] = {{");
indent!(self);
let mut index = 0;
let mut small_state_indices = Vec::new();
let mut symbols_by_value = HashMap::<(usize, SymbolType), Vec<Symbol>>::new();
2019-05-16 17:19:44 -07:00
for state in self.parse_table.states.iter().skip(self.large_state_count) {
small_state_indices.push(index);
symbols_by_value.clear();
2019-05-16 17:19:44 -07:00
terminal_entries.clear();
terminal_entries.extend(state.terminal_entries.iter());
terminal_entries.sort_unstable_by_key(|e| self.symbol_order.get(e.0));
// In a given parse state, many lookahead symbols have the same actions.
// So in the "small state" representation, group symbols by their action
// in order to avoid repeating the action.
2019-05-16 17:19:44 -07:00
for (symbol, entry) in &terminal_entries {
let entry_id = self.get_parse_action_list_id(
entry,
&mut parse_table_entries,
&mut next_parse_action_list_index,
);
symbols_by_value
.entry((entry_id, SymbolType::Terminal))
.or_default()
.push(**symbol);
}
2019-10-21 13:31:49 -07:00
for (symbol, action) in &state.nonterminal_entries {
let state_id = match action {
GotoAction::Goto(i) => *i,
GotoAction::ShiftExtra => {
self.large_state_count + small_state_indices.len() - 1
}
};
symbols_by_value
2019-10-21 13:31:49 -07:00
.entry((state_id, SymbolType::NonTerminal))
.or_default()
.push(*symbol);
2019-05-16 17:19:44 -07:00
}
let mut values_with_symbols = symbols_by_value.drain().collect::<Vec<_>>();
values_with_symbols.sort_unstable_by_key(|((value, kind), symbols)| {
(symbols.len(), *kind, *value, symbols[0])
});
2024-02-04 01:30:33 -05:00
add_line!(self, "[{index}] = {},", values_with_symbols.len());
indent!(self);
2024-02-04 01:30:33 -05:00
for ((value, kind), symbols) in &mut values_with_symbols {
if *kind == SymbolType::NonTerminal {
2024-02-04 01:30:33 -05:00
add_line!(self, "STATE({value}), {},", symbols.len());
} else {
2024-02-04 01:30:33 -05:00
add_line!(self, "ACTIONS({value}), {},", symbols.len());
}
symbols.sort_unstable();
indent!(self);
for symbol in symbols {
add_line!(self, "{},", self.symbol_ids[symbol]);
}
dedent!(self);
2019-05-16 17:19:44 -07:00
}
2019-05-16 17:19:44 -07:00
dedent!(self);
index += 1 + values_with_symbols
.iter()
.map(|(_, symbols)| 2 + symbols.len())
.sum::<usize>();
}
dedent!(self);
add_line!(self, "}};");
add_line!(self, "");
add_line!(
self,
"static const uint32_t ts_small_parse_table_map[] = {{"
);
indent!(self);
for i in self.large_state_count..self.parse_table.states.len() {
add_line!(
self,
2024-02-04 01:30:33 -05:00
"[SMALL_STATE({i})] = {},",
small_state_indices[i - self.large_state_count]
);
}
dedent!(self);
2019-05-16 17:19:44 -07:00
add_line!(self, "}};");
add_line!(self, "");
}
2024-02-04 01:30:33 -05:00
let mut parse_table_entries = parse_table_entries
.into_iter()
.map(|(entry, i)| (i, entry))
2024-02-04 01:30:33 -05:00
.collect::<Vec<_>>();
parse_table_entries.sort_by_key(|(index, _)| *index);
2018-12-23 10:16:03 -08:00
self.add_parse_action_list(parse_table_entries);
}
fn add_parse_action_list(&mut self, parse_table_entries: Vec<(usize, ParseTableEntry)>) {
add_line!(
self,
"static const TSParseActionEntry ts_parse_actions[] = {{"
);
2018-12-23 10:16:03 -08:00
indent!(self);
for (i, entry) in parse_table_entries {
add!(
self,
2024-02-04 01:30:33 -05:00
" [{i}] = {{.entry = {{.count = {}, .reusable = {}}}}},",
2018-12-23 10:16:03 -08:00
entry.actions.len(),
entry.reusable
);
for action in entry.actions {
add!(self, " ");
match action {
ParseAction::Accept => add!(self, " ACCEPT_INPUT()"),
ParseAction::Recover => add!(self, "RECOVER()"),
ParseAction::ShiftExtra => add!(self, "SHIFT_EXTRA()"),
ParseAction::Shift {
state,
is_repetition,
} => {
if is_repetition {
2024-02-04 01:30:33 -05:00
add!(self, "SHIFT_REPEAT({state})");
2018-12-23 10:16:03 -08:00
} else {
2024-02-04 01:30:33 -05:00
add!(self, "SHIFT({state})");
2018-12-23 10:16:03 -08:00
}
}
ParseAction::Reduce {
symbol,
child_count,
dynamic_precedence,
2019-02-12 11:06:18 -08:00
production_id,
2018-12-23 10:16:03 -08:00
..
} => {
2024-03-28 14:27:44 +02:00
add!(
self,
"REDUCE({}, {child_count}, {dynamic_precedence}, {production_id})",
2024-03-28 14:27:44 +02:00
self.symbol_ids[&symbol]
);
2018-12-23 10:16:03 -08:00
}
}
2024-02-04 01:30:33 -05:00
add!(self, ",");
2018-12-23 10:16:03 -08:00
}
add!(self, "\n");
}
dedent!(self);
add_line!(self, "}};");
add_line!(self, "");
}
fn add_parser_export(&mut self) {
let language_function_name = format!("tree_sitter_{}", self.language_name);
2024-02-04 01:30:33 -05:00
let external_scanner_name = format!("{language_function_name}_external_scanner");
2018-12-23 10:16:03 -08:00
Add a language linkage declaration to parsers Previously, in order to compile a `tree-sitter` grammar that contained c++ source in the parser (ie the `scanner.cc` file), you would have to compile the `parser.c` file separately from the c++ files. For example, in rust this would result in a `build.rs` close to the following: ``` extern crate cc; fn main() { let dir: PathBuf = ["tree-sitter-ruby", "src"].iter().collect(); cc::Build::new() .include(&dir) .cpp(true) .file(dir.join("scanner.cc")) // NOTE: must have a name that differs from the c static lib .compile("tree-sitter-ruby-scanner"); cc::Build::new() .include(&dir) .file(dir.join("parser.c")) // NOTE: must have a name that differs from the c++ static lib .compile("tree-sitter-ruby-parser"); } ``` This was necessary at the time for the following grammars: `ruby`, `php`, `python`, `embedded-template`, `html`, `cpp`, `ocaml`, `bash`, `agda`, and `haskell`. To solve this, we specify an `extern "C"` language linkage declaration to the functions that must be linked against to compile a parser with the scanner, making parsers linkable against c++ source. On all major compilers (gcc, clang, and msvc) this should be the only change needed due to the combination of clang and gcc both supporting designated initialization for years and msvc 2019 adopting designated initializers as a part of the C++20 conformance push. Subsequently, for rust projects, the necessary `build.rs` would become (which also brings these parsers into sync with the current docs): ``` extern crate cc; fn main() { let dir: PathBuf = ["tree-sitter-ruby", "src"].iter().collect(); cc::Build::new() .include(&dir) .cpp(true) .file(dir.join("scanner.cc")) .file(dir.join("parser.c")) .compile("tree-sitter-ruby"); } ```
2020-02-18 19:18:19 -08:00
add_line!(self, "#ifdef __cplusplus");
add_line!(self, r#"extern "C" {{"#);
add_line!(self, "#endif");
2018-12-23 10:16:03 -08:00
if !self.syntax_grammar.external_tokens.is_empty() {
2024-02-04 01:30:33 -05:00
add_line!(self, "void *{external_scanner_name}_create(void);");
add_line!(self, "void {external_scanner_name}_destroy(void *);");
2018-12-23 10:16:03 -08:00
add_line!(
self,
2024-02-04 01:30:33 -05:00
"bool {external_scanner_name}_scan(void *, TSLexer *, const bool *);",
2018-12-23 10:16:03 -08:00
);
add_line!(
self,
2024-02-04 01:30:33 -05:00
"unsigned {external_scanner_name}_serialize(void *, char *);",
2018-12-23 10:16:03 -08:00
);
add_line!(
self,
2024-02-04 01:30:33 -05:00
"void {external_scanner_name}_deserialize(void *, const char *, unsigned);",
2018-12-23 10:16:03 -08:00
);
add_line!(self, "");
}
2024-03-14 11:01:11 +02:00
add_line!(self, "#ifdef TREE_SITTER_HIDE_SYMBOLS");
add_line!(self, "#define TS_PUBLIC");
add_line!(self, "#elif defined(_WIN32)");
add_line!(self, "#define TS_PUBLIC __declspec(dllexport)");
add_line!(self, "#else");
add_line!(
self,
"#define TS_PUBLIC __attribute__((visibility(\"default\")))"
);
2018-12-23 10:16:03 -08:00
add_line!(self, "#endif");
add_line!(self, "");
add_line!(
self,
2024-03-18 21:11:39 +02:00
"TS_PUBLIC const TSLanguage *{language_function_name}(void) {{",
2018-12-23 10:16:03 -08:00
);
indent!(self);
add_line!(self, "static const TSLanguage language = {{");
2018-12-23 10:16:03 -08:00
indent!(self);
add_line!(self, ".version = LANGUAGE_VERSION,");
// Quantities
2018-12-23 10:16:03 -08:00
add_line!(self, ".symbol_count = SYMBOL_COUNT,");
add_line!(self, ".alias_count = ALIAS_COUNT,");
add_line!(self, ".token_count = TOKEN_COUNT,");
add_line!(self, ".external_token_count = EXTERNAL_TOKEN_COUNT,");
add_line!(self, ".state_count = STATE_COUNT,");
add_line!(self, ".large_state_count = LARGE_STATE_COUNT,");
add_line!(self, ".production_id_count = PRODUCTION_ID_COUNT,");
add_line!(self, ".field_count = FIELD_COUNT,");
add_line!(
self,
".max_alias_sequence_length = MAX_ALIAS_SEQUENCE_LENGTH,"
);
// Parse table
add_line!(self, ".parse_table = &ts_parse_table[0][0],");
if self.large_state_count < self.parse_table.states.len() {
add_line!(self, ".small_parse_table = ts_small_parse_table,");
add_line!(self, ".small_parse_table_map = ts_small_parse_table_map,");
}
2018-12-23 10:16:03 -08:00
add_line!(self, ".parse_actions = ts_parse_actions,");
// Metadata
add_line!(self, ".symbol_names = ts_symbol_names,");
if !self.field_names.is_empty() {
add_line!(self, ".field_names = ts_field_names,");
add_line!(self, ".field_map_slices = ts_field_map_slices,");
add_line!(self, ".field_map_entries = ts_field_map_entries,");
}
add_line!(self, ".symbol_metadata = ts_symbol_metadata,");
add_line!(self, ".public_symbol_map = ts_symbol_map,");
add_line!(self, ".alias_map = ts_non_terminal_alias_map,");
if !self.parse_table.production_infos.is_empty() {
add_line!(self, ".alias_sequences = &ts_alias_sequences[0][0],");
}
// Lexing
add_line!(self, ".lex_modes = ts_lex_modes,");
add_line!(self, ".lex_fn = ts_lex,");
2018-12-23 10:16:03 -08:00
if let Some(keyword_capture_token) = self.keyword_capture_token {
add_line!(self, ".keyword_lex_fn = ts_lex_keywords,");
add_line!(
self,
".keyword_capture_token = {},",
self.symbol_ids[&keyword_capture_token]
);
}
if !self.syntax_grammar.external_tokens.is_empty() {
add_line!(self, ".external_scanner = {{");
indent!(self);
add_line!(self, "&ts_external_scanner_states[0][0],");
2018-12-23 10:16:03 -08:00
add_line!(self, "ts_external_scanner_symbol_map,");
2024-02-04 01:30:33 -05:00
add_line!(self, "{external_scanner_name}_create,");
add_line!(self, "{external_scanner_name}_destroy,");
add_line!(self, "{external_scanner_name}_scan,");
add_line!(self, "{external_scanner_name}_serialize,");
add_line!(self, "{external_scanner_name}_deserialize,");
2018-12-23 10:16:03 -08:00
dedent!(self);
add_line!(self, "}},");
}
if self.abi_version >= ABI_VERSION_WITH_PRIMARY_STATES {
add_line!(self, ".primary_state_ids = ts_primary_state_ids,");
}
dedent!(self);
2018-12-23 10:16:03 -08:00
add_line!(self, "}};");
add_line!(self, "return &language;");
dedent!(self);
add_line!(self, "}}");
Add a language linkage declaration to parsers Previously, in order to compile a `tree-sitter` grammar that contained c++ source in the parser (ie the `scanner.cc` file), you would have to compile the `parser.c` file separately from the c++ files. For example, in rust this would result in a `build.rs` close to the following: ``` extern crate cc; fn main() { let dir: PathBuf = ["tree-sitter-ruby", "src"].iter().collect(); cc::Build::new() .include(&dir) .cpp(true) .file(dir.join("scanner.cc")) // NOTE: must have a name that differs from the c static lib .compile("tree-sitter-ruby-scanner"); cc::Build::new() .include(&dir) .file(dir.join("parser.c")) // NOTE: must have a name that differs from the c++ static lib .compile("tree-sitter-ruby-parser"); } ``` This was necessary at the time for the following grammars: `ruby`, `php`, `python`, `embedded-template`, `html`, `cpp`, `ocaml`, `bash`, `agda`, and `haskell`. To solve this, we specify an `extern "C"` language linkage declaration to the functions that must be linked against to compile a parser with the scanner, making parsers linkable against c++ source. On all major compilers (gcc, clang, and msvc) this should be the only change needed due to the combination of clang and gcc both supporting designated initialization for years and msvc 2019 adopting designated initializers as a part of the C++20 conformance push. Subsequently, for rust projects, the necessary `build.rs` would become (which also brings these parsers into sync with the current docs): ``` extern crate cc; fn main() { let dir: PathBuf = ["tree-sitter-ruby", "src"].iter().collect(); cc::Build::new() .include(&dir) .cpp(true) .file(dir.join("scanner.cc")) .file(dir.join("parser.c")) .compile("tree-sitter-ruby"); } ```
2020-02-18 19:18:19 -08:00
add_line!(self, "#ifdef __cplusplus");
add_line!(self, "}}");
add_line!(self, "#endif");
2018-12-23 10:16:03 -08:00
}
fn get_parse_action_list_id(
&self,
entry: &ParseTableEntry,
parse_table_entries: &mut HashMap<ParseTableEntry, usize>,
2018-12-23 10:16:03 -08:00
next_parse_action_list_index: &mut usize,
) -> usize {
if let Some(&index) = parse_table_entries.get(entry) {
index
} else {
let result = *next_parse_action_list_index;
parse_table_entries.insert(entry.clone(), result);
*next_parse_action_list_index += 1 + entry.actions.len();
result
2018-12-23 10:16:03 -08:00
}
}
fn get_field_map_id(
&self,
flat_field_map: Vec<(String, FieldLocation)>,
flat_field_maps: &mut Vec<(usize, Vec<(String, FieldLocation)>)>,
next_flat_field_map_index: &mut usize,
) -> usize {
if let Some((index, _)) = flat_field_maps.iter().find(|(_, e)| *e == *flat_field_map) {
return *index;
}
let result = *next_flat_field_map_index;
*next_flat_field_map_index += flat_field_map.len();
flat_field_maps.push((result, flat_field_map));
result
}
2018-12-23 10:16:03 -08:00
fn external_token_id(&self, token: &ExternalToken) -> String {
format!(
"ts_external_token_{}",
self.sanitize_identifier(&token.name)
)
}
fn assign_symbol_id(&mut self, symbol: Symbol, used_identifiers: &mut HashSet<String>) {
let mut id;
if symbol == Symbol::end() {
id = "ts_builtin_sym_end".to_string();
} else {
let (name, kind) = self.metadata_for_symbol(symbol);
id = match kind {
2018-12-23 10:16:03 -08:00
VariableType::Auxiliary => format!("aux_sym_{}", self.sanitize_identifier(name)),
VariableType::Anonymous => format!("anon_sym_{}", self.sanitize_identifier(name)),
VariableType::Hidden | VariableType::Named => {
2018-12-23 10:16:03 -08:00
format!("sym_{}", self.sanitize_identifier(name))
}
};
let mut suffix_number = 1;
let mut suffix = String::new();
2018-12-23 10:16:03 -08:00
while used_identifiers.contains(&id) {
id.drain(id.len() - suffix.len()..);
suffix_number += 1;
suffix = suffix_number.to_string();
id += &suffix;
}
}
2018-12-23 10:16:03 -08:00
used_identifiers.insert(id.clone());
self.symbol_ids.insert(symbol, id);
}
2024-02-04 01:30:33 -05:00
fn field_id(&self, field_name: &str) -> String {
format!("field_{field_name}")
2019-02-07 12:29:20 -08:00
}
fn metadata_for_symbol(&self, symbol: Symbol) -> (&str, VariableType) {
match symbol.kind {
SymbolType::End | SymbolType::EndOfNonTerminalExtra => ("end", VariableType::Hidden),
SymbolType::NonTerminal => {
let variable = &self.syntax_grammar.variables[symbol.index];
(&variable.name, variable.kind)
}
SymbolType::Terminal => {
let variable = &self.lexical_grammar.variables[symbol.index];
(&variable.name, variable.kind)
}
SymbolType::External => {
let token = &self.syntax_grammar.external_tokens[symbol.index];
(&token.name, token.kind)
}
}
}
2018-12-23 10:16:03 -08:00
fn sanitize_identifier(&self, name: &str) -> String {
let mut result = String::with_capacity(name.len());
for c in name.chars() {
if c.is_ascii_alphanumeric() || c == '_' {
2018-12-23 10:16:03 -08:00
result.push(c);
} else {
'special_chars: {
let replacement = match c {
' ' if name.len() == 1 => "SPACE",
'~' => "TILDE",
'`' => "BQUOTE",
'!' => "BANG",
'@' => "AT",
'#' => "POUND",
'$' => "DOLLAR",
'%' => "PERCENT",
'^' => "CARET",
'&' => "AMP",
'*' => "STAR",
'(' => "LPAREN",
')' => "RPAREN",
'-' => "DASH",
'+' => "PLUS",
'=' => "EQ",
'{' => "LBRACE",
'}' => "RBRACE",
'[' => "LBRACK",
']' => "RBRACK",
'\\' => "BSLASH",
'|' => "PIPE",
':' => "COLON",
';' => "SEMI",
'"' => "DQUOTE",
'\'' => "SQUOTE",
'<' => "LT",
'>' => "GT",
',' => "COMMA",
'.' => "DOT",
'?' => "QMARK",
'/' => "SLASH",
'\n' => "LF",
'\r' => "CR",
'\t' => "TAB",
'\0' => "NULL",
'\u{0001}' => "SOH",
'\u{0002}' => "STX",
'\u{0003}' => "ETX",
'\u{0004}' => "EOT",
'\u{0005}' => "ENQ",
'\u{0006}' => "ACK",
'\u{0007}' => "BEL",
'\u{0008}' => "BS",
'\u{000b}' => "VTAB",
'\u{000c}' => "FF",
'\u{000e}' => "SO",
'\u{000f}' => "SI",
'\u{0010}' => "DLE",
'\u{0011}' => "DC1",
'\u{0012}' => "DC2",
'\u{0013}' => "DC3",
'\u{0014}' => "DC4",
'\u{0015}' => "NAK",
'\u{0016}' => "SYN",
'\u{0017}' => "ETB",
'\u{0018}' => "CAN",
'\u{0019}' => "EM",
'\u{001a}' => "SUB",
'\u{001b}' => "ESC",
'\u{001c}' => "FS",
'\u{001d}' => "GS",
'\u{001e}' => "RS",
'\u{001f}' => "US",
'\u{007F}' => "DEL",
'\u{FEFF}' => "BOM",
'\u{0080}'..='\u{FFFF}' => {
result.push_str(&format!("u{:04x}", c as u32));
break 'special_chars;
}
'\u{10000}'..='\u{10FFFF}' => {
result.push_str(&format!("U{:08x}", c as u32));
break 'special_chars;
}
'0'..='9' | 'a'..='z' | 'A'..='Z' | '_' => unreachable!(),
' ' => break 'special_chars,
};
2024-02-04 01:30:33 -05:00
if !result.is_empty() && !result.ends_with('_') {
result.push('_');
}
result += replacement;
2018-12-23 10:16:03 -08:00
}
}
}
result
}
2018-12-23 10:16:03 -08:00
fn sanitize_string(&self, name: &str) -> String {
let mut result = String::with_capacity(name.len());
for c in name.chars() {
match c {
'\"' => result += "\\\"",
'?' => result += "\\?",
'\\' => result += "\\\\",
'\u{0007}' => result += "\\a",
'\u{0008}' => result += "\\b",
'\u{000b}' => result += "\\v",
'\u{000c}' => result += "\\f",
'\n' => result += "\\n",
'\r' => result += "\\r",
'\t' => result += "\\t",
'\0' => result += "\\0",
'\u{0001}'..='\u{001f}' => result += &format!("\\x{:02x}", c as u32),
'\u{007F}'..='\u{FFFF}' => result += &format!("\\u{:04x}", c as u32),
'\u{10000}'..='\u{10FFFF}' => {
result.push_str(&format!("\\U{:08x}", c as u32));
}
_ => result.push(c),
2018-12-23 10:16:03 -08:00
}
}
result
}
2019-01-02 12:34:40 -08:00
fn add_character(&mut self, c: char) {
match c {
'\'' => add!(self, "'\\''"),
'\\' => add!(self, "'\\\\'"),
'\u{000c}' => add!(self, "'\\f'"),
'\n' => add!(self, "'\\n'"),
'\t' => add!(self, "'\\t'"),
'\r' => add!(self, "'\\r'"),
_ => {
if c == '\0' {
add!(self, "0")
} else if c == ' ' || c.is_ascii_graphic() {
2024-02-04 01:30:33 -05:00
add!(self, "'{c}'");
} else {
add!(self, "0x{:02x}", c as u32);
}
2019-01-02 12:34:40 -08:00
}
}
}
}
2018-12-05 12:50:12 -08:00
/// Returns a String of C code for the given components of a parser.
///
/// # Arguments
///
/// * `name` - A string slice containing the name of the language
/// * `parse_table` - The generated parse table for the language
/// * `main_lex_table` - The generated lexing table for the language
/// * `keyword_lex_table` - The generated keyword lexing table for the language
2024-04-09 21:40:17 -04:00
/// * `keyword_capture_token` - A symbol indicating which token is used for keyword capture, if any.
/// * `syntax_grammar` - The syntax grammar extracted from the language's grammar
/// * `lexical_grammar` - The lexical grammar extracted from the language's grammar
2024-04-09 21:40:17 -04:00
/// * `default_aliases` - A map describing the global rename rules that should apply. the keys are
/// symbols that are *always* aliased in the same way, and the values are the aliases that are
/// applied to those symbols.
/// * `abi_version` - The language ABI version that should be generated. Usually you want
/// Tree-sitter's current version, but right after making an ABI change, it may be useful to
/// generate code with the previous ABI.
2024-02-04 01:30:33 -05:00
#[allow(clippy::too_many_arguments)]
pub fn render_c_code(
2018-12-05 12:50:12 -08:00
name: &str,
tables: Tables,
2018-12-05 12:50:12 -08:00
syntax_grammar: SyntaxGrammar,
lexical_grammar: LexicalGrammar,
default_aliases: AliasMap,
abi_version: usize,
2018-12-05 12:50:12 -08:00
) -> String {
2024-02-04 01:30:33 -05:00
assert!(
(ABI_VERSION_MIN..=ABI_VERSION_MAX).contains(&abi_version),
"This version of Tree-sitter can only generate parsers with ABI version {ABI_VERSION_MIN} - {ABI_VERSION_MAX}, not {abi_version}",
);
Generator {
buffer: String::new(),
indent_level: 0,
language_name: name.to_string(),
large_state_count: 0,
parse_table: tables.parse_table,
main_lex_table: tables.main_lex_table,
keyword_lex_table: tables.keyword_lex_table,
keyword_capture_token: tables.word_token,
large_character_sets: tables.large_character_sets,
large_character_set_constant_names: Vec::new(),
syntax_grammar,
lexical_grammar,
default_aliases,
symbol_ids: HashMap::new(),
symbol_order: HashMap::new(),
2018-12-23 10:16:03 -08:00
alias_ids: HashMap::new(),
symbol_map: HashMap::new(),
unique_aliases: Vec::new(),
2019-02-07 12:29:20 -08:00
field_names: Vec::new(),
abi_version,
}
.generate()
2018-12-05 12:50:12 -08:00
}