feat: add 'reserved word' construct

Co-authored-by: Amaan Qureshi <amaanq12@gmail.com>
This commit is contained in:
Max Brunsfeld 2024-12-23 00:06:32 -08:00 committed by GitHub
parent 2a63077cac
commit 201b41cf11
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
31 changed files with 2367 additions and 1628 deletions

View file

@ -9,7 +9,7 @@ use super::{
build_tables::Tables,
grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType},
nfa::CharacterSet,
rules::{Alias, AliasMap, Symbol, SymbolType},
rules::{Alias, AliasMap, Symbol, SymbolType, TokenSet},
tables::{
AdvanceAction, FieldLocation, GotoAction, LexState, LexTable, ParseAction, ParseTable,
ParseTableEntry,
@ -19,7 +19,7 @@ use super::{
const SMALL_STATE_THRESHOLD: usize = 64;
const ABI_VERSION_MIN: usize = 14;
const ABI_VERSION_MAX: usize = tree_sitter::LANGUAGE_VERSION;
const ABI_VERSION_WITH_METADATA: usize = 15;
const ABI_VERSION_WITH_RESERVED_WORDS: usize = 15;
const BUILD_VERSION: &str = env!("CARGO_PKG_VERSION");
const BUILD_SHA: Option<&'static str> = option_env!("BUILD_SHA");
@ -58,6 +58,7 @@ macro_rules! dedent {
};
}
#[derive(Default)]
struct Generator {
buffer: String,
indent_level: usize,
@ -68,7 +69,6 @@ struct Generator {
large_character_sets: Vec<(Option<Symbol>, CharacterSet)>,
large_character_set_info: Vec<LargeCharacterSetInfo>,
large_state_count: usize,
keyword_capture_token: Option<Symbol>,
syntax_grammar: SyntaxGrammar,
lexical_grammar: LexicalGrammar,
default_aliases: AliasMap,
@ -77,6 +77,8 @@ struct Generator {
alias_ids: HashMap<Alias, String>,
unique_aliases: Vec<Alias>,
symbol_map: HashMap<Symbol, Symbol>,
reserved_word_sets: Vec<TokenSet>,
reserved_word_set_ids_by_parse_state: Vec<usize>,
field_names: Vec<String>,
#[allow(unused)]
@ -119,7 +121,7 @@ impl Generator {
swap(&mut main_lex_table, &mut self.main_lex_table);
self.add_lex_function("ts_lex", main_lex_table);
if self.keyword_capture_token.is_some() {
if self.syntax_grammar.word_token.is_some() {
let mut keyword_lex_table = LexTable::default();
swap(&mut keyword_lex_table, &mut self.keyword_lex_table);
self.add_lex_function("ts_lex_keywords", keyword_lex_table);
@ -135,7 +137,13 @@ impl Generator {
}
self.buffer.push_str(&lex_functions);
self.add_lex_modes_list();
self.add_lex_modes();
if self.abi_version >= ABI_VERSION_WITH_RESERVED_WORDS && self.reserved_word_sets.len() > 1
{
self.add_reserved_word_sets();
}
self.add_parse_table();
if !self.syntax_grammar.external_tokens.is_empty() {
@ -266,6 +274,22 @@ impl Generator {
});
}
// Assign an id to each unique reserved word set
self.reserved_word_sets.push(TokenSet::new());
for state in &self.parse_table.states {
let id = if let Some(ix) = self
.reserved_word_sets
.iter()
.position(|set| *set == state.reserved_words)
{
ix
} else {
self.reserved_word_sets.push(state.reserved_words.clone());
self.reserved_word_sets.len() - 1
};
self.reserved_word_set_ids_by_parse_state.push(id);
}
// Determine which states should use the "small state" representation, and which should
// use the normal array representation.
let threshold = cmp::min(SMALL_STATE_THRESHOLD, self.parse_table.symbols.len() / 2);
@ -365,6 +389,16 @@ impl Generator {
"#define MAX_ALIAS_SEQUENCE_LENGTH {}",
self.parse_table.max_aliased_production_length
);
add_line!(
self,
"#define MAX_RESERVED_WORD_SET_SIZE {}",
self.reserved_word_sets
.iter()
.map(TokenSet::len)
.max()
.unwrap()
);
add_line!(
self,
"#define PRODUCTION_ID_COUNT {}",
@ -1016,25 +1050,66 @@ impl Generator {
}
}
fn add_lex_modes_list(&mut self) {
fn add_lex_modes(&mut self) {
add_line!(
self,
"static const TSLexMode ts_lex_modes[STATE_COUNT] = {{"
"static const {} ts_lex_modes[STATE_COUNT] = {{",
if self.abi_version >= ABI_VERSION_WITH_RESERVED_WORDS {
"TSLexerMode"
} else {
"TSLexMode"
}
);
indent!(self);
for (i, state) in self.parse_table.states.iter().enumerate() {
add_whitespace!(self);
add!(self, "[{}] = {{", i);
if state.is_end_of_non_terminal_extra() {
add_line!(self, "[{i}] = {{(TSStateId)(-1)}},");
} else if state.external_lex_state_id > 0 {
add_line!(
self,
"[{i}] = {{.lex_state = {}, .external_lex_state = {}}},",
state.lex_state_id,
state.external_lex_state_id
);
add!(self, "(TSStateId)(-1),");
} else {
add_line!(self, "[{i}] = {{.lex_state = {}}},", state.lex_state_id);
add!(self, ".lex_state = {}", state.lex_state_id);
if state.external_lex_state_id > 0 {
add!(
self,
", .external_lex_state = {}",
state.external_lex_state_id
);
}
if self.abi_version >= ABI_VERSION_WITH_RESERVED_WORDS {
let reserved_word_set_id = self.reserved_word_set_ids_by_parse_state[i];
if reserved_word_set_id != 0 {
add!(self, ", .reserved_word_set_id = {reserved_word_set_id}");
}
}
}
add!(self, "}},\n");
}
dedent!(self);
add_line!(self, "}};");
add_line!(self, "");
}
fn add_reserved_word_sets(&mut self) {
add_line!(
self,
"static const TSSymbol ts_reserved_words[{}][MAX_RESERVED_WORD_SET_SIZE] = {{",
self.reserved_word_sets.len(),
);
indent!(self);
for (id, set) in self.reserved_word_sets.iter().enumerate() {
if id == 0 {
continue;
}
add_line!(self, "[{}] = {{", id);
indent!(self);
for token in set.iter() {
add_line!(self, "{},", self.symbol_ids[&token]);
}
dedent!(self);
add_line!(self, "}},");
}
dedent!(self);
add_line!(self, "}};");
@ -1110,6 +1185,7 @@ impl Generator {
let mut parse_table_entries = HashMap::new();
let mut next_parse_action_list_index = 0;
// Parse action lists zero is for the default value, when a symbol is not valid.
self.get_parse_action_list_id(
&ParseTableEntry {
actions: Vec::new(),
@ -1135,7 +1211,7 @@ impl Generator {
.enumerate()
.take(self.large_state_count)
{
add_line!(self, "[{i}] = {{");
add_line!(self, "[STATE({i})] = {{");
indent!(self);
// Ensure the entries are in a deterministic order, since they are
@ -1167,9 +1243,11 @@ impl Generator {
);
add_line!(self, "[{}] = ACTIONS({entry_id}),", self.symbol_ids[symbol]);
}
dedent!(self);
add_line!(self, "}},");
}
dedent!(self);
add_line!(self, "}};");
add_line!(self, "");
@ -1178,11 +1256,11 @@ impl Generator {
add_line!(self, "static const uint16_t ts_small_parse_table[] = {{");
indent!(self);
let mut index = 0;
let mut next_table_index = 0;
let mut small_state_indices = Vec::new();
let mut symbols_by_value = HashMap::<(usize, SymbolType), Vec<Symbol>>::new();
for state in self.parse_table.states.iter().skip(self.large_state_count) {
small_state_indices.push(index);
small_state_indices.push(next_table_index);
symbols_by_value.clear();
terminal_entries.clear();
@ -1221,10 +1299,16 @@ impl Generator {
(symbols.len(), *kind, *value, symbols[0])
});
add_line!(self, "[{index}] = {},", values_with_symbols.len());
add_line!(
self,
"[{next_table_index}] = {},",
values_with_symbols.len()
);
indent!(self);
next_table_index += 1;
for ((value, kind), symbols) in &mut values_with_symbols {
next_table_index += 2 + symbols.len();
if *kind == SymbolType::NonTerminal {
add_line!(self, "STATE({value}), {},", symbols.len());
} else {
@ -1240,11 +1324,6 @@ impl Generator {
}
dedent!(self);
index += 1 + values_with_symbols
.iter()
.map(|(_, symbols)| 2 + symbols.len())
.sum::<usize>();
}
dedent!(self);
@ -1412,9 +1491,9 @@ impl Generator {
}
// Lexing
add_line!(self, ".lex_modes = ts_lex_modes,");
add_line!(self, ".lex_modes = (const void*)ts_lex_modes,");
add_line!(self, ".lex_fn = ts_lex,");
if let Some(keyword_capture_token) = self.keyword_capture_token {
if let Some(keyword_capture_token) = self.syntax_grammar.word_token {
add_line!(self, ".keyword_lex_fn = ts_lex_keywords,");
add_line!(
self,
@ -1439,8 +1518,22 @@ impl Generator {
add_line!(self, ".primary_state_ids = ts_primary_state_ids,");
if self.abi_version >= ABI_VERSION_WITH_METADATA {
if self.abi_version >= ABI_VERSION_WITH_RESERVED_WORDS {
add_line!(self, ".name = \"{}\",", self.language_name);
if self.reserved_word_sets.len() > 1 {
add_line!(self, ".reserved_words = &ts_reserved_words[0][0],");
}
add_line!(
self,
".max_reserved_word_set_size = {},",
self.reserved_word_sets
.iter()
.map(TokenSet::len)
.max()
.unwrap()
);
}
dedent!(self);
@ -1716,26 +1809,17 @@ pub fn render_c_code(
);
Generator {
buffer: String::new(),
indent_level: 0,
language_name: name.to_string(),
large_state_count: 0,
parse_table: tables.parse_table,
main_lex_table: tables.main_lex_table,
keyword_lex_table: tables.keyword_lex_table,
keyword_capture_token: tables.word_token,
large_character_sets: tables.large_character_sets,
large_character_set_info: Vec::new(),
syntax_grammar,
lexical_grammar,
default_aliases,
symbol_ids: HashMap::new(),
symbol_order: HashMap::new(),
alias_ids: HashMap::new(),
symbol_map: HashMap::new(),
unique_aliases: Vec::new(),
field_names: Vec::new(),
abi_version,
..Default::default()
}
.generate()
}