Merge pull request #334 from tree-sitter/small-parse-states
Reduce parsers' static memory footprint by storing "small" parse states more compactly
This commit is contained in:
commit
94ca4dc8e0
13 changed files with 485 additions and 96 deletions
|
|
@ -25,10 +25,11 @@ struct AuxiliarySymbolInfo {
|
|||
type SymbolSequence = Vec<Symbol>;
|
||||
type AuxiliarySymbolSequence = Vec<AuxiliarySymbolInfo>;
|
||||
|
||||
pub(crate) type ParseStateInfo<'a> = (SymbolSequence, ParseItemSet<'a>);
|
||||
|
||||
struct ParseStateQueueEntry {
|
||||
preceding_symbols: SymbolSequence,
|
||||
preceding_auxiliary_symbols: AuxiliarySymbolSequence,
|
||||
state_id: ParseStateId,
|
||||
preceding_auxiliary_symbols: AuxiliarySymbolSequence,
|
||||
}
|
||||
|
||||
struct ParseTableBuilder<'a> {
|
||||
|
|
@ -38,13 +39,13 @@ struct ParseTableBuilder<'a> {
|
|||
variable_info: &'a Vec<VariableInfo>,
|
||||
core_ids_by_core: HashMap<ParseItemSetCore<'a>, usize>,
|
||||
state_ids_by_item_set: HashMap<ParseItemSet<'a>, ParseStateId>,
|
||||
item_sets_by_state_id: Vec<ParseItemSet<'a>>,
|
||||
parse_state_info_by_id: Vec<ParseStateInfo<'a>>,
|
||||
parse_state_queue: VecDeque<ParseStateQueueEntry>,
|
||||
parse_table: ParseTable,
|
||||
}
|
||||
|
||||
impl<'a> ParseTableBuilder<'a> {
|
||||
fn build(mut self) -> Result<ParseTable> {
|
||||
fn build(mut self) -> Result<(ParseTable, Vec<ParseStateInfo<'a>>)> {
|
||||
// Ensure that the empty alias sequence has index 0.
|
||||
self.parse_table
|
||||
.production_infos
|
||||
|
|
@ -70,9 +71,10 @@ impl<'a> ParseTableBuilder<'a> {
|
|||
while let Some(entry) = self.parse_state_queue.pop_front() {
|
||||
let item_set = self
|
||||
.item_set_builder
|
||||
.transitive_closure(&self.item_sets_by_state_id[entry.state_id]);
|
||||
.transitive_closure(&self.parse_state_info_by_id[entry.state_id].1);
|
||||
|
||||
self.add_actions(
|
||||
entry.preceding_symbols,
|
||||
self.parse_state_info_by_id[entry.state_id].0.clone(),
|
||||
entry.preceding_auxiliary_symbols,
|
||||
entry.state_id,
|
||||
item_set,
|
||||
|
|
@ -81,7 +83,7 @@ impl<'a> ParseTableBuilder<'a> {
|
|||
|
||||
self.remove_precedences();
|
||||
|
||||
Ok(self.parse_table)
|
||||
Ok((self.parse_table, self.parse_state_info_by_id))
|
||||
}
|
||||
|
||||
fn add_parse_state(
|
||||
|
|
@ -104,17 +106,19 @@ impl<'a> ParseTableBuilder<'a> {
|
|||
};
|
||||
|
||||
let state_id = self.parse_table.states.len();
|
||||
self.item_sets_by_state_id.push(v.key().clone());
|
||||
self.parse_state_info_by_id
|
||||
.push((preceding_symbols.clone(), v.key().clone()));
|
||||
|
||||
self.parse_table.states.push(ParseState {
|
||||
id: state_id,
|
||||
lex_state_id: 0,
|
||||
external_lex_state_id: 0,
|
||||
terminal_entries: HashMap::new(),
|
||||
nonterminal_entries: HashMap::new(),
|
||||
core_id,
|
||||
});
|
||||
self.parse_state_queue.push_back(ParseStateQueueEntry {
|
||||
state_id,
|
||||
preceding_symbols: preceding_symbols.clone(),
|
||||
preceding_auxiliary_symbols: preceding_auxiliary_symbols.clone(),
|
||||
});
|
||||
v.insert(state_id);
|
||||
|
|
@ -750,12 +754,12 @@ fn populate_following_tokens(
|
|||
}
|
||||
}
|
||||
|
||||
pub(crate) fn build_parse_table(
|
||||
syntax_grammar: &SyntaxGrammar,
|
||||
lexical_grammar: &LexicalGrammar,
|
||||
inlines: &InlinedProductionMap,
|
||||
variable_info: &Vec<VariableInfo>,
|
||||
) -> Result<(ParseTable, Vec<TokenSet>)> {
|
||||
pub(crate) fn build_parse_table<'a>(
|
||||
syntax_grammar: &'a SyntaxGrammar,
|
||||
lexical_grammar: &'a LexicalGrammar,
|
||||
inlines: &'a InlinedProductionMap,
|
||||
variable_info: &'a Vec<VariableInfo>,
|
||||
) -> Result<(ParseTable, Vec<TokenSet>, Vec<ParseStateInfo<'a>>)> {
|
||||
let item_set_builder = ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines);
|
||||
let mut following_tokens = vec![TokenSet::new(); lexical_grammar.variables.len()];
|
||||
populate_following_tokens(
|
||||
|
|
@ -765,23 +769,24 @@ pub(crate) fn build_parse_table(
|
|||
&item_set_builder,
|
||||
);
|
||||
|
||||
let table = ParseTableBuilder {
|
||||
let (table, item_sets) = ParseTableBuilder {
|
||||
syntax_grammar,
|
||||
lexical_grammar,
|
||||
item_set_builder,
|
||||
variable_info,
|
||||
state_ids_by_item_set: HashMap::new(),
|
||||
core_ids_by_core: HashMap::new(),
|
||||
item_sets_by_state_id: Vec::new(),
|
||||
parse_state_info_by_id: Vec::new(),
|
||||
parse_state_queue: VecDeque::new(),
|
||||
parse_table: ParseTable {
|
||||
states: Vec::new(),
|
||||
symbols: Vec::new(),
|
||||
external_lex_states: Vec::new(),
|
||||
production_infos: Vec::new(),
|
||||
max_aliased_production_length: 1,
|
||||
},
|
||||
}
|
||||
.build()?;
|
||||
|
||||
Ok((table, following_tokens))
|
||||
Ok((table, following_tokens, item_sets))
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,8 @@
|
|||
use crate::generate::grammars::{LexicalGrammar, Production, ProductionStep, SyntaxGrammar};
|
||||
use crate::generate::rules::{Associativity, Symbol, SymbolType, TokenSet};
|
||||
use crate::generate::grammars::{
|
||||
LexicalGrammar, Production, ProductionStep, SyntaxGrammar,
|
||||
};
|
||||
use crate::generate::rules::Associativity;
|
||||
use crate::generate::rules::{Symbol, SymbolType, TokenSet};
|
||||
use lazy_static::lazy_static;
|
||||
use std::cmp::Ordering;
|
||||
use std::fmt;
|
||||
|
|
@ -161,12 +164,14 @@ impl<'a> fmt::Display for ParseItemDisplay<'a> {
|
|||
for (i, step) in self.0.production.steps.iter().enumerate() {
|
||||
if i == self.0.step_index as usize {
|
||||
write!(f, " •")?;
|
||||
if step.precedence != 0 || step.associativity.is_some() {
|
||||
write!(
|
||||
f,
|
||||
" (prec {:?} assoc {:?})",
|
||||
step.precedence, step.associativity
|
||||
)?;
|
||||
if let Some(associativity) = step.associativity {
|
||||
if step.precedence != 0 {
|
||||
write!(f, " ({} {:?})", step.precedence, associativity)?;
|
||||
} else {
|
||||
write!(f, " ({:?})", associativity)?;
|
||||
}
|
||||
} else if step.precedence != 0 {
|
||||
write!(f, " ({})", step.precedence)?;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -184,19 +189,21 @@ impl<'a> fmt::Display for ParseItemDisplay<'a> {
|
|||
}
|
||||
|
||||
if let Some(alias) = &step.alias {
|
||||
write!(f, " (alias {})", alias.value)?;
|
||||
write!(f, "@{}", alias.value)?;
|
||||
}
|
||||
}
|
||||
|
||||
if self.0.is_done() {
|
||||
write!(f, " •")?;
|
||||
if let Some(step) = self.0.production.steps.last() {
|
||||
if step.precedence != 0 || step.associativity.is_some() {
|
||||
write!(
|
||||
f,
|
||||
" (prec {:?} assoc {:?})",
|
||||
step.precedence, step.associativity
|
||||
)?;
|
||||
if let Some(associativity) = step.associativity {
|
||||
if step.precedence != 0 {
|
||||
write!(f, " ({} {:?})", step.precedence, associativity)?;
|
||||
} else {
|
||||
write!(f, " ({:?})", associativity)?;
|
||||
}
|
||||
} else if step.precedence != 0 {
|
||||
write!(f, " ({})", step.precedence)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ pub(crate) fn minimize_parse_table(
|
|||
minimizer.merge_compatible_states();
|
||||
minimizer.remove_unit_reductions();
|
||||
minimizer.remove_unused_states();
|
||||
minimizer.reorder_states_by_descending_size();
|
||||
}
|
||||
|
||||
struct Minimizer<'a> {
|
||||
|
|
@ -454,4 +455,37 @@ impl<'a> Minimizer<'a> {
|
|||
original_state_id += 1;
|
||||
}
|
||||
}
|
||||
|
||||
fn reorder_states_by_descending_size(&mut self) {
|
||||
// Get a mapping of old state index -> new_state_index
|
||||
let mut old_ids_by_new_id = (0..self.parse_table.states.len()).collect::<Vec<_>>();
|
||||
&old_ids_by_new_id.sort_unstable_by_key(|i| {
|
||||
// Don't changes states 0 (the error state) or 1 (the start state).
|
||||
if *i <= 1 {
|
||||
return *i as i64 - 1_000_000;
|
||||
}
|
||||
|
||||
// Reorder all the other states by descending symbol count.
|
||||
let state = &self.parse_table.states[*i];
|
||||
-((state.terminal_entries.len() + state.nonterminal_entries.len()) as i64)
|
||||
});
|
||||
|
||||
// Get the inverse mapping
|
||||
let mut new_ids_by_old_id = vec![0; old_ids_by_new_id.len()];
|
||||
for (id, old_id) in old_ids_by_new_id.iter().enumerate() {
|
||||
new_ids_by_old_id[*old_id] = id;
|
||||
}
|
||||
|
||||
// Reorder the parse states and update their references to reflect
|
||||
// the new ordering.
|
||||
self.parse_table.states = old_ids_by_new_id
|
||||
.iter()
|
||||
.map(|old_id| {
|
||||
let mut state = ParseState::default();
|
||||
mem::swap(&mut state, &mut self.parse_table.states[*old_id]);
|
||||
state.update_referenced_states(|id, _| new_ids_by_old_id[id]);
|
||||
state
|
||||
})
|
||||
.collect();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ mod minimize_parse_table;
|
|||
mod token_conflicts;
|
||||
|
||||
use self::build_lex_table::build_lex_table;
|
||||
use self::build_parse_table::build_parse_table;
|
||||
use self::build_parse_table::{build_parse_table, ParseStateInfo};
|
||||
use self::coincident_tokens::CoincidentTokenIndex;
|
||||
use self::minimize_parse_table::minimize_parse_table;
|
||||
use self::token_conflicts::TokenConflictMap;
|
||||
|
|
@ -18,6 +18,7 @@ use crate::generate::node_types::VariableInfo;
|
|||
use crate::generate::rules::{AliasMap, Symbol, SymbolType, TokenSet};
|
||||
use crate::generate::tables::{LexTable, ParseAction, ParseTable, ParseTableEntry};
|
||||
use log::info;
|
||||
use std::collections::{BTreeSet, HashMap};
|
||||
|
||||
pub(crate) fn build_tables(
|
||||
syntax_grammar: &SyntaxGrammar,
|
||||
|
|
@ -25,8 +26,9 @@ pub(crate) fn build_tables(
|
|||
simple_aliases: &AliasMap,
|
||||
variable_info: &Vec<VariableInfo>,
|
||||
inlines: &InlinedProductionMap,
|
||||
report_symbol_name: Option<&str>,
|
||||
) -> Result<(ParseTable, LexTable, LexTable, Option<Symbol>)> {
|
||||
let (mut parse_table, following_tokens) =
|
||||
let (mut parse_table, following_tokens, parse_state_info) =
|
||||
build_parse_table(syntax_grammar, lexical_grammar, inlines, variable_info)?;
|
||||
let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens);
|
||||
let coincident_token_index = CoincidentTokenIndex::new(&parse_table, lexical_grammar);
|
||||
|
|
@ -62,7 +64,18 @@ pub(crate) fn build_tables(
|
|||
&coincident_token_index,
|
||||
&token_conflict_map,
|
||||
);
|
||||
populate_external_lex_states(&mut parse_table, syntax_grammar);
|
||||
mark_fragile_tokens(&mut parse_table, lexical_grammar, &token_conflict_map);
|
||||
|
||||
if let Some(report_symbol_name) = report_symbol_name {
|
||||
report_state_info(
|
||||
&syntax_grammar,
|
||||
&lexical_grammar,
|
||||
&parse_table,
|
||||
&parse_state_info,
|
||||
report_symbol_name,
|
||||
);
|
||||
}
|
||||
Ok((
|
||||
parse_table,
|
||||
main_lex_table,
|
||||
|
|
@ -197,6 +210,43 @@ fn populate_used_symbols(
|
|||
}
|
||||
}
|
||||
|
||||
fn populate_external_lex_states(parse_table: &mut ParseTable, syntax_grammar: &SyntaxGrammar) {
|
||||
let mut external_tokens_by_corresponding_internal_token = HashMap::new();
|
||||
for (i, external_token) in syntax_grammar.external_tokens.iter().enumerate() {
|
||||
if let Some(symbol) = external_token.corresponding_internal_token {
|
||||
external_tokens_by_corresponding_internal_token.insert(symbol.index, i);
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure that external lex state 0 represents the absence of any
|
||||
// external tokens.
|
||||
parse_table.external_lex_states.push(TokenSet::new());
|
||||
|
||||
for i in 0..parse_table.states.len() {
|
||||
let mut external_tokens = TokenSet::new();
|
||||
for token in parse_table.states[i].terminal_entries.keys() {
|
||||
if token.is_external() {
|
||||
external_tokens.insert(*token);
|
||||
} else if token.is_terminal() {
|
||||
if let Some(index) =
|
||||
external_tokens_by_corresponding_internal_token.get(&token.index)
|
||||
{
|
||||
external_tokens.insert(Symbol::external(*index));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
parse_table.states[i].external_lex_state_id = parse_table
|
||||
.external_lex_states
|
||||
.iter()
|
||||
.position(|tokens| *tokens == external_tokens)
|
||||
.unwrap_or_else(|| {
|
||||
parse_table.external_lex_states.push(external_tokens);
|
||||
parse_table.external_lex_states.len() - 1
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
fn identify_keywords(
|
||||
lexical_grammar: &LexicalGrammar,
|
||||
parse_table: &ParseTable,
|
||||
|
|
@ -333,6 +383,90 @@ fn mark_fragile_tokens(
|
|||
}
|
||||
}
|
||||
|
||||
fn report_state_info<'a>(
|
||||
syntax_grammar: &SyntaxGrammar,
|
||||
lexical_grammar: &LexicalGrammar,
|
||||
parse_table: &ParseTable,
|
||||
parse_state_info: &Vec<ParseStateInfo<'a>>,
|
||||
report_symbol_name: &'a str,
|
||||
) {
|
||||
let mut all_state_indices = BTreeSet::new();
|
||||
let mut symbols_with_state_indices = (0..syntax_grammar.variables.len())
|
||||
.map(|i| (Symbol::non_terminal(i), BTreeSet::new()))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
for (i, state) in parse_table.states.iter().enumerate() {
|
||||
all_state_indices.insert(i);
|
||||
let item_set = &parse_state_info[state.id];
|
||||
for (item, _) in item_set.1.entries.iter() {
|
||||
if !item.is_augmented() {
|
||||
symbols_with_state_indices[item.variable_index as usize]
|
||||
.1
|
||||
.insert(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
symbols_with_state_indices.sort_unstable_by_key(|(_, states)| -(states.len() as i32));
|
||||
|
||||
let max_symbol_name_length = syntax_grammar
|
||||
.variables
|
||||
.iter()
|
||||
.map(|v| v.name.len())
|
||||
.max()
|
||||
.unwrap();
|
||||
for (symbol, states) in &symbols_with_state_indices {
|
||||
eprintln!(
|
||||
"{:width$}\t{}",
|
||||
syntax_grammar.variables[symbol.index].name,
|
||||
states.len(),
|
||||
width = max_symbol_name_length
|
||||
);
|
||||
}
|
||||
eprintln!("");
|
||||
|
||||
let state_indices = if report_symbol_name == "*" {
|
||||
Some(&all_state_indices)
|
||||
} else {
|
||||
symbols_with_state_indices
|
||||
.iter()
|
||||
.find_map(|(symbol, state_indices)| {
|
||||
if syntax_grammar.variables[symbol.index].name == report_symbol_name {
|
||||
Some(state_indices)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
};
|
||||
|
||||
if let Some(state_indices) = state_indices {
|
||||
let mut state_indices = state_indices.into_iter().cloned().collect::<Vec<_>>();
|
||||
state_indices.sort_unstable_by_key(|i| (parse_table.states[*i].core_id, *i));
|
||||
|
||||
for state_index in state_indices {
|
||||
let id = parse_table.states[state_index].id;
|
||||
let (preceding_symbols, item_set) = &parse_state_info[id];
|
||||
eprintln!("state index: {}", state_index);
|
||||
eprintln!("state id: {}", id);
|
||||
eprint!("symbol sequence:");
|
||||
for symbol in preceding_symbols {
|
||||
let name = if symbol.is_terminal() {
|
||||
&lexical_grammar.variables[symbol.index].name
|
||||
} else if symbol.is_external() {
|
||||
&syntax_grammar.external_tokens[symbol.index].name
|
||||
} else {
|
||||
&syntax_grammar.variables[symbol.index].name
|
||||
};
|
||||
eprint!(" {}", name);
|
||||
}
|
||||
eprintln!(
|
||||
"\nitems:\n{}",
|
||||
self::item::ParseItemSetDisplay(&item_set, syntax_grammar, lexical_grammar,),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn all_chars_are_alphabetical(cursor: &NfaCursor) -> bool {
|
||||
cursor.transition_chars().all(|(chars, is_sep)| {
|
||||
if is_sep {
|
||||
|
|
|
|||
|
|
@ -33,6 +33,16 @@ lazy_static! {
|
|||
.unwrap();
|
||||
}
|
||||
|
||||
const NEW_HEADER_PARTS: [&'static str; 2] = [
|
||||
"
|
||||
uint32_t large_state_count;
|
||||
const uint16_t *small_parse_table;
|
||||
const uint32_t *small_parse_table_map;",
|
||||
"
|
||||
#define SMALL_STATE(id) id - LARGE_STATE_COUNT
|
||||
",
|
||||
];
|
||||
|
||||
struct GeneratedParser {
|
||||
c_code: String,
|
||||
node_types_json: String,
|
||||
|
|
@ -42,6 +52,8 @@ pub fn generate_parser_in_directory(
|
|||
repo_path: &PathBuf,
|
||||
grammar_path: Option<&str>,
|
||||
properties_only: bool,
|
||||
next_abi: bool,
|
||||
report_symbol_name: Option<&str>,
|
||||
) -> Result<()> {
|
||||
let src_path = repo_path.join("src");
|
||||
let header_path = src_path.join("tree_sitter");
|
||||
|
|
@ -102,11 +114,28 @@ pub fn generate_parser_in_directory(
|
|||
lexical_grammar,
|
||||
inlines,
|
||||
simple_aliases,
|
||||
next_abi,
|
||||
report_symbol_name,
|
||||
)?;
|
||||
|
||||
write_file(&src_path.join("parser.c"), c_code)?;
|
||||
write_file(&src_path.join("node-types.json"), node_types_json)?;
|
||||
write_file(&header_path.join("parser.h"), tree_sitter::PARSER_HEADER)?;
|
||||
|
||||
if next_abi {
|
||||
write_file(&header_path.join("parser.h"), tree_sitter::PARSER_HEADER)?;
|
||||
} else {
|
||||
let mut header = tree_sitter::PARSER_HEADER.to_string();
|
||||
|
||||
for part in &NEW_HEADER_PARTS {
|
||||
let pos = header
|
||||
.find(part)
|
||||
.expect("Missing expected part of parser.h header");
|
||||
header.replace_range(pos..(pos + part.len()), "");
|
||||
}
|
||||
|
||||
write_file(&header_path.join("parser.h"), header)?;
|
||||
}
|
||||
|
||||
ensure_file(&repo_path.join("index.js"), || {
|
||||
npm_files::index_js(&language_name)
|
||||
})?;
|
||||
|
|
@ -132,6 +161,8 @@ pub fn generate_parser_for_grammar(grammar_json: &str) -> Result<(String, String
|
|||
lexical_grammar,
|
||||
inlines,
|
||||
simple_aliases,
|
||||
true,
|
||||
None,
|
||||
)?;
|
||||
Ok((input_grammar.name, parser.c_code))
|
||||
}
|
||||
|
|
@ -142,6 +173,8 @@ fn generate_parser_for_grammar_with_opts(
|
|||
lexical_grammar: LexicalGrammar,
|
||||
inlines: InlinedProductionMap,
|
||||
simple_aliases: AliasMap,
|
||||
next_abi: bool,
|
||||
report_symbol_name: Option<&str>,
|
||||
) -> Result<GeneratedParser> {
|
||||
let variable_info = node_types::get_variable_info(&syntax_grammar, &lexical_grammar, &inlines)?;
|
||||
let node_types_json = node_types::generate_node_types_json(
|
||||
|
|
@ -156,6 +189,7 @@ fn generate_parser_for_grammar_with_opts(
|
|||
&simple_aliases,
|
||||
&variable_info,
|
||||
&inlines,
|
||||
report_symbol_name,
|
||||
)?;
|
||||
let c_code = render_c_code(
|
||||
name,
|
||||
|
|
@ -166,6 +200,7 @@ fn generate_parser_for_grammar_with_opts(
|
|||
syntax_grammar,
|
||||
lexical_grammar,
|
||||
simple_aliases,
|
||||
next_abi,
|
||||
);
|
||||
Ok(GeneratedParser {
|
||||
c_code,
|
||||
|
|
|
|||
|
|
@ -1,14 +1,18 @@
|
|||
use super::grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType};
|
||||
use super::nfa::CharacterSet;
|
||||
use super::rules::{Alias, AliasMap, Symbol, SymbolType, TokenSet};
|
||||
use super::rules::{Alias, AliasMap, Symbol, SymbolType};
|
||||
use super::tables::{
|
||||
AdvanceAction, FieldLocation, LexState, LexTable, ParseAction, ParseTable, ParseTableEntry,
|
||||
};
|
||||
use core::ops::Range;
|
||||
use std::cmp;
|
||||
use std::collections::{BTreeMap, HashMap, HashSet};
|
||||
use std::fmt::Write;
|
||||
use std::mem::swap;
|
||||
use tree_sitter::LANGUAGE_VERSION;
|
||||
|
||||
// Currently, the library supports a new ABI version that has not yet been
|
||||
// stabilized, and the parser generation does not use it by default.
|
||||
const STABLE_LANGUAGE_VERSION: usize = tree_sitter::LANGUAGE_VERSION - 1;
|
||||
|
||||
macro_rules! add {
|
||||
($this: tt, $($arg: tt)*) => {{
|
||||
|
|
@ -45,6 +49,8 @@ macro_rules! dedent {
|
|||
};
|
||||
}
|
||||
|
||||
const SMALL_STATE_THRESHOLD: usize = 48;
|
||||
|
||||
struct Generator {
|
||||
buffer: String,
|
||||
indent_level: usize,
|
||||
|
|
@ -52,15 +58,17 @@ struct Generator {
|
|||
parse_table: ParseTable,
|
||||
main_lex_table: LexTable,
|
||||
keyword_lex_table: LexTable,
|
||||
large_state_count: usize,
|
||||
keyword_capture_token: Option<Symbol>,
|
||||
syntax_grammar: SyntaxGrammar,
|
||||
lexical_grammar: LexicalGrammar,
|
||||
simple_aliases: AliasMap,
|
||||
symbol_order: HashMap<Symbol, usize>,
|
||||
symbol_ids: HashMap<Symbol, String>,
|
||||
alias_ids: HashMap<Alias, String>,
|
||||
external_scanner_states: Vec<TokenSet>,
|
||||
alias_map: BTreeMap<Alias, Option<Symbol>>,
|
||||
field_names: Vec<String>,
|
||||
next_abi: bool,
|
||||
}
|
||||
|
||||
impl Generator {
|
||||
|
|
@ -148,6 +156,27 @@ impl Generator {
|
|||
field_names.sort_unstable();
|
||||
field_names.dedup();
|
||||
self.field_names = field_names.into_iter().cloned().collect();
|
||||
|
||||
// If we are opting in to the new unstable language ABI, then use the concept of
|
||||
// "small parse states". Otherwise, use the same representation for all parse
|
||||
// states.
|
||||
if self.next_abi {
|
||||
let threshold = cmp::min(
|
||||
SMALL_STATE_THRESHOLD,
|
||||
self.parse_table.symbols.len() / 2 - 1,
|
||||
);
|
||||
self.large_state_count = self
|
||||
.parse_table
|
||||
.states
|
||||
.iter()
|
||||
.enumerate()
|
||||
.take_while(|(i, s)| {
|
||||
*i <= 1 || s.terminal_entries.len() + s.nonterminal_entries.len() > threshold
|
||||
})
|
||||
.count();
|
||||
} else {
|
||||
self.large_state_count = self.parse_table.states.len();
|
||||
}
|
||||
}
|
||||
|
||||
fn add_includes(&mut self) {
|
||||
|
|
@ -198,12 +227,26 @@ impl Generator {
|
|||
})
|
||||
.count();
|
||||
|
||||
add_line!(self, "#define LANGUAGE_VERSION {}", LANGUAGE_VERSION);
|
||||
if self.next_abi {
|
||||
add_line!(
|
||||
self,
|
||||
"#define LANGUAGE_VERSION {}",
|
||||
tree_sitter::LANGUAGE_VERSION
|
||||
);
|
||||
} else {
|
||||
add_line!(self, "#define LANGUAGE_VERSION {}", STABLE_LANGUAGE_VERSION);
|
||||
}
|
||||
|
||||
add_line!(
|
||||
self,
|
||||
"#define STATE_COUNT {}",
|
||||
self.parse_table.states.len()
|
||||
);
|
||||
|
||||
if self.next_abi {
|
||||
add_line!(self, "#define LARGE_STATE_COUNT {}", self.large_state_count);
|
||||
}
|
||||
|
||||
add_line!(
|
||||
self,
|
||||
"#define SYMBOL_COUNT {}",
|
||||
|
|
@ -232,9 +275,11 @@ impl Generator {
|
|||
fn add_symbol_enum(&mut self) {
|
||||
add_line!(self, "enum {{");
|
||||
indent!(self);
|
||||
self.symbol_order.insert(Symbol::end(), 0);
|
||||
let mut i = 1;
|
||||
for symbol in self.parse_table.symbols.iter() {
|
||||
if *symbol != Symbol::end() {
|
||||
self.symbol_order.insert(*symbol, i);
|
||||
add_line!(self, "{} = {},", self.symbol_ids[&symbol], i);
|
||||
i += 1;
|
||||
}
|
||||
|
|
@ -633,40 +678,16 @@ impl Generator {
|
|||
}
|
||||
|
||||
fn add_lex_modes_list(&mut self) {
|
||||
self.get_external_scanner_state_id(TokenSet::new());
|
||||
|
||||
let mut external_tokens_by_corresponding_internal_token = HashMap::new();
|
||||
for (i, external_token) in self.syntax_grammar.external_tokens.iter().enumerate() {
|
||||
if let Some(symbol) = external_token.corresponding_internal_token {
|
||||
external_tokens_by_corresponding_internal_token.insert(symbol.index, i);
|
||||
}
|
||||
}
|
||||
|
||||
add_line!(self, "static TSLexMode ts_lex_modes[STATE_COUNT] = {{");
|
||||
indent!(self);
|
||||
for i in 0..self.parse_table.states.len() {
|
||||
let mut external_tokens = TokenSet::new();
|
||||
for token in self.parse_table.states[i].terminal_entries.keys() {
|
||||
if token.is_external() {
|
||||
external_tokens.insert(*token);
|
||||
} else if token.is_terminal() {
|
||||
if let Some(external_index) =
|
||||
external_tokens_by_corresponding_internal_token.get(&token.index)
|
||||
{
|
||||
external_tokens.insert(Symbol::external(*external_index));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let external_state_id = self.get_external_scanner_state_id(external_tokens);
|
||||
let state = &self.parse_table.states[i];
|
||||
if external_state_id > 0 {
|
||||
for (i, state) in self.parse_table.states.iter().enumerate() {
|
||||
if state.external_lex_state_id > 0 {
|
||||
add_line!(
|
||||
self,
|
||||
"[{}] = {{.lex_state = {}, .external_lex_state = {}}},",
|
||||
i,
|
||||
state.lex_state_id,
|
||||
external_state_id
|
||||
state.external_lex_state_id
|
||||
);
|
||||
} else {
|
||||
add_line!(self, "[{}] = {{.lex_state = {}}},", i, state.lex_state_id);
|
||||
|
|
@ -720,14 +741,14 @@ impl Generator {
|
|||
add_line!(
|
||||
self,
|
||||
"static bool ts_external_scanner_states[{}][EXTERNAL_TOKEN_COUNT] = {{",
|
||||
self.external_scanner_states.len(),
|
||||
self.parse_table.external_lex_states.len(),
|
||||
);
|
||||
indent!(self);
|
||||
for i in 0..self.external_scanner_states.len() {
|
||||
if !self.external_scanner_states[i].is_empty() {
|
||||
for i in 0..self.parse_table.external_lex_states.len() {
|
||||
if !self.parse_table.external_lex_states[i].is_empty() {
|
||||
add_line!(self, "[{}] = {{", i);
|
||||
indent!(self);
|
||||
for token in self.external_scanner_states[i].iter() {
|
||||
for token in self.parse_table.external_lex_states[i].iter() {
|
||||
add_line!(
|
||||
self,
|
||||
"[{}] = true,",
|
||||
|
|
@ -758,25 +779,42 @@ impl Generator {
|
|||
|
||||
add_line!(
|
||||
self,
|
||||
"static uint16_t ts_parse_table[STATE_COUNT][SYMBOL_COUNT] = {{"
|
||||
"static uint16_t ts_parse_table[{}][SYMBOL_COUNT] = {{",
|
||||
if self.next_abi {
|
||||
"LARGE_STATE_COUNT"
|
||||
} else {
|
||||
"STATE_COUNT"
|
||||
}
|
||||
);
|
||||
indent!(self);
|
||||
|
||||
let mut terminal_entries = Vec::new();
|
||||
let mut nonterminal_entries = Vec::new();
|
||||
|
||||
for (i, state) in self.parse_table.states.iter().enumerate() {
|
||||
for (i, state) in self
|
||||
.parse_table
|
||||
.states
|
||||
.iter()
|
||||
.enumerate()
|
||||
.take(self.large_state_count)
|
||||
{
|
||||
add_line!(self, "[{}] = {{", i);
|
||||
indent!(self);
|
||||
|
||||
terminal_entries.clear();
|
||||
nonterminal_entries.clear();
|
||||
terminal_entries.extend(state.terminal_entries.iter());
|
||||
nonterminal_entries.extend(state.nonterminal_entries.iter());
|
||||
terminal_entries.sort_unstable_by_key(|e| e.0);
|
||||
nonterminal_entries.sort_unstable_by_key(|e| e.0);
|
||||
terminal_entries.sort_unstable_by_key(|e| self.symbol_order.get(e.0));
|
||||
nonterminal_entries.sort_unstable_by_key(|k| k.0);
|
||||
|
||||
add_line!(self, "[{}] = {{", i);
|
||||
indent!(self);
|
||||
for (symbol, state_id) in &nonterminal_entries {
|
||||
add_line!(self, "[{}] = STATE({}),", self.symbol_ids[symbol], state_id);
|
||||
add_line!(
|
||||
self,
|
||||
"[{}] = STATE({}),",
|
||||
self.symbol_ids[symbol],
|
||||
*state_id
|
||||
);
|
||||
}
|
||||
|
||||
for (symbol, entry) in &terminal_entries {
|
||||
|
|
@ -799,6 +837,59 @@ impl Generator {
|
|||
add_line!(self, "}};");
|
||||
add_line!(self, "");
|
||||
|
||||
if self.large_state_count < self.parse_table.states.len() {
|
||||
add_line!(self, "static uint32_t ts_small_parse_table_map[] = {{");
|
||||
indent!(self);
|
||||
let mut index = 0;
|
||||
for (i, state) in self
|
||||
.parse_table
|
||||
.states
|
||||
.iter()
|
||||
.enumerate()
|
||||
.skip(self.large_state_count)
|
||||
{
|
||||
add_line!(self, "[SMALL_STATE({})] = {},", i, index);
|
||||
index += 1 + 2 * state.symbol_count();
|
||||
}
|
||||
dedent!(self);
|
||||
add_line!(self, "}};");
|
||||
add_line!(self, "");
|
||||
|
||||
index = 0;
|
||||
add_line!(self, "static uint16_t ts_small_parse_table[] = {{");
|
||||
indent!(self);
|
||||
for state in self.parse_table.states.iter().skip(self.large_state_count) {
|
||||
add_line!(self, "[{}] = {},", index, state.symbol_count());
|
||||
indent!(self);
|
||||
|
||||
terminal_entries.clear();
|
||||
nonterminal_entries.clear();
|
||||
terminal_entries.extend(state.terminal_entries.iter());
|
||||
nonterminal_entries.extend(state.nonterminal_entries.iter());
|
||||
terminal_entries.sort_unstable_by_key(|e| self.symbol_order.get(e.0));
|
||||
nonterminal_entries.sort_unstable_by_key(|k| k.0);
|
||||
|
||||
for (symbol, entry) in &terminal_entries {
|
||||
let entry_id = self.get_parse_action_list_id(
|
||||
entry,
|
||||
&mut parse_table_entries,
|
||||
&mut next_parse_action_list_index,
|
||||
);
|
||||
add_line!(self, "{}, ACTIONS({}),", self.symbol_ids[symbol], entry_id);
|
||||
}
|
||||
|
||||
for (symbol, state_id) in &nonterminal_entries {
|
||||
add_line!(self, "{}, STATE({}),", self.symbol_ids[symbol], *state_id);
|
||||
}
|
||||
dedent!(self);
|
||||
|
||||
index += 1 + 2 * state.symbol_count();
|
||||
}
|
||||
dedent!(self);
|
||||
add_line!(self, "}};");
|
||||
add_line!(self, "");
|
||||
}
|
||||
|
||||
self.add_parse_action_list(parse_table_entries);
|
||||
}
|
||||
|
||||
|
|
@ -897,11 +988,28 @@ impl Generator {
|
|||
add_line!(self, ".symbol_count = SYMBOL_COUNT,");
|
||||
add_line!(self, ".alias_count = ALIAS_COUNT,");
|
||||
add_line!(self, ".token_count = TOKEN_COUNT,");
|
||||
|
||||
if self.next_abi {
|
||||
add_line!(self, ".large_state_count = LARGE_STATE_COUNT,");
|
||||
}
|
||||
|
||||
add_line!(self, ".symbol_metadata = ts_symbol_metadata,");
|
||||
add_line!(
|
||||
self,
|
||||
".parse_table = (const unsigned short *)ts_parse_table,"
|
||||
);
|
||||
|
||||
if self.large_state_count < self.parse_table.states.len() {
|
||||
add_line!(
|
||||
self,
|
||||
".small_parse_table = (const uint16_t *)ts_small_parse_table,"
|
||||
);
|
||||
add_line!(
|
||||
self,
|
||||
".small_parse_table_map = (const uint32_t *)ts_small_parse_table_map,"
|
||||
);
|
||||
}
|
||||
|
||||
add_line!(self, ".parse_actions = ts_parse_actions,");
|
||||
add_line!(self, ".lex_modes = ts_lex_modes,");
|
||||
add_line!(self, ".symbol_names = ts_symbol_names,");
|
||||
|
|
@ -997,16 +1105,6 @@ impl Generator {
|
|||
result
|
||||
}
|
||||
|
||||
fn get_external_scanner_state_id(&mut self, external_tokens: TokenSet) -> usize {
|
||||
self.external_scanner_states
|
||||
.iter()
|
||||
.position(|tokens| *tokens == external_tokens)
|
||||
.unwrap_or_else(|| {
|
||||
self.external_scanner_states.push(external_tokens);
|
||||
self.external_scanner_states.len() - 1
|
||||
})
|
||||
}
|
||||
|
||||
fn external_token_id(&self, token: &ExternalToken) -> String {
|
||||
format!(
|
||||
"ts_external_token_{}",
|
||||
|
|
@ -1152,6 +1250,23 @@ impl Generator {
|
|||
}
|
||||
}
|
||||
|
||||
/// Returns a String of C code for the given components of a parser.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `name` - A string slice containing the name of the language
|
||||
/// * `parse_table` - The generated parse table for the language
|
||||
/// * `main_lex_table` - The generated lexing table for the language
|
||||
/// * `keyword_lex_table` - The generated keyword lexing table for the language
|
||||
/// * `keyword_capture_token` - A symbol indicating which token is used
|
||||
/// for keyword capture, if any.
|
||||
/// * `syntax_grammar` - The syntax grammar extracted from the language's grammar
|
||||
/// * `lexical_grammar` - The lexical grammar extracted from the language's grammar
|
||||
/// * `simple_aliases` - A map describing the global rename rules that should apply.
|
||||
/// the keys are symbols that are *always* aliased in the same way, and the values
|
||||
/// are the aliases that are applied to those symbols.
|
||||
/// * `next_abi` - A boolean indicating whether to opt into the new, unstable parse
|
||||
/// table format. This is mainly used for testing, when developing Tree-sitter itself.
|
||||
pub(crate) fn render_c_code(
|
||||
name: &str,
|
||||
parse_table: ParseTable,
|
||||
|
|
@ -1161,11 +1276,13 @@ pub(crate) fn render_c_code(
|
|||
syntax_grammar: SyntaxGrammar,
|
||||
lexical_grammar: LexicalGrammar,
|
||||
simple_aliases: AliasMap,
|
||||
next_abi: bool,
|
||||
) -> String {
|
||||
Generator {
|
||||
buffer: String::new(),
|
||||
indent_level: 0,
|
||||
language_name: name.to_string(),
|
||||
large_state_count: 0,
|
||||
parse_table,
|
||||
main_lex_table,
|
||||
keyword_lex_table,
|
||||
|
|
@ -1174,10 +1291,11 @@ pub(crate) fn render_c_code(
|
|||
lexical_grammar,
|
||||
simple_aliases,
|
||||
symbol_ids: HashMap::new(),
|
||||
symbol_order: HashMap::new(),
|
||||
alias_ids: HashMap::new(),
|
||||
external_scanner_states: Vec::new(),
|
||||
alias_map: BTreeMap::new(),
|
||||
field_names: Vec::new(),
|
||||
next_abi,
|
||||
}
|
||||
.generate()
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
use super::nfa::CharacterSet;
|
||||
use super::rules::{Alias, Associativity, Symbol};
|
||||
use super::rules::{Alias, Associativity, Symbol, TokenSet};
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
|
||||
pub(crate) type ProductionInfoId = usize;
|
||||
pub(crate) type ParseStateId = usize;
|
||||
pub(crate) type LexStateId = usize;
|
||||
|
|
@ -37,6 +36,7 @@ pub(crate) struct ParseState {
|
|||
pub terminal_entries: HashMap<Symbol, ParseTableEntry>,
|
||||
pub nonterminal_entries: HashMap<Symbol, ParseStateId>,
|
||||
pub lex_state_id: usize,
|
||||
pub external_lex_state_id: usize,
|
||||
pub core_id: usize,
|
||||
}
|
||||
|
||||
|
|
@ -58,6 +58,7 @@ pub(crate) struct ParseTable {
|
|||
pub symbols: Vec<Symbol>,
|
||||
pub production_infos: Vec<ProductionInfo>,
|
||||
pub max_aliased_production_length: usize,
|
||||
pub external_lex_states: Vec<TokenSet>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
|
||||
|
|
@ -93,6 +94,10 @@ impl Default for LexTable {
|
|||
}
|
||||
|
||||
impl ParseState {
|
||||
pub fn symbol_count(&self) -> usize {
|
||||
self.terminal_entries.len() + self.nonterminal_entries.len()
|
||||
}
|
||||
|
||||
pub fn referenced_states<'a>(&'a self) -> impl Iterator<Item = ParseStateId> + 'a {
|
||||
self.terminal_entries
|
||||
.iter()
|
||||
|
|
|
|||
|
|
@ -38,7 +38,14 @@ fn run() -> error::Result<()> {
|
|||
.about("Generate a parser")
|
||||
.arg(Arg::with_name("grammar-path").index(1))
|
||||
.arg(Arg::with_name("log").long("log"))
|
||||
.arg(Arg::with_name("next-abi").long("next-abi"))
|
||||
.arg(Arg::with_name("properties-only").long("properties"))
|
||||
.arg(
|
||||
Arg::with_name("report-states-for-rule")
|
||||
.long("report-states-for-rule")
|
||||
.value_name("rule-name")
|
||||
.takes_value(true),
|
||||
)
|
||||
.arg(Arg::with_name("no-minimize").long("no-minimize")),
|
||||
)
|
||||
.subcommand(
|
||||
|
|
@ -121,10 +128,24 @@ fn run() -> error::Result<()> {
|
|||
} else if let Some(matches) = matches.subcommand_matches("generate") {
|
||||
let grammar_path = matches.value_of("grammar-path");
|
||||
let properties_only = matches.is_present("properties-only");
|
||||
let report_symbol_name = matches.value_of("report-states-for-rule").or_else(|| {
|
||||
if matches.is_present("report-states") {
|
||||
Some("")
|
||||
} else {
|
||||
None
|
||||
}
|
||||
});
|
||||
if matches.is_present("log") {
|
||||
logger::init();
|
||||
}
|
||||
generate::generate_parser_in_directory(¤t_dir, grammar_path, properties_only)?;
|
||||
let next_abi = matches.is_present("next-abi");
|
||||
generate::generate_parser_in_directory(
|
||||
¤t_dir,
|
||||
grammar_path,
|
||||
properties_only,
|
||||
next_abi,
|
||||
report_symbol_name,
|
||||
)?;
|
||||
} else if let Some(matches) = matches.subcommand_matches("test") {
|
||||
let debug = matches.is_present("debug");
|
||||
let debug_graph = matches.is_present("debug-graph");
|
||||
|
|
|
|||
|
|
@ -591,5 +591,5 @@ extern "C" {
|
|||
pub fn ts_language_version(arg1: *const TSLanguage) -> u32;
|
||||
}
|
||||
|
||||
pub const TREE_SITTER_LANGUAGE_VERSION: usize = 10;
|
||||
pub const TREE_SITTER_LANGUAGE_VERSION: usize = 11;
|
||||
pub const TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION: usize = 9;
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ extern "C" {
|
|||
/* Section - ABI Versioning */
|
||||
/****************************/
|
||||
|
||||
#define TREE_SITTER_LANGUAGE_VERSION 10
|
||||
#define TREE_SITTER_LANGUAGE_VERSION 11
|
||||
#define TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION 9
|
||||
|
||||
/*******************/
|
||||
|
|
|
|||
|
|
@ -114,6 +114,9 @@ struct TSLanguage {
|
|||
const TSFieldMapSlice *field_map_slices;
|
||||
const TSFieldMapEntry *field_map_entries;
|
||||
const char **field_names;
|
||||
uint32_t large_state_count;
|
||||
const uint16_t *small_parse_table;
|
||||
const uint32_t *small_parse_table_map;
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
@ -155,6 +158,8 @@ struct TSLanguage {
|
|||
* Parse Table Macros
|
||||
*/
|
||||
|
||||
#define SMALL_STATE(id) id - LARGE_STATE_COUNT
|
||||
|
||||
#define STATE(id) id
|
||||
|
||||
#define ACTIONS(id) id
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ void ts_language_table_entry(const TSLanguage *self, TSStateId state,
|
|||
result->actions = NULL;
|
||||
} else {
|
||||
assert(symbol < self->token_count);
|
||||
uint32_t action_index = self->parse_table[state * self->symbol_count + symbol];
|
||||
uint32_t action_index = ts_language_lookup(self, state, symbol);
|
||||
const TSParseActionEntry *entry = &self->parse_actions[action_index];
|
||||
result->action_count = entry->count;
|
||||
result->is_reusable = entry->reusable;
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ extern "C" {
|
|||
|
||||
#define ts_builtin_sym_error_repeat (ts_builtin_sym_error - 1)
|
||||
#define TREE_SITTER_LANGUAGE_VERSION_WITH_FIELDS 10
|
||||
#define TREE_SITTER_LANGUAGE_VERSION_WITH_SMALL_STATES 11
|
||||
|
||||
typedef struct {
|
||||
const TSParseAction *actions;
|
||||
|
|
@ -51,6 +52,30 @@ static inline bool ts_language_has_reduce_action(const TSLanguage *self,
|
|||
return entry.action_count > 0 && entry.actions[0].type == TSParseActionTypeReduce;
|
||||
}
|
||||
|
||||
static inline uint16_t ts_language_lookup(
|
||||
const TSLanguage *self,
|
||||
TSStateId state,
|
||||
TSSymbol symbol
|
||||
) {
|
||||
if (
|
||||
self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_SMALL_STATES &&
|
||||
state >= self->large_state_count
|
||||
) {
|
||||
uint32_t index = self->small_parse_table_map[state - self->large_state_count];
|
||||
const uint16_t *state_data = &self->small_parse_table[index];
|
||||
uint16_t symbol_count = *state_data;
|
||||
state_data++;
|
||||
for (unsigned i = 0; i < symbol_count; i++) {
|
||||
if (state_data[0] == symbol) return state_data[1];
|
||||
if (state_data[0] > symbol) break;
|
||||
state_data += 2;
|
||||
}
|
||||
return 0;
|
||||
} else {
|
||||
return self->parse_table[state * self->symbol_count + symbol];
|
||||
}
|
||||
}
|
||||
|
||||
static inline TSStateId ts_language_next_state(const TSLanguage *self,
|
||||
TSStateId state,
|
||||
TSSymbol symbol) {
|
||||
|
|
@ -67,7 +92,7 @@ static inline TSStateId ts_language_next_state(const TSLanguage *self,
|
|||
}
|
||||
return 0;
|
||||
} else {
|
||||
return self->parse_table[state * self->symbol_count + symbol];
|
||||
return ts_language_lookup(self, state, symbol);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue