Merge pull request #334 from tree-sitter/small-parse-states

Reduce parsers' static memory footprint by storing "small" parse states more compactly
This commit is contained in:
Max Brunsfeld 2019-08-29 20:30:51 -07:00 committed by GitHub
commit 94ca4dc8e0
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 485 additions and 96 deletions

View file

@ -25,10 +25,11 @@ struct AuxiliarySymbolInfo {
type SymbolSequence = Vec<Symbol>;
type AuxiliarySymbolSequence = Vec<AuxiliarySymbolInfo>;
pub(crate) type ParseStateInfo<'a> = (SymbolSequence, ParseItemSet<'a>);
struct ParseStateQueueEntry {
preceding_symbols: SymbolSequence,
preceding_auxiliary_symbols: AuxiliarySymbolSequence,
state_id: ParseStateId,
preceding_auxiliary_symbols: AuxiliarySymbolSequence,
}
struct ParseTableBuilder<'a> {
@ -38,13 +39,13 @@ struct ParseTableBuilder<'a> {
variable_info: &'a Vec<VariableInfo>,
core_ids_by_core: HashMap<ParseItemSetCore<'a>, usize>,
state_ids_by_item_set: HashMap<ParseItemSet<'a>, ParseStateId>,
item_sets_by_state_id: Vec<ParseItemSet<'a>>,
parse_state_info_by_id: Vec<ParseStateInfo<'a>>,
parse_state_queue: VecDeque<ParseStateQueueEntry>,
parse_table: ParseTable,
}
impl<'a> ParseTableBuilder<'a> {
fn build(mut self) -> Result<ParseTable> {
fn build(mut self) -> Result<(ParseTable, Vec<ParseStateInfo<'a>>)> {
// Ensure that the empty alias sequence has index 0.
self.parse_table
.production_infos
@ -70,9 +71,10 @@ impl<'a> ParseTableBuilder<'a> {
while let Some(entry) = self.parse_state_queue.pop_front() {
let item_set = self
.item_set_builder
.transitive_closure(&self.item_sets_by_state_id[entry.state_id]);
.transitive_closure(&self.parse_state_info_by_id[entry.state_id].1);
self.add_actions(
entry.preceding_symbols,
self.parse_state_info_by_id[entry.state_id].0.clone(),
entry.preceding_auxiliary_symbols,
entry.state_id,
item_set,
@ -81,7 +83,7 @@ impl<'a> ParseTableBuilder<'a> {
self.remove_precedences();
Ok(self.parse_table)
Ok((self.parse_table, self.parse_state_info_by_id))
}
fn add_parse_state(
@ -104,17 +106,19 @@ impl<'a> ParseTableBuilder<'a> {
};
let state_id = self.parse_table.states.len();
self.item_sets_by_state_id.push(v.key().clone());
self.parse_state_info_by_id
.push((preceding_symbols.clone(), v.key().clone()));
self.parse_table.states.push(ParseState {
id: state_id,
lex_state_id: 0,
external_lex_state_id: 0,
terminal_entries: HashMap::new(),
nonterminal_entries: HashMap::new(),
core_id,
});
self.parse_state_queue.push_back(ParseStateQueueEntry {
state_id,
preceding_symbols: preceding_symbols.clone(),
preceding_auxiliary_symbols: preceding_auxiliary_symbols.clone(),
});
v.insert(state_id);
@ -750,12 +754,12 @@ fn populate_following_tokens(
}
}
pub(crate) fn build_parse_table(
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
inlines: &InlinedProductionMap,
variable_info: &Vec<VariableInfo>,
) -> Result<(ParseTable, Vec<TokenSet>)> {
pub(crate) fn build_parse_table<'a>(
syntax_grammar: &'a SyntaxGrammar,
lexical_grammar: &'a LexicalGrammar,
inlines: &'a InlinedProductionMap,
variable_info: &'a Vec<VariableInfo>,
) -> Result<(ParseTable, Vec<TokenSet>, Vec<ParseStateInfo<'a>>)> {
let item_set_builder = ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines);
let mut following_tokens = vec![TokenSet::new(); lexical_grammar.variables.len()];
populate_following_tokens(
@ -765,23 +769,24 @@ pub(crate) fn build_parse_table(
&item_set_builder,
);
let table = ParseTableBuilder {
let (table, item_sets) = ParseTableBuilder {
syntax_grammar,
lexical_grammar,
item_set_builder,
variable_info,
state_ids_by_item_set: HashMap::new(),
core_ids_by_core: HashMap::new(),
item_sets_by_state_id: Vec::new(),
parse_state_info_by_id: Vec::new(),
parse_state_queue: VecDeque::new(),
parse_table: ParseTable {
states: Vec::new(),
symbols: Vec::new(),
external_lex_states: Vec::new(),
production_infos: Vec::new(),
max_aliased_production_length: 1,
},
}
.build()?;
Ok((table, following_tokens))
Ok((table, following_tokens, item_sets))
}

View file

@ -1,5 +1,8 @@
use crate::generate::grammars::{LexicalGrammar, Production, ProductionStep, SyntaxGrammar};
use crate::generate::rules::{Associativity, Symbol, SymbolType, TokenSet};
use crate::generate::grammars::{
LexicalGrammar, Production, ProductionStep, SyntaxGrammar,
};
use crate::generate::rules::Associativity;
use crate::generate::rules::{Symbol, SymbolType, TokenSet};
use lazy_static::lazy_static;
use std::cmp::Ordering;
use std::fmt;
@ -161,12 +164,14 @@ impl<'a> fmt::Display for ParseItemDisplay<'a> {
for (i, step) in self.0.production.steps.iter().enumerate() {
if i == self.0.step_index as usize {
write!(f, "")?;
if step.precedence != 0 || step.associativity.is_some() {
write!(
f,
" (prec {:?} assoc {:?})",
step.precedence, step.associativity
)?;
if let Some(associativity) = step.associativity {
if step.precedence != 0 {
write!(f, " ({} {:?})", step.precedence, associativity)?;
} else {
write!(f, " ({:?})", associativity)?;
}
} else if step.precedence != 0 {
write!(f, " ({})", step.precedence)?;
}
}
@ -184,19 +189,21 @@ impl<'a> fmt::Display for ParseItemDisplay<'a> {
}
if let Some(alias) = &step.alias {
write!(f, " (alias {})", alias.value)?;
write!(f, "@{}", alias.value)?;
}
}
if self.0.is_done() {
write!(f, "")?;
if let Some(step) = self.0.production.steps.last() {
if step.precedence != 0 || step.associativity.is_some() {
write!(
f,
" (prec {:?} assoc {:?})",
step.precedence, step.associativity
)?;
if let Some(associativity) = step.associativity {
if step.precedence != 0 {
write!(f, " ({} {:?})", step.precedence, associativity)?;
} else {
write!(f, " ({:?})", associativity)?;
}
} else if step.precedence != 0 {
write!(f, " ({})", step.precedence)?;
}
}
}

View file

@ -26,6 +26,7 @@ pub(crate) fn minimize_parse_table(
minimizer.merge_compatible_states();
minimizer.remove_unit_reductions();
minimizer.remove_unused_states();
minimizer.reorder_states_by_descending_size();
}
struct Minimizer<'a> {
@ -454,4 +455,37 @@ impl<'a> Minimizer<'a> {
original_state_id += 1;
}
}
fn reorder_states_by_descending_size(&mut self) {
// Get a mapping of old state index -> new_state_index
let mut old_ids_by_new_id = (0..self.parse_table.states.len()).collect::<Vec<_>>();
&old_ids_by_new_id.sort_unstable_by_key(|i| {
// Don't changes states 0 (the error state) or 1 (the start state).
if *i <= 1 {
return *i as i64 - 1_000_000;
}
// Reorder all the other states by descending symbol count.
let state = &self.parse_table.states[*i];
-((state.terminal_entries.len() + state.nonterminal_entries.len()) as i64)
});
// Get the inverse mapping
let mut new_ids_by_old_id = vec![0; old_ids_by_new_id.len()];
for (id, old_id) in old_ids_by_new_id.iter().enumerate() {
new_ids_by_old_id[*old_id] = id;
}
// Reorder the parse states and update their references to reflect
// the new ordering.
self.parse_table.states = old_ids_by_new_id
.iter()
.map(|old_id| {
let mut state = ParseState::default();
mem::swap(&mut state, &mut self.parse_table.states[*old_id]);
state.update_referenced_states(|id, _| new_ids_by_old_id[id]);
state
})
.collect();
}
}

View file

@ -7,7 +7,7 @@ mod minimize_parse_table;
mod token_conflicts;
use self::build_lex_table::build_lex_table;
use self::build_parse_table::build_parse_table;
use self::build_parse_table::{build_parse_table, ParseStateInfo};
use self::coincident_tokens::CoincidentTokenIndex;
use self::minimize_parse_table::minimize_parse_table;
use self::token_conflicts::TokenConflictMap;
@ -18,6 +18,7 @@ use crate::generate::node_types::VariableInfo;
use crate::generate::rules::{AliasMap, Symbol, SymbolType, TokenSet};
use crate::generate::tables::{LexTable, ParseAction, ParseTable, ParseTableEntry};
use log::info;
use std::collections::{BTreeSet, HashMap};
pub(crate) fn build_tables(
syntax_grammar: &SyntaxGrammar,
@ -25,8 +26,9 @@ pub(crate) fn build_tables(
simple_aliases: &AliasMap,
variable_info: &Vec<VariableInfo>,
inlines: &InlinedProductionMap,
report_symbol_name: Option<&str>,
) -> Result<(ParseTable, LexTable, LexTable, Option<Symbol>)> {
let (mut parse_table, following_tokens) =
let (mut parse_table, following_tokens, parse_state_info) =
build_parse_table(syntax_grammar, lexical_grammar, inlines, variable_info)?;
let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens);
let coincident_token_index = CoincidentTokenIndex::new(&parse_table, lexical_grammar);
@ -62,7 +64,18 @@ pub(crate) fn build_tables(
&coincident_token_index,
&token_conflict_map,
);
populate_external_lex_states(&mut parse_table, syntax_grammar);
mark_fragile_tokens(&mut parse_table, lexical_grammar, &token_conflict_map);
if let Some(report_symbol_name) = report_symbol_name {
report_state_info(
&syntax_grammar,
&lexical_grammar,
&parse_table,
&parse_state_info,
report_symbol_name,
);
}
Ok((
parse_table,
main_lex_table,
@ -197,6 +210,43 @@ fn populate_used_symbols(
}
}
fn populate_external_lex_states(parse_table: &mut ParseTable, syntax_grammar: &SyntaxGrammar) {
let mut external_tokens_by_corresponding_internal_token = HashMap::new();
for (i, external_token) in syntax_grammar.external_tokens.iter().enumerate() {
if let Some(symbol) = external_token.corresponding_internal_token {
external_tokens_by_corresponding_internal_token.insert(symbol.index, i);
}
}
// Ensure that external lex state 0 represents the absence of any
// external tokens.
parse_table.external_lex_states.push(TokenSet::new());
for i in 0..parse_table.states.len() {
let mut external_tokens = TokenSet::new();
for token in parse_table.states[i].terminal_entries.keys() {
if token.is_external() {
external_tokens.insert(*token);
} else if token.is_terminal() {
if let Some(index) =
external_tokens_by_corresponding_internal_token.get(&token.index)
{
external_tokens.insert(Symbol::external(*index));
}
}
}
parse_table.states[i].external_lex_state_id = parse_table
.external_lex_states
.iter()
.position(|tokens| *tokens == external_tokens)
.unwrap_or_else(|| {
parse_table.external_lex_states.push(external_tokens);
parse_table.external_lex_states.len() - 1
});
}
}
fn identify_keywords(
lexical_grammar: &LexicalGrammar,
parse_table: &ParseTable,
@ -333,6 +383,90 @@ fn mark_fragile_tokens(
}
}
fn report_state_info<'a>(
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
parse_table: &ParseTable,
parse_state_info: &Vec<ParseStateInfo<'a>>,
report_symbol_name: &'a str,
) {
let mut all_state_indices = BTreeSet::new();
let mut symbols_with_state_indices = (0..syntax_grammar.variables.len())
.map(|i| (Symbol::non_terminal(i), BTreeSet::new()))
.collect::<Vec<_>>();
for (i, state) in parse_table.states.iter().enumerate() {
all_state_indices.insert(i);
let item_set = &parse_state_info[state.id];
for (item, _) in item_set.1.entries.iter() {
if !item.is_augmented() {
symbols_with_state_indices[item.variable_index as usize]
.1
.insert(i);
}
}
}
symbols_with_state_indices.sort_unstable_by_key(|(_, states)| -(states.len() as i32));
let max_symbol_name_length = syntax_grammar
.variables
.iter()
.map(|v| v.name.len())
.max()
.unwrap();
for (symbol, states) in &symbols_with_state_indices {
eprintln!(
"{:width$}\t{}",
syntax_grammar.variables[symbol.index].name,
states.len(),
width = max_symbol_name_length
);
}
eprintln!("");
let state_indices = if report_symbol_name == "*" {
Some(&all_state_indices)
} else {
symbols_with_state_indices
.iter()
.find_map(|(symbol, state_indices)| {
if syntax_grammar.variables[symbol.index].name == report_symbol_name {
Some(state_indices)
} else {
None
}
})
};
if let Some(state_indices) = state_indices {
let mut state_indices = state_indices.into_iter().cloned().collect::<Vec<_>>();
state_indices.sort_unstable_by_key(|i| (parse_table.states[*i].core_id, *i));
for state_index in state_indices {
let id = parse_table.states[state_index].id;
let (preceding_symbols, item_set) = &parse_state_info[id];
eprintln!("state index: {}", state_index);
eprintln!("state id: {}", id);
eprint!("symbol sequence:");
for symbol in preceding_symbols {
let name = if symbol.is_terminal() {
&lexical_grammar.variables[symbol.index].name
} else if symbol.is_external() {
&syntax_grammar.external_tokens[symbol.index].name
} else {
&syntax_grammar.variables[symbol.index].name
};
eprint!(" {}", name);
}
eprintln!(
"\nitems:\n{}",
self::item::ParseItemSetDisplay(&item_set, syntax_grammar, lexical_grammar,),
);
}
}
}
fn all_chars_are_alphabetical(cursor: &NfaCursor) -> bool {
cursor.transition_chars().all(|(chars, is_sep)| {
if is_sep {

View file

@ -33,6 +33,16 @@ lazy_static! {
.unwrap();
}
const NEW_HEADER_PARTS: [&'static str; 2] = [
"
uint32_t large_state_count;
const uint16_t *small_parse_table;
const uint32_t *small_parse_table_map;",
"
#define SMALL_STATE(id) id - LARGE_STATE_COUNT
",
];
struct GeneratedParser {
c_code: String,
node_types_json: String,
@ -42,6 +52,8 @@ pub fn generate_parser_in_directory(
repo_path: &PathBuf,
grammar_path: Option<&str>,
properties_only: bool,
next_abi: bool,
report_symbol_name: Option<&str>,
) -> Result<()> {
let src_path = repo_path.join("src");
let header_path = src_path.join("tree_sitter");
@ -102,11 +114,28 @@ pub fn generate_parser_in_directory(
lexical_grammar,
inlines,
simple_aliases,
next_abi,
report_symbol_name,
)?;
write_file(&src_path.join("parser.c"), c_code)?;
write_file(&src_path.join("node-types.json"), node_types_json)?;
write_file(&header_path.join("parser.h"), tree_sitter::PARSER_HEADER)?;
if next_abi {
write_file(&header_path.join("parser.h"), tree_sitter::PARSER_HEADER)?;
} else {
let mut header = tree_sitter::PARSER_HEADER.to_string();
for part in &NEW_HEADER_PARTS {
let pos = header
.find(part)
.expect("Missing expected part of parser.h header");
header.replace_range(pos..(pos + part.len()), "");
}
write_file(&header_path.join("parser.h"), header)?;
}
ensure_file(&repo_path.join("index.js"), || {
npm_files::index_js(&language_name)
})?;
@ -132,6 +161,8 @@ pub fn generate_parser_for_grammar(grammar_json: &str) -> Result<(String, String
lexical_grammar,
inlines,
simple_aliases,
true,
None,
)?;
Ok((input_grammar.name, parser.c_code))
}
@ -142,6 +173,8 @@ fn generate_parser_for_grammar_with_opts(
lexical_grammar: LexicalGrammar,
inlines: InlinedProductionMap,
simple_aliases: AliasMap,
next_abi: bool,
report_symbol_name: Option<&str>,
) -> Result<GeneratedParser> {
let variable_info = node_types::get_variable_info(&syntax_grammar, &lexical_grammar, &inlines)?;
let node_types_json = node_types::generate_node_types_json(
@ -156,6 +189,7 @@ fn generate_parser_for_grammar_with_opts(
&simple_aliases,
&variable_info,
&inlines,
report_symbol_name,
)?;
let c_code = render_c_code(
name,
@ -166,6 +200,7 @@ fn generate_parser_for_grammar_with_opts(
syntax_grammar,
lexical_grammar,
simple_aliases,
next_abi,
);
Ok(GeneratedParser {
c_code,

View file

@ -1,14 +1,18 @@
use super::grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType};
use super::nfa::CharacterSet;
use super::rules::{Alias, AliasMap, Symbol, SymbolType, TokenSet};
use super::rules::{Alias, AliasMap, Symbol, SymbolType};
use super::tables::{
AdvanceAction, FieldLocation, LexState, LexTable, ParseAction, ParseTable, ParseTableEntry,
};
use core::ops::Range;
use std::cmp;
use std::collections::{BTreeMap, HashMap, HashSet};
use std::fmt::Write;
use std::mem::swap;
use tree_sitter::LANGUAGE_VERSION;
// Currently, the library supports a new ABI version that has not yet been
// stabilized, and the parser generation does not use it by default.
const STABLE_LANGUAGE_VERSION: usize = tree_sitter::LANGUAGE_VERSION - 1;
macro_rules! add {
($this: tt, $($arg: tt)*) => {{
@ -45,6 +49,8 @@ macro_rules! dedent {
};
}
const SMALL_STATE_THRESHOLD: usize = 48;
struct Generator {
buffer: String,
indent_level: usize,
@ -52,15 +58,17 @@ struct Generator {
parse_table: ParseTable,
main_lex_table: LexTable,
keyword_lex_table: LexTable,
large_state_count: usize,
keyword_capture_token: Option<Symbol>,
syntax_grammar: SyntaxGrammar,
lexical_grammar: LexicalGrammar,
simple_aliases: AliasMap,
symbol_order: HashMap<Symbol, usize>,
symbol_ids: HashMap<Symbol, String>,
alias_ids: HashMap<Alias, String>,
external_scanner_states: Vec<TokenSet>,
alias_map: BTreeMap<Alias, Option<Symbol>>,
field_names: Vec<String>,
next_abi: bool,
}
impl Generator {
@ -148,6 +156,27 @@ impl Generator {
field_names.sort_unstable();
field_names.dedup();
self.field_names = field_names.into_iter().cloned().collect();
// If we are opting in to the new unstable language ABI, then use the concept of
// "small parse states". Otherwise, use the same representation for all parse
// states.
if self.next_abi {
let threshold = cmp::min(
SMALL_STATE_THRESHOLD,
self.parse_table.symbols.len() / 2 - 1,
);
self.large_state_count = self
.parse_table
.states
.iter()
.enumerate()
.take_while(|(i, s)| {
*i <= 1 || s.terminal_entries.len() + s.nonterminal_entries.len() > threshold
})
.count();
} else {
self.large_state_count = self.parse_table.states.len();
}
}
fn add_includes(&mut self) {
@ -198,12 +227,26 @@ impl Generator {
})
.count();
add_line!(self, "#define LANGUAGE_VERSION {}", LANGUAGE_VERSION);
if self.next_abi {
add_line!(
self,
"#define LANGUAGE_VERSION {}",
tree_sitter::LANGUAGE_VERSION
);
} else {
add_line!(self, "#define LANGUAGE_VERSION {}", STABLE_LANGUAGE_VERSION);
}
add_line!(
self,
"#define STATE_COUNT {}",
self.parse_table.states.len()
);
if self.next_abi {
add_line!(self, "#define LARGE_STATE_COUNT {}", self.large_state_count);
}
add_line!(
self,
"#define SYMBOL_COUNT {}",
@ -232,9 +275,11 @@ impl Generator {
fn add_symbol_enum(&mut self) {
add_line!(self, "enum {{");
indent!(self);
self.symbol_order.insert(Symbol::end(), 0);
let mut i = 1;
for symbol in self.parse_table.symbols.iter() {
if *symbol != Symbol::end() {
self.symbol_order.insert(*symbol, i);
add_line!(self, "{} = {},", self.symbol_ids[&symbol], i);
i += 1;
}
@ -633,40 +678,16 @@ impl Generator {
}
fn add_lex_modes_list(&mut self) {
self.get_external_scanner_state_id(TokenSet::new());
let mut external_tokens_by_corresponding_internal_token = HashMap::new();
for (i, external_token) in self.syntax_grammar.external_tokens.iter().enumerate() {
if let Some(symbol) = external_token.corresponding_internal_token {
external_tokens_by_corresponding_internal_token.insert(symbol.index, i);
}
}
add_line!(self, "static TSLexMode ts_lex_modes[STATE_COUNT] = {{");
indent!(self);
for i in 0..self.parse_table.states.len() {
let mut external_tokens = TokenSet::new();
for token in self.parse_table.states[i].terminal_entries.keys() {
if token.is_external() {
external_tokens.insert(*token);
} else if token.is_terminal() {
if let Some(external_index) =
external_tokens_by_corresponding_internal_token.get(&token.index)
{
external_tokens.insert(Symbol::external(*external_index));
}
}
}
let external_state_id = self.get_external_scanner_state_id(external_tokens);
let state = &self.parse_table.states[i];
if external_state_id > 0 {
for (i, state) in self.parse_table.states.iter().enumerate() {
if state.external_lex_state_id > 0 {
add_line!(
self,
"[{}] = {{.lex_state = {}, .external_lex_state = {}}},",
i,
state.lex_state_id,
external_state_id
state.external_lex_state_id
);
} else {
add_line!(self, "[{}] = {{.lex_state = {}}},", i, state.lex_state_id);
@ -720,14 +741,14 @@ impl Generator {
add_line!(
self,
"static bool ts_external_scanner_states[{}][EXTERNAL_TOKEN_COUNT] = {{",
self.external_scanner_states.len(),
self.parse_table.external_lex_states.len(),
);
indent!(self);
for i in 0..self.external_scanner_states.len() {
if !self.external_scanner_states[i].is_empty() {
for i in 0..self.parse_table.external_lex_states.len() {
if !self.parse_table.external_lex_states[i].is_empty() {
add_line!(self, "[{}] = {{", i);
indent!(self);
for token in self.external_scanner_states[i].iter() {
for token in self.parse_table.external_lex_states[i].iter() {
add_line!(
self,
"[{}] = true,",
@ -758,25 +779,42 @@ impl Generator {
add_line!(
self,
"static uint16_t ts_parse_table[STATE_COUNT][SYMBOL_COUNT] = {{"
"static uint16_t ts_parse_table[{}][SYMBOL_COUNT] = {{",
if self.next_abi {
"LARGE_STATE_COUNT"
} else {
"STATE_COUNT"
}
);
indent!(self);
let mut terminal_entries = Vec::new();
let mut nonterminal_entries = Vec::new();
for (i, state) in self.parse_table.states.iter().enumerate() {
for (i, state) in self
.parse_table
.states
.iter()
.enumerate()
.take(self.large_state_count)
{
add_line!(self, "[{}] = {{", i);
indent!(self);
terminal_entries.clear();
nonterminal_entries.clear();
terminal_entries.extend(state.terminal_entries.iter());
nonterminal_entries.extend(state.nonterminal_entries.iter());
terminal_entries.sort_unstable_by_key(|e| e.0);
nonterminal_entries.sort_unstable_by_key(|e| e.0);
terminal_entries.sort_unstable_by_key(|e| self.symbol_order.get(e.0));
nonterminal_entries.sort_unstable_by_key(|k| k.0);
add_line!(self, "[{}] = {{", i);
indent!(self);
for (symbol, state_id) in &nonterminal_entries {
add_line!(self, "[{}] = STATE({}),", self.symbol_ids[symbol], state_id);
add_line!(
self,
"[{}] = STATE({}),",
self.symbol_ids[symbol],
*state_id
);
}
for (symbol, entry) in &terminal_entries {
@ -799,6 +837,59 @@ impl Generator {
add_line!(self, "}};");
add_line!(self, "");
if self.large_state_count < self.parse_table.states.len() {
add_line!(self, "static uint32_t ts_small_parse_table_map[] = {{");
indent!(self);
let mut index = 0;
for (i, state) in self
.parse_table
.states
.iter()
.enumerate()
.skip(self.large_state_count)
{
add_line!(self, "[SMALL_STATE({})] = {},", i, index);
index += 1 + 2 * state.symbol_count();
}
dedent!(self);
add_line!(self, "}};");
add_line!(self, "");
index = 0;
add_line!(self, "static uint16_t ts_small_parse_table[] = {{");
indent!(self);
for state in self.parse_table.states.iter().skip(self.large_state_count) {
add_line!(self, "[{}] = {},", index, state.symbol_count());
indent!(self);
terminal_entries.clear();
nonterminal_entries.clear();
terminal_entries.extend(state.terminal_entries.iter());
nonterminal_entries.extend(state.nonterminal_entries.iter());
terminal_entries.sort_unstable_by_key(|e| self.symbol_order.get(e.0));
nonterminal_entries.sort_unstable_by_key(|k| k.0);
for (symbol, entry) in &terminal_entries {
let entry_id = self.get_parse_action_list_id(
entry,
&mut parse_table_entries,
&mut next_parse_action_list_index,
);
add_line!(self, "{}, ACTIONS({}),", self.symbol_ids[symbol], entry_id);
}
for (symbol, state_id) in &nonterminal_entries {
add_line!(self, "{}, STATE({}),", self.symbol_ids[symbol], *state_id);
}
dedent!(self);
index += 1 + 2 * state.symbol_count();
}
dedent!(self);
add_line!(self, "}};");
add_line!(self, "");
}
self.add_parse_action_list(parse_table_entries);
}
@ -897,11 +988,28 @@ impl Generator {
add_line!(self, ".symbol_count = SYMBOL_COUNT,");
add_line!(self, ".alias_count = ALIAS_COUNT,");
add_line!(self, ".token_count = TOKEN_COUNT,");
if self.next_abi {
add_line!(self, ".large_state_count = LARGE_STATE_COUNT,");
}
add_line!(self, ".symbol_metadata = ts_symbol_metadata,");
add_line!(
self,
".parse_table = (const unsigned short *)ts_parse_table,"
);
if self.large_state_count < self.parse_table.states.len() {
add_line!(
self,
".small_parse_table = (const uint16_t *)ts_small_parse_table,"
);
add_line!(
self,
".small_parse_table_map = (const uint32_t *)ts_small_parse_table_map,"
);
}
add_line!(self, ".parse_actions = ts_parse_actions,");
add_line!(self, ".lex_modes = ts_lex_modes,");
add_line!(self, ".symbol_names = ts_symbol_names,");
@ -997,16 +1105,6 @@ impl Generator {
result
}
fn get_external_scanner_state_id(&mut self, external_tokens: TokenSet) -> usize {
self.external_scanner_states
.iter()
.position(|tokens| *tokens == external_tokens)
.unwrap_or_else(|| {
self.external_scanner_states.push(external_tokens);
self.external_scanner_states.len() - 1
})
}
fn external_token_id(&self, token: &ExternalToken) -> String {
format!(
"ts_external_token_{}",
@ -1152,6 +1250,23 @@ impl Generator {
}
}
/// Returns a String of C code for the given components of a parser.
///
/// # Arguments
///
/// * `name` - A string slice containing the name of the language
/// * `parse_table` - The generated parse table for the language
/// * `main_lex_table` - The generated lexing table for the language
/// * `keyword_lex_table` - The generated keyword lexing table for the language
/// * `keyword_capture_token` - A symbol indicating which token is used
/// for keyword capture, if any.
/// * `syntax_grammar` - The syntax grammar extracted from the language's grammar
/// * `lexical_grammar` - The lexical grammar extracted from the language's grammar
/// * `simple_aliases` - A map describing the global rename rules that should apply.
/// the keys are symbols that are *always* aliased in the same way, and the values
/// are the aliases that are applied to those symbols.
/// * `next_abi` - A boolean indicating whether to opt into the new, unstable parse
/// table format. This is mainly used for testing, when developing Tree-sitter itself.
pub(crate) fn render_c_code(
name: &str,
parse_table: ParseTable,
@ -1161,11 +1276,13 @@ pub(crate) fn render_c_code(
syntax_grammar: SyntaxGrammar,
lexical_grammar: LexicalGrammar,
simple_aliases: AliasMap,
next_abi: bool,
) -> String {
Generator {
buffer: String::new(),
indent_level: 0,
language_name: name.to_string(),
large_state_count: 0,
parse_table,
main_lex_table,
keyword_lex_table,
@ -1174,10 +1291,11 @@ pub(crate) fn render_c_code(
lexical_grammar,
simple_aliases,
symbol_ids: HashMap::new(),
symbol_order: HashMap::new(),
alias_ids: HashMap::new(),
external_scanner_states: Vec::new(),
alias_map: BTreeMap::new(),
field_names: Vec::new(),
next_abi,
}
.generate()
}

View file

@ -1,7 +1,6 @@
use super::nfa::CharacterSet;
use super::rules::{Alias, Associativity, Symbol};
use super::rules::{Alias, Associativity, Symbol, TokenSet};
use std::collections::{BTreeMap, HashMap};
pub(crate) type ProductionInfoId = usize;
pub(crate) type ParseStateId = usize;
pub(crate) type LexStateId = usize;
@ -37,6 +36,7 @@ pub(crate) struct ParseState {
pub terminal_entries: HashMap<Symbol, ParseTableEntry>,
pub nonterminal_entries: HashMap<Symbol, ParseStateId>,
pub lex_state_id: usize,
pub external_lex_state_id: usize,
pub core_id: usize,
}
@ -58,6 +58,7 @@ pub(crate) struct ParseTable {
pub symbols: Vec<Symbol>,
pub production_infos: Vec<ProductionInfo>,
pub max_aliased_production_length: usize,
pub external_lex_states: Vec<TokenSet>,
}
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
@ -93,6 +94,10 @@ impl Default for LexTable {
}
impl ParseState {
pub fn symbol_count(&self) -> usize {
self.terminal_entries.len() + self.nonterminal_entries.len()
}
pub fn referenced_states<'a>(&'a self) -> impl Iterator<Item = ParseStateId> + 'a {
self.terminal_entries
.iter()

View file

@ -38,7 +38,14 @@ fn run() -> error::Result<()> {
.about("Generate a parser")
.arg(Arg::with_name("grammar-path").index(1))
.arg(Arg::with_name("log").long("log"))
.arg(Arg::with_name("next-abi").long("next-abi"))
.arg(Arg::with_name("properties-only").long("properties"))
.arg(
Arg::with_name("report-states-for-rule")
.long("report-states-for-rule")
.value_name("rule-name")
.takes_value(true),
)
.arg(Arg::with_name("no-minimize").long("no-minimize")),
)
.subcommand(
@ -121,10 +128,24 @@ fn run() -> error::Result<()> {
} else if let Some(matches) = matches.subcommand_matches("generate") {
let grammar_path = matches.value_of("grammar-path");
let properties_only = matches.is_present("properties-only");
let report_symbol_name = matches.value_of("report-states-for-rule").or_else(|| {
if matches.is_present("report-states") {
Some("")
} else {
None
}
});
if matches.is_present("log") {
logger::init();
}
generate::generate_parser_in_directory(&current_dir, grammar_path, properties_only)?;
let next_abi = matches.is_present("next-abi");
generate::generate_parser_in_directory(
&current_dir,
grammar_path,
properties_only,
next_abi,
report_symbol_name,
)?;
} else if let Some(matches) = matches.subcommand_matches("test") {
let debug = matches.is_present("debug");
let debug_graph = matches.is_present("debug-graph");

View file

@ -591,5 +591,5 @@ extern "C" {
pub fn ts_language_version(arg1: *const TSLanguage) -> u32;
}
pub const TREE_SITTER_LANGUAGE_VERSION: usize = 10;
pub const TREE_SITTER_LANGUAGE_VERSION: usize = 11;
pub const TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION: usize = 9;

View file

@ -14,7 +14,7 @@ extern "C" {
/* Section - ABI Versioning */
/****************************/
#define TREE_SITTER_LANGUAGE_VERSION 10
#define TREE_SITTER_LANGUAGE_VERSION 11
#define TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION 9
/*******************/

View file

@ -114,6 +114,9 @@ struct TSLanguage {
const TSFieldMapSlice *field_map_slices;
const TSFieldMapEntry *field_map_entries;
const char **field_names;
uint32_t large_state_count;
const uint16_t *small_parse_table;
const uint32_t *small_parse_table_map;
};
/*
@ -155,6 +158,8 @@ struct TSLanguage {
* Parse Table Macros
*/
#define SMALL_STATE(id) id - LARGE_STATE_COUNT
#define STATE(id) id
#define ACTIONS(id) id

View file

@ -11,7 +11,7 @@ void ts_language_table_entry(const TSLanguage *self, TSStateId state,
result->actions = NULL;
} else {
assert(symbol < self->token_count);
uint32_t action_index = self->parse_table[state * self->symbol_count + symbol];
uint32_t action_index = ts_language_lookup(self, state, symbol);
const TSParseActionEntry *entry = &self->parse_actions[action_index];
result->action_count = entry->count;
result->is_reusable = entry->reusable;

View file

@ -10,6 +10,7 @@ extern "C" {
#define ts_builtin_sym_error_repeat (ts_builtin_sym_error - 1)
#define TREE_SITTER_LANGUAGE_VERSION_WITH_FIELDS 10
#define TREE_SITTER_LANGUAGE_VERSION_WITH_SMALL_STATES 11
typedef struct {
const TSParseAction *actions;
@ -51,6 +52,30 @@ static inline bool ts_language_has_reduce_action(const TSLanguage *self,
return entry.action_count > 0 && entry.actions[0].type == TSParseActionTypeReduce;
}
static inline uint16_t ts_language_lookup(
const TSLanguage *self,
TSStateId state,
TSSymbol symbol
) {
if (
self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_SMALL_STATES &&
state >= self->large_state_count
) {
uint32_t index = self->small_parse_table_map[state - self->large_state_count];
const uint16_t *state_data = &self->small_parse_table[index];
uint16_t symbol_count = *state_data;
state_data++;
for (unsigned i = 0; i < symbol_count; i++) {
if (state_data[0] == symbol) return state_data[1];
if (state_data[0] > symbol) break;
state_data += 2;
}
return 0;
} else {
return self->parse_table[state * self->symbol_count + symbol];
}
}
static inline TSStateId ts_language_next_state(const TSLanguage *self,
TSStateId state,
TSSymbol symbol) {
@ -67,7 +92,7 @@ static inline TSStateId ts_language_next_state(const TSLanguage *self,
}
return 0;
} else {
return self->parse_table[state * self->symbol_count + symbol];
return ts_language_lookup(self, state, symbol);
}
}