From 759c1d6e6503fcb87df536bde9b6955fa7d0cab9 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 16 May 2019 15:22:49 -0700 Subject: [PATCH 1/6] Reorder parse states by descending symbol count --- .../build_tables/minimize_parse_table.rs | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/cli/src/generate/build_tables/minimize_parse_table.rs b/cli/src/generate/build_tables/minimize_parse_table.rs index db7e3961..5ecde0fd 100644 --- a/cli/src/generate/build_tables/minimize_parse_table.rs +++ b/cli/src/generate/build_tables/minimize_parse_table.rs @@ -26,6 +26,7 @@ pub(crate) fn minimize_parse_table( minimizer.merge_compatible_states(); minimizer.remove_unit_reductions(); minimizer.remove_unused_states(); + minimizer.reorder_states_by_descending_size(); } struct Minimizer<'a> { @@ -454,4 +455,37 @@ impl<'a> Minimizer<'a> { original_state_id += 1; } } + + fn reorder_states_by_descending_size(&mut self) { + // Get a mapping of old state index -> new_state_index + let mut old_ids_by_new_id = (0..self.parse_table.states.len()).collect::>(); + &old_ids_by_new_id.sort_unstable_by_key(|i| { + // Don't changes states 0 (the error state) or 1 (the start state). + if *i <= 1 { + return *i as i64 - 1_000_000; + } + + // Reorder all the other states by descending symbol count. + let state = &self.parse_table.states[*i]; + -((state.terminal_entries.len() + state.nonterminal_entries.len()) as i64) + }); + + // Get the inverse mapping + let mut new_ids_by_old_id = vec![0; old_ids_by_new_id.len()]; + for (id, old_id) in old_ids_by_new_id.iter().enumerate() { + new_ids_by_old_id[*old_id] = id; + } + + // Reorder the parse states and update their references to reflect + // the new ordering. + self.parse_table.states = old_ids_by_new_id + .iter() + .map(|old_id| { + let mut state = ParseState::default(); + mem::swap(&mut state, &mut self.parse_table.states[*old_id]); + state.update_referenced_states(|id, _| new_ids_by_old_id[id]); + state + }) + .collect(); + } } From 48a883c1d4f9a49161bd19564252773aa283d636 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 16 May 2019 16:27:05 -0700 Subject: [PATCH 2/6] Move external token state id computation out of render module --- .../build_tables/build_parse_table.rs | 2 + cli/src/generate/build_tables/mod.rs | 39 ++++++++++++++ cli/src/generate/render.rs | 52 +++---------------- cli/src/generate/tables.rs | 5 +- 4 files changed, 52 insertions(+), 46 deletions(-) diff --git a/cli/src/generate/build_tables/build_parse_table.rs b/cli/src/generate/build_tables/build_parse_table.rs index 8c13ae14..7b26892e 100644 --- a/cli/src/generate/build_tables/build_parse_table.rs +++ b/cli/src/generate/build_tables/build_parse_table.rs @@ -108,6 +108,7 @@ impl<'a> ParseTableBuilder<'a> { self.parse_table.states.push(ParseState { id: state_id, lex_state_id: 0, + external_lex_state_id: 0, terminal_entries: HashMap::new(), nonterminal_entries: HashMap::new(), core_id, @@ -777,6 +778,7 @@ pub(crate) fn build_parse_table( parse_table: ParseTable { states: Vec::new(), symbols: Vec::new(), + external_lex_states: Vec::new(), production_infos: Vec::new(), max_aliased_production_length: 1, }, diff --git a/cli/src/generate/build_tables/mod.rs b/cli/src/generate/build_tables/mod.rs index 2c3f47fb..af9483eb 100644 --- a/cli/src/generate/build_tables/mod.rs +++ b/cli/src/generate/build_tables/mod.rs @@ -18,6 +18,7 @@ use crate::generate::node_types::VariableInfo; use crate::generate::rules::{AliasMap, Symbol, SymbolType, TokenSet}; use crate::generate::tables::{LexTable, ParseAction, ParseTable, ParseTableEntry}; use log::info; +use std::collections::HashMap; pub(crate) fn build_tables( syntax_grammar: &SyntaxGrammar, @@ -62,6 +63,7 @@ pub(crate) fn build_tables( &coincident_token_index, &token_conflict_map, ); + populate_external_lex_states(&mut parse_table, syntax_grammar); mark_fragile_tokens(&mut parse_table, lexical_grammar, &token_conflict_map); Ok(( parse_table, @@ -197,6 +199,43 @@ fn populate_used_symbols( } } +fn populate_external_lex_states(parse_table: &mut ParseTable, syntax_grammar: &SyntaxGrammar) { + let mut external_tokens_by_corresponding_internal_token = HashMap::new(); + for (i, external_token) in syntax_grammar.external_tokens.iter().enumerate() { + if let Some(symbol) = external_token.corresponding_internal_token { + external_tokens_by_corresponding_internal_token.insert(symbol.index, i); + } + } + + // Ensure that external lex state 0 represents the absence of any + // external tokens. + parse_table.external_lex_states.push(TokenSet::new()); + + for i in 0..parse_table.states.len() { + let mut external_tokens = TokenSet::new(); + for token in parse_table.states[i].terminal_entries.keys() { + if token.is_external() { + external_tokens.insert(*token); + } else if token.is_terminal() { + if let Some(index) = + external_tokens_by_corresponding_internal_token.get(&token.index) + { + external_tokens.insert(Symbol::external(*index)); + } + } + } + + parse_table.states[i].external_lex_state_id = parse_table + .external_lex_states + .iter() + .position(|tokens| *tokens == external_tokens) + .unwrap_or_else(|| { + parse_table.external_lex_states.push(external_tokens); + parse_table.external_lex_states.len() - 1 + }); + } +} + fn identify_keywords( lexical_grammar: &LexicalGrammar, parse_table: &ParseTable, diff --git a/cli/src/generate/render.rs b/cli/src/generate/render.rs index 7d6976c4..cb4b6fda 100644 --- a/cli/src/generate/render.rs +++ b/cli/src/generate/render.rs @@ -1,6 +1,6 @@ use super::grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType}; use super::nfa::CharacterSet; -use super::rules::{Alias, AliasMap, Symbol, SymbolType, TokenSet}; +use super::rules::{Alias, AliasMap, Symbol, SymbolType}; use super::tables::{ AdvanceAction, FieldLocation, LexState, LexTable, ParseAction, ParseTable, ParseTableEntry, }; @@ -58,7 +58,6 @@ struct Generator { simple_aliases: AliasMap, symbol_ids: HashMap, alias_ids: HashMap, - external_scanner_states: Vec, alias_map: BTreeMap>, field_names: Vec, } @@ -633,40 +632,16 @@ impl Generator { } fn add_lex_modes_list(&mut self) { - self.get_external_scanner_state_id(TokenSet::new()); - - let mut external_tokens_by_corresponding_internal_token = HashMap::new(); - for (i, external_token) in self.syntax_grammar.external_tokens.iter().enumerate() { - if let Some(symbol) = external_token.corresponding_internal_token { - external_tokens_by_corresponding_internal_token.insert(symbol.index, i); - } - } - add_line!(self, "static TSLexMode ts_lex_modes[STATE_COUNT] = {{"); indent!(self); - for i in 0..self.parse_table.states.len() { - let mut external_tokens = TokenSet::new(); - for token in self.parse_table.states[i].terminal_entries.keys() { - if token.is_external() { - external_tokens.insert(*token); - } else if token.is_terminal() { - if let Some(external_index) = - external_tokens_by_corresponding_internal_token.get(&token.index) - { - external_tokens.insert(Symbol::external(*external_index)); - } - } - } - - let external_state_id = self.get_external_scanner_state_id(external_tokens); - let state = &self.parse_table.states[i]; - if external_state_id > 0 { + for (i, state) in self.parse_table.states.iter().enumerate() { + if state.external_lex_state_id > 0 { add_line!( self, "[{}] = {{.lex_state = {}, .external_lex_state = {}}},", i, state.lex_state_id, - external_state_id + state.external_lex_state_id ); } else { add_line!(self, "[{}] = {{.lex_state = {}}},", i, state.lex_state_id); @@ -720,14 +695,14 @@ impl Generator { add_line!( self, "static bool ts_external_scanner_states[{}][EXTERNAL_TOKEN_COUNT] = {{", - self.external_scanner_states.len(), + self.parse_table.external_lex_states.len(), ); indent!(self); - for i in 0..self.external_scanner_states.len() { - if !self.external_scanner_states[i].is_empty() { + for i in 0..self.parse_table.external_lex_states.len() { + if !self.parse_table.external_lex_states[i].is_empty() { add_line!(self, "[{}] = {{", i); indent!(self); - for token in self.external_scanner_states[i].iter() { + for token in self.parse_table.external_lex_states[i].iter() { add_line!( self, "[{}] = true,", @@ -997,16 +972,6 @@ impl Generator { result } - fn get_external_scanner_state_id(&mut self, external_tokens: TokenSet) -> usize { - self.external_scanner_states - .iter() - .position(|tokens| *tokens == external_tokens) - .unwrap_or_else(|| { - self.external_scanner_states.push(external_tokens); - self.external_scanner_states.len() - 1 - }) - } - fn external_token_id(&self, token: &ExternalToken) -> String { format!( "ts_external_token_{}", @@ -1175,7 +1140,6 @@ pub(crate) fn render_c_code( simple_aliases, symbol_ids: HashMap::new(), alias_ids: HashMap::new(), - external_scanner_states: Vec::new(), alias_map: BTreeMap::new(), field_names: Vec::new(), } diff --git a/cli/src/generate/tables.rs b/cli/src/generate/tables.rs index 81b31493..fb593953 100644 --- a/cli/src/generate/tables.rs +++ b/cli/src/generate/tables.rs @@ -1,7 +1,6 @@ use super::nfa::CharacterSet; -use super::rules::{Alias, Associativity, Symbol}; +use super::rules::{Alias, Associativity, Symbol, TokenSet}; use std::collections::{BTreeMap, HashMap}; - pub(crate) type ProductionInfoId = usize; pub(crate) type ParseStateId = usize; pub(crate) type LexStateId = usize; @@ -37,6 +36,7 @@ pub(crate) struct ParseState { pub terminal_entries: HashMap, pub nonterminal_entries: HashMap, pub lex_state_id: usize, + pub external_lex_state_id: usize, pub core_id: usize, } @@ -58,6 +58,7 @@ pub(crate) struct ParseTable { pub symbols: Vec, pub production_infos: Vec, pub max_aliased_production_length: usize, + pub external_lex_states: Vec, } #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] From 09a2755399ce574f527ece49f27b779fb07ba37d Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 16 May 2019 16:59:50 -0700 Subject: [PATCH 3/6] Store parse states with few lookahead symbols in a more compact way --- cli/src/generate/render.rs | 104 ++++++++++++++++++++++++++++--- cli/src/generate/tables.rs | 4 ++ lib/binding_rust/bindings.rs | 2 +- lib/include/tree_sitter/api.h | 2 +- lib/include/tree_sitter/parser.h | 5 ++ lib/src/language.c | 2 +- lib/src/language.h | 27 +++++++- 7 files changed, 135 insertions(+), 11 deletions(-) diff --git a/cli/src/generate/render.rs b/cli/src/generate/render.rs index cb4b6fda..bad1d290 100644 --- a/cli/src/generate/render.rs +++ b/cli/src/generate/render.rs @@ -45,6 +45,8 @@ macro_rules! dedent { }; } +const SMALL_STATE_THRESHOLD: usize = 48; + struct Generator { buffer: String, indent_level: usize, @@ -52,10 +54,12 @@ struct Generator { parse_table: ParseTable, main_lex_table: LexTable, keyword_lex_table: LexTable, + large_state_count: usize, keyword_capture_token: Option, syntax_grammar: SyntaxGrammar, lexical_grammar: LexicalGrammar, simple_aliases: AliasMap, + symbol_order: HashMap, symbol_ids: HashMap, alias_ids: HashMap, alias_map: BTreeMap>, @@ -144,6 +148,15 @@ impl Generator { } } + self.large_state_count = self + .parse_table + .states + .iter() + .take_while(|s| { + s.terminal_entries.len() + s.nonterminal_entries.len() > SMALL_STATE_THRESHOLD + }) + .count(); + field_names.sort_unstable(); field_names.dedup(); self.field_names = field_names.into_iter().cloned().collect(); @@ -203,6 +216,7 @@ impl Generator { "#define STATE_COUNT {}", self.parse_table.states.len() ); + add_line!(self, "#define LARGE_STATE_COUNT {}", self.large_state_count); add_line!( self, "#define SYMBOL_COUNT {}", @@ -231,9 +245,11 @@ impl Generator { fn add_symbol_enum(&mut self) { add_line!(self, "enum {{"); indent!(self); + self.symbol_order.insert(Symbol::end(), 0); let mut i = 1; for symbol in self.parse_table.symbols.iter() { if *symbol != Symbol::end() { + self.symbol_order.insert(*symbol, i); add_line!(self, "{} = {},", self.symbol_ids[&symbol], i); i += 1; } @@ -733,25 +749,37 @@ impl Generator { add_line!( self, - "static uint16_t ts_parse_table[STATE_COUNT][SYMBOL_COUNT] = {{" + "static uint16_t ts_parse_table[LARGE_STATE_COUNT][SYMBOL_COUNT] = {{" ); indent!(self); let mut terminal_entries = Vec::new(); let mut nonterminal_entries = Vec::new(); - for (i, state) in self.parse_table.states.iter().enumerate() { + for (i, state) in self + .parse_table + .states + .iter() + .enumerate() + .take(self.large_state_count) + { + add_line!(self, "[{}] = {{", i); + indent!(self); + terminal_entries.clear(); nonterminal_entries.clear(); terminal_entries.extend(state.terminal_entries.iter()); nonterminal_entries.extend(state.nonterminal_entries.iter()); - terminal_entries.sort_unstable_by_key(|e| e.0); - nonterminal_entries.sort_unstable_by_key(|e| e.0); + terminal_entries.sort_unstable_by_key(|e| self.symbol_order.get(e.0)); + nonterminal_entries.sort_unstable_by_key(|k| k.0); - add_line!(self, "[{}] = {{", i); - indent!(self); for (symbol, state_id) in &nonterminal_entries { - add_line!(self, "[{}] = STATE({}),", self.symbol_ids[symbol], state_id); + add_line!( + self, + "[{}] = STATE({}),", + self.symbol_ids[symbol], + *state_id + ); } for (symbol, entry) in &terminal_entries { @@ -774,6 +802,57 @@ impl Generator { add_line!(self, "}};"); add_line!(self, ""); + add_line!(self, "static uint32_t ts_small_parse_table_map[] = {{"); + indent!(self); + let mut index = 0; + for (i, state) in self + .parse_table + .states + .iter() + .enumerate() + .skip(self.large_state_count) + { + add_line!(self, "[SMALL_STATE({})] = {},", i, index); + index += 1 + 2 * state.symbol_count(); + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + + index = 0; + add_line!(self, "static uint16_t ts_small_parse_table[] = {{"); + indent!(self); + for state in self.parse_table.states.iter().skip(self.large_state_count) { + add_line!(self, "[{}] = {},", index, state.symbol_count()); + indent!(self); + + terminal_entries.clear(); + nonterminal_entries.clear(); + terminal_entries.extend(state.terminal_entries.iter()); + nonterminal_entries.extend(state.nonterminal_entries.iter()); + terminal_entries.sort_unstable_by_key(|e| self.symbol_order.get(e.0)); + nonterminal_entries.sort_unstable_by_key(|k| k.0); + + for (symbol, entry) in &terminal_entries { + let entry_id = self.get_parse_action_list_id( + entry, + &mut parse_table_entries, + &mut next_parse_action_list_index, + ); + add_line!(self, "{}, ACTIONS({}),", self.symbol_ids[symbol], entry_id); + } + + for (symbol, state_id) in &nonterminal_entries { + add_line!(self, "{}, STATE({}),", self.symbol_ids[symbol], *state_id); + } + dedent!(self); + + index += 1 + 2 * state.symbol_count(); + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + self.add_parse_action_list(parse_table_entries); } @@ -872,11 +951,20 @@ impl Generator { add_line!(self, ".symbol_count = SYMBOL_COUNT,"); add_line!(self, ".alias_count = ALIAS_COUNT,"); add_line!(self, ".token_count = TOKEN_COUNT,"); + add_line!(self, ".large_state_count = LARGE_STATE_COUNT,"); add_line!(self, ".symbol_metadata = ts_symbol_metadata,"); add_line!( self, ".parse_table = (const unsigned short *)ts_parse_table," ); + add_line!( + self, + ".small_parse_table = (const uint16_t *)ts_small_parse_table," + ); + add_line!( + self, + ".small_parse_table_map = (const uint32_t *)ts_small_parse_table_map," + ); add_line!(self, ".parse_actions = ts_parse_actions,"); add_line!(self, ".lex_modes = ts_lex_modes,"); add_line!(self, ".symbol_names = ts_symbol_names,"); @@ -1131,6 +1219,7 @@ pub(crate) fn render_c_code( buffer: String::new(), indent_level: 0, language_name: name.to_string(), + large_state_count: 0, parse_table, main_lex_table, keyword_lex_table, @@ -1139,6 +1228,7 @@ pub(crate) fn render_c_code( lexical_grammar, simple_aliases, symbol_ids: HashMap::new(), + symbol_order: HashMap::new(), alias_ids: HashMap::new(), alias_map: BTreeMap::new(), field_names: Vec::new(), diff --git a/cli/src/generate/tables.rs b/cli/src/generate/tables.rs index fb593953..8a8cc089 100644 --- a/cli/src/generate/tables.rs +++ b/cli/src/generate/tables.rs @@ -94,6 +94,10 @@ impl Default for LexTable { } impl ParseState { + pub fn symbol_count(&self) -> usize { + self.terminal_entries.len() + self.nonterminal_entries.len() + } + pub fn referenced_states<'a>(&'a self) -> impl Iterator + 'a { self.terminal_entries .iter() diff --git a/lib/binding_rust/bindings.rs b/lib/binding_rust/bindings.rs index c205aeb4..a71b297e 100644 --- a/lib/binding_rust/bindings.rs +++ b/lib/binding_rust/bindings.rs @@ -591,5 +591,5 @@ extern "C" { pub fn ts_language_version(arg1: *const TSLanguage) -> u32; } -pub const TREE_SITTER_LANGUAGE_VERSION: usize = 10; +pub const TREE_SITTER_LANGUAGE_VERSION: usize = 11; pub const TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION: usize = 9; diff --git a/lib/include/tree_sitter/api.h b/lib/include/tree_sitter/api.h index 9375db8b..d39d0521 100644 --- a/lib/include/tree_sitter/api.h +++ b/lib/include/tree_sitter/api.h @@ -14,7 +14,7 @@ extern "C" { /* Section - ABI Versioning */ /****************************/ -#define TREE_SITTER_LANGUAGE_VERSION 10 +#define TREE_SITTER_LANGUAGE_VERSION 11 #define TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION 9 /*******************/ diff --git a/lib/include/tree_sitter/parser.h b/lib/include/tree_sitter/parser.h index a8ee20b6..974a7ca5 100644 --- a/lib/include/tree_sitter/parser.h +++ b/lib/include/tree_sitter/parser.h @@ -114,6 +114,9 @@ struct TSLanguage { const TSFieldMapSlice *field_map_slices; const TSFieldMapEntry *field_map_entries; const char **field_names; + uint32_t large_state_count; + const uint16_t *small_parse_table; + const uint32_t *small_parse_table_map; }; /* @@ -155,6 +158,8 @@ struct TSLanguage { * Parse Table Macros */ +#define SMALL_STATE(id) id - LARGE_STATE_COUNT + #define STATE(id) id #define ACTIONS(id) id diff --git a/lib/src/language.c b/lib/src/language.c index ebb47d06..1bfb1a8d 100644 --- a/lib/src/language.c +++ b/lib/src/language.c @@ -11,7 +11,7 @@ void ts_language_table_entry(const TSLanguage *self, TSStateId state, result->actions = NULL; } else { assert(symbol < self->token_count); - uint32_t action_index = self->parse_table[state * self->symbol_count + symbol]; + uint32_t action_index = ts_language_lookup(self, state, symbol); const TSParseActionEntry *entry = &self->parse_actions[action_index]; result->action_count = entry->count; result->is_reusable = entry->reusable; diff --git a/lib/src/language.h b/lib/src/language.h index 16e74790..de33d2d7 100644 --- a/lib/src/language.h +++ b/lib/src/language.h @@ -10,6 +10,7 @@ extern "C" { #define ts_builtin_sym_error_repeat (ts_builtin_sym_error - 1) #define TREE_SITTER_LANGUAGE_VERSION_WITH_FIELDS 10 +#define TREE_SITTER_LANGUAGE_VERSION_WITH_SMALL_STATES 11 typedef struct { const TSParseAction *actions; @@ -51,6 +52,30 @@ static inline bool ts_language_has_reduce_action(const TSLanguage *self, return entry.action_count > 0 && entry.actions[0].type == TSParseActionTypeReduce; } +static inline uint16_t ts_language_lookup( + const TSLanguage *self, + TSStateId state, + TSSymbol symbol +) { + if ( + self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_SMALL_STATES && + state >= self->large_state_count + ) { + uint32_t index = self->small_parse_table_map[state - self->large_state_count]; + const uint16_t *state_data = &self->small_parse_table[index]; + uint16_t symbol_count = *state_data; + state_data++; + for (unsigned i = 0; i < symbol_count; i++) { + if (state_data[0] == symbol) return state_data[1]; + if (state_data[0] > symbol) break; + state_data += 2; + } + return 0; + } else { + return self->parse_table[state * self->symbol_count + symbol]; + } +} + static inline TSStateId ts_language_next_state(const TSLanguage *self, TSStateId state, TSSymbol symbol) { @@ -67,7 +92,7 @@ static inline TSStateId ts_language_next_state(const TSLanguage *self, } return 0; } else { - return self->parse_table[state * self->symbol_count + symbol]; + return ts_language_lookup(self, state, symbol); } } From 82ff542d3b9df02cee4d9550aa1257d622f67ce3 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 16 May 2019 17:19:44 -0700 Subject: [PATCH 4/6] Appease MSVC by avoiding empty arrays --- cli/src/generate/render.rs | 122 ++++++++++++++++++++----------------- 1 file changed, 67 insertions(+), 55 deletions(-) diff --git a/cli/src/generate/render.rs b/cli/src/generate/render.rs index bad1d290..e806564c 100644 --- a/cli/src/generate/render.rs +++ b/cli/src/generate/render.rs @@ -5,6 +5,7 @@ use super::tables::{ AdvanceAction, FieldLocation, LexState, LexTable, ParseAction, ParseTable, ParseTableEntry, }; use core::ops::Range; +use std::cmp; use std::collections::{BTreeMap, HashMap, HashSet}; use std::fmt::Write; use std::mem::swap; @@ -148,12 +149,17 @@ impl Generator { } } + let threshold = cmp::min( + SMALL_STATE_THRESHOLD, + self.parse_table.symbols.len() / 2 - 1, + ); self.large_state_count = self .parse_table .states .iter() - .take_while(|s| { - s.terminal_entries.len() + s.nonterminal_entries.len() > SMALL_STATE_THRESHOLD + .enumerate() + .take_while(|(i, s)| { + *i <= 1 || s.terminal_entries.len() + s.nonterminal_entries.len() > threshold }) .count(); @@ -802,56 +808,58 @@ impl Generator { add_line!(self, "}};"); add_line!(self, ""); - add_line!(self, "static uint32_t ts_small_parse_table_map[] = {{"); - indent!(self); - let mut index = 0; - for (i, state) in self - .parse_table - .states - .iter() - .enumerate() - .skip(self.large_state_count) - { - add_line!(self, "[SMALL_STATE({})] = {},", i, index); - index += 1 + 2 * state.symbol_count(); - } - dedent!(self); - add_line!(self, "}};"); - add_line!(self, ""); - - index = 0; - add_line!(self, "static uint16_t ts_small_parse_table[] = {{"); - indent!(self); - for state in self.parse_table.states.iter().skip(self.large_state_count) { - add_line!(self, "[{}] = {},", index, state.symbol_count()); + if self.large_state_count < self.parse_table.states.len() { + add_line!(self, "static uint32_t ts_small_parse_table_map[] = {{"); indent!(self); - - terminal_entries.clear(); - nonterminal_entries.clear(); - terminal_entries.extend(state.terminal_entries.iter()); - nonterminal_entries.extend(state.nonterminal_entries.iter()); - terminal_entries.sort_unstable_by_key(|e| self.symbol_order.get(e.0)); - nonterminal_entries.sort_unstable_by_key(|k| k.0); - - for (symbol, entry) in &terminal_entries { - let entry_id = self.get_parse_action_list_id( - entry, - &mut parse_table_entries, - &mut next_parse_action_list_index, - ); - add_line!(self, "{}, ACTIONS({}),", self.symbol_ids[symbol], entry_id); - } - - for (symbol, state_id) in &nonterminal_entries { - add_line!(self, "{}, STATE({}),", self.symbol_ids[symbol], *state_id); + let mut index = 0; + for (i, state) in self + .parse_table + .states + .iter() + .enumerate() + .skip(self.large_state_count) + { + add_line!(self, "[SMALL_STATE({})] = {},", i, index); + index += 1 + 2 * state.symbol_count(); } dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); - index += 1 + 2 * state.symbol_count(); + index = 0; + add_line!(self, "static uint16_t ts_small_parse_table[] = {{"); + indent!(self); + for state in self.parse_table.states.iter().skip(self.large_state_count) { + add_line!(self, "[{}] = {},", index, state.symbol_count()); + indent!(self); + + terminal_entries.clear(); + nonterminal_entries.clear(); + terminal_entries.extend(state.terminal_entries.iter()); + nonterminal_entries.extend(state.nonterminal_entries.iter()); + terminal_entries.sort_unstable_by_key(|e| self.symbol_order.get(e.0)); + nonterminal_entries.sort_unstable_by_key(|k| k.0); + + for (symbol, entry) in &terminal_entries { + let entry_id = self.get_parse_action_list_id( + entry, + &mut parse_table_entries, + &mut next_parse_action_list_index, + ); + add_line!(self, "{}, ACTIONS({}),", self.symbol_ids[symbol], entry_id); + } + + for (symbol, state_id) in &nonterminal_entries { + add_line!(self, "{}, STATE({}),", self.symbol_ids[symbol], *state_id); + } + dedent!(self); + + index += 1 + 2 * state.symbol_count(); + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); } - dedent!(self); - add_line!(self, "}};"); - add_line!(self, ""); self.add_parse_action_list(parse_table_entries); } @@ -957,14 +965,18 @@ impl Generator { self, ".parse_table = (const unsigned short *)ts_parse_table," ); - add_line!( - self, - ".small_parse_table = (const uint16_t *)ts_small_parse_table," - ); - add_line!( - self, - ".small_parse_table_map = (const uint32_t *)ts_small_parse_table_map," - ); + + if self.large_state_count < self.parse_table.states.len() { + add_line!( + self, + ".small_parse_table = (const uint16_t *)ts_small_parse_table," + ); + add_line!( + self, + ".small_parse_table_map = (const uint32_t *)ts_small_parse_table_map," + ); + } + add_line!(self, ".parse_actions = ts_parse_actions,"); add_line!(self, ".lex_modes = ts_lex_modes,"); add_line!(self, ".symbol_names = ts_symbol_names,"); From aeb2f895b458f053015425d7be6eeb8a66523458 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 20 May 2019 13:25:01 -0700 Subject: [PATCH 5/6] Add `--report-states` flag for reporting state counts for each rule --- .../build_tables/build_parse_table.rs | 39 +++---- cli/src/generate/build_tables/item.rs | 37 ++++--- cli/src/generate/build_tables/mod.rs | 101 +++++++++++++++++- cli/src/generate/mod.rs | 5 + cli/src/main.rs | 20 +++- 5 files changed, 165 insertions(+), 37 deletions(-) diff --git a/cli/src/generate/build_tables/build_parse_table.rs b/cli/src/generate/build_tables/build_parse_table.rs index 7b26892e..41d3932c 100644 --- a/cli/src/generate/build_tables/build_parse_table.rs +++ b/cli/src/generate/build_tables/build_parse_table.rs @@ -25,10 +25,11 @@ struct AuxiliarySymbolInfo { type SymbolSequence = Vec; type AuxiliarySymbolSequence = Vec; +pub(crate) type ParseStateInfo<'a> = (SymbolSequence, ParseItemSet<'a>); + struct ParseStateQueueEntry { - preceding_symbols: SymbolSequence, - preceding_auxiliary_symbols: AuxiliarySymbolSequence, state_id: ParseStateId, + preceding_auxiliary_symbols: AuxiliarySymbolSequence, } struct ParseTableBuilder<'a> { @@ -38,13 +39,13 @@ struct ParseTableBuilder<'a> { variable_info: &'a Vec, core_ids_by_core: HashMap, usize>, state_ids_by_item_set: HashMap, ParseStateId>, - item_sets_by_state_id: Vec>, + parse_state_info_by_id: Vec>, parse_state_queue: VecDeque, parse_table: ParseTable, } impl<'a> ParseTableBuilder<'a> { - fn build(mut self) -> Result { + fn build(mut self) -> Result<(ParseTable, Vec>)> { // Ensure that the empty alias sequence has index 0. self.parse_table .production_infos @@ -70,9 +71,10 @@ impl<'a> ParseTableBuilder<'a> { while let Some(entry) = self.parse_state_queue.pop_front() { let item_set = self .item_set_builder - .transitive_closure(&self.item_sets_by_state_id[entry.state_id]); + .transitive_closure(&self.parse_state_info_by_id[entry.state_id].1); + self.add_actions( - entry.preceding_symbols, + self.parse_state_info_by_id[entry.state_id].0.clone(), entry.preceding_auxiliary_symbols, entry.state_id, item_set, @@ -81,7 +83,7 @@ impl<'a> ParseTableBuilder<'a> { self.remove_precedences(); - Ok(self.parse_table) + Ok((self.parse_table, self.parse_state_info_by_id)) } fn add_parse_state( @@ -104,7 +106,9 @@ impl<'a> ParseTableBuilder<'a> { }; let state_id = self.parse_table.states.len(); - self.item_sets_by_state_id.push(v.key().clone()); + self.parse_state_info_by_id + .push((preceding_symbols.clone(), v.key().clone())); + self.parse_table.states.push(ParseState { id: state_id, lex_state_id: 0, @@ -115,7 +119,6 @@ impl<'a> ParseTableBuilder<'a> { }); self.parse_state_queue.push_back(ParseStateQueueEntry { state_id, - preceding_symbols: preceding_symbols.clone(), preceding_auxiliary_symbols: preceding_auxiliary_symbols.clone(), }); v.insert(state_id); @@ -751,12 +754,12 @@ fn populate_following_tokens( } } -pub(crate) fn build_parse_table( - syntax_grammar: &SyntaxGrammar, - lexical_grammar: &LexicalGrammar, - inlines: &InlinedProductionMap, - variable_info: &Vec, -) -> Result<(ParseTable, Vec)> { +pub(crate) fn build_parse_table<'a>( + syntax_grammar: &'a SyntaxGrammar, + lexical_grammar: &'a LexicalGrammar, + inlines: &'a InlinedProductionMap, + variable_info: &'a Vec, +) -> Result<(ParseTable, Vec, Vec>)> { let item_set_builder = ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines); let mut following_tokens = vec![TokenSet::new(); lexical_grammar.variables.len()]; populate_following_tokens( @@ -766,14 +769,14 @@ pub(crate) fn build_parse_table( &item_set_builder, ); - let table = ParseTableBuilder { + let (table, item_sets) = ParseTableBuilder { syntax_grammar, lexical_grammar, item_set_builder, variable_info, state_ids_by_item_set: HashMap::new(), core_ids_by_core: HashMap::new(), - item_sets_by_state_id: Vec::new(), + parse_state_info_by_id: Vec::new(), parse_state_queue: VecDeque::new(), parse_table: ParseTable { states: Vec::new(), @@ -785,5 +788,5 @@ pub(crate) fn build_parse_table( } .build()?; - Ok((table, following_tokens)) + Ok((table, following_tokens, item_sets)) } diff --git a/cli/src/generate/build_tables/item.rs b/cli/src/generate/build_tables/item.rs index df712402..f0e5d381 100644 --- a/cli/src/generate/build_tables/item.rs +++ b/cli/src/generate/build_tables/item.rs @@ -1,5 +1,8 @@ -use crate::generate::grammars::{LexicalGrammar, Production, ProductionStep, SyntaxGrammar}; -use crate::generate::rules::{Associativity, Symbol, SymbolType, TokenSet}; +use crate::generate::grammars::{ + LexicalGrammar, Production, ProductionStep, SyntaxGrammar, +}; +use crate::generate::rules::Associativity; +use crate::generate::rules::{Symbol, SymbolType, TokenSet}; use lazy_static::lazy_static; use std::cmp::Ordering; use std::fmt; @@ -161,12 +164,14 @@ impl<'a> fmt::Display for ParseItemDisplay<'a> { for (i, step) in self.0.production.steps.iter().enumerate() { if i == self.0.step_index as usize { write!(f, " •")?; - if step.precedence != 0 || step.associativity.is_some() { - write!( - f, - " (prec {:?} assoc {:?})", - step.precedence, step.associativity - )?; + if let Some(associativity) = step.associativity { + if step.precedence != 0 { + write!(f, " ({} {:?})", step.precedence, associativity)?; + } else { + write!(f, " ({:?})", associativity)?; + } + } else if step.precedence != 0 { + write!(f, " ({})", step.precedence)?; } } @@ -184,19 +189,21 @@ impl<'a> fmt::Display for ParseItemDisplay<'a> { } if let Some(alias) = &step.alias { - write!(f, " (alias {})", alias.value)?; + write!(f, "@{}", alias.value)?; } } if self.0.is_done() { write!(f, " •")?; if let Some(step) = self.0.production.steps.last() { - if step.precedence != 0 || step.associativity.is_some() { - write!( - f, - " (prec {:?} assoc {:?})", - step.precedence, step.associativity - )?; + if let Some(associativity) = step.associativity { + if step.precedence != 0 { + write!(f, " ({} {:?})", step.precedence, associativity)?; + } else { + write!(f, " ({:?})", associativity)?; + } + } else if step.precedence != 0 { + write!(f, " ({})", step.precedence)?; } } } diff --git a/cli/src/generate/build_tables/mod.rs b/cli/src/generate/build_tables/mod.rs index af9483eb..e0f84244 100644 --- a/cli/src/generate/build_tables/mod.rs +++ b/cli/src/generate/build_tables/mod.rs @@ -7,7 +7,7 @@ mod minimize_parse_table; mod token_conflicts; use self::build_lex_table::build_lex_table; -use self::build_parse_table::build_parse_table; +use self::build_parse_table::{build_parse_table, ParseStateInfo}; use self::coincident_tokens::CoincidentTokenIndex; use self::minimize_parse_table::minimize_parse_table; use self::token_conflicts::TokenConflictMap; @@ -18,7 +18,7 @@ use crate::generate::node_types::VariableInfo; use crate::generate::rules::{AliasMap, Symbol, SymbolType, TokenSet}; use crate::generate::tables::{LexTable, ParseAction, ParseTable, ParseTableEntry}; use log::info; -use std::collections::HashMap; +use std::collections::{BTreeSet, HashMap}; pub(crate) fn build_tables( syntax_grammar: &SyntaxGrammar, @@ -26,8 +26,9 @@ pub(crate) fn build_tables( simple_aliases: &AliasMap, variable_info: &Vec, inlines: &InlinedProductionMap, + report_symbol_name: Option<&str>, ) -> Result<(ParseTable, LexTable, LexTable, Option)> { - let (mut parse_table, following_tokens) = + let (mut parse_table, following_tokens, parse_state_info) = build_parse_table(syntax_grammar, lexical_grammar, inlines, variable_info)?; let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens); let coincident_token_index = CoincidentTokenIndex::new(&parse_table, lexical_grammar); @@ -65,6 +66,16 @@ pub(crate) fn build_tables( ); populate_external_lex_states(&mut parse_table, syntax_grammar); mark_fragile_tokens(&mut parse_table, lexical_grammar, &token_conflict_map); + + if let Some(report_symbol_name) = report_symbol_name { + report_state_info( + &syntax_grammar, + &lexical_grammar, + &parse_table, + &parse_state_info, + report_symbol_name, + ); + } Ok(( parse_table, main_lex_table, @@ -372,6 +383,90 @@ fn mark_fragile_tokens( } } +fn report_state_info<'a>( + syntax_grammar: &SyntaxGrammar, + lexical_grammar: &LexicalGrammar, + parse_table: &ParseTable, + parse_state_info: &Vec>, + report_symbol_name: &'a str, +) { + let mut all_state_indices = BTreeSet::new(); + let mut symbols_with_state_indices = (0..syntax_grammar.variables.len()) + .map(|i| (Symbol::non_terminal(i), BTreeSet::new())) + .collect::>(); + + for (i, state) in parse_table.states.iter().enumerate() { + all_state_indices.insert(i); + let item_set = &parse_state_info[state.id]; + for (item, _) in item_set.1.entries.iter() { + if !item.is_augmented() { + symbols_with_state_indices[item.variable_index as usize] + .1 + .insert(i); + } + } + } + + symbols_with_state_indices.sort_unstable_by_key(|(_, states)| -(states.len() as i32)); + + let max_symbol_name_length = syntax_grammar + .variables + .iter() + .map(|v| v.name.len()) + .max() + .unwrap(); + for (symbol, states) in &symbols_with_state_indices { + eprintln!( + "{:width$}\t{}", + syntax_grammar.variables[symbol.index].name, + states.len(), + width = max_symbol_name_length + ); + } + eprintln!(""); + + let state_indices = if report_symbol_name == "*" { + Some(&all_state_indices) + } else { + symbols_with_state_indices + .iter() + .find_map(|(symbol, state_indices)| { + if syntax_grammar.variables[symbol.index].name == report_symbol_name { + Some(state_indices) + } else { + None + } + }) + }; + + if let Some(state_indices) = state_indices { + let mut state_indices = state_indices.into_iter().cloned().collect::>(); + state_indices.sort_unstable_by_key(|i| (parse_table.states[*i].core_id, *i)); + + for state_index in state_indices { + let id = parse_table.states[state_index].id; + let (preceding_symbols, item_set) = &parse_state_info[id]; + eprintln!("state index: {}", state_index); + eprintln!("state id: {}", id); + eprint!("symbol sequence:"); + for symbol in preceding_symbols { + let name = if symbol.is_terminal() { + &lexical_grammar.variables[symbol.index].name + } else if symbol.is_external() { + &syntax_grammar.external_tokens[symbol.index].name + } else { + &syntax_grammar.variables[symbol.index].name + }; + eprint!(" {}", name); + } + eprintln!( + "\nitems:\n{}", + self::item::ParseItemSetDisplay(&item_set, syntax_grammar, lexical_grammar,), + ); + } + } +} + fn all_chars_are_alphabetical(cursor: &NfaCursor) -> bool { cursor.transition_chars().all(|(chars, is_sep)| { if is_sep { diff --git a/cli/src/generate/mod.rs b/cli/src/generate/mod.rs index 64de772c..2afab507 100644 --- a/cli/src/generate/mod.rs +++ b/cli/src/generate/mod.rs @@ -42,6 +42,7 @@ pub fn generate_parser_in_directory( repo_path: &PathBuf, grammar_path: Option<&str>, properties_only: bool, + report_symbol_name: Option<&str>, ) -> Result<()> { let src_path = repo_path.join("src"); let header_path = src_path.join("tree_sitter"); @@ -102,6 +103,7 @@ pub fn generate_parser_in_directory( lexical_grammar, inlines, simple_aliases, + report_symbol_name, )?; write_file(&src_path.join("parser.c"), c_code)?; @@ -132,6 +134,7 @@ pub fn generate_parser_for_grammar(grammar_json: &str) -> Result<(String, String lexical_grammar, inlines, simple_aliases, + None, )?; Ok((input_grammar.name, parser.c_code)) } @@ -142,6 +145,7 @@ fn generate_parser_for_grammar_with_opts( lexical_grammar: LexicalGrammar, inlines: InlinedProductionMap, simple_aliases: AliasMap, + report_symbol_name: Option<&str>, ) -> Result { let variable_info = node_types::get_variable_info(&syntax_grammar, &lexical_grammar, &inlines)?; let node_types_json = node_types::generate_node_types_json( @@ -156,6 +160,7 @@ fn generate_parser_for_grammar_with_opts( &simple_aliases, &variable_info, &inlines, + report_symbol_name, )?; let c_code = render_c_code( name, diff --git a/cli/src/main.rs b/cli/src/main.rs index f4565f34..80be798a 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -39,6 +39,12 @@ fn run() -> error::Result<()> { .arg(Arg::with_name("grammar-path").index(1)) .arg(Arg::with_name("log").long("log")) .arg(Arg::with_name("properties-only").long("properties")) + .arg( + Arg::with_name("report-states-for-rule") + .long("report-states-for-rule") + .value_name("rule-name") + .takes_value(true), + ) .arg(Arg::with_name("no-minimize").long("no-minimize")), ) .subcommand( @@ -121,10 +127,22 @@ fn run() -> error::Result<()> { } else if let Some(matches) = matches.subcommand_matches("generate") { let grammar_path = matches.value_of("grammar-path"); let properties_only = matches.is_present("properties-only"); + let report_symbol_name = matches.value_of("report-states-for-rule").or_else(|| { + if matches.is_present("report-states") { + Some("") + } else { + None + } + }); if matches.is_present("log") { logger::init(); } - generate::generate_parser_in_directory(¤t_dir, grammar_path, properties_only)?; + generate::generate_parser_in_directory( + ¤t_dir, + grammar_path, + properties_only, + report_symbol_name, + )?; } else if let Some(matches) = matches.subcommand_matches("test") { let debug = matches.is_present("debug"); let debug_graph = matches.is_present("debug-graph"); From 803760758314ae97138bfb201dd04c73c624f39c Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 28 Aug 2019 17:14:04 -0700 Subject: [PATCH 6/6] Only generate the new parse table format if --next-abi flag is used --- cli/src/generate/mod.rs | 32 +++++++++++++- cli/src/generate/render.rs | 90 ++++++++++++++++++++++++++++++-------- cli/src/main.rs | 3 ++ 3 files changed, 105 insertions(+), 20 deletions(-) diff --git a/cli/src/generate/mod.rs b/cli/src/generate/mod.rs index 2afab507..5446e4af 100644 --- a/cli/src/generate/mod.rs +++ b/cli/src/generate/mod.rs @@ -33,6 +33,16 @@ lazy_static! { .unwrap(); } +const NEW_HEADER_PARTS: [&'static str; 2] = [ + " + uint32_t large_state_count; + const uint16_t *small_parse_table; + const uint32_t *small_parse_table_map;", + " +#define SMALL_STATE(id) id - LARGE_STATE_COUNT +", +]; + struct GeneratedParser { c_code: String, node_types_json: String, @@ -42,6 +52,7 @@ pub fn generate_parser_in_directory( repo_path: &PathBuf, grammar_path: Option<&str>, properties_only: bool, + next_abi: bool, report_symbol_name: Option<&str>, ) -> Result<()> { let src_path = repo_path.join("src"); @@ -103,12 +114,28 @@ pub fn generate_parser_in_directory( lexical_grammar, inlines, simple_aliases, + next_abi, report_symbol_name, )?; write_file(&src_path.join("parser.c"), c_code)?; write_file(&src_path.join("node-types.json"), node_types_json)?; - write_file(&header_path.join("parser.h"), tree_sitter::PARSER_HEADER)?; + + if next_abi { + write_file(&header_path.join("parser.h"), tree_sitter::PARSER_HEADER)?; + } else { + let mut header = tree_sitter::PARSER_HEADER.to_string(); + + for part in &NEW_HEADER_PARTS { + let pos = header + .find(part) + .expect("Missing expected part of parser.h header"); + header.replace_range(pos..(pos + part.len()), ""); + } + + write_file(&header_path.join("parser.h"), header)?; + } + ensure_file(&repo_path.join("index.js"), || { npm_files::index_js(&language_name) })?; @@ -134,6 +161,7 @@ pub fn generate_parser_for_grammar(grammar_json: &str) -> Result<(String, String lexical_grammar, inlines, simple_aliases, + true, None, )?; Ok((input_grammar.name, parser.c_code)) @@ -145,6 +173,7 @@ fn generate_parser_for_grammar_with_opts( lexical_grammar: LexicalGrammar, inlines: InlinedProductionMap, simple_aliases: AliasMap, + next_abi: bool, report_symbol_name: Option<&str>, ) -> Result { let variable_info = node_types::get_variable_info(&syntax_grammar, &lexical_grammar, &inlines)?; @@ -171,6 +200,7 @@ fn generate_parser_for_grammar_with_opts( syntax_grammar, lexical_grammar, simple_aliases, + next_abi, ); Ok(GeneratedParser { c_code, diff --git a/cli/src/generate/render.rs b/cli/src/generate/render.rs index e806564c..c5f1545b 100644 --- a/cli/src/generate/render.rs +++ b/cli/src/generate/render.rs @@ -9,7 +9,10 @@ use std::cmp; use std::collections::{BTreeMap, HashMap, HashSet}; use std::fmt::Write; use std::mem::swap; -use tree_sitter::LANGUAGE_VERSION; + +// Currently, the library supports a new ABI version that has not yet been +// stabilized, and the parser generation does not use it by default. +const STABLE_LANGUAGE_VERSION: usize = tree_sitter::LANGUAGE_VERSION - 1; macro_rules! add { ($this: tt, $($arg: tt)*) => {{ @@ -65,6 +68,7 @@ struct Generator { alias_ids: HashMap, alias_map: BTreeMap>, field_names: Vec, + next_abi: bool, } impl Generator { @@ -149,23 +153,30 @@ impl Generator { } } - let threshold = cmp::min( - SMALL_STATE_THRESHOLD, - self.parse_table.symbols.len() / 2 - 1, - ); - self.large_state_count = self - .parse_table - .states - .iter() - .enumerate() - .take_while(|(i, s)| { - *i <= 1 || s.terminal_entries.len() + s.nonterminal_entries.len() > threshold - }) - .count(); - field_names.sort_unstable(); field_names.dedup(); self.field_names = field_names.into_iter().cloned().collect(); + + // If we are opting in to the new unstable language ABI, then use the concept of + // "small parse states". Otherwise, use the same representation for all parse + // states. + if self.next_abi { + let threshold = cmp::min( + SMALL_STATE_THRESHOLD, + self.parse_table.symbols.len() / 2 - 1, + ); + self.large_state_count = self + .parse_table + .states + .iter() + .enumerate() + .take_while(|(i, s)| { + *i <= 1 || s.terminal_entries.len() + s.nonterminal_entries.len() > threshold + }) + .count(); + } else { + self.large_state_count = self.parse_table.states.len(); + } } fn add_includes(&mut self) { @@ -216,13 +227,26 @@ impl Generator { }) .count(); - add_line!(self, "#define LANGUAGE_VERSION {}", LANGUAGE_VERSION); + if self.next_abi { + add_line!( + self, + "#define LANGUAGE_VERSION {}", + tree_sitter::LANGUAGE_VERSION + ); + } else { + add_line!(self, "#define LANGUAGE_VERSION {}", STABLE_LANGUAGE_VERSION); + } + add_line!( self, "#define STATE_COUNT {}", self.parse_table.states.len() ); - add_line!(self, "#define LARGE_STATE_COUNT {}", self.large_state_count); + + if self.next_abi { + add_line!(self, "#define LARGE_STATE_COUNT {}", self.large_state_count); + } + add_line!( self, "#define SYMBOL_COUNT {}", @@ -755,7 +779,12 @@ impl Generator { add_line!( self, - "static uint16_t ts_parse_table[LARGE_STATE_COUNT][SYMBOL_COUNT] = {{" + "static uint16_t ts_parse_table[{}][SYMBOL_COUNT] = {{", + if self.next_abi { + "LARGE_STATE_COUNT" + } else { + "STATE_COUNT" + } ); indent!(self); @@ -959,7 +988,11 @@ impl Generator { add_line!(self, ".symbol_count = SYMBOL_COUNT,"); add_line!(self, ".alias_count = ALIAS_COUNT,"); add_line!(self, ".token_count = TOKEN_COUNT,"); - add_line!(self, ".large_state_count = LARGE_STATE_COUNT,"); + + if self.next_abi { + add_line!(self, ".large_state_count = LARGE_STATE_COUNT,"); + } + add_line!(self, ".symbol_metadata = ts_symbol_metadata,"); add_line!( self, @@ -1217,6 +1250,23 @@ impl Generator { } } +/// Returns a String of C code for the given components of a parser. +/// +/// # Arguments +/// +/// * `name` - A string slice containing the name of the language +/// * `parse_table` - The generated parse table for the language +/// * `main_lex_table` - The generated lexing table for the language +/// * `keyword_lex_table` - The generated keyword lexing table for the language +/// * `keyword_capture_token` - A symbol indicating which token is used +/// for keyword capture, if any. +/// * `syntax_grammar` - The syntax grammar extracted from the language's grammar +/// * `lexical_grammar` - The lexical grammar extracted from the language's grammar +/// * `simple_aliases` - A map describing the global rename rules that should apply. +/// the keys are symbols that are *always* aliased in the same way, and the values +/// are the aliases that are applied to those symbols. +/// * `next_abi` - A boolean indicating whether to opt into the new, unstable parse +/// table format. This is mainly used for testing, when developing Tree-sitter itself. pub(crate) fn render_c_code( name: &str, parse_table: ParseTable, @@ -1226,6 +1276,7 @@ pub(crate) fn render_c_code( syntax_grammar: SyntaxGrammar, lexical_grammar: LexicalGrammar, simple_aliases: AliasMap, + next_abi: bool, ) -> String { Generator { buffer: String::new(), @@ -1244,6 +1295,7 @@ pub(crate) fn render_c_code( alias_ids: HashMap::new(), alias_map: BTreeMap::new(), field_names: Vec::new(), + next_abi, } .generate() } diff --git a/cli/src/main.rs b/cli/src/main.rs index 80be798a..59d04a97 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -38,6 +38,7 @@ fn run() -> error::Result<()> { .about("Generate a parser") .arg(Arg::with_name("grammar-path").index(1)) .arg(Arg::with_name("log").long("log")) + .arg(Arg::with_name("next-abi").long("next-abi")) .arg(Arg::with_name("properties-only").long("properties")) .arg( Arg::with_name("report-states-for-rule") @@ -137,10 +138,12 @@ fn run() -> error::Result<()> { if matches.is_present("log") { logger::init(); } + let next_abi = matches.is_present("next-abi"); generate::generate_parser_in_directory( ¤t_dir, grammar_path, properties_only, + next_abi, report_symbol_name, )?; } else if let Some(matches) = matches.subcommand_matches("test") {