diff --git a/cli/src/generate/build_tables/build_lex_table.rs b/cli/src/generate/build_tables/build_lex_table.rs index 361e437c..f7bff0d9 100644 --- a/cli/src/generate/build_tables/build_lex_table.rs +++ b/cli/src/generate/build_tables/build_lex_table.rs @@ -9,11 +9,19 @@ use super::{coincident_tokens::CoincidentTokenIndex, token_conflicts::TokenConfl use crate::generate::{ dedup::split_state_id_groups, grammars::{LexicalGrammar, SyntaxGrammar}, - nfa::NfaCursor, + nfa::{CharacterSet, NfaCursor}, rules::{Symbol, TokenSet}, tables::{AdvanceAction, LexState, LexTable, ParseStateId, ParseTable}, }; +pub const LARGE_CHARACTER_RANGE_COUNT: usize = 8; + +pub struct LexTables { + pub main_lex_table: LexTable, + pub keyword_lex_table: LexTable, + pub large_character_sets: Vec<(Option, CharacterSet)>, +} + pub fn build_lex_table( parse_table: &mut ParseTable, syntax_grammar: &SyntaxGrammar, @@ -21,7 +29,7 @@ pub fn build_lex_table( keywords: &TokenSet, coincident_token_index: &CoincidentTokenIndex, token_conflict_map: &TokenConflictMap, -) -> (LexTable, LexTable) { +) -> LexTables { let keyword_lex_table = if syntax_grammar.word_token.is_some() { let mut builder = LexTableBuilder::new(lexical_grammar); builder.add_state_for_tokens(keywords); @@ -78,10 +86,45 @@ pub fn build_lex_table( } } - let mut table = builder.table; - minimize_lex_table(&mut table, parse_table); - sort_states(&mut table, parse_table); - (table, keyword_lex_table) + let mut main_lex_table = mem::take(&mut builder.table); + minimize_lex_table(&mut main_lex_table, parse_table); + sort_states(&mut main_lex_table, parse_table); + + let mut large_character_sets = Vec::new(); + for (variable_ix, _variable) in lexical_grammar.variables.iter().enumerate() { + let symbol = Symbol::terminal(variable_ix); + builder.reset(); + builder.add_state_for_tokens(&TokenSet::from_iter([symbol])); + for state in &builder.table.states { + let mut characters = CharacterSet::empty(); + for (chars, action) in &state.advance_actions { + if action.in_main_token { + characters = characters.add(chars); + continue; + } + + if chars.range_count() > LARGE_CHARACTER_RANGE_COUNT + && !large_character_sets.iter().any(|(_, set)| set == chars) + { + large_character_sets.push((None, chars.clone())); + } + } + + if characters.range_count() > LARGE_CHARACTER_RANGE_COUNT + && !large_character_sets + .iter() + .any(|(_, set)| *set == characters) + { + large_character_sets.push((Some(symbol), characters)); + } + } + } + + LexTables { + main_lex_table, + keyword_lex_table, + large_character_sets, + } } struct QueueEntry { @@ -109,6 +152,12 @@ impl<'a> LexTableBuilder<'a> { } } + fn reset(&mut self) { + self.table = LexTable::default(); + self.state_queue.clear(); + self.state_ids_by_nfa_state_set.clear(); + } + fn add_state_for_tokens(&mut self, tokens: &TokenSet) -> usize { let mut eof_valid = false; let nfa_states = tokens diff --git a/cli/src/generate/build_tables/mod.rs b/cli/src/generate/build_tables/mod.rs index 7946c144..49fc5061 100644 --- a/cli/src/generate/build_tables/mod.rs +++ b/cli/src/generate/build_tables/mod.rs @@ -1,5 +1,5 @@ -pub mod build_lex_table; -pub mod build_parse_table; +mod build_lex_table; +mod build_parse_table; mod coincident_tokens; mod item; mod item_set_builder; @@ -20,12 +20,22 @@ use self::{ }; use crate::generate::{ grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}, - nfa::NfaCursor, + nfa::{CharacterSet, NfaCursor}, node_types::VariableInfo, rules::{AliasMap, Symbol, SymbolType, TokenSet}, tables::{LexTable, ParseAction, ParseTable, ParseTableEntry}, }; +pub use build_lex_table::LARGE_CHARACTER_RANGE_COUNT; + +pub struct Tables { + pub parse_table: ParseTable, + pub main_lex_table: LexTable, + pub keyword_lex_table: LexTable, + pub word_token: Option, + pub large_character_sets: Vec<(Option, CharacterSet)>, +} + pub fn build_tables( syntax_grammar: &SyntaxGrammar, lexical_grammar: &LexicalGrammar, @@ -33,7 +43,7 @@ pub fn build_tables( variable_info: &[VariableInfo], inlines: &InlinedProductionMap, report_symbol_name: Option<&str>, -) -> Result<(ParseTable, LexTable, LexTable, Option)> { +) -> Result { let (mut parse_table, following_tokens, parse_state_info) = build_parse_table(syntax_grammar, lexical_grammar, inlines, variable_info)?; let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens); @@ -62,7 +72,7 @@ pub fn build_tables( &token_conflict_map, &keywords, ); - let (main_lex_table, keyword_lex_table) = build_lex_table( + let lex_tables = build_lex_table( &mut parse_table, syntax_grammar, lexical_grammar, @@ -82,12 +92,14 @@ pub fn build_tables( report_symbol_name, ); } - Ok(( + + Ok(Tables { parse_table, - main_lex_table, - keyword_lex_table, - syntax_grammar.word_token, - )) + main_lex_table: lex_tables.main_lex_table, + keyword_lex_table: lex_tables.keyword_lex_table, + large_character_sets: lex_tables.large_character_sets, + word_token: syntax_grammar.word_token, + }) } fn populate_error_state( diff --git a/cli/src/generate/char_tree.rs b/cli/src/generate/char_tree.rs deleted file mode 100644 index 2e28d56f..00000000 --- a/cli/src/generate/char_tree.rs +++ /dev/null @@ -1,133 +0,0 @@ -use std::ops::Range; - -/// A set of characters represented as a balanced binary tree of comparisons. -/// This is used as an intermediate step in generating efficient code for -/// matching a given character set. -#[derive(PartialEq, Eq)] -pub enum CharacterTree { - Yes, - Compare { - value: char, - operator: Comparator, - consequence: Option>, - alternative: Option>, - }, -} - -#[derive(PartialEq, Eq)] -pub enum Comparator { - Less, - LessOrEqual, - Equal, - GreaterOrEqual, -} - -impl CharacterTree { - pub fn from_ranges(ranges: &[Range]) -> Option { - match ranges.len() { - 0 => None, - 1 => { - let range = &ranges[0]; - if range.start == range.end { - Some(Self::Compare { - operator: Comparator::Equal, - value: range.start, - consequence: Some(Box::new(Self::Yes)), - alternative: None, - }) - } else { - Some(Self::Compare { - operator: Comparator::GreaterOrEqual, - value: range.start, - consequence: Some(Box::new(Self::Compare { - operator: Comparator::LessOrEqual, - value: range.end, - consequence: Some(Box::new(Self::Yes)), - alternative: None, - })), - alternative: None, - }) - } - } - len => { - let mid = len / 2; - let mid_range = &ranges[mid]; - Some(Self::Compare { - operator: Comparator::Less, - value: mid_range.start, - consequence: Self::from_ranges(&ranges[0..mid]).map(Box::new), - alternative: Some(Box::new(Self::Compare { - operator: Comparator::LessOrEqual, - value: mid_range.end, - consequence: Some(Box::new(Self::Yes)), - alternative: Self::from_ranges(&ranges[(mid + 1)..]).map(Box::new), - })), - }) - } - } - } - - #[cfg(test)] - fn contains(&self, c: char) -> bool { - match self { - Self::Yes => true, - Self::Compare { - value, - operator, - alternative, - consequence, - } => { - let condition = match operator { - Comparator::Less => c < *value, - Comparator::LessOrEqual => c <= *value, - Comparator::Equal => c == *value, - Comparator::GreaterOrEqual => c >= *value, - }; - if condition { consequence } else { alternative } - .as_ref() - .map_or(false, |a| a.contains(c)) - } - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_character_tree_simple() { - let tree = CharacterTree::from_ranges(&['a'..'d', 'h'..'l', 'p'..'r', 'u'..'u', 'z'..'z']) - .unwrap(); - - assert!(tree.contains('a')); - assert!(tree.contains('b')); - assert!(tree.contains('c')); - assert!(tree.contains('d')); - - assert!(!tree.contains('e')); - assert!(!tree.contains('f')); - assert!(!tree.contains('g')); - - assert!(tree.contains('h')); - assert!(tree.contains('i')); - assert!(tree.contains('j')); - assert!(tree.contains('k')); - assert!(tree.contains('l')); - - assert!(!tree.contains('m')); - assert!(!tree.contains('n')); - assert!(!tree.contains('o')); - - assert!(tree.contains('p')); - assert!(tree.contains('q')); - assert!(tree.contains('r')); - - assert!(!tree.contains('s')); - assert!(!tree.contains('s')); - - assert!(tree.contains('u')); - - assert!(!tree.contains('v')); - } -} diff --git a/cli/src/generate/mod.rs b/cli/src/generate/mod.rs index 2f07361e..e9f2bfee 100644 --- a/cli/src/generate/mod.rs +++ b/cli/src/generate/mod.rs @@ -8,17 +8,15 @@ use std::{ use anyhow::{anyhow, Context, Result}; use build_tables::build_tables; use grammar_files::path_in_ignore; -use grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; +use grammars::InputGrammar; use lazy_static::lazy_static; use parse_grammar::parse_grammar; use prepare_grammar::prepare_grammar; use regex::{Regex, RegexBuilder}; use render::render_c_code; -use rules::AliasMap; use semver::Version; mod build_tables; -mod char_tree; mod dedup; mod grammar_files; mod grammars; @@ -105,23 +103,12 @@ pub fn generate_parser_in_directory( // Parse and preprocess the grammar. let input_grammar = parse_grammar(&grammar_json)?; - let (syntax_grammar, lexical_grammar, inlines, simple_aliases) = - prepare_grammar(&input_grammar)?; - let language_name = input_grammar.name; // Generate the parser and related files. let GeneratedParser { c_code, node_types_json, - } = generate_parser_for_grammar_with_opts( - &language_name, - syntax_grammar, - lexical_grammar, - &inlines, - simple_aliases, - abi_version, - report_symbol_name, - )?; + } = generate_parser_for_grammar_with_opts(&input_grammar, abi_version, report_symbol_name)?; write_file(&src_path.join("parser.c"), c_code)?; write_file(&src_path.join("node-types.json"), node_types_json)?; @@ -130,7 +117,7 @@ pub fn generate_parser_in_directory( write_file(&header_path.join("parser.h"), tree_sitter::PARSER_HEADER)?; if !path_in_ignore(&repo_path) { - grammar_files::generate_grammar_files(&repo_path, &language_name, generate_bindings)?; + grammar_files::generate_grammar_files(&repo_path, &input_grammar.name, generate_bindings)?; } Ok(()) @@ -139,29 +126,18 @@ pub fn generate_parser_in_directory( pub fn generate_parser_for_grammar(grammar_json: &str) -> Result<(String, String)> { let grammar_json = JSON_COMMENT_REGEX.replace_all(grammar_json, "\n"); let input_grammar = parse_grammar(&grammar_json)?; - let (syntax_grammar, lexical_grammar, inlines, simple_aliases) = - prepare_grammar(&input_grammar)?; - let parser = generate_parser_for_grammar_with_opts( - &input_grammar.name, - syntax_grammar, - lexical_grammar, - &inlines, - simple_aliases, - tree_sitter::LANGUAGE_VERSION, - None, - )?; - Ok((input_grammar.name, parser.c_code)) + let parser = + generate_parser_for_grammar_with_opts(&input_grammar, tree_sitter::LANGUAGE_VERSION, None)?; + Ok((input_grammar.name.clone(), parser.c_code)) } fn generate_parser_for_grammar_with_opts( - name: &str, - syntax_grammar: SyntaxGrammar, - lexical_grammar: LexicalGrammar, - inlines: &InlinedProductionMap, - simple_aliases: AliasMap, + input_grammar: &InputGrammar, abi_version: usize, report_symbol_name: Option<&str>, ) -> Result { + let (syntax_grammar, lexical_grammar, inlines, simple_aliases) = + prepare_grammar(input_grammar)?; let variable_info = node_types::get_variable_info(&syntax_grammar, &lexical_grammar, &simple_aliases)?; let node_types_json = node_types::generate_node_types_json( @@ -170,20 +146,17 @@ fn generate_parser_for_grammar_with_opts( &simple_aliases, &variable_info, ); - let (parse_table, main_lex_table, keyword_lex_table, keyword_capture_token) = build_tables( + let tables = build_tables( &syntax_grammar, &lexical_grammar, &simple_aliases, &variable_info, - inlines, + &inlines, report_symbol_name, )?; let c_code = render_c_code( - name, - parse_table, - main_lex_table, - keyword_lex_table, - keyword_capture_token, + &input_grammar.name, + tables, syntax_grammar, lexical_grammar, simple_aliases, diff --git a/cli/src/generate/nfa.rs b/cli/src/generate/nfa.rs index e793199f..b247ce4f 100644 --- a/cli/src/generate/nfa.rs +++ b/cli/src/generate/nfa.rs @@ -1,14 +1,13 @@ use std::{ char, cmp::{max, Ordering}, - collections::HashSet, fmt, - mem::swap, - ops::Range, + mem::{self, swap}, + ops::{Range, RangeInclusive}, }; /// A set of characters represented as a vector of ranges. -#[derive(Clone, PartialEq, Eq, Hash)] +#[derive(Clone, Default, PartialEq, Eq, Hash)] pub struct CharacterSet { ranges: Vec>, } @@ -115,6 +114,11 @@ impl CharacterSet { self } + pub fn assign(&mut self, other: &Self) { + self.ranges.clear(); + self.ranges.extend_from_slice(&other.ranges); + } + fn add_int_range(&mut self, mut i: usize, start: u32, end: u32) -> usize { while i < self.ranges.len() { let range = &mut self.ranges[i]; @@ -286,12 +290,24 @@ impl CharacterSet { self.add(&other) } - pub fn iter(&self) -> impl Iterator + '_ { - self.ranges.iter().flat_map(std::clone::Clone::clone) + pub fn char_codes(&self) -> impl Iterator + '_ { + self.ranges.iter().flat_map(Clone::clone) } pub fn chars(&self) -> impl Iterator + '_ { - self.iter().filter_map(char::from_u32) + self.char_codes().filter_map(char::from_u32) + } + + pub fn range_count(&self) -> usize { + self.ranges.len() + } + + pub fn ranges(&self) -> impl Iterator> + '_ { + self.ranges.iter().filter_map(|range| { + let start = range.clone().find_map(char::from_u32)?; + let end = (range.start..range.end).rev().find_map(char::from_u32)?; + Some(start..=end) + }) } pub fn is_empty(&self) -> bool { @@ -300,41 +316,57 @@ impl CharacterSet { /// Get a reduced list of character ranges, assuming that a given /// set of characters can be safely ignored. - pub fn simplify_ignoring<'a>( - &'a self, - ruled_out_characters: &'a HashSet, - ) -> Vec> { - let mut prev_range: Option> = None; - self.chars() - .map(|c| (c, false)) - .chain(Some(('\0', true))) - .filter_map(move |(c, done)| { - if done { - return prev_range.clone(); - } - if ruled_out_characters.contains(&(c as u32)) { - return None; - } - if let Some(range) = prev_range.clone() { - let mut prev_range_successor = range.end as u32 + 1; - while prev_range_successor < c as u32 { - if !ruled_out_characters.contains(&prev_range_successor) { - prev_range = Some(c..c); - return Some(range); + pub fn simplify_ignoring(&self, ruled_out_characters: &Self) -> Self { + let mut prev_range: Option> = None; + Self { + ranges: self + .ranges + .iter() + .map(|range| Some(range.clone())) + .chain([None]) + .filter_map(move |range| { + if let Some(range) = &range { + if ruled_out_characters.contains_codepoint_range(range.clone()) { + return None; + } + + if let Some(prev_range) = &mut prev_range { + if ruled_out_characters + .contains_codepoint_range(prev_range.end..range.start) + { + prev_range.end = range.end; + return None; + } } - prev_range_successor += 1; } - prev_range = Some(range.start..c); - } else { - prev_range = Some(c..c); - } - None - }) - .collect() + + let result = prev_range.clone(); + prev_range = range; + result + }) + .collect(), + } + } + + pub fn contains_codepoint_range(&self, seek_range: Range) -> bool { + let ix = match self.ranges.binary_search_by(|probe| { + if probe.end <= seek_range.start { + Ordering::Less + } else if probe.start > seek_range.start { + Ordering::Greater + } else { + Ordering::Equal + } + }) { + Ok(ix) | Err(ix) => ix, + }; + self.ranges.get(ix).map_or(false, |range| { + range.start <= seek_range.start && range.end >= seek_range.end + }) } pub fn contains(&self, c: char) -> bool { - self.ranges.iter().any(|r| r.contains(&(c as u32))) + self.contains_codepoint_range(c as u32..c as u32 + 1) } } @@ -387,11 +419,11 @@ impl fmt::Debug for CharacterSet { write!(f, "^ ")?; set = set.negate(); } - for (i, c) in set.chars().enumerate() { + for (i, range) in set.ranges().enumerate() { if i > 0 { write!(f, ", ")?; } - write!(f, "{c:?}")?; + write!(f, "{range:?}")?; } write!(f, "]")?; Ok(()) @@ -503,17 +535,17 @@ impl<'a> NfaCursor<'a> { result.sort_unstable_by(|a, b| a.characters.cmp(&b.characters)); let mut i = 0; - 'i_loop: while i < result.len() { + while i < result.len() { for j in 0..i { if result[j].states == result[i].states && result[j].is_separator == result[i].is_separator && result[j].precedence == result[i].precedence { - let mut characters = CharacterSet::empty(); - swap(&mut characters, &mut result[j].characters); + let characters = mem::take(&mut result[j].characters); result[j].characters = characters.add(&result[i].characters); result.remove(i); - continue 'i_loop; + i -= 1; + break; } } i += 1; @@ -1034,7 +1066,7 @@ mod tests { #[test] #[allow(clippy::single_range_in_vec_init)] - fn test_character_set_get_ranges() { + fn test_character_set_simplify_ignoring() { struct Row { chars: Vec, ruled_out_chars: Vec, @@ -1057,6 +1089,21 @@ mod tests { ruled_out_chars: vec!['d', 'f', 'g'], expected_ranges: vec!['a'..'h', 'z'..'z'], }, + Row { + chars: vec!['a', 'b', 'c', 'g', 'h', 'i'], + ruled_out_chars: vec!['d', 'j'], + expected_ranges: vec!['a'..'c', 'g'..'i'], + }, + Row { + chars: vec!['c', 'd', 'e', 'g', 'h'], + ruled_out_chars: vec!['a', 'b', 'c', 'd', 'e', 'f'], + expected_ranges: vec!['g'..'h'], + }, + Row { + chars: vec!['I', 'N'], + ruled_out_chars: vec!['A', 'I', 'N', 'Z'], + expected_ranges: vec![], + }, ]; for Row { @@ -1065,13 +1112,23 @@ mod tests { expected_ranges, } in &table { - let ruled_out_chars = ruled_out_chars.iter().map(|c: &char| *c as u32).collect(); + let ruled_out_chars = ruled_out_chars + .iter() + .fold(CharacterSet::empty(), |set, c| set.add_char(*c)); let mut set = CharacterSet::empty(); for c in chars { set = set.add_char(*c); } - let ranges = set.simplify_ignoring(&ruled_out_chars); - assert_eq!(ranges, *expected_ranges); + let actual = set.simplify_ignoring(&ruled_out_chars); + let expected = expected_ranges + .iter() + .fold(CharacterSet::empty(), |set, range| { + set.add_range(range.start, range.end) + }); + assert_eq!( + actual, expected, + "chars: {chars:?}, ruled out chars: {ruled_out_chars:?}" + ); } } } diff --git a/cli/src/generate/render.rs b/cli/src/generate/render.rs index ddba0fc1..2ef6c34a 100644 --- a/cli/src/generate/render.rs +++ b/cli/src/generate/render.rs @@ -1,4 +1,3 @@ -use core::ops::Range; use std::{ cmp, collections::{HashMap, HashSet}, @@ -7,8 +6,9 @@ use std::{ }; use super::{ - char_tree::{CharacterTree, Comparator}, + build_tables::Tables, grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType}, + nfa::CharacterSet, rules::{Alias, AliasMap, Symbol, SymbolType}, tables::{ AdvanceAction, FieldLocation, GotoAction, LexState, LexTable, ParseAction, ParseTable, @@ -16,7 +16,6 @@ use super::{ }, }; -const LARGE_CHARACTER_RANGE_COUNT: usize = 8; const SMALL_STATE_THRESHOLD: usize = 64; const ABI_VERSION_MIN: usize = 13; const ABI_VERSION_MAX: usize = tree_sitter::LANGUAGE_VERSION; @@ -64,6 +63,8 @@ struct Generator { parse_table: ParseTable, main_lex_table: LexTable, keyword_lex_table: LexTable, + large_character_sets: Vec<(Option, CharacterSet)>, + large_character_set_constant_names: Vec, large_state_count: usize, keyword_capture_token: Option, syntax_grammar: SyntaxGrammar, @@ -80,18 +81,6 @@ struct Generator { abi_version: usize, } -struct TransitionSummary { - is_included: bool, - ranges: Vec>, - call_id: Option, -} - -struct LargeCharacterSetInfo { - ranges: Vec>, - symbol: Symbol, - index: usize, -} - impl Generator { fn generate(mut self) -> String { self.init(); @@ -119,14 +108,20 @@ impl Generator { self.add_primary_state_id_list(); } + // Generate a helper function for each large character set. + // let mut sorted_large_char_sets = self.large_character_sets.iter().collect::>(); + for ix in 0..self.large_character_sets.len() { + self.add_character_set(ix); + } + let mut main_lex_table = LexTable::default(); swap(&mut main_lex_table, &mut self.main_lex_table); - self.add_lex_function("ts_lex", main_lex_table, true); + self.add_lex_function("ts_lex", main_lex_table); if self.keyword_capture_token.is_some() { let mut keyword_lex_table = LexTable::default(); swap(&mut keyword_lex_table, &mut self.keyword_lex_table); - self.add_lex_function("ts_lex_keywords", keyword_lex_table, false); + self.add_lex_function("ts_lex_keywords", keyword_lex_table); } self.add_lex_modes_list(); @@ -664,97 +659,7 @@ impl Generator { add_line!(self, ""); } - fn add_lex_function( - &mut self, - name: &str, - lex_table: LexTable, - extract_helper_functions: bool, - ) { - let mut ruled_out_chars = HashSet::new(); - let mut large_character_sets = Vec::::new(); - - // For each lex state, compute a summary of the code that needs to be - // generated. - let state_transition_summaries = lex_table - .states - .iter() - .map(|state| { - ruled_out_chars.clear(); - - // For each state transition, compute the set of character ranges - // that need to be checked. - state - .advance_actions - .iter() - .map(|(chars, action)| { - let is_included = !chars.contains(char::MAX); - let mut ranges; - if is_included { - ranges = chars.simplify_ignoring(&ruled_out_chars); - ruled_out_chars.extend(chars.iter()); - } else { - ranges = chars.clone().negate().simplify_ignoring(&ruled_out_chars); - ranges.insert(0, '\0'..'\0'); - } - - // Record any large character sets so that they can be extracted - // into helper functions, reducing code duplication. - let mut call_id = None; - if extract_helper_functions && ranges.len() > LARGE_CHARACTER_RANGE_COUNT { - let char_set_symbol = self - .symbol_for_advance_action(action, &lex_table) - .expect("No symbol for lex state"); - let mut count_for_symbol = 0; - for (i, info) in large_character_sets.iter_mut().enumerate() { - if info.ranges == ranges { - call_id = Some(i); - break; - } - if info.symbol == char_set_symbol { - count_for_symbol += 1; - } - } - if call_id.is_none() { - call_id = Some(large_character_sets.len()); - large_character_sets.push(LargeCharacterSetInfo { - symbol: char_set_symbol, - index: count_for_symbol + 1, - ranges: ranges.clone(), - }); - } - } - - TransitionSummary { - is_included, - ranges, - call_id, - } - }) - .collect() - }) - .collect::>>(); - - // Generate a helper function for each large character set. - let mut sorted_large_char_sets = large_character_sets.iter().collect::>(); - sorted_large_char_sets.sort_unstable_by_key(|info| (info.symbol, info.index)); - for info in sorted_large_char_sets { - add_line!( - self, - "static inline bool {}_character_set_{}(int32_t c) {{", - self.symbol_ids[&info.symbol], - info.index - ); - indent!(self); - add_whitespace!(self); - add!(self, "return "); - let tree = CharacterTree::from_ranges(&info.ranges); - self.add_character_tree(tree.as_ref()); - add!(self, ";\n"); - dedent!(self); - add_line!(self, "}}"); - add_line!(self, ""); - } - + fn add_lex_function(&mut self, name: &str, lex_table: LexTable) { add_line!( self, "static bool {name}(TSLexer *lexer, TSStateId state) {{", @@ -769,7 +674,7 @@ impl Generator { for (i, state) in lex_table.states.into_iter().enumerate() { add_line!(self, "case {i}:"); indent!(self); - self.add_lex_state(state, &state_transition_summaries[i], &large_character_sets); + self.add_lex_state(i, state); dedent!(self); } @@ -786,35 +691,7 @@ impl Generator { add_line!(self, ""); } - fn symbol_for_advance_action( - &self, - action: &AdvanceAction, - lex_table: &LexTable, - ) -> Option { - let mut state_ids = vec![action.state]; - let mut i = 0; - while i < state_ids.len() { - let id = state_ids[i]; - let state = &lex_table.states[id]; - if let Some(accept) = state.accept_action { - return Some(accept); - } - for (_, action) in &state.advance_actions { - if !state_ids.contains(&action.state) { - state_ids.push(action.state); - } - } - i += 1; - } - None - } - - fn add_lex_state( - &mut self, - state: LexState, - transition_info: &[TransitionSummary], - large_character_sets: &[LargeCharacterSetInfo], - ) { + fn add_lex_state(&mut self, _state_ix: usize, state: LexState) { if let Some(accept_action) = state.accept_action { add_line!(self, "ACCEPT_TOKEN({});", self.symbol_ids[&accept_action]); } @@ -823,37 +700,167 @@ impl Generator { add_line!(self, "if (eof) ADVANCE({});", eof_action.state); } - for (i, (_, action)) in state.advance_actions.into_iter().enumerate() { - let transition = &transition_info[i]; + let mut chars_copy = CharacterSet::empty(); + let mut large_set = CharacterSet::empty(); + let mut ruled_out_chars = CharacterSet::empty(); + + // The transitions in a lex state are sorted with the single-character + // transitions first. If there are many single-character transitions, + // then implement them using an array of (lookahead character, state) + // pairs, instead of individual if statements, in order to reduce compile + // time. + let mut leading_simple_transition_count = 0; + let mut leading_simple_transition_character_count = 0; + for (chars, action) in &state.advance_actions { + if action.in_main_token + && chars + .ranges() + .all(|r| r.start() == r.end() && *r.start() as u32 <= u16::MAX as u32) + { + leading_simple_transition_count += 1; + leading_simple_transition_character_count += chars.range_count(); + } else { + break; + } + } + + if leading_simple_transition_character_count >= 8 { + add_line!(self, "ADVANCE_MAP("); + indent!(self); + for (chars, action) in &state.advance_actions[0..leading_simple_transition_count] { + for range in chars.ranges() { + add_whitespace!(self); + self.add_character(*range.start()); + add!(self, ", {},\n", action.state); + } + ruled_out_chars = ruled_out_chars.add(chars); + } + dedent!(self); + add_line!(self, ");"); + } else { + leading_simple_transition_count = 0; + } + + for (chars, action) in &state.advance_actions[leading_simple_transition_count..] { add_whitespace!(self); - // If there is a helper function for this transition's character - // set, then generate a call to that helper function. - if let Some(call_id) = transition.call_id { - let info = &large_character_sets[call_id]; - add!(self, "if ("); - if !transition.is_included { - add!(self, "!"); + // The lex state's advance actions are represented with disjoint + // sets of characters. When translating these disjoint sets into a + // sequence of checks, we don't need to re-check conditions that + // have already been checked due to previous transitions. + // + // Note that this simplification may result in an empty character set. + // That means that the transition is guaranteed (nothing further needs to + // be checked), not that this transition is impossible. + let simplified_chars = chars.simplify_ignoring(&ruled_out_chars); + + // For large character sets, find the best matching character set from + // a pre-selected list of large character sets, which are based on the + // state transitions for invidual tokens. This transition may not exactly + // match one of the pre-selected character sets. In that case, determine + // the additional checks that need to be performed to match this transition. + let mut best_large_char_set: Option<(usize, CharacterSet, CharacterSet)> = None; + if simplified_chars.range_count() >= super::build_tables::LARGE_CHARACTER_RANGE_COUNT { + for (ix, (_, set)) in self.large_character_sets.iter().enumerate() { + chars_copy.assign(&simplified_chars); + large_set.assign(set); + let intersection = chars_copy.remove_intersection(&mut large_set); + if !intersection.is_empty() { + let additions = chars_copy.simplify_ignoring(&ruled_out_chars); + let removals = large_set.simplify_ignoring(&ruled_out_chars); + let total_range_count = additions.range_count() + removals.range_count(); + if total_range_count >= simplified_chars.range_count() { + continue; + } + if let Some((_, best_additions, best_removals)) = &best_large_char_set { + let best_range_count = + best_additions.range_count() + best_removals.range_count(); + if best_range_count < total_range_count { + continue; + } + } + best_large_char_set = Some((ix, additions, removals)); + } } - add!( - self, - "{}_character_set_{}(lookahead)) ", - self.symbol_ids[&info.symbol], - info.index - ); - self.add_advance_action(&action); - add!(self, "\n"); - continue; } - // Otherwise, generate code to compare the lookahead character - // with all of the character ranges. - if !transition.ranges.is_empty() { + // Add this transition's character set to the set of ruled out characters, + // which don't need to be checked for subsequent transitions in this state. + ruled_out_chars = ruled_out_chars.add(chars); + + let mut large_char_set_ix = None; + let mut asserted_chars = simplified_chars; + let mut negated_chars = CharacterSet::empty(); + if let Some((char_set_ix, additions, removals)) = best_large_char_set { + asserted_chars = additions; + negated_chars = removals; + large_char_set_ix = Some(char_set_ix); + } + + let mut line_break = "\n".to_string(); + for _ in 0..self.indent_level + 2 { + line_break.push_str(" "); + } + + let has_positive_condition = large_char_set_ix.is_some() || !asserted_chars.is_empty(); + let has_negative_condition = !negated_chars.is_empty(); + let has_condition = has_positive_condition || has_negative_condition; + if has_condition { add!(self, "if ("); - self.add_character_range_conditions(&transition.ranges, transition.is_included, 2); + if has_positive_condition && has_negative_condition { + add!(self, "("); + } + } + + if let Some(large_char_set_ix) = large_char_set_ix { + let large_set = &self.large_character_sets[large_char_set_ix].1; + + // If the character set contains the null character, check that we + // are not at the end of the file. + let check_eof = large_set.contains('\0'); + if check_eof { + add!(self, "(!eof && ") + } + + add!( + self, + "set_contains({}, {}, lookahead)", + &self.large_character_set_constant_names[large_char_set_ix], + large_set.range_count(), + ); + if check_eof { + add!(self, ")"); + } + } + + if !asserted_chars.is_empty() { + if large_char_set_ix.is_some() { + add!(self, " ||{line_break}"); + } + + // If the character set contains the max character, than it probably + // corresponds to a negated character class in a regex, so it will be more + // concise and readable to express it in terms of negated ranges. + let is_included = !asserted_chars.contains(char::MAX); + if !is_included { + asserted_chars = asserted_chars.negate().add_char('\0'); + } + + self.add_character_range_conditions(&asserted_chars, is_included, &line_break); + } + + if has_negative_condition { + if has_positive_condition { + add!(self, ") &&{line_break}"); + } + self.add_character_range_conditions(&negated_chars, false, &line_break); + } + + if has_condition { add!(self, ") "); } - self.add_advance_action(&action); + + self.add_advance_action(action); add!(self, "\n"); } @@ -862,135 +869,106 @@ impl Generator { fn add_character_range_conditions( &mut self, - ranges: &[Range], + characters: &CharacterSet, is_included: bool, - indent_count: usize, + line_break: &str, ) { - let mut line_break = "\n".to_string(); - for _ in 0..self.indent_level + indent_count { - line_break.push_str(" "); - } - - for (i, range) in ranges.iter().enumerate() { + for (i, range) in characters.ranges().enumerate() { + let start = *range.start(); + let end = *range.end(); if is_included { if i > 0 { add!(self, " ||{line_break}"); } - // parenthesis needed if we add the `!eof` condition to explicitly avoid confusion with - // precedence of `&&` and `||` - let mut close_paren = false; - if range.start == '\0' { + + if start == '\0' { add!(self, "(!eof && "); - close_paren = true; - } - if range.end == range.start { - add!(self, "lookahead == "); - self.add_character(range.start); - } else if range.end as u32 == range.start as u32 + 1 { - if close_paren { - add!(self, "("); + if end == '\0' { + add!(self, "lookahead == 0"); + } else { + add!(self, "lookahead <= "); } + self.add_character(end); + add!(self, ")"); + continue; + } else if end == start { add!(self, "lookahead == "); - self.add_character(range.start); + self.add_character(start); + } else if end as u32 == start as u32 + 1 { + add!(self, "lookahead == "); + self.add_character(start); add!(self, " ||{line_break}lookahead == "); - self.add_character(range.end); - if close_paren { - add!(self, ")"); - } + self.add_character(end); } else { add!(self, "("); - self.add_character(range.start); + self.add_character(start); add!(self, " <= lookahead && lookahead <= "); - self.add_character(range.end); - add!(self, ")"); - } - if close_paren { + self.add_character(end); add!(self, ")"); } } else { if i > 0 { add!(self, " &&{line_break}"); } - if range.end == range.start { + if end == start { add!(self, "lookahead != "); - self.add_character(range.start); - } else if range.end as u32 == range.start as u32 + 1 { + self.add_character(start); + } else if end as u32 == start as u32 + 1 { add!(self, "lookahead != "); - self.add_character(range.start); + self.add_character(start); add!(self, " &&{line_break}lookahead != "); - self.add_character(range.end); - } else if range.start != '\0' { + self.add_character(end); + } else if start != '\0' { add!(self, "(lookahead < "); - self.add_character(range.start); + self.add_character(start); add!(self, " || "); - self.add_character(range.end); + self.add_character(end); add!(self, " < lookahead)"); } else { add!(self, "lookahead > "); - self.add_character(range.end); + self.add_character(end); } } } } - fn add_character_tree(&mut self, tree: Option<&CharacterTree>) { - match tree { - Some(CharacterTree::Compare { - value, - operator, - consequence, - alternative, - }) => { - let op = match operator { - Comparator::Less => "<", - Comparator::LessOrEqual => "<=", - Comparator::Equal => "==", - Comparator::GreaterOrEqual => ">=", - }; - let consequence = consequence.as_ref().map(Box::as_ref); - let alternative = alternative.as_ref().map(Box::as_ref); + fn add_character_set(&mut self, ix: usize) { + let (symbol, characters) = self.large_character_sets[ix].clone(); + let count = self.large_character_sets[0..ix] + .iter() + .filter(|(sym, _)| *sym == symbol) + .count() + + 1; - let simple = alternative.is_none() && consequence == Some(&CharacterTree::Yes); + let constant_name = if let Some(symbol) = symbol { + format!("{}_character_set_{}", self.symbol_ids[&symbol], count) + } else { + format!("extras_character_set_{}", count) + }; + add_line!(self, "static TSCharacterRange {}[] = {{", constant_name); + self.large_character_set_constant_names.push(constant_name); - if !simple { - add!(self, "("); - } - - add!(self, "c {op} "); - self.add_character(*value); - - if !simple { - if alternative.is_none() { - add!(self, " && "); - self.add_character_tree(consequence); - } else if consequence == Some(&CharacterTree::Yes) { - add!(self, " || "); - self.add_character_tree(alternative); - } else { - add!(self, "\n"); - indent!(self); - add_whitespace!(self); - add!(self, "? "); - self.add_character_tree(consequence); - add!(self, "\n"); - add_whitespace!(self); - add!(self, ": "); - self.add_character_tree(alternative); - dedent!(self); - } - } - - if !simple { - add!(self, ")"); + indent!(self); + for (ix, range) in characters.ranges().enumerate() { + let column = ix % 8; + if column == 0 { + if ix > 0 { + add!(self, "\n"); } + add_whitespace!(self); + } else { + add!(self, " "); } - Some(CharacterTree::Yes) => { - add!(self, "true"); - } - None => { - add!(self, "false"); - } + add!(self, "{{"); + self.add_character(*range.start()); + add!(self, ", "); + self.add_character(*range.end()); + add!(self, "}},"); } + add!(self, "\n"); + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); } fn add_advance_action(&mut self, action: &AdvanceAction) { @@ -1656,10 +1634,12 @@ impl Generator { '\t' => add!(self, "'\\t'"), '\r' => add!(self, "'\\r'"), _ => { - if c == ' ' || c.is_ascii_graphic() { + if c == '\0' { + add!(self, "0") + } else if c == ' ' || c.is_ascii_graphic() { add!(self, "'{c}'"); } else { - add!(self, "{}", c as u32); + add!(self, "0x{:02x}", c as u32); } } } @@ -1686,10 +1666,7 @@ impl Generator { #[allow(clippy::too_many_arguments)] pub fn render_c_code( name: &str, - parse_table: ParseTable, - main_lex_table: LexTable, - keyword_lex_table: LexTable, - keyword_capture_token: Option, + tables: Tables, syntax_grammar: SyntaxGrammar, lexical_grammar: LexicalGrammar, default_aliases: AliasMap, @@ -1705,10 +1682,12 @@ pub fn render_c_code( indent_level: 0, language_name: name.to_string(), large_state_count: 0, - parse_table, - main_lex_table, - keyword_lex_table, - keyword_capture_token, + parse_table: tables.parse_table, + main_lex_table: tables.main_lex_table, + keyword_lex_table: tables.keyword_lex_table, + keyword_capture_token: tables.word_token, + large_character_sets: tables.large_character_sets, + large_character_set_constant_names: Vec::new(), syntax_grammar, lexical_grammar, default_aliases, diff --git a/lib/src/parser.h b/lib/src/parser.h index a6081cc3..17f0e94b 100644 --- a/lib/src/parser.h +++ b/lib/src/parser.h @@ -86,6 +86,11 @@ typedef union { } entry; } TSParseActionEntry; +typedef struct { + int32_t start; + int32_t end; +} TSCharacterRange; + struct TSLanguage { uint32_t version; uint32_t symbol_count; @@ -125,6 +130,24 @@ struct TSLanguage { const TSStateId *primary_state_ids; }; +static inline bool set_contains(TSCharacterRange *ranges, uint32_t len, int32_t lookahead) { + uint32_t index = 0; + uint32_t size = len - index; + while (size > 1) { + uint32_t half_size = size / 2; + uint32_t mid_index = index + half_size; + TSCharacterRange *range = &ranges[mid_index]; + if (lookahead >= range->start && lookahead <= range->end) { + return true; + } else if (lookahead > range->end) { + index = mid_index; + } + size -= half_size; + } + TSCharacterRange *range = &ranges[index]; + return (lookahead >= range->start && lookahead <= range->end); +} + /* * Lexer Macros */ @@ -154,6 +177,17 @@ struct TSLanguage { goto next_state; \ } +#define ADVANCE_MAP(...) \ + { \ + static const uint16_t map[] = { __VA_ARGS__ }; \ + for (uint32_t i = 0; i < sizeof(map) / sizeof(map[0]); i += 2) { \ + if (map[i] == lookahead) { \ + state = map[i + 1]; \ + goto next_state; \ + } \ + } \ + } + #define SKIP(state_value) \ { \ skip = true; \