From be34bc9430a247c94a11a30ecef7f0101b35c8c2 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 9 Apr 2024 17:53:37 -0700 Subject: [PATCH] Identify large char sets for lexer using NFA transitions --- .../generate/build_tables/build_lex_table.rs | 62 +++- cli/src/generate/build_tables/mod.rs | 52 +-- cli/src/generate/mod.rs | 72 ++-- cli/src/generate/nfa.rs | 127 ++++--- cli/src/generate/render.rs | 317 +++++++----------- 5 files changed, 313 insertions(+), 317 deletions(-) diff --git a/cli/src/generate/build_tables/build_lex_table.rs b/cli/src/generate/build_tables/build_lex_table.rs index bc65447c..f9f1b99c 100644 --- a/cli/src/generate/build_tables/build_lex_table.rs +++ b/cli/src/generate/build_tables/build_lex_table.rs @@ -1,15 +1,23 @@ -use super::coincident_tokens::CoincidentTokenIndex; -use super::token_conflicts::TokenConflictMap; -use crate::generate::dedup::split_state_id_groups; -use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar}; -use crate::generate::nfa::NfaCursor; -use crate::generate::rules::{Symbol, TokenSet}; -use crate::generate::tables::{AdvanceAction, LexState, LexTable, ParseStateId, ParseTable}; +use crate::generate::{ + build_tables::{coincident_tokens::CoincidentTokenIndex, token_conflicts::TokenConflictMap}, + dedup::split_state_id_groups, + grammars::{LexicalGrammar, SyntaxGrammar}, + nfa::{CharacterSet, NfaCursor, NfaState}, + rules::{Symbol, TokenSet}, + tables::{AdvanceAction, LexState, LexTable, ParseStateId, ParseTable}, +}; use log::info; -use std::collections::hash_map::Entry; -use std::collections::{HashMap, VecDeque}; +use std::collections::{hash_map::Entry, HashMap, VecDeque}; use std::mem; +const LARGE_CHARACTER_RANGE_COUNT: usize = 8; + +pub struct LexTables { + pub main_lex_table: LexTable, + pub keyword_lex_table: LexTable, + pub large_character_sets: Vec<(Option, CharacterSet)>, +} + pub fn build_lex_table( parse_table: &mut ParseTable, syntax_grammar: &SyntaxGrammar, @@ -17,7 +25,7 @@ pub fn build_lex_table( keywords: &TokenSet, coincident_token_index: &CoincidentTokenIndex, token_conflict_map: &TokenConflictMap, -) -> (LexTable, LexTable) { +) -> LexTables { let keyword_lex_table = if syntax_grammar.word_token.is_some() { let mut builder = LexTableBuilder::new(lexical_grammar); builder.add_state_for_tokens(keywords); @@ -74,10 +82,36 @@ pub fn build_lex_table( } } - let mut table = builder.table; - minimize_lex_table(&mut table, parse_table); - sort_states(&mut table, parse_table); - (table, keyword_lex_table) + let mut main_lex_table = builder.table; + minimize_lex_table(&mut main_lex_table, parse_table); + sort_states(&mut main_lex_table, parse_table); + + let mut large_character_sets = Vec::new(); + for (state_ix, state) in lexical_grammar.nfa.states.iter().enumerate() { + if let NfaState::Advance { chars, is_sep, .. } = state { + if chars.range_count() > LARGE_CHARACTER_RANGE_COUNT { + let symbol = if *is_sep { + None + } else { + let ix = lexical_grammar + .variables + .iter() + .position(|v| v.start_state >= state_ix as u32); + ix.map(Symbol::terminal) + }; + + if !large_character_sets.iter().any(|(_, set)| set == chars) { + large_character_sets.push((symbol, chars.clone())); + } + } + } + } + + LexTables { + main_lex_table, + keyword_lex_table, + large_character_sets, + } } struct QueueEntry { diff --git a/cli/src/generate/build_tables/mod.rs b/cli/src/generate/build_tables/mod.rs index edb13cac..c7d49242 100644 --- a/cli/src/generate/build_tables/mod.rs +++ b/cli/src/generate/build_tables/mod.rs @@ -1,25 +1,37 @@ -pub mod build_lex_table; -pub mod build_parse_table; +mod build_lex_table; +mod build_parse_table; mod coincident_tokens; mod item; mod item_set_builder; mod minimize_parse_table; mod token_conflicts; -use self::build_lex_table::build_lex_table; -use self::build_parse_table::{build_parse_table, ParseStateInfo}; -use self::coincident_tokens::CoincidentTokenIndex; -use self::minimize_parse_table::minimize_parse_table; -use self::token_conflicts::TokenConflictMap; -use crate::generate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; -use crate::generate::nfa::NfaCursor; -use crate::generate::node_types::VariableInfo; -use crate::generate::rules::{AliasMap, Symbol, SymbolType, TokenSet}; -use crate::generate::tables::{LexTable, ParseAction, ParseTable, ParseTableEntry}; +use self::{ + build_lex_table::build_lex_table, + build_parse_table::{build_parse_table, ParseStateInfo}, + coincident_tokens::CoincidentTokenIndex, + minimize_parse_table::minimize_parse_table, + token_conflicts::TokenConflictMap, +}; +use crate::generate::{ + grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}, + nfa::{CharacterSet, NfaCursor}, + node_types::VariableInfo, + rules::{AliasMap, Symbol, SymbolType, TokenSet}, + tables::{LexTable, ParseAction, ParseTable, ParseTableEntry}, +}; use anyhow::Result; use log::info; use std::collections::{BTreeSet, HashMap}; +pub struct Tables { + pub parse_table: ParseTable, + pub main_lex_table: LexTable, + pub keyword_lex_table: LexTable, + pub word_token: Option, + pub large_character_sets: Vec<(Option, CharacterSet)>, +} + pub fn build_tables( syntax_grammar: &SyntaxGrammar, lexical_grammar: &LexicalGrammar, @@ -27,7 +39,7 @@ pub fn build_tables( variable_info: &[VariableInfo], inlines: &InlinedProductionMap, report_symbol_name: Option<&str>, -) -> Result<(ParseTable, LexTable, LexTable, Option)> { +) -> Result { let (mut parse_table, following_tokens, parse_state_info) = build_parse_table(syntax_grammar, lexical_grammar, inlines, variable_info)?; let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens); @@ -56,7 +68,7 @@ pub fn build_tables( &token_conflict_map, &keywords, ); - let (main_lex_table, keyword_lex_table) = build_lex_table( + let lex_tables = build_lex_table( &mut parse_table, syntax_grammar, lexical_grammar, @@ -76,12 +88,14 @@ pub fn build_tables( report_symbol_name, ); } - Ok(( + + Ok(Tables { parse_table, - main_lex_table, - keyword_lex_table, - syntax_grammar.word_token, - )) + main_lex_table: lex_tables.main_lex_table, + keyword_lex_table: lex_tables.keyword_lex_table, + large_character_sets: lex_tables.large_character_sets, + word_token: syntax_grammar.word_token, + }) } fn populate_error_state( diff --git a/cli/src/generate/mod.rs b/cli/src/generate/mod.rs index 493dde77..f2ecbcb1 100644 --- a/cli/src/generate/mod.rs +++ b/cli/src/generate/mod.rs @@ -1,21 +1,18 @@ +use self::grammars::InputGrammar; +use anyhow::{anyhow, Context, Result}; +use build_tables::build_tables; +use grammar_files::path_in_ignore; +use lazy_static::lazy_static; +use parse_grammar::parse_grammar; +use prepare_grammar::prepare_grammar; +use regex::{Regex, RegexBuilder}; +use render::render_c_code; +use semver::Version; use std::io::Write; use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; use std::{env, fs}; -use anyhow::{anyhow, Context, Result}; -use lazy_static::lazy_static; -use regex::{Regex, RegexBuilder}; -use semver::Version; - -use build_tables::build_tables; -use grammar_files::path_in_ignore; -use grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; -use parse_grammar::parse_grammar; -use prepare_grammar::prepare_grammar; -use render::render_c_code; -use rules::AliasMap; - mod build_tables; mod dedup; mod grammar_files; @@ -103,23 +100,12 @@ pub fn generate_parser_in_directory( // Parse and preprocess the grammar. let input_grammar = parse_grammar(&grammar_json)?; - let (syntax_grammar, lexical_grammar, inlines, simple_aliases) = - prepare_grammar(&input_grammar)?; - let language_name = input_grammar.name; // Generate the parser and related files. let GeneratedParser { c_code, node_types_json, - } = generate_parser_for_grammar_with_opts( - &language_name, - syntax_grammar, - lexical_grammar, - &inlines, - simple_aliases, - abi_version, - report_symbol_name, - )?; + } = generate_parser_for_grammar_with_opts(&input_grammar, abi_version, report_symbol_name)?; write_file(&src_path.join("parser.c"), c_code)?; write_file(&src_path.join("node-types.json"), node_types_json)?; @@ -128,7 +114,7 @@ pub fn generate_parser_in_directory( write_file(&header_path.join("parser.h"), tree_sitter::PARSER_HEADER)?; if !path_in_ignore(&repo_path) { - grammar_files::generate_grammar_files(&repo_path, &language_name, generate_bindings)?; + grammar_files::generate_grammar_files(&repo_path, &input_grammar.name, generate_bindings)?; } Ok(()) @@ -137,29 +123,18 @@ pub fn generate_parser_in_directory( pub fn generate_parser_for_grammar(grammar_json: &str) -> Result<(String, String)> { let grammar_json = JSON_COMMENT_REGEX.replace_all(grammar_json, "\n"); let input_grammar = parse_grammar(&grammar_json)?; - let (syntax_grammar, lexical_grammar, inlines, simple_aliases) = - prepare_grammar(&input_grammar)?; - let parser = generate_parser_for_grammar_with_opts( - &input_grammar.name, - syntax_grammar, - lexical_grammar, - &inlines, - simple_aliases, - tree_sitter::LANGUAGE_VERSION, - None, - )?; - Ok((input_grammar.name, parser.c_code)) + let parser = + generate_parser_for_grammar_with_opts(&input_grammar, tree_sitter::LANGUAGE_VERSION, None)?; + Ok((input_grammar.name.clone(), parser.c_code)) } fn generate_parser_for_grammar_with_opts( - name: &str, - syntax_grammar: SyntaxGrammar, - lexical_grammar: LexicalGrammar, - inlines: &InlinedProductionMap, - simple_aliases: AliasMap, + input_grammar: &InputGrammar, abi_version: usize, report_symbol_name: Option<&str>, ) -> Result { + let (syntax_grammar, lexical_grammar, inlines, simple_aliases) = + prepare_grammar(input_grammar)?; let variable_info = node_types::get_variable_info(&syntax_grammar, &lexical_grammar, &simple_aliases)?; let node_types_json = node_types::generate_node_types_json( @@ -168,20 +143,17 @@ fn generate_parser_for_grammar_with_opts( &simple_aliases, &variable_info, ); - let (parse_table, main_lex_table, keyword_lex_table, keyword_capture_token) = build_tables( + let tables = build_tables( &syntax_grammar, &lexical_grammar, &simple_aliases, &variable_info, - inlines, + &inlines, report_symbol_name, )?; let c_code = render_c_code( - name, - parse_table, - main_lex_table, - keyword_lex_table, - keyword_capture_token, + &input_grammar.name, + tables, syntax_grammar, lexical_grammar, simple_aliases, diff --git a/cli/src/generate/nfa.rs b/cli/src/generate/nfa.rs index 66f78074..0bfd05e3 100644 --- a/cli/src/generate/nfa.rs +++ b/cli/src/generate/nfa.rs @@ -1,10 +1,10 @@ -use std::char; -use std::cmp::max; -use std::cmp::Ordering; -use std::collections::HashSet; -use std::fmt; -use std::mem::swap; -use std::ops::Range; +use std::{ + char, + cmp::{max, Ordering}, + fmt, + mem::swap, + ops::{Range, RangeInclusive}, +}; /// A set of characters represented as a vector of ranges. #[derive(Clone, PartialEq, Eq, Hash)] @@ -114,6 +114,11 @@ impl CharacterSet { self } + pub fn assign(&mut self, other: &Self) { + self.ranges.clear(); + self.ranges.extend_from_slice(&other.ranges); + } + fn add_int_range(&mut self, mut i: usize, start: u32, end: u32) -> usize { while i < self.ranges.len() { let range = &mut self.ranges[i]; @@ -285,12 +290,24 @@ impl CharacterSet { self.add(&other) } - pub fn iter(&self) -> impl Iterator + '_ { - self.ranges.iter().flat_map(std::clone::Clone::clone) + pub fn char_codes(&self) -> impl Iterator + '_ { + self.ranges.iter().flat_map(Clone::clone) } pub fn chars(&self) -> impl Iterator + '_ { - self.iter().filter_map(char::from_u32) + self.char_codes().filter_map(char::from_u32) + } + + pub fn range_count(&self) -> usize { + self.ranges.len() + } + + pub fn ranges(&self) -> impl Iterator> + '_ { + self.ranges.iter().filter_map(|range| { + let start = range.clone().find_map(char::from_u32)?; + let end = (range.start..range.end).rev().find_map(char::from_u32)?; + Some(start..=end) + }) } pub fn is_empty(&self) -> bool { @@ -299,41 +316,46 @@ impl CharacterSet { /// Get a reduced list of character ranges, assuming that a given /// set of characters can be safely ignored. - pub fn simplify_ignoring<'a>( - &'a self, - ruled_out_characters: &'a HashSet, - ) -> Vec> { - let mut prev_range: Option> = None; - self.chars() - .map(|c| (c, false)) - .chain(Some(('\0', true))) - .filter_map(move |(c, done)| { - if done { - return prev_range.clone(); - } - if ruled_out_characters.contains(&(c as u32)) { - return None; - } - if let Some(range) = prev_range.clone() { - let mut prev_range_successor = range.end as u32 + 1; - while prev_range_successor < c as u32 { - if !ruled_out_characters.contains(&prev_range_successor) { - prev_range = Some(c..c); - return Some(range); - } - prev_range_successor += 1; + pub fn simplify_ignoring(&self, ruled_out_characters: &Self) -> Self { + let mut prev_range: Option> = None; + Self { + ranges: self + .char_codes() + .map(|c| (c, false)) + .chain(Some(('\0' as u32, true))) + .filter_map(move |(c, done)| { + if done { + return prev_range.clone(); } - prev_range = Some(range.start..c); - } else { - prev_range = Some(c..c); - } - None - }) - .collect() + if ruled_out_characters.contains_code(c) { + return None; + } + if let Some(range) = prev_range.clone() { + let mut prev_range_successor = range.end as u32; + while prev_range_successor < c as u32 { + if !ruled_out_characters.contains_code(prev_range_successor) { + prev_range = Some(c..(c + 1)); + return Some(range); + } + prev_range_successor += 1; + } + prev_range = Some(range.start..(c + 1)); + } else { + prev_range = Some(c..(c + 1)); + } + None + }) + .collect(), + } } pub fn contains(&self, c: char) -> bool { - self.ranges.iter().any(|r| r.contains(&(c as u32))) + self.contains_code(c as u32) + } + + fn contains_code(&self, c: u32) -> bool { + // self.ranges.iter().any(|r| r.start <= c && r.end >= c) + self.ranges.iter().any(|r| r.contains(&c)) } } @@ -1033,7 +1055,7 @@ mod tests { #[test] #[allow(clippy::single_range_in_vec_init)] - fn test_character_set_get_ranges() { + fn test_character_set_simplify_ignoring() { struct Row { chars: Vec, ruled_out_chars: Vec, @@ -1056,6 +1078,11 @@ mod tests { ruled_out_chars: vec!['d', 'f', 'g'], expected_ranges: vec!['a'..'h', 'z'..'z'], }, + Row { + chars: vec!['a', 'b', 'c', 'g', 'h', 'i'], + ruled_out_chars: vec!['d', 'j'], + expected_ranges: vec!['a'..'c', 'g'..'i'], + }, ]; for Row { @@ -1064,13 +1091,23 @@ mod tests { expected_ranges, } in &table { - let ruled_out_chars = ruled_out_chars.iter().map(|c: &char| *c as u32).collect(); + let ruled_out_chars = ruled_out_chars + .iter() + .fold(CharacterSet::empty(), |set, c| set.add_char(*c)); let mut set = CharacterSet::empty(); for c in chars { set = set.add_char(*c); } - let ranges = set.simplify_ignoring(&ruled_out_chars); - assert_eq!(ranges, *expected_ranges); + let actual = set.simplify_ignoring(&ruled_out_chars); + let expected = expected_ranges + .iter() + .fold(CharacterSet::empty(), |set, range| { + set.add_range(range.start, range.end) + }); + assert_eq!( + actual, expected, + "chars: {chars:?}, ruled out chars: {ruled_out_chars:?}" + ); } } } diff --git a/cli/src/generate/render.rs b/cli/src/generate/render.rs index a8221af1..bb6573b6 100644 --- a/cli/src/generate/render.rs +++ b/cli/src/generate/render.rs @@ -1,12 +1,13 @@ use super::{ + build_tables::Tables, grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType}, + nfa::CharacterSet, rules::{Alias, AliasMap, Symbol, SymbolType}, tables::{ AdvanceAction, FieldLocation, GotoAction, LexState, LexTable, ParseAction, ParseTable, ParseTableEntry, }, }; -use core::ops::Range; use std::{ cmp, collections::{HashMap, HashSet}, @@ -62,6 +63,8 @@ struct Generator { parse_table: ParseTable, main_lex_table: LexTable, keyword_lex_table: LexTable, + large_character_sets: Vec<(Option, CharacterSet)>, + large_character_set_constant_names: Vec, large_state_count: usize, keyword_capture_token: Option, syntax_grammar: SyntaxGrammar, @@ -78,18 +81,6 @@ struct Generator { abi_version: usize, } -struct TransitionSummary { - is_included: bool, - ranges: Vec>, - call_id: Option, -} - -struct LargeCharacterSetInfo { - ranges: Vec>, - symbol: Symbol, - index: usize, -} - impl Generator { fn generate(mut self) -> String { self.init(); @@ -117,14 +108,20 @@ impl Generator { self.add_primary_state_id_list(); } + // Generate a helper function for each large character set. + // let mut sorted_large_char_sets = self.large_character_sets.iter().collect::>(); + for ix in 0..self.large_character_sets.len() { + self.add_character_set(ix); + } + let mut main_lex_table = LexTable::default(); swap(&mut main_lex_table, &mut self.main_lex_table); - self.add_lex_function("ts_lex", main_lex_table, true); + self.add_lex_function("ts_lex", main_lex_table); if self.keyword_capture_token.is_some() { let mut keyword_lex_table = LexTable::default(); swap(&mut keyword_lex_table, &mut self.keyword_lex_table); - self.add_lex_function("ts_lex_keywords", keyword_lex_table, false); + self.add_lex_function("ts_lex_keywords", keyword_lex_table); } self.add_lex_modes_list(); @@ -662,83 +659,7 @@ impl Generator { add_line!(self, ""); } - fn add_lex_function( - &mut self, - name: &str, - lex_table: LexTable, - extract_helper_functions: bool, - ) { - let mut ruled_out_chars = HashSet::new(); - let mut large_character_sets = Vec::::new(); - - // For each lex state, compute a summary of the code that needs to be - // generated. - let state_transition_summaries = lex_table - .states - .iter() - .map(|state| { - ruled_out_chars.clear(); - - // For each state transition, compute the set of character ranges - // that need to be checked. - state - .advance_actions - .iter() - .map(|(chars, action)| { - let is_included = !chars.contains(std::char::MAX); - let mut ranges; - if is_included { - ranges = chars.simplify_ignoring(&ruled_out_chars); - ruled_out_chars.extend(chars.iter()); - } else { - ranges = chars.clone().negate().simplify_ignoring(&ruled_out_chars); - ranges.insert(0, '\0'..'\0'); - } - - // Record any large character sets so that they can be extracted - // into helper functions, reducing code duplication. - let mut call_id = None; - if extract_helper_functions && ranges.len() > LARGE_CHARACTER_RANGE_COUNT { - let char_set_symbol = self - .symbol_for_advance_action(action, &lex_table) - .expect("No symbol for lex state"); - let mut count_for_symbol = 0; - for (i, info) in large_character_sets.iter_mut().enumerate() { - if info.ranges == ranges { - call_id = Some(i); - break; - } - if info.symbol == char_set_symbol { - count_for_symbol += 1; - } - } - if call_id.is_none() { - call_id = Some(large_character_sets.len()); - large_character_sets.push(LargeCharacterSetInfo { - symbol: char_set_symbol, - index: count_for_symbol + 1, - ranges: ranges.clone(), - }); - } - } - - TransitionSummary { - is_included, - ranges, - call_id, - } - }) - .collect() - }) - .collect::>>(); - - // Generate a helper function for each large character set. - let mut sorted_large_char_sets = large_character_sets.iter().collect::>(); - sorted_large_char_sets.sort_unstable_by_key(|info| (info.symbol, info.index)); - for info in sorted_large_char_sets { - self.add_character_set(info); - } - + fn add_lex_function(&mut self, name: &str, lex_table: LexTable) { add_line!( self, "static bool {name}(TSLexer *lexer, TSStateId state) {{", @@ -753,7 +674,7 @@ impl Generator { for (i, state) in lex_table.states.into_iter().enumerate() { add_line!(self, "case {i}:"); indent!(self); - self.add_lex_state(state, &state_transition_summaries[i], &large_character_sets); + self.add_lex_state(state); dedent!(self); } @@ -770,35 +691,7 @@ impl Generator { add_line!(self, ""); } - fn symbol_for_advance_action( - &self, - action: &AdvanceAction, - lex_table: &LexTable, - ) -> Option { - let mut state_ids = vec![action.state]; - let mut i = 0; - while i < state_ids.len() { - let id = state_ids[i]; - let state = &lex_table.states[id]; - if let Some(accept) = state.accept_action { - return Some(accept); - } - for (_, action) in &state.advance_actions { - if !state_ids.contains(&action.state) { - state_ids.push(action.state); - } - } - i += 1; - } - None - } - - fn add_lex_state( - &mut self, - state: LexState, - transition_info: &[TransitionSummary], - large_character_sets: &[LargeCharacterSetInfo], - ) { + fn add_lex_state(&mut self, state: LexState) { if let Some(accept_action) = state.accept_action { add_line!(self, "ACCEPT_TOKEN({});", self.symbol_ids[&accept_action]); } @@ -807,37 +700,69 @@ impl Generator { add_line!(self, "if (eof) ADVANCE({});", eof_action.state); } - for (i, (_, action)) in state.advance_actions.into_iter().enumerate() { - let transition = &transition_info[i]; + let mut chars_copy = CharacterSet::empty(); + let mut large_set = CharacterSet::empty(); + let mut ruled_out_chars = CharacterSet::empty(); + + for (chars, action) in state.advance_actions { add_whitespace!(self); + // For each state transition, compute the set of character ranges + // that need to be checked. + let simplified = chars.simplify_ignoring(&ruled_out_chars); + ruled_out_chars = ruled_out_chars.add(&chars); + let mut chars = simplified; + + // Find a large character set that matches the transition's character set, + // allowing for ruled-out characters for previous transitions. + let mut call_id = None; + if chars.range_count() >= LARGE_CHARACTER_RANGE_COUNT { + for (ix, (_, set)) in self.large_character_sets.iter().enumerate() { + chars_copy.assign(&chars); + large_set.assign(&set); + chars_copy.remove_intersection(&mut large_set); + if chars_copy.is_empty() + && large_set.chars().all(|c| ruled_out_chars.contains(c)) + { + call_id = Some(ix); + break; + } + } + } + + let mut in_condition = false; + if call_id.is_some() || !chars.is_empty() { + add!(self, "if ("); + in_condition = true; + } + // If there is a helper function for this transition's character // set, then generate a call to that helper function. - if let Some(call_id) = transition.call_id { - let info = &large_character_sets[call_id]; - add!(self, "if ("); - if !transition.is_included { - add!(self, "!"); - } + if let Some(call_id) = call_id { add!( self, - "set_contains({}_character_set_{}, {}, lookahead)) ", - self.symbol_ids[&info.symbol], - info.index, - info.ranges.len(), + "set_contains({}, {}, lookahead)", + self.large_character_set_constant_names[call_id], + chars.range_count(), ); - self.add_advance_action(&action); - add!(self, "\n"); - continue; } - // Otherwise, generate code to compare the lookahead character // with all of the character ranges. - if !transition.ranges.is_empty() { - add!(self, "if ("); - self.add_character_range_conditions(&transition.ranges, transition.is_included, 2); + else if !chars.is_empty() { + if call_id.is_some() { + add!(self, " || "); + } + let is_included = !chars.contains(char::MAX); + if !is_included { + chars = chars.negate().add_char('\0'); + } + self.add_character_range_conditions(&chars, is_included, 2); + } + + if in_condition { add!(self, ") "); } + self.add_advance_action(&action); add!(self, "\n"); } @@ -847,7 +772,7 @@ impl Generator { fn add_character_range_conditions( &mut self, - ranges: &[Range], + characters: &CharacterSet, is_included: bool, indent_count: usize, ) { @@ -859,86 +784,99 @@ impl Generator { // parenthesis needed if we add the `!eof` condition to explicitly avoid confusion with // precedence of `&&` and `||` let (mut need_open_paren, mut need_close_paren) = (false, false); - for (i, range) in ranges.iter().enumerate() { + for (i, range) in characters.ranges().enumerate() { + let start = *range.start(); + let end = *range.end(); if is_included { if i > 0 { add!(self, " ||{line_break}"); } - if range.start == '\0' { + if start == '\0' { add!(self, "!eof && "); (need_open_paren, need_close_paren) = (true, true); } - if range.end == range.start { + if end == start { if need_open_paren { add!(self, "("); need_open_paren = false; } add!(self, "lookahead == "); - self.add_character(range.start); - if need_close_paren && i == ranges.len() - 1 { + self.add_character(start); + if need_close_paren && i == characters.range_count() - 1 { add!(self, ")"); need_close_paren = false; } - } else if range.end as u32 == range.start as u32 + 1 { + } else if end as u32 == start as u32 + 1 { add!(self, "lookahead == "); - self.add_character(range.start); + self.add_character(start); add!(self, " ||{line_break}lookahead == "); - self.add_character(range.end); + self.add_character(end); } else { add!(self, "("); - self.add_character(range.start); + self.add_character(start); add!(self, " <= lookahead && lookahead <= "); - self.add_character(range.end); + self.add_character(end); add!(self, ")"); } } else { if i > 0 { add!(self, " &&{line_break}"); } - if range.end == range.start { + if end == start { add!(self, "lookahead != "); - self.add_character(range.start); - } else if range.end as u32 == range.start as u32 + 1 { + self.add_character(start); + } else if end as u32 == start as u32 + 1 { add!(self, "lookahead != "); - self.add_character(range.start); + self.add_character(start); add!(self, " &&{line_break}lookahead != "); - self.add_character(range.end); - } else if range.start != '\0' { + self.add_character(end); + } else if start != '\0' { add!(self, "(lookahead < "); - self.add_character(range.start); + self.add_character(start); add!(self, " || "); - self.add_character(range.end); + self.add_character(end); add!(self, " < lookahead)"); } else { add!(self, "lookahead > "); - self.add_character(range.end); + self.add_character(end); } } } } - fn add_character_set(&mut self, info: &LargeCharacterSetInfo) { - add_line!( - self, - "static TSCharacterRange {}_character_set_{}[] = {{", - self.symbol_ids[&info.symbol], - info.index - ); + fn add_character_set(&mut self, ix: usize) { + let (symbol, characters) = self.large_character_sets[ix].clone(); + let count = self.large_character_sets[0..ix] + .iter() + .filter(|(sym, _)| *sym == symbol) + .count(); + + let constant_name = if let Some(symbol) = symbol { + format!("{}_character_set_{}", self.symbol_ids[&symbol], count) + } else { + format!("extras_character_set_{}", count) + }; + add_line!(self, "static TSCharacterRange {}[] = {{", constant_name); + self.large_character_set_constant_names.push(constant_name); + indent!(self); - for chunk in info.ranges.chunks(8) { - add_whitespace!(self); - for (i, range) in chunk.iter().enumerate() { - if i > 0 { - add!(self, " "); + for (ix, range) in characters.ranges().enumerate() { + let column = ix % 8; + if column == 0 { + if ix > 0 { + add!(self, "\n"); } - add!(self, "{{"); - self.add_character(range.start); - add!(self, ", "); - self.add_character(range.end); - add!(self, "}},"); + add_whitespace!(self); + } else { + add!(self, " "); } - add!(self, "\n"); + add!(self, "{{"); + self.add_character(*range.start()); + add!(self, ", "); + self.add_character(*range.end()); + add!(self, "}},"); } + add!(self, "\n"); dedent!(self); add_line!(self, "}};"); add_line!(self, ""); @@ -1610,10 +1548,12 @@ impl Generator { '\t' => add!(self, "'\\t'"), '\r' => add!(self, "'\\r'"), _ => { - if c == ' ' || c.is_ascii_graphic() { + if c == '\0' { + add!(self, "0") + } else if c == ' ' || c.is_ascii_graphic() { add!(self, "'{c}'"); } else { - add!(self, "{}", c as u32); + add!(self, "0x{:02x}", c as u32); } } } @@ -1641,10 +1581,7 @@ impl Generator { #[allow(clippy::too_many_arguments)] pub fn render_c_code( name: &str, - parse_table: ParseTable, - main_lex_table: LexTable, - keyword_lex_table: LexTable, - keyword_capture_token: Option, + tables: Tables, syntax_grammar: SyntaxGrammar, lexical_grammar: LexicalGrammar, default_aliases: AliasMap, @@ -1660,10 +1597,12 @@ pub fn render_c_code( indent_level: 0, language_name: name.to_string(), large_state_count: 0, - parse_table, - main_lex_table, - keyword_lex_table, - keyword_capture_token, + parse_table: tables.parse_table, + main_lex_table: tables.main_lex_table, + keyword_lex_table: tables.keyword_lex_table, + keyword_capture_token: tables.word_token, + large_character_sets: tables.large_character_sets, + large_character_set_constant_names: Vec::new(), syntax_grammar, lexical_grammar, default_aliases,