From 39be6972fe97eec0c69543f386877d1d19475457 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 29 Mar 2024 14:30:25 -0700 Subject: [PATCH 1/8] Use static arrays and a fixed binary search for large char set checks --- cli/src/generate/char_tree.rs | 133 ---------------------------------- cli/src/generate/mod.rs | 1 - cli/src/generate/render.rs | 99 +++++++------------------ lib/src/parser.h | 23 ++++++ 4 files changed, 49 insertions(+), 207 deletions(-) delete mode 100644 cli/src/generate/char_tree.rs diff --git a/cli/src/generate/char_tree.rs b/cli/src/generate/char_tree.rs deleted file mode 100644 index 2e28d56f..00000000 --- a/cli/src/generate/char_tree.rs +++ /dev/null @@ -1,133 +0,0 @@ -use std::ops::Range; - -/// A set of characters represented as a balanced binary tree of comparisons. -/// This is used as an intermediate step in generating efficient code for -/// matching a given character set. -#[derive(PartialEq, Eq)] -pub enum CharacterTree { - Yes, - Compare { - value: char, - operator: Comparator, - consequence: Option>, - alternative: Option>, - }, -} - -#[derive(PartialEq, Eq)] -pub enum Comparator { - Less, - LessOrEqual, - Equal, - GreaterOrEqual, -} - -impl CharacterTree { - pub fn from_ranges(ranges: &[Range]) -> Option { - match ranges.len() { - 0 => None, - 1 => { - let range = &ranges[0]; - if range.start == range.end { - Some(Self::Compare { - operator: Comparator::Equal, - value: range.start, - consequence: Some(Box::new(Self::Yes)), - alternative: None, - }) - } else { - Some(Self::Compare { - operator: Comparator::GreaterOrEqual, - value: range.start, - consequence: Some(Box::new(Self::Compare { - operator: Comparator::LessOrEqual, - value: range.end, - consequence: Some(Box::new(Self::Yes)), - alternative: None, - })), - alternative: None, - }) - } - } - len => { - let mid = len / 2; - let mid_range = &ranges[mid]; - Some(Self::Compare { - operator: Comparator::Less, - value: mid_range.start, - consequence: Self::from_ranges(&ranges[0..mid]).map(Box::new), - alternative: Some(Box::new(Self::Compare { - operator: Comparator::LessOrEqual, - value: mid_range.end, - consequence: Some(Box::new(Self::Yes)), - alternative: Self::from_ranges(&ranges[(mid + 1)..]).map(Box::new), - })), - }) - } - } - } - - #[cfg(test)] - fn contains(&self, c: char) -> bool { - match self { - Self::Yes => true, - Self::Compare { - value, - operator, - alternative, - consequence, - } => { - let condition = match operator { - Comparator::Less => c < *value, - Comparator::LessOrEqual => c <= *value, - Comparator::Equal => c == *value, - Comparator::GreaterOrEqual => c >= *value, - }; - if condition { consequence } else { alternative } - .as_ref() - .map_or(false, |a| a.contains(c)) - } - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_character_tree_simple() { - let tree = CharacterTree::from_ranges(&['a'..'d', 'h'..'l', 'p'..'r', 'u'..'u', 'z'..'z']) - .unwrap(); - - assert!(tree.contains('a')); - assert!(tree.contains('b')); - assert!(tree.contains('c')); - assert!(tree.contains('d')); - - assert!(!tree.contains('e')); - assert!(!tree.contains('f')); - assert!(!tree.contains('g')); - - assert!(tree.contains('h')); - assert!(tree.contains('i')); - assert!(tree.contains('j')); - assert!(tree.contains('k')); - assert!(tree.contains('l')); - - assert!(!tree.contains('m')); - assert!(!tree.contains('n')); - assert!(!tree.contains('o')); - - assert!(tree.contains('p')); - assert!(tree.contains('q')); - assert!(tree.contains('r')); - - assert!(!tree.contains('s')); - assert!(!tree.contains('s')); - - assert!(tree.contains('u')); - - assert!(!tree.contains('v')); - } -} diff --git a/cli/src/generate/mod.rs b/cli/src/generate/mod.rs index ea850c8d..493dde77 100644 --- a/cli/src/generate/mod.rs +++ b/cli/src/generate/mod.rs @@ -17,7 +17,6 @@ use render::render_c_code; use rules::AliasMap; mod build_tables; -mod char_tree; mod dedup; mod grammar_files; mod grammars; diff --git a/cli/src/generate/render.rs b/cli/src/generate/render.rs index 1de8a069..a8221af1 100644 --- a/cli/src/generate/render.rs +++ b/cli/src/generate/render.rs @@ -1,5 +1,4 @@ use super::{ - char_tree::{CharacterTree, Comparator}, grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType}, rules::{Alias, AliasMap, Symbol, SymbolType}, tables::{ @@ -737,21 +736,7 @@ impl Generator { let mut sorted_large_char_sets = large_character_sets.iter().collect::>(); sorted_large_char_sets.sort_unstable_by_key(|info| (info.symbol, info.index)); for info in sorted_large_char_sets { - add_line!( - self, - "static inline bool {}_character_set_{}(int32_t c) {{", - self.symbol_ids[&info.symbol], - info.index - ); - indent!(self); - add_whitespace!(self); - add!(self, "return "); - let tree = CharacterTree::from_ranges(&info.ranges); - self.add_character_tree(tree.as_ref()); - add!(self, ";\n"); - dedent!(self); - add_line!(self, "}}"); - add_line!(self, ""); + self.add_character_set(info); } add_line!( @@ -836,9 +821,10 @@ impl Generator { } add!( self, - "{}_character_set_{}(lookahead)) ", + "set_contains({}_character_set_{}, {}, lookahead)) ", self.symbol_ids[&info.symbol], - info.index + info.index, + info.ranges.len(), ); self.add_advance_action(&action); add!(self, "\n"); @@ -931,64 +917,31 @@ impl Generator { } } - fn add_character_tree(&mut self, tree: Option<&CharacterTree>) { - match tree { - Some(CharacterTree::Compare { - value, - operator, - consequence, - alternative, - }) => { - let op = match operator { - Comparator::Less => "<", - Comparator::LessOrEqual => "<=", - Comparator::Equal => "==", - Comparator::GreaterOrEqual => ">=", - }; - let consequence = consequence.as_ref().map(Box::as_ref); - let alternative = alternative.as_ref().map(Box::as_ref); - - let simple = alternative.is_none() && consequence == Some(&CharacterTree::Yes); - - if !simple { - add!(self, "("); - } - - add!(self, "c {op} "); - self.add_character(*value); - - if !simple { - if alternative.is_none() { - add!(self, " && "); - self.add_character_tree(consequence); - } else if consequence == Some(&CharacterTree::Yes) { - add!(self, " || "); - self.add_character_tree(alternative); - } else { - add!(self, "\n"); - indent!(self); - add_whitespace!(self); - add!(self, "? "); - self.add_character_tree(consequence); - add!(self, "\n"); - add_whitespace!(self); - add!(self, ": "); - self.add_character_tree(alternative); - dedent!(self); - } - } - - if !simple { - add!(self, ")"); + fn add_character_set(&mut self, info: &LargeCharacterSetInfo) { + add_line!( + self, + "static TSCharacterRange {}_character_set_{}[] = {{", + self.symbol_ids[&info.symbol], + info.index + ); + indent!(self); + for chunk in info.ranges.chunks(8) { + add_whitespace!(self); + for (i, range) in chunk.iter().enumerate() { + if i > 0 { + add!(self, " "); } + add!(self, "{{"); + self.add_character(range.start); + add!(self, ", "); + self.add_character(range.end); + add!(self, "}},"); } - Some(CharacterTree::Yes) => { - add!(self, "true"); - } - None => { - add!(self, "false"); - } + add!(self, "\n"); } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); } fn add_advance_action(&mut self, action: &AdvanceAction) { diff --git a/lib/src/parser.h b/lib/src/parser.h index 17b4fde9..718f2bd6 100644 --- a/lib/src/parser.h +++ b/lib/src/parser.h @@ -86,6 +86,11 @@ typedef union { } entry; } TSParseActionEntry; +typedef struct { + int32_t start; + int32_t end; +} TSCharacterRange; + struct TSLanguage { uint32_t version; uint32_t symbol_count; @@ -125,6 +130,24 @@ struct TSLanguage { const TSStateId *primary_state_ids; }; +static inline bool set_contains(TSCharacterRange *ranges, uint32_t len, int32_t lookahead) { + uint32_t index = 0; + uint32_t size = len - index; + while (size > 1) { + uint32_t half_size = size / 2; + uint32_t mid_index = index + half_size; + TSCharacterRange *range = &ranges[mid_index]; + if (lookahead >= range->start && lookahead <= range->end) { + return true; + } else if (lookahead > range->end) { + index = mid_index; + } + size -= half_size; + } + TSCharacterRange *range = &ranges[index]; + return (lookahead >= range->start && lookahead <= range->end); +} + /* * Lexer Macros */ From be34bc9430a247c94a11a30ecef7f0101b35c8c2 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 9 Apr 2024 17:53:37 -0700 Subject: [PATCH 2/8] Identify large char sets for lexer using NFA transitions --- .../generate/build_tables/build_lex_table.rs | 62 +++- cli/src/generate/build_tables/mod.rs | 52 +-- cli/src/generate/mod.rs | 72 ++-- cli/src/generate/nfa.rs | 127 ++++--- cli/src/generate/render.rs | 317 +++++++----------- 5 files changed, 313 insertions(+), 317 deletions(-) diff --git a/cli/src/generate/build_tables/build_lex_table.rs b/cli/src/generate/build_tables/build_lex_table.rs index bc65447c..f9f1b99c 100644 --- a/cli/src/generate/build_tables/build_lex_table.rs +++ b/cli/src/generate/build_tables/build_lex_table.rs @@ -1,15 +1,23 @@ -use super::coincident_tokens::CoincidentTokenIndex; -use super::token_conflicts::TokenConflictMap; -use crate::generate::dedup::split_state_id_groups; -use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar}; -use crate::generate::nfa::NfaCursor; -use crate::generate::rules::{Symbol, TokenSet}; -use crate::generate::tables::{AdvanceAction, LexState, LexTable, ParseStateId, ParseTable}; +use crate::generate::{ + build_tables::{coincident_tokens::CoincidentTokenIndex, token_conflicts::TokenConflictMap}, + dedup::split_state_id_groups, + grammars::{LexicalGrammar, SyntaxGrammar}, + nfa::{CharacterSet, NfaCursor, NfaState}, + rules::{Symbol, TokenSet}, + tables::{AdvanceAction, LexState, LexTable, ParseStateId, ParseTable}, +}; use log::info; -use std::collections::hash_map::Entry; -use std::collections::{HashMap, VecDeque}; +use std::collections::{hash_map::Entry, HashMap, VecDeque}; use std::mem; +const LARGE_CHARACTER_RANGE_COUNT: usize = 8; + +pub struct LexTables { + pub main_lex_table: LexTable, + pub keyword_lex_table: LexTable, + pub large_character_sets: Vec<(Option, CharacterSet)>, +} + pub fn build_lex_table( parse_table: &mut ParseTable, syntax_grammar: &SyntaxGrammar, @@ -17,7 +25,7 @@ pub fn build_lex_table( keywords: &TokenSet, coincident_token_index: &CoincidentTokenIndex, token_conflict_map: &TokenConflictMap, -) -> (LexTable, LexTable) { +) -> LexTables { let keyword_lex_table = if syntax_grammar.word_token.is_some() { let mut builder = LexTableBuilder::new(lexical_grammar); builder.add_state_for_tokens(keywords); @@ -74,10 +82,36 @@ pub fn build_lex_table( } } - let mut table = builder.table; - minimize_lex_table(&mut table, parse_table); - sort_states(&mut table, parse_table); - (table, keyword_lex_table) + let mut main_lex_table = builder.table; + minimize_lex_table(&mut main_lex_table, parse_table); + sort_states(&mut main_lex_table, parse_table); + + let mut large_character_sets = Vec::new(); + for (state_ix, state) in lexical_grammar.nfa.states.iter().enumerate() { + if let NfaState::Advance { chars, is_sep, .. } = state { + if chars.range_count() > LARGE_CHARACTER_RANGE_COUNT { + let symbol = if *is_sep { + None + } else { + let ix = lexical_grammar + .variables + .iter() + .position(|v| v.start_state >= state_ix as u32); + ix.map(Symbol::terminal) + }; + + if !large_character_sets.iter().any(|(_, set)| set == chars) { + large_character_sets.push((symbol, chars.clone())); + } + } + } + } + + LexTables { + main_lex_table, + keyword_lex_table, + large_character_sets, + } } struct QueueEntry { diff --git a/cli/src/generate/build_tables/mod.rs b/cli/src/generate/build_tables/mod.rs index edb13cac..c7d49242 100644 --- a/cli/src/generate/build_tables/mod.rs +++ b/cli/src/generate/build_tables/mod.rs @@ -1,25 +1,37 @@ -pub mod build_lex_table; -pub mod build_parse_table; +mod build_lex_table; +mod build_parse_table; mod coincident_tokens; mod item; mod item_set_builder; mod minimize_parse_table; mod token_conflicts; -use self::build_lex_table::build_lex_table; -use self::build_parse_table::{build_parse_table, ParseStateInfo}; -use self::coincident_tokens::CoincidentTokenIndex; -use self::minimize_parse_table::minimize_parse_table; -use self::token_conflicts::TokenConflictMap; -use crate::generate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; -use crate::generate::nfa::NfaCursor; -use crate::generate::node_types::VariableInfo; -use crate::generate::rules::{AliasMap, Symbol, SymbolType, TokenSet}; -use crate::generate::tables::{LexTable, ParseAction, ParseTable, ParseTableEntry}; +use self::{ + build_lex_table::build_lex_table, + build_parse_table::{build_parse_table, ParseStateInfo}, + coincident_tokens::CoincidentTokenIndex, + minimize_parse_table::minimize_parse_table, + token_conflicts::TokenConflictMap, +}; +use crate::generate::{ + grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}, + nfa::{CharacterSet, NfaCursor}, + node_types::VariableInfo, + rules::{AliasMap, Symbol, SymbolType, TokenSet}, + tables::{LexTable, ParseAction, ParseTable, ParseTableEntry}, +}; use anyhow::Result; use log::info; use std::collections::{BTreeSet, HashMap}; +pub struct Tables { + pub parse_table: ParseTable, + pub main_lex_table: LexTable, + pub keyword_lex_table: LexTable, + pub word_token: Option, + pub large_character_sets: Vec<(Option, CharacterSet)>, +} + pub fn build_tables( syntax_grammar: &SyntaxGrammar, lexical_grammar: &LexicalGrammar, @@ -27,7 +39,7 @@ pub fn build_tables( variable_info: &[VariableInfo], inlines: &InlinedProductionMap, report_symbol_name: Option<&str>, -) -> Result<(ParseTable, LexTable, LexTable, Option)> { +) -> Result { let (mut parse_table, following_tokens, parse_state_info) = build_parse_table(syntax_grammar, lexical_grammar, inlines, variable_info)?; let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens); @@ -56,7 +68,7 @@ pub fn build_tables( &token_conflict_map, &keywords, ); - let (main_lex_table, keyword_lex_table) = build_lex_table( + let lex_tables = build_lex_table( &mut parse_table, syntax_grammar, lexical_grammar, @@ -76,12 +88,14 @@ pub fn build_tables( report_symbol_name, ); } - Ok(( + + Ok(Tables { parse_table, - main_lex_table, - keyword_lex_table, - syntax_grammar.word_token, - )) + main_lex_table: lex_tables.main_lex_table, + keyword_lex_table: lex_tables.keyword_lex_table, + large_character_sets: lex_tables.large_character_sets, + word_token: syntax_grammar.word_token, + }) } fn populate_error_state( diff --git a/cli/src/generate/mod.rs b/cli/src/generate/mod.rs index 493dde77..f2ecbcb1 100644 --- a/cli/src/generate/mod.rs +++ b/cli/src/generate/mod.rs @@ -1,21 +1,18 @@ +use self::grammars::InputGrammar; +use anyhow::{anyhow, Context, Result}; +use build_tables::build_tables; +use grammar_files::path_in_ignore; +use lazy_static::lazy_static; +use parse_grammar::parse_grammar; +use prepare_grammar::prepare_grammar; +use regex::{Regex, RegexBuilder}; +use render::render_c_code; +use semver::Version; use std::io::Write; use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; use std::{env, fs}; -use anyhow::{anyhow, Context, Result}; -use lazy_static::lazy_static; -use regex::{Regex, RegexBuilder}; -use semver::Version; - -use build_tables::build_tables; -use grammar_files::path_in_ignore; -use grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; -use parse_grammar::parse_grammar; -use prepare_grammar::prepare_grammar; -use render::render_c_code; -use rules::AliasMap; - mod build_tables; mod dedup; mod grammar_files; @@ -103,23 +100,12 @@ pub fn generate_parser_in_directory( // Parse and preprocess the grammar. let input_grammar = parse_grammar(&grammar_json)?; - let (syntax_grammar, lexical_grammar, inlines, simple_aliases) = - prepare_grammar(&input_grammar)?; - let language_name = input_grammar.name; // Generate the parser and related files. let GeneratedParser { c_code, node_types_json, - } = generate_parser_for_grammar_with_opts( - &language_name, - syntax_grammar, - lexical_grammar, - &inlines, - simple_aliases, - abi_version, - report_symbol_name, - )?; + } = generate_parser_for_grammar_with_opts(&input_grammar, abi_version, report_symbol_name)?; write_file(&src_path.join("parser.c"), c_code)?; write_file(&src_path.join("node-types.json"), node_types_json)?; @@ -128,7 +114,7 @@ pub fn generate_parser_in_directory( write_file(&header_path.join("parser.h"), tree_sitter::PARSER_HEADER)?; if !path_in_ignore(&repo_path) { - grammar_files::generate_grammar_files(&repo_path, &language_name, generate_bindings)?; + grammar_files::generate_grammar_files(&repo_path, &input_grammar.name, generate_bindings)?; } Ok(()) @@ -137,29 +123,18 @@ pub fn generate_parser_in_directory( pub fn generate_parser_for_grammar(grammar_json: &str) -> Result<(String, String)> { let grammar_json = JSON_COMMENT_REGEX.replace_all(grammar_json, "\n"); let input_grammar = parse_grammar(&grammar_json)?; - let (syntax_grammar, lexical_grammar, inlines, simple_aliases) = - prepare_grammar(&input_grammar)?; - let parser = generate_parser_for_grammar_with_opts( - &input_grammar.name, - syntax_grammar, - lexical_grammar, - &inlines, - simple_aliases, - tree_sitter::LANGUAGE_VERSION, - None, - )?; - Ok((input_grammar.name, parser.c_code)) + let parser = + generate_parser_for_grammar_with_opts(&input_grammar, tree_sitter::LANGUAGE_VERSION, None)?; + Ok((input_grammar.name.clone(), parser.c_code)) } fn generate_parser_for_grammar_with_opts( - name: &str, - syntax_grammar: SyntaxGrammar, - lexical_grammar: LexicalGrammar, - inlines: &InlinedProductionMap, - simple_aliases: AliasMap, + input_grammar: &InputGrammar, abi_version: usize, report_symbol_name: Option<&str>, ) -> Result { + let (syntax_grammar, lexical_grammar, inlines, simple_aliases) = + prepare_grammar(input_grammar)?; let variable_info = node_types::get_variable_info(&syntax_grammar, &lexical_grammar, &simple_aliases)?; let node_types_json = node_types::generate_node_types_json( @@ -168,20 +143,17 @@ fn generate_parser_for_grammar_with_opts( &simple_aliases, &variable_info, ); - let (parse_table, main_lex_table, keyword_lex_table, keyword_capture_token) = build_tables( + let tables = build_tables( &syntax_grammar, &lexical_grammar, &simple_aliases, &variable_info, - inlines, + &inlines, report_symbol_name, )?; let c_code = render_c_code( - name, - parse_table, - main_lex_table, - keyword_lex_table, - keyword_capture_token, + &input_grammar.name, + tables, syntax_grammar, lexical_grammar, simple_aliases, diff --git a/cli/src/generate/nfa.rs b/cli/src/generate/nfa.rs index 66f78074..0bfd05e3 100644 --- a/cli/src/generate/nfa.rs +++ b/cli/src/generate/nfa.rs @@ -1,10 +1,10 @@ -use std::char; -use std::cmp::max; -use std::cmp::Ordering; -use std::collections::HashSet; -use std::fmt; -use std::mem::swap; -use std::ops::Range; +use std::{ + char, + cmp::{max, Ordering}, + fmt, + mem::swap, + ops::{Range, RangeInclusive}, +}; /// A set of characters represented as a vector of ranges. #[derive(Clone, PartialEq, Eq, Hash)] @@ -114,6 +114,11 @@ impl CharacterSet { self } + pub fn assign(&mut self, other: &Self) { + self.ranges.clear(); + self.ranges.extend_from_slice(&other.ranges); + } + fn add_int_range(&mut self, mut i: usize, start: u32, end: u32) -> usize { while i < self.ranges.len() { let range = &mut self.ranges[i]; @@ -285,12 +290,24 @@ impl CharacterSet { self.add(&other) } - pub fn iter(&self) -> impl Iterator + '_ { - self.ranges.iter().flat_map(std::clone::Clone::clone) + pub fn char_codes(&self) -> impl Iterator + '_ { + self.ranges.iter().flat_map(Clone::clone) } pub fn chars(&self) -> impl Iterator + '_ { - self.iter().filter_map(char::from_u32) + self.char_codes().filter_map(char::from_u32) + } + + pub fn range_count(&self) -> usize { + self.ranges.len() + } + + pub fn ranges(&self) -> impl Iterator> + '_ { + self.ranges.iter().filter_map(|range| { + let start = range.clone().find_map(char::from_u32)?; + let end = (range.start..range.end).rev().find_map(char::from_u32)?; + Some(start..=end) + }) } pub fn is_empty(&self) -> bool { @@ -299,41 +316,46 @@ impl CharacterSet { /// Get a reduced list of character ranges, assuming that a given /// set of characters can be safely ignored. - pub fn simplify_ignoring<'a>( - &'a self, - ruled_out_characters: &'a HashSet, - ) -> Vec> { - let mut prev_range: Option> = None; - self.chars() - .map(|c| (c, false)) - .chain(Some(('\0', true))) - .filter_map(move |(c, done)| { - if done { - return prev_range.clone(); - } - if ruled_out_characters.contains(&(c as u32)) { - return None; - } - if let Some(range) = prev_range.clone() { - let mut prev_range_successor = range.end as u32 + 1; - while prev_range_successor < c as u32 { - if !ruled_out_characters.contains(&prev_range_successor) { - prev_range = Some(c..c); - return Some(range); - } - prev_range_successor += 1; + pub fn simplify_ignoring(&self, ruled_out_characters: &Self) -> Self { + let mut prev_range: Option> = None; + Self { + ranges: self + .char_codes() + .map(|c| (c, false)) + .chain(Some(('\0' as u32, true))) + .filter_map(move |(c, done)| { + if done { + return prev_range.clone(); } - prev_range = Some(range.start..c); - } else { - prev_range = Some(c..c); - } - None - }) - .collect() + if ruled_out_characters.contains_code(c) { + return None; + } + if let Some(range) = prev_range.clone() { + let mut prev_range_successor = range.end as u32; + while prev_range_successor < c as u32 { + if !ruled_out_characters.contains_code(prev_range_successor) { + prev_range = Some(c..(c + 1)); + return Some(range); + } + prev_range_successor += 1; + } + prev_range = Some(range.start..(c + 1)); + } else { + prev_range = Some(c..(c + 1)); + } + None + }) + .collect(), + } } pub fn contains(&self, c: char) -> bool { - self.ranges.iter().any(|r| r.contains(&(c as u32))) + self.contains_code(c as u32) + } + + fn contains_code(&self, c: u32) -> bool { + // self.ranges.iter().any(|r| r.start <= c && r.end >= c) + self.ranges.iter().any(|r| r.contains(&c)) } } @@ -1033,7 +1055,7 @@ mod tests { #[test] #[allow(clippy::single_range_in_vec_init)] - fn test_character_set_get_ranges() { + fn test_character_set_simplify_ignoring() { struct Row { chars: Vec, ruled_out_chars: Vec, @@ -1056,6 +1078,11 @@ mod tests { ruled_out_chars: vec!['d', 'f', 'g'], expected_ranges: vec!['a'..'h', 'z'..'z'], }, + Row { + chars: vec!['a', 'b', 'c', 'g', 'h', 'i'], + ruled_out_chars: vec!['d', 'j'], + expected_ranges: vec!['a'..'c', 'g'..'i'], + }, ]; for Row { @@ -1064,13 +1091,23 @@ mod tests { expected_ranges, } in &table { - let ruled_out_chars = ruled_out_chars.iter().map(|c: &char| *c as u32).collect(); + let ruled_out_chars = ruled_out_chars + .iter() + .fold(CharacterSet::empty(), |set, c| set.add_char(*c)); let mut set = CharacterSet::empty(); for c in chars { set = set.add_char(*c); } - let ranges = set.simplify_ignoring(&ruled_out_chars); - assert_eq!(ranges, *expected_ranges); + let actual = set.simplify_ignoring(&ruled_out_chars); + let expected = expected_ranges + .iter() + .fold(CharacterSet::empty(), |set, range| { + set.add_range(range.start, range.end) + }); + assert_eq!( + actual, expected, + "chars: {chars:?}, ruled out chars: {ruled_out_chars:?}" + ); } } } diff --git a/cli/src/generate/render.rs b/cli/src/generate/render.rs index a8221af1..bb6573b6 100644 --- a/cli/src/generate/render.rs +++ b/cli/src/generate/render.rs @@ -1,12 +1,13 @@ use super::{ + build_tables::Tables, grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType}, + nfa::CharacterSet, rules::{Alias, AliasMap, Symbol, SymbolType}, tables::{ AdvanceAction, FieldLocation, GotoAction, LexState, LexTable, ParseAction, ParseTable, ParseTableEntry, }, }; -use core::ops::Range; use std::{ cmp, collections::{HashMap, HashSet}, @@ -62,6 +63,8 @@ struct Generator { parse_table: ParseTable, main_lex_table: LexTable, keyword_lex_table: LexTable, + large_character_sets: Vec<(Option, CharacterSet)>, + large_character_set_constant_names: Vec, large_state_count: usize, keyword_capture_token: Option, syntax_grammar: SyntaxGrammar, @@ -78,18 +81,6 @@ struct Generator { abi_version: usize, } -struct TransitionSummary { - is_included: bool, - ranges: Vec>, - call_id: Option, -} - -struct LargeCharacterSetInfo { - ranges: Vec>, - symbol: Symbol, - index: usize, -} - impl Generator { fn generate(mut self) -> String { self.init(); @@ -117,14 +108,20 @@ impl Generator { self.add_primary_state_id_list(); } + // Generate a helper function for each large character set. + // let mut sorted_large_char_sets = self.large_character_sets.iter().collect::>(); + for ix in 0..self.large_character_sets.len() { + self.add_character_set(ix); + } + let mut main_lex_table = LexTable::default(); swap(&mut main_lex_table, &mut self.main_lex_table); - self.add_lex_function("ts_lex", main_lex_table, true); + self.add_lex_function("ts_lex", main_lex_table); if self.keyword_capture_token.is_some() { let mut keyword_lex_table = LexTable::default(); swap(&mut keyword_lex_table, &mut self.keyword_lex_table); - self.add_lex_function("ts_lex_keywords", keyword_lex_table, false); + self.add_lex_function("ts_lex_keywords", keyword_lex_table); } self.add_lex_modes_list(); @@ -662,83 +659,7 @@ impl Generator { add_line!(self, ""); } - fn add_lex_function( - &mut self, - name: &str, - lex_table: LexTable, - extract_helper_functions: bool, - ) { - let mut ruled_out_chars = HashSet::new(); - let mut large_character_sets = Vec::::new(); - - // For each lex state, compute a summary of the code that needs to be - // generated. - let state_transition_summaries = lex_table - .states - .iter() - .map(|state| { - ruled_out_chars.clear(); - - // For each state transition, compute the set of character ranges - // that need to be checked. - state - .advance_actions - .iter() - .map(|(chars, action)| { - let is_included = !chars.contains(std::char::MAX); - let mut ranges; - if is_included { - ranges = chars.simplify_ignoring(&ruled_out_chars); - ruled_out_chars.extend(chars.iter()); - } else { - ranges = chars.clone().negate().simplify_ignoring(&ruled_out_chars); - ranges.insert(0, '\0'..'\0'); - } - - // Record any large character sets so that they can be extracted - // into helper functions, reducing code duplication. - let mut call_id = None; - if extract_helper_functions && ranges.len() > LARGE_CHARACTER_RANGE_COUNT { - let char_set_symbol = self - .symbol_for_advance_action(action, &lex_table) - .expect("No symbol for lex state"); - let mut count_for_symbol = 0; - for (i, info) in large_character_sets.iter_mut().enumerate() { - if info.ranges == ranges { - call_id = Some(i); - break; - } - if info.symbol == char_set_symbol { - count_for_symbol += 1; - } - } - if call_id.is_none() { - call_id = Some(large_character_sets.len()); - large_character_sets.push(LargeCharacterSetInfo { - symbol: char_set_symbol, - index: count_for_symbol + 1, - ranges: ranges.clone(), - }); - } - } - - TransitionSummary { - is_included, - ranges, - call_id, - } - }) - .collect() - }) - .collect::>>(); - - // Generate a helper function for each large character set. - let mut sorted_large_char_sets = large_character_sets.iter().collect::>(); - sorted_large_char_sets.sort_unstable_by_key(|info| (info.symbol, info.index)); - for info in sorted_large_char_sets { - self.add_character_set(info); - } - + fn add_lex_function(&mut self, name: &str, lex_table: LexTable) { add_line!( self, "static bool {name}(TSLexer *lexer, TSStateId state) {{", @@ -753,7 +674,7 @@ impl Generator { for (i, state) in lex_table.states.into_iter().enumerate() { add_line!(self, "case {i}:"); indent!(self); - self.add_lex_state(state, &state_transition_summaries[i], &large_character_sets); + self.add_lex_state(state); dedent!(self); } @@ -770,35 +691,7 @@ impl Generator { add_line!(self, ""); } - fn symbol_for_advance_action( - &self, - action: &AdvanceAction, - lex_table: &LexTable, - ) -> Option { - let mut state_ids = vec![action.state]; - let mut i = 0; - while i < state_ids.len() { - let id = state_ids[i]; - let state = &lex_table.states[id]; - if let Some(accept) = state.accept_action { - return Some(accept); - } - for (_, action) in &state.advance_actions { - if !state_ids.contains(&action.state) { - state_ids.push(action.state); - } - } - i += 1; - } - None - } - - fn add_lex_state( - &mut self, - state: LexState, - transition_info: &[TransitionSummary], - large_character_sets: &[LargeCharacterSetInfo], - ) { + fn add_lex_state(&mut self, state: LexState) { if let Some(accept_action) = state.accept_action { add_line!(self, "ACCEPT_TOKEN({});", self.symbol_ids[&accept_action]); } @@ -807,37 +700,69 @@ impl Generator { add_line!(self, "if (eof) ADVANCE({});", eof_action.state); } - for (i, (_, action)) in state.advance_actions.into_iter().enumerate() { - let transition = &transition_info[i]; + let mut chars_copy = CharacterSet::empty(); + let mut large_set = CharacterSet::empty(); + let mut ruled_out_chars = CharacterSet::empty(); + + for (chars, action) in state.advance_actions { add_whitespace!(self); + // For each state transition, compute the set of character ranges + // that need to be checked. + let simplified = chars.simplify_ignoring(&ruled_out_chars); + ruled_out_chars = ruled_out_chars.add(&chars); + let mut chars = simplified; + + // Find a large character set that matches the transition's character set, + // allowing for ruled-out characters for previous transitions. + let mut call_id = None; + if chars.range_count() >= LARGE_CHARACTER_RANGE_COUNT { + for (ix, (_, set)) in self.large_character_sets.iter().enumerate() { + chars_copy.assign(&chars); + large_set.assign(&set); + chars_copy.remove_intersection(&mut large_set); + if chars_copy.is_empty() + && large_set.chars().all(|c| ruled_out_chars.contains(c)) + { + call_id = Some(ix); + break; + } + } + } + + let mut in_condition = false; + if call_id.is_some() || !chars.is_empty() { + add!(self, "if ("); + in_condition = true; + } + // If there is a helper function for this transition's character // set, then generate a call to that helper function. - if let Some(call_id) = transition.call_id { - let info = &large_character_sets[call_id]; - add!(self, "if ("); - if !transition.is_included { - add!(self, "!"); - } + if let Some(call_id) = call_id { add!( self, - "set_contains({}_character_set_{}, {}, lookahead)) ", - self.symbol_ids[&info.symbol], - info.index, - info.ranges.len(), + "set_contains({}, {}, lookahead)", + self.large_character_set_constant_names[call_id], + chars.range_count(), ); - self.add_advance_action(&action); - add!(self, "\n"); - continue; } - // Otherwise, generate code to compare the lookahead character // with all of the character ranges. - if !transition.ranges.is_empty() { - add!(self, "if ("); - self.add_character_range_conditions(&transition.ranges, transition.is_included, 2); + else if !chars.is_empty() { + if call_id.is_some() { + add!(self, " || "); + } + let is_included = !chars.contains(char::MAX); + if !is_included { + chars = chars.negate().add_char('\0'); + } + self.add_character_range_conditions(&chars, is_included, 2); + } + + if in_condition { add!(self, ") "); } + self.add_advance_action(&action); add!(self, "\n"); } @@ -847,7 +772,7 @@ impl Generator { fn add_character_range_conditions( &mut self, - ranges: &[Range], + characters: &CharacterSet, is_included: bool, indent_count: usize, ) { @@ -859,86 +784,99 @@ impl Generator { // parenthesis needed if we add the `!eof` condition to explicitly avoid confusion with // precedence of `&&` and `||` let (mut need_open_paren, mut need_close_paren) = (false, false); - for (i, range) in ranges.iter().enumerate() { + for (i, range) in characters.ranges().enumerate() { + let start = *range.start(); + let end = *range.end(); if is_included { if i > 0 { add!(self, " ||{line_break}"); } - if range.start == '\0' { + if start == '\0' { add!(self, "!eof && "); (need_open_paren, need_close_paren) = (true, true); } - if range.end == range.start { + if end == start { if need_open_paren { add!(self, "("); need_open_paren = false; } add!(self, "lookahead == "); - self.add_character(range.start); - if need_close_paren && i == ranges.len() - 1 { + self.add_character(start); + if need_close_paren && i == characters.range_count() - 1 { add!(self, ")"); need_close_paren = false; } - } else if range.end as u32 == range.start as u32 + 1 { + } else if end as u32 == start as u32 + 1 { add!(self, "lookahead == "); - self.add_character(range.start); + self.add_character(start); add!(self, " ||{line_break}lookahead == "); - self.add_character(range.end); + self.add_character(end); } else { add!(self, "("); - self.add_character(range.start); + self.add_character(start); add!(self, " <= lookahead && lookahead <= "); - self.add_character(range.end); + self.add_character(end); add!(self, ")"); } } else { if i > 0 { add!(self, " &&{line_break}"); } - if range.end == range.start { + if end == start { add!(self, "lookahead != "); - self.add_character(range.start); - } else if range.end as u32 == range.start as u32 + 1 { + self.add_character(start); + } else if end as u32 == start as u32 + 1 { add!(self, "lookahead != "); - self.add_character(range.start); + self.add_character(start); add!(self, " &&{line_break}lookahead != "); - self.add_character(range.end); - } else if range.start != '\0' { + self.add_character(end); + } else if start != '\0' { add!(self, "(lookahead < "); - self.add_character(range.start); + self.add_character(start); add!(self, " || "); - self.add_character(range.end); + self.add_character(end); add!(self, " < lookahead)"); } else { add!(self, "lookahead > "); - self.add_character(range.end); + self.add_character(end); } } } } - fn add_character_set(&mut self, info: &LargeCharacterSetInfo) { - add_line!( - self, - "static TSCharacterRange {}_character_set_{}[] = {{", - self.symbol_ids[&info.symbol], - info.index - ); + fn add_character_set(&mut self, ix: usize) { + let (symbol, characters) = self.large_character_sets[ix].clone(); + let count = self.large_character_sets[0..ix] + .iter() + .filter(|(sym, _)| *sym == symbol) + .count(); + + let constant_name = if let Some(symbol) = symbol { + format!("{}_character_set_{}", self.symbol_ids[&symbol], count) + } else { + format!("extras_character_set_{}", count) + }; + add_line!(self, "static TSCharacterRange {}[] = {{", constant_name); + self.large_character_set_constant_names.push(constant_name); + indent!(self); - for chunk in info.ranges.chunks(8) { - add_whitespace!(self); - for (i, range) in chunk.iter().enumerate() { - if i > 0 { - add!(self, " "); + for (ix, range) in characters.ranges().enumerate() { + let column = ix % 8; + if column == 0 { + if ix > 0 { + add!(self, "\n"); } - add!(self, "{{"); - self.add_character(range.start); - add!(self, ", "); - self.add_character(range.end); - add!(self, "}},"); + add_whitespace!(self); + } else { + add!(self, " "); } - add!(self, "\n"); + add!(self, "{{"); + self.add_character(*range.start()); + add!(self, ", "); + self.add_character(*range.end()); + add!(self, "}},"); } + add!(self, "\n"); dedent!(self); add_line!(self, "}};"); add_line!(self, ""); @@ -1610,10 +1548,12 @@ impl Generator { '\t' => add!(self, "'\\t'"), '\r' => add!(self, "'\\r'"), _ => { - if c == ' ' || c.is_ascii_graphic() { + if c == '\0' { + add!(self, "0") + } else if c == ' ' || c.is_ascii_graphic() { add!(self, "'{c}'"); } else { - add!(self, "{}", c as u32); + add!(self, "0x{:02x}", c as u32); } } } @@ -1641,10 +1581,7 @@ impl Generator { #[allow(clippy::too_many_arguments)] pub fn render_c_code( name: &str, - parse_table: ParseTable, - main_lex_table: LexTable, - keyword_lex_table: LexTable, - keyword_capture_token: Option, + tables: Tables, syntax_grammar: SyntaxGrammar, lexical_grammar: LexicalGrammar, default_aliases: AliasMap, @@ -1660,10 +1597,12 @@ pub fn render_c_code( indent_level: 0, language_name: name.to_string(), large_state_count: 0, - parse_table, - main_lex_table, - keyword_lex_table, - keyword_capture_token, + parse_table: tables.parse_table, + main_lex_table: tables.main_lex_table, + keyword_lex_table: tables.keyword_lex_table, + keyword_capture_token: tables.word_token, + large_character_sets: tables.large_character_sets, + large_character_set_constant_names: Vec::new(), syntax_grammar, lexical_grammar, default_aliases, From 3d088888f571d71811825c27660354e42083e985 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 10 Apr 2024 16:02:28 -0700 Subject: [PATCH 3/8] Derive large character sets from lex states for individual tokens --- .../generate/build_tables/build_lex_table.rs | 51 ++++++---- cli/src/generate/build_tables/mod.rs | 2 + cli/src/generate/nfa.rs | 86 +++++++++++------ cli/src/generate/render.rs | 94 ++++++++++++------- 4 files changed, 149 insertions(+), 84 deletions(-) diff --git a/cli/src/generate/build_tables/build_lex_table.rs b/cli/src/generate/build_tables/build_lex_table.rs index f9f1b99c..1bb55acd 100644 --- a/cli/src/generate/build_tables/build_lex_table.rs +++ b/cli/src/generate/build_tables/build_lex_table.rs @@ -2,7 +2,7 @@ use crate::generate::{ build_tables::{coincident_tokens::CoincidentTokenIndex, token_conflicts::TokenConflictMap}, dedup::split_state_id_groups, grammars::{LexicalGrammar, SyntaxGrammar}, - nfa::{CharacterSet, NfaCursor, NfaState}, + nfa::{CharacterSet, NfaCursor}, rules::{Symbol, TokenSet}, tables::{AdvanceAction, LexState, LexTable, ParseStateId, ParseTable}, }; @@ -10,7 +10,7 @@ use log::info; use std::collections::{hash_map::Entry, HashMap, VecDeque}; use std::mem; -const LARGE_CHARACTER_RANGE_COUNT: usize = 8; +pub const LARGE_CHARACTER_RANGE_COUNT: usize = 8; pub struct LexTables { pub main_lex_table: LexTable, @@ -82,27 +82,36 @@ pub fn build_lex_table( } } - let mut main_lex_table = builder.table; + let mut main_lex_table = mem::take(&mut builder.table); minimize_lex_table(&mut main_lex_table, parse_table); sort_states(&mut main_lex_table, parse_table); let mut large_character_sets = Vec::new(); - for (state_ix, state) in lexical_grammar.nfa.states.iter().enumerate() { - if let NfaState::Advance { chars, is_sep, .. } = state { - if chars.range_count() > LARGE_CHARACTER_RANGE_COUNT { - let symbol = if *is_sep { - None - } else { - let ix = lexical_grammar - .variables - .iter() - .position(|v| v.start_state >= state_ix as u32); - ix.map(Symbol::terminal) - }; - - if !large_character_sets.iter().any(|(_, set)| set == chars) { - large_character_sets.push((symbol, chars.clone())); + for (variable_ix, _variable) in lexical_grammar.variables.iter().enumerate() { + let symbol = Symbol::terminal(variable_ix); + builder.reset(); + builder.add_state_for_tokens(&TokenSet::from_iter([symbol])); + for state in &builder.table.states { + let mut characters = CharacterSet::empty(); + for (chars, action) in &state.advance_actions { + if action.in_main_token { + characters = characters.add(&chars); + continue; } + + if chars.range_count() > LARGE_CHARACTER_RANGE_COUNT + && !large_character_sets.iter().any(|(_, set)| set == chars) + { + large_character_sets.push((None, chars.clone())); + } + } + + if characters.range_count() > LARGE_CHARACTER_RANGE_COUNT + && !large_character_sets + .iter() + .any(|(_, set)| *set == characters) + { + large_character_sets.push((Some(symbol), characters)); } } } @@ -139,6 +148,12 @@ impl<'a> LexTableBuilder<'a> { } } + fn reset(&mut self) { + self.table = LexTable::default(); + self.state_queue.clear(); + self.state_ids_by_nfa_state_set.clear(); + } + fn add_state_for_tokens(&mut self, tokens: &TokenSet) -> usize { let mut eof_valid = false; let nfa_states = tokens diff --git a/cli/src/generate/build_tables/mod.rs b/cli/src/generate/build_tables/mod.rs index c7d49242..e6277742 100644 --- a/cli/src/generate/build_tables/mod.rs +++ b/cli/src/generate/build_tables/mod.rs @@ -24,6 +24,8 @@ use anyhow::Result; use log::info; use std::collections::{BTreeSet, HashMap}; +pub use build_lex_table::LARGE_CHARACTER_RANGE_COUNT; + pub struct Tables { pub parse_table: ParseTable, pub main_lex_table: LexTable, diff --git a/cli/src/generate/nfa.rs b/cli/src/generate/nfa.rs index 0bfd05e3..4ad9f153 100644 --- a/cli/src/generate/nfa.rs +++ b/cli/src/generate/nfa.rs @@ -2,12 +2,12 @@ use std::{ char, cmp::{max, Ordering}, fmt, - mem::swap, + mem::{self, swap}, ops::{Range, RangeInclusive}, }; /// A set of characters represented as a vector of ranges. -#[derive(Clone, PartialEq, Eq, Hash)] +#[derive(Clone, Default, PartialEq, Eq, Hash)] pub struct CharacterSet { ranges: Vec>, } @@ -320,35 +320,51 @@ impl CharacterSet { let mut prev_range: Option> = None; Self { ranges: self - .char_codes() - .map(|c| (c, false)) - .chain(Some(('\0' as u32, true))) - .filter_map(move |(c, done)| { - if done { - return prev_range.clone(); - } - if ruled_out_characters.contains_code(c) { - return None; - } - if let Some(range) = prev_range.clone() { - let mut prev_range_successor = range.end as u32; - while prev_range_successor < c as u32 { - if !ruled_out_characters.contains_code(prev_range_successor) { - prev_range = Some(c..(c + 1)); - return Some(range); - } - prev_range_successor += 1; + .ranges + .iter() + .map(|range| Some(range.clone())) + .chain([None]) + .filter_map(move |range| { + if let Some(range) = &range { + if ruled_out_characters.contains_codepoint_range(range.clone()) { + return None; + } + + if let Some(prev_range) = &mut prev_range { + if ruled_out_characters + .contains_codepoint_range(prev_range.end..range.start) + { + prev_range.end = range.end; + return None; + } } - prev_range = Some(range.start..(c + 1)); - } else { - prev_range = Some(c..(c + 1)); } - None + + let result = prev_range.clone(); + prev_range = range; + result }) .collect(), } } + pub fn contains_codepoint_range(&self, seek_range: Range) -> bool { + let ix = match self.ranges.binary_search_by(|probe| { + if probe.start < seek_range.start { + Ordering::Less + } else if probe.start > seek_range.start { + Ordering::Greater + } else { + Ordering::Equal + } + }) { + Ok(ix) | Err(ix) => ix, + }; + self.ranges.get(ix).map_or(false, |range| { + range.start <= seek_range.start && range.end >= seek_range.end + }) + } + pub fn contains(&self, c: char) -> bool { self.contains_code(c as u32) } @@ -408,11 +424,11 @@ impl fmt::Debug for CharacterSet { write!(f, "^ ")?; set = set.negate(); } - for (i, c) in set.chars().enumerate() { + for (i, range) in set.ranges().enumerate() { if i > 0 { write!(f, ", ")?; } - write!(f, "{c:?}")?; + write!(f, "{range:?}")?; } write!(f, "]")?; Ok(()) @@ -524,17 +540,17 @@ impl<'a> NfaCursor<'a> { result.sort_unstable_by(|a, b| a.characters.cmp(&b.characters)); let mut i = 0; - 'i_loop: while i < result.len() { + while i < result.len() { for j in 0..i { if result[j].states == result[i].states && result[j].is_separator == result[i].is_separator && result[j].precedence == result[i].precedence { - let mut characters = CharacterSet::empty(); - swap(&mut characters, &mut result[j].characters); + let characters = mem::take(&mut result[j].characters); result[j].characters = characters.add(&result[i].characters); result.remove(i); - continue 'i_loop; + i -= 1; + break; } } i += 1; @@ -1083,6 +1099,16 @@ mod tests { ruled_out_chars: vec!['d', 'j'], expected_ranges: vec!['a'..'c', 'g'..'i'], }, + Row { + chars: vec!['c', 'd', 'e', 'g', 'h'], + ruled_out_chars: vec!['a', 'b', 'c', 'd', 'e', 'f'], + expected_ranges: vec!['g'..'h'], + }, + Row { + chars: vec!['I', 'N'], + ruled_out_chars: vec!['A', 'I', 'N', 'Z'], + expected_ranges: vec![], + }, ]; for Row { diff --git a/cli/src/generate/render.rs b/cli/src/generate/render.rs index bb6573b6..e3057cbb 100644 --- a/cli/src/generate/render.rs +++ b/cli/src/generate/render.rs @@ -15,7 +15,6 @@ use std::{ mem::swap, }; -const LARGE_CHARACTER_RANGE_COUNT: usize = 8; const SMALL_STATE_THRESHOLD: usize = 64; const ABI_VERSION_MIN: usize = 13; const ABI_VERSION_MAX: usize = tree_sitter::LANGUAGE_VERSION; @@ -674,7 +673,7 @@ impl Generator { for (i, state) in lex_table.states.into_iter().enumerate() { add_line!(self, "case {i}:"); indent!(self); - self.add_lex_state(state); + self.add_lex_state(i, state); dedent!(self); } @@ -691,7 +690,7 @@ impl Generator { add_line!(self, ""); } - fn add_lex_state(&mut self, state: LexState) { + fn add_lex_state(&mut self, _state_ix: usize, state: LexState) { if let Some(accept_action) = state.accept_action { add_line!(self, "ACCEPT_TOKEN({});", self.symbol_ids[&accept_action]); } @@ -709,54 +708,81 @@ impl Generator { // For each state transition, compute the set of character ranges // that need to be checked. - let simplified = chars.simplify_ignoring(&ruled_out_chars); - ruled_out_chars = ruled_out_chars.add(&chars); - let mut chars = simplified; + let simplified_chars = chars.simplify_ignoring(&ruled_out_chars); // Find a large character set that matches the transition's character set, // allowing for ruled-out characters for previous transitions. - let mut call_id = None; - if chars.range_count() >= LARGE_CHARACTER_RANGE_COUNT { + let mut best_large_char_set: Option<(usize, CharacterSet, CharacterSet)> = None; + if simplified_chars.range_count() >= super::build_tables::LARGE_CHARACTER_RANGE_COUNT { for (ix, (_, set)) in self.large_character_sets.iter().enumerate() { - chars_copy.assign(&chars); + chars_copy.assign(&simplified_chars); large_set.assign(&set); - chars_copy.remove_intersection(&mut large_set); - if chars_copy.is_empty() - && large_set.chars().all(|c| ruled_out_chars.contains(c)) - { - call_id = Some(ix); - break; + let intersection = chars_copy.remove_intersection(&mut large_set); + if !intersection.is_empty() { + let additions = chars_copy.simplify_ignoring(&ruled_out_chars); + let exclusions = large_set.simplify_ignoring(&ruled_out_chars); + if let Some((_, best_additions, best_exclusions)) = &best_large_char_set { + if best_additions.range_count() + best_exclusions.range_count() + < additions.range_count() + exclusions.range_count() + { + continue; + } + } + best_large_char_set = Some((ix, additions, exclusions)); } } } + ruled_out_chars = ruled_out_chars.add(&chars); + + let mut large_char_set_ix = None; + let mut asserted_chars = simplified_chars; + let mut negated_chars = CharacterSet::empty(); + if let Some((char_set_ix, additions, exclusions)) = best_large_char_set { + asserted_chars = additions; + negated_chars = exclusions; + large_char_set_ix = Some(char_set_ix); + } + + let mut line_break = "\n".to_string(); + for _ in 0..self.indent_level + 2 { + line_break.push_str(" "); + } + let mut in_condition = false; - if call_id.is_some() || !chars.is_empty() { + if large_char_set_ix.is_some() + || !asserted_chars.is_empty() + || !negated_chars.is_empty() + { add!(self, "if ("); in_condition = true; } - // If there is a helper function for this transition's character - // set, then generate a call to that helper function. - if let Some(call_id) = call_id { + if let Some(large_char_set_ix) = large_char_set_ix { add!( self, "set_contains({}, {}, lookahead)", - self.large_character_set_constant_names[call_id], - chars.range_count(), + self.large_character_set_constant_names[large_char_set_ix], + self.large_character_sets[large_char_set_ix].1.range_count(), ); } - // Otherwise, generate code to compare the lookahead character - // with all of the character ranges. - else if !chars.is_empty() { - if call_id.is_some() { - add!(self, " || "); + + if !asserted_chars.is_empty() { + if large_char_set_ix.is_some() { + add!(self, " ||{line_break}"); } - let is_included = !chars.contains(char::MAX); + let is_included = !asserted_chars.contains(char::MAX); if !is_included { - chars = chars.negate().add_char('\0'); + asserted_chars = asserted_chars.negate().add_char('\0'); } - self.add_character_range_conditions(&chars, is_included, 2); + self.add_character_range_conditions(&asserted_chars, is_included, &line_break); + } + + if !negated_chars.is_empty() { + if large_char_set_ix.is_some() || !asserted_chars.is_empty() { + add!(self, " &&{line_break}"); + } + self.add_character_range_conditions(&negated_chars, false, &line_break); } if in_condition { @@ -774,13 +800,8 @@ impl Generator { &mut self, characters: &CharacterSet, is_included: bool, - indent_count: usize, + line_break: &str, ) { - let mut line_break = "\n".to_string(); - for _ in 0..self.indent_level + indent_count { - line_break.push_str(" "); - } - // parenthesis needed if we add the `!eof` condition to explicitly avoid confusion with // precedence of `&&` and `||` let (mut need_open_paren, mut need_close_paren) = (false, false); @@ -849,7 +870,8 @@ impl Generator { let count = self.large_character_sets[0..ix] .iter() .filter(|(sym, _)| *sym == symbol) - .count(); + .count() + + 1; let constant_name = if let Some(symbol) = symbol { format!("{}_character_set_{}", self.symbol_ids[&symbol], count) From b8701fcf180f7f096262c62bce3453f3306e626a Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 11 Apr 2024 16:19:21 -0700 Subject: [PATCH 4/8] Check EOF when checking a large char set that contains the null character --- cli/src/generate/render.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/cli/src/generate/render.rs b/cli/src/generate/render.rs index bd482475..df57dcdc 100644 --- a/cli/src/generate/render.rs +++ b/cli/src/generate/render.rs @@ -759,12 +759,20 @@ impl Generator { } if let Some(large_char_set_ix) = large_char_set_ix { + let large_set = &self.large_character_sets[large_char_set_ix].1; + let check_eof = large_set.contains('\0'); + if check_eof { + add!(self, "(!eof && ") + } add!( self, "set_contains({}, {}, lookahead)", - self.large_character_set_constant_names[large_char_set_ix], - self.large_character_sets[large_char_set_ix].1.range_count(), + &self.large_character_set_constant_names[large_char_set_ix], + large_set.range_count(), ); + if check_eof { + add!(self, ")"); + } } if !asserted_chars.is_empty() { From 1f0707e1ac911e8ff8a4dae10e10689c093531bc Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 11 Apr 2024 16:29:59 -0700 Subject: [PATCH 5/8] Fix clippy warnings --- cli/src/generate/build_tables/build_lex_table.rs | 2 +- cli/src/generate/nfa.rs | 9 ++------- cli/src/generate/render.rs | 2 +- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/cli/src/generate/build_tables/build_lex_table.rs b/cli/src/generate/build_tables/build_lex_table.rs index 1bb55acd..72811f5e 100644 --- a/cli/src/generate/build_tables/build_lex_table.rs +++ b/cli/src/generate/build_tables/build_lex_table.rs @@ -95,7 +95,7 @@ pub fn build_lex_table( let mut characters = CharacterSet::empty(); for (chars, action) in &state.advance_actions { if action.in_main_token { - characters = characters.add(&chars); + characters = characters.add(chars); continue; } diff --git a/cli/src/generate/nfa.rs b/cli/src/generate/nfa.rs index 4ad9f153..b247ce4f 100644 --- a/cli/src/generate/nfa.rs +++ b/cli/src/generate/nfa.rs @@ -350,7 +350,7 @@ impl CharacterSet { pub fn contains_codepoint_range(&self, seek_range: Range) -> bool { let ix = match self.ranges.binary_search_by(|probe| { - if probe.start < seek_range.start { + if probe.end <= seek_range.start { Ordering::Less } else if probe.start > seek_range.start { Ordering::Greater @@ -366,12 +366,7 @@ impl CharacterSet { } pub fn contains(&self, c: char) -> bool { - self.contains_code(c as u32) - } - - fn contains_code(&self, c: u32) -> bool { - // self.ranges.iter().any(|r| r.start <= c && r.end >= c) - self.ranges.iter().any(|r| r.contains(&c)) + self.contains_codepoint_range(c as u32..c as u32 + 1) } } diff --git a/cli/src/generate/render.rs b/cli/src/generate/render.rs index df57dcdc..d60c0d5f 100644 --- a/cli/src/generate/render.rs +++ b/cli/src/generate/render.rs @@ -716,7 +716,7 @@ impl Generator { if simplified_chars.range_count() >= super::build_tables::LARGE_CHARACTER_RANGE_COUNT { for (ix, (_, set)) in self.large_character_sets.iter().enumerate() { chars_copy.assign(&simplified_chars); - large_set.assign(&set); + large_set.assign(set); let intersection = chars_copy.remove_intersection(&mut large_set); if !intersection.is_empty() { let additions = chars_copy.simplify_ignoring(&ruled_out_chars); From 15fe07a20e3cef6faf7c973574a9b823f1838602 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 12 Apr 2024 09:02:33 -0700 Subject: [PATCH 6/8] Clean up code generation for lexer state transitions --- cli/src/generate/render.rs | 63 +++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 21 deletions(-) diff --git a/cli/src/generate/render.rs b/cli/src/generate/render.rs index d60c0d5f..77caa348 100644 --- a/cli/src/generate/render.rs +++ b/cli/src/generate/render.rs @@ -706,12 +706,21 @@ impl Generator { for (chars, action) in state.advance_actions { add_whitespace!(self); - // For each state transition, compute the set of character ranges - // that need to be checked. + // The lex state's advance actions are represented with disjoint + // sets of characters. When translating these disjoint sets into a + // sequence of checks, we don't need to re-check conditions that + // have already been checked due to previous transitions. + // + // Note that this simplification may result in an empty character set. + // That means that the transition is guaranteed (nothing further needs to + // be checked), not that this transition is impossible. let simplified_chars = chars.simplify_ignoring(&ruled_out_chars); - // Find a large character set that matches the transition's character set, - // allowing for ruled-out characters for previous transitions. + // For large character sets, find the best matching character set from + // a pre-selected list of large character sets, which are based on the + // state transitions for invidual tokens. This transition may not exactly + // match one of the pre-selected character sets. In that case, determine + // the additional checks that need to be performed to match this transition. let mut best_large_char_set: Option<(usize, CharacterSet, CharacterSet)> = None; if simplified_chars.range_count() >= super::build_tables::LARGE_CHARACTER_RANGE_COUNT { for (ix, (_, set)) in self.large_character_sets.iter().enumerate() { @@ -720,27 +729,29 @@ impl Generator { let intersection = chars_copy.remove_intersection(&mut large_set); if !intersection.is_empty() { let additions = chars_copy.simplify_ignoring(&ruled_out_chars); - let exclusions = large_set.simplify_ignoring(&ruled_out_chars); - if let Some((_, best_additions, best_exclusions)) = &best_large_char_set { - if best_additions.range_count() + best_exclusions.range_count() - < additions.range_count() + exclusions.range_count() + let removals = large_set.simplify_ignoring(&ruled_out_chars); + if let Some((_, best_additions, best_removals)) = &best_large_char_set { + if best_additions.range_count() + best_removals.range_count() + < additions.range_count() + removals.range_count() { continue; } } - best_large_char_set = Some((ix, additions, exclusions)); + best_large_char_set = Some((ix, additions, removals)); } } } + // Add this transition's character set to the set of ruled out characters, + // which don't need to be checked for subsequent transitions in this state. ruled_out_chars = ruled_out_chars.add(&chars); let mut large_char_set_ix = None; let mut asserted_chars = simplified_chars; let mut negated_chars = CharacterSet::empty(); - if let Some((char_set_ix, additions, exclusions)) = best_large_char_set { + if let Some((char_set_ix, additions, removals)) = best_large_char_set { asserted_chars = additions; - negated_chars = exclusions; + negated_chars = removals; large_char_set_ix = Some(char_set_ix); } @@ -749,21 +760,26 @@ impl Generator { line_break.push_str(" "); } - let mut in_condition = false; - if large_char_set_ix.is_some() - || !asserted_chars.is_empty() - || !negated_chars.is_empty() - { + let has_positive_condition = large_char_set_ix.is_some() || !asserted_chars.is_empty(); + let has_negative_condition = !negated_chars.is_empty(); + let has_condition = has_positive_condition || has_negative_condition; + if has_condition { add!(self, "if ("); - in_condition = true; + if has_positive_condition && has_negative_condition { + add!(self, "("); + } } if let Some(large_char_set_ix) = large_char_set_ix { let large_set = &self.large_character_sets[large_char_set_ix].1; + + // If the character set contains the null character, check that we + // are not at the end of the file. let check_eof = large_set.contains('\0'); if check_eof { add!(self, "(!eof && ") } + add!( self, "set_contains({}, {}, lookahead)", @@ -779,21 +795,26 @@ impl Generator { if large_char_set_ix.is_some() { add!(self, " ||{line_break}"); } + + // If the character set contains the max character, than it probably + // corresponds to a negated character class in a regex, so it will be more + // concise and readable to express it in terms of negated ranges. let is_included = !asserted_chars.contains(char::MAX); if !is_included { asserted_chars = asserted_chars.negate().add_char('\0'); } + self.add_character_range_conditions(&asserted_chars, is_included, &line_break); } - if !negated_chars.is_empty() { - if large_char_set_ix.is_some() || !asserted_chars.is_empty() { - add!(self, " &&{line_break}"); + if has_negative_condition { + if has_positive_condition { + add!(self, ") &&{line_break}"); } self.add_character_range_conditions(&negated_chars, false, &line_break); } - if in_condition { + if has_condition { add!(self, ") "); } From 3210c7e21fc8894f4515809a17da6ea354e647f6 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 12 Apr 2024 12:01:23 -0700 Subject: [PATCH 7/8] Avoid using a large character set constant when it doesn't reduce code size --- cli/src/generate/render.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/cli/src/generate/render.rs b/cli/src/generate/render.rs index 00c414ae..f0ac0ab2 100644 --- a/cli/src/generate/render.rs +++ b/cli/src/generate/render.rs @@ -731,10 +731,14 @@ impl Generator { if !intersection.is_empty() { let additions = chars_copy.simplify_ignoring(&ruled_out_chars); let removals = large_set.simplify_ignoring(&ruled_out_chars); + let total_range_count = additions.range_count() + removals.range_count(); + if total_range_count >= simplified_chars.range_count() { + continue; + } if let Some((_, best_additions, best_removals)) = &best_large_char_set { - if best_additions.range_count() + best_removals.range_count() - < additions.range_count() + removals.range_count() - { + let best_range_count = + best_additions.range_count() + best_removals.range_count(); + if best_range_count < total_range_count { continue; } } From 7ec40b0ab44ad1387bdb79c009ef0fe20de13b3f Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 12 Apr 2024 14:34:03 -0700 Subject: [PATCH 8/8] Implement single-char state transitions using a static array and for loop This reduces compile time, compared to generating many individual if statements. --- cli/src/generate/render.rs | 43 +++++++++++++++++++++++++++++++++++--- lib/src/parser.h | 11 ++++++++++ 2 files changed, 51 insertions(+), 3 deletions(-) diff --git a/cli/src/generate/render.rs b/cli/src/generate/render.rs index f0ac0ab2..2ef6c34a 100644 --- a/cli/src/generate/render.rs +++ b/cli/src/generate/render.rs @@ -704,7 +704,44 @@ impl Generator { let mut large_set = CharacterSet::empty(); let mut ruled_out_chars = CharacterSet::empty(); - for (chars, action) in state.advance_actions { + // The transitions in a lex state are sorted with the single-character + // transitions first. If there are many single-character transitions, + // then implement them using an array of (lookahead character, state) + // pairs, instead of individual if statements, in order to reduce compile + // time. + let mut leading_simple_transition_count = 0; + let mut leading_simple_transition_character_count = 0; + for (chars, action) in &state.advance_actions { + if action.in_main_token + && chars + .ranges() + .all(|r| r.start() == r.end() && *r.start() as u32 <= u16::MAX as u32) + { + leading_simple_transition_count += 1; + leading_simple_transition_character_count += chars.range_count(); + } else { + break; + } + } + + if leading_simple_transition_character_count >= 8 { + add_line!(self, "ADVANCE_MAP("); + indent!(self); + for (chars, action) in &state.advance_actions[0..leading_simple_transition_count] { + for range in chars.ranges() { + add_whitespace!(self); + self.add_character(*range.start()); + add!(self, ", {},\n", action.state); + } + ruled_out_chars = ruled_out_chars.add(chars); + } + dedent!(self); + add_line!(self, ");"); + } else { + leading_simple_transition_count = 0; + } + + for (chars, action) in &state.advance_actions[leading_simple_transition_count..] { add_whitespace!(self); // The lex state's advance actions are represented with disjoint @@ -749,7 +786,7 @@ impl Generator { // Add this transition's character set to the set of ruled out characters, // which don't need to be checked for subsequent transitions in this state. - ruled_out_chars = ruled_out_chars.add(&chars); + ruled_out_chars = ruled_out_chars.add(chars); let mut large_char_set_ix = None; let mut asserted_chars = simplified_chars; @@ -823,7 +860,7 @@ impl Generator { add!(self, ") "); } - self.add_advance_action(&action); + self.add_advance_action(action); add!(self, "\n"); } diff --git a/lib/src/parser.h b/lib/src/parser.h index 30bb9292..17f0e94b 100644 --- a/lib/src/parser.h +++ b/lib/src/parser.h @@ -177,6 +177,17 @@ static inline bool set_contains(TSCharacterRange *ranges, uint32_t len, int32_t goto next_state; \ } +#define ADVANCE_MAP(...) \ + { \ + static const uint16_t map[] = { __VA_ARGS__ }; \ + for (uint32_t i = 0; i < sizeof(map) / sizeof(map[0]); i += 2) { \ + if (map[i] == lookahead) { \ + state = map[i + 1]; \ + goto next_state; \ + } \ + } \ + } + #define SKIP(state_value) \ { \ skip = true; \