From 39be6972fe97eec0c69543f386877d1d19475457 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 29 Mar 2024 14:30:25 -0700 Subject: [PATCH] Use static arrays and a fixed binary search for large char set checks --- cli/src/generate/char_tree.rs | 133 ---------------------------------- cli/src/generate/mod.rs | 1 - cli/src/generate/render.rs | 99 +++++++------------------ lib/src/parser.h | 23 ++++++ 4 files changed, 49 insertions(+), 207 deletions(-) delete mode 100644 cli/src/generate/char_tree.rs diff --git a/cli/src/generate/char_tree.rs b/cli/src/generate/char_tree.rs deleted file mode 100644 index 2e28d56f..00000000 --- a/cli/src/generate/char_tree.rs +++ /dev/null @@ -1,133 +0,0 @@ -use std::ops::Range; - -/// A set of characters represented as a balanced binary tree of comparisons. -/// This is used as an intermediate step in generating efficient code for -/// matching a given character set. -#[derive(PartialEq, Eq)] -pub enum CharacterTree { - Yes, - Compare { - value: char, - operator: Comparator, - consequence: Option>, - alternative: Option>, - }, -} - -#[derive(PartialEq, Eq)] -pub enum Comparator { - Less, - LessOrEqual, - Equal, - GreaterOrEqual, -} - -impl CharacterTree { - pub fn from_ranges(ranges: &[Range]) -> Option { - match ranges.len() { - 0 => None, - 1 => { - let range = &ranges[0]; - if range.start == range.end { - Some(Self::Compare { - operator: Comparator::Equal, - value: range.start, - consequence: Some(Box::new(Self::Yes)), - alternative: None, - }) - } else { - Some(Self::Compare { - operator: Comparator::GreaterOrEqual, - value: range.start, - consequence: Some(Box::new(Self::Compare { - operator: Comparator::LessOrEqual, - value: range.end, - consequence: Some(Box::new(Self::Yes)), - alternative: None, - })), - alternative: None, - }) - } - } - len => { - let mid = len / 2; - let mid_range = &ranges[mid]; - Some(Self::Compare { - operator: Comparator::Less, - value: mid_range.start, - consequence: Self::from_ranges(&ranges[0..mid]).map(Box::new), - alternative: Some(Box::new(Self::Compare { - operator: Comparator::LessOrEqual, - value: mid_range.end, - consequence: Some(Box::new(Self::Yes)), - alternative: Self::from_ranges(&ranges[(mid + 1)..]).map(Box::new), - })), - }) - } - } - } - - #[cfg(test)] - fn contains(&self, c: char) -> bool { - match self { - Self::Yes => true, - Self::Compare { - value, - operator, - alternative, - consequence, - } => { - let condition = match operator { - Comparator::Less => c < *value, - Comparator::LessOrEqual => c <= *value, - Comparator::Equal => c == *value, - Comparator::GreaterOrEqual => c >= *value, - }; - if condition { consequence } else { alternative } - .as_ref() - .map_or(false, |a| a.contains(c)) - } - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_character_tree_simple() { - let tree = CharacterTree::from_ranges(&['a'..'d', 'h'..'l', 'p'..'r', 'u'..'u', 'z'..'z']) - .unwrap(); - - assert!(tree.contains('a')); - assert!(tree.contains('b')); - assert!(tree.contains('c')); - assert!(tree.contains('d')); - - assert!(!tree.contains('e')); - assert!(!tree.contains('f')); - assert!(!tree.contains('g')); - - assert!(tree.contains('h')); - assert!(tree.contains('i')); - assert!(tree.contains('j')); - assert!(tree.contains('k')); - assert!(tree.contains('l')); - - assert!(!tree.contains('m')); - assert!(!tree.contains('n')); - assert!(!tree.contains('o')); - - assert!(tree.contains('p')); - assert!(tree.contains('q')); - assert!(tree.contains('r')); - - assert!(!tree.contains('s')); - assert!(!tree.contains('s')); - - assert!(tree.contains('u')); - - assert!(!tree.contains('v')); - } -} diff --git a/cli/src/generate/mod.rs b/cli/src/generate/mod.rs index ea850c8d..493dde77 100644 --- a/cli/src/generate/mod.rs +++ b/cli/src/generate/mod.rs @@ -17,7 +17,6 @@ use render::render_c_code; use rules::AliasMap; mod build_tables; -mod char_tree; mod dedup; mod grammar_files; mod grammars; diff --git a/cli/src/generate/render.rs b/cli/src/generate/render.rs index 1de8a069..a8221af1 100644 --- a/cli/src/generate/render.rs +++ b/cli/src/generate/render.rs @@ -1,5 +1,4 @@ use super::{ - char_tree::{CharacterTree, Comparator}, grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType}, rules::{Alias, AliasMap, Symbol, SymbolType}, tables::{ @@ -737,21 +736,7 @@ impl Generator { let mut sorted_large_char_sets = large_character_sets.iter().collect::>(); sorted_large_char_sets.sort_unstable_by_key(|info| (info.symbol, info.index)); for info in sorted_large_char_sets { - add_line!( - self, - "static inline bool {}_character_set_{}(int32_t c) {{", - self.symbol_ids[&info.symbol], - info.index - ); - indent!(self); - add_whitespace!(self); - add!(self, "return "); - let tree = CharacterTree::from_ranges(&info.ranges); - self.add_character_tree(tree.as_ref()); - add!(self, ";\n"); - dedent!(self); - add_line!(self, "}}"); - add_line!(self, ""); + self.add_character_set(info); } add_line!( @@ -836,9 +821,10 @@ impl Generator { } add!( self, - "{}_character_set_{}(lookahead)) ", + "set_contains({}_character_set_{}, {}, lookahead)) ", self.symbol_ids[&info.symbol], - info.index + info.index, + info.ranges.len(), ); self.add_advance_action(&action); add!(self, "\n"); @@ -931,64 +917,31 @@ impl Generator { } } - fn add_character_tree(&mut self, tree: Option<&CharacterTree>) { - match tree { - Some(CharacterTree::Compare { - value, - operator, - consequence, - alternative, - }) => { - let op = match operator { - Comparator::Less => "<", - Comparator::LessOrEqual => "<=", - Comparator::Equal => "==", - Comparator::GreaterOrEqual => ">=", - }; - let consequence = consequence.as_ref().map(Box::as_ref); - let alternative = alternative.as_ref().map(Box::as_ref); - - let simple = alternative.is_none() && consequence == Some(&CharacterTree::Yes); - - if !simple { - add!(self, "("); - } - - add!(self, "c {op} "); - self.add_character(*value); - - if !simple { - if alternative.is_none() { - add!(self, " && "); - self.add_character_tree(consequence); - } else if consequence == Some(&CharacterTree::Yes) { - add!(self, " || "); - self.add_character_tree(alternative); - } else { - add!(self, "\n"); - indent!(self); - add_whitespace!(self); - add!(self, "? "); - self.add_character_tree(consequence); - add!(self, "\n"); - add_whitespace!(self); - add!(self, ": "); - self.add_character_tree(alternative); - dedent!(self); - } - } - - if !simple { - add!(self, ")"); + fn add_character_set(&mut self, info: &LargeCharacterSetInfo) { + add_line!( + self, + "static TSCharacterRange {}_character_set_{}[] = {{", + self.symbol_ids[&info.symbol], + info.index + ); + indent!(self); + for chunk in info.ranges.chunks(8) { + add_whitespace!(self); + for (i, range) in chunk.iter().enumerate() { + if i > 0 { + add!(self, " "); } + add!(self, "{{"); + self.add_character(range.start); + add!(self, ", "); + self.add_character(range.end); + add!(self, "}},"); } - Some(CharacterTree::Yes) => { - add!(self, "true"); - } - None => { - add!(self, "false"); - } + add!(self, "\n"); } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); } fn add_advance_action(&mut self, action: &AdvanceAction) { diff --git a/lib/src/parser.h b/lib/src/parser.h index 17b4fde9..718f2bd6 100644 --- a/lib/src/parser.h +++ b/lib/src/parser.h @@ -86,6 +86,11 @@ typedef union { } entry; } TSParseActionEntry; +typedef struct { + int32_t start; + int32_t end; +} TSCharacterRange; + struct TSLanguage { uint32_t version; uint32_t symbol_count; @@ -125,6 +130,24 @@ struct TSLanguage { const TSStateId *primary_state_ids; }; +static inline bool set_contains(TSCharacterRange *ranges, uint32_t len, int32_t lookahead) { + uint32_t index = 0; + uint32_t size = len - index; + while (size > 1) { + uint32_t half_size = size / 2; + uint32_t mid_index = index + half_size; + TSCharacterRange *range = &ranges[mid_index]; + if (lookahead >= range->start && lookahead <= range->end) { + return true; + } else if (lookahead > range->end) { + index = mid_index; + } + size -= half_size; + } + TSCharacterRange *range = &ranges[index]; + return (lookahead >= range->start && lookahead <= range->end); +} + /* * Lexer Macros */