Use static arrays and a fixed binary search for large char set checks

This commit is contained in:
Max Brunsfeld 2024-03-29 14:30:25 -07:00
parent 0fc92c9a7d
commit 39be6972fe
4 changed files with 49 additions and 207 deletions

View file

@ -1,133 +0,0 @@
use std::ops::Range;
/// A set of characters represented as a balanced binary tree of comparisons.
/// This is used as an intermediate step in generating efficient code for
/// matching a given character set.
#[derive(PartialEq, Eq)]
pub enum CharacterTree {
Yes,
Compare {
value: char,
operator: Comparator,
consequence: Option<Box<CharacterTree>>,
alternative: Option<Box<CharacterTree>>,
},
}
#[derive(PartialEq, Eq)]
pub enum Comparator {
Less,
LessOrEqual,
Equal,
GreaterOrEqual,
}
impl CharacterTree {
pub fn from_ranges(ranges: &[Range<char>]) -> Option<Self> {
match ranges.len() {
0 => None,
1 => {
let range = &ranges[0];
if range.start == range.end {
Some(Self::Compare {
operator: Comparator::Equal,
value: range.start,
consequence: Some(Box::new(Self::Yes)),
alternative: None,
})
} else {
Some(Self::Compare {
operator: Comparator::GreaterOrEqual,
value: range.start,
consequence: Some(Box::new(Self::Compare {
operator: Comparator::LessOrEqual,
value: range.end,
consequence: Some(Box::new(Self::Yes)),
alternative: None,
})),
alternative: None,
})
}
}
len => {
let mid = len / 2;
let mid_range = &ranges[mid];
Some(Self::Compare {
operator: Comparator::Less,
value: mid_range.start,
consequence: Self::from_ranges(&ranges[0..mid]).map(Box::new),
alternative: Some(Box::new(Self::Compare {
operator: Comparator::LessOrEqual,
value: mid_range.end,
consequence: Some(Box::new(Self::Yes)),
alternative: Self::from_ranges(&ranges[(mid + 1)..]).map(Box::new),
})),
})
}
}
}
#[cfg(test)]
fn contains(&self, c: char) -> bool {
match self {
Self::Yes => true,
Self::Compare {
value,
operator,
alternative,
consequence,
} => {
let condition = match operator {
Comparator::Less => c < *value,
Comparator::LessOrEqual => c <= *value,
Comparator::Equal => c == *value,
Comparator::GreaterOrEqual => c >= *value,
};
if condition { consequence } else { alternative }
.as_ref()
.map_or(false, |a| a.contains(c))
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_character_tree_simple() {
let tree = CharacterTree::from_ranges(&['a'..'d', 'h'..'l', 'p'..'r', 'u'..'u', 'z'..'z'])
.unwrap();
assert!(tree.contains('a'));
assert!(tree.contains('b'));
assert!(tree.contains('c'));
assert!(tree.contains('d'));
assert!(!tree.contains('e'));
assert!(!tree.contains('f'));
assert!(!tree.contains('g'));
assert!(tree.contains('h'));
assert!(tree.contains('i'));
assert!(tree.contains('j'));
assert!(tree.contains('k'));
assert!(tree.contains('l'));
assert!(!tree.contains('m'));
assert!(!tree.contains('n'));
assert!(!tree.contains('o'));
assert!(tree.contains('p'));
assert!(tree.contains('q'));
assert!(tree.contains('r'));
assert!(!tree.contains('s'));
assert!(!tree.contains('s'));
assert!(tree.contains('u'));
assert!(!tree.contains('v'));
}
}

View file

@ -17,7 +17,6 @@ use render::render_c_code;
use rules::AliasMap;
mod build_tables;
mod char_tree;
mod dedup;
mod grammar_files;
mod grammars;

View file

@ -1,5 +1,4 @@
use super::{
char_tree::{CharacterTree, Comparator},
grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType},
rules::{Alias, AliasMap, Symbol, SymbolType},
tables::{
@ -737,21 +736,7 @@ impl Generator {
let mut sorted_large_char_sets = large_character_sets.iter().collect::<Vec<_>>();
sorted_large_char_sets.sort_unstable_by_key(|info| (info.symbol, info.index));
for info in sorted_large_char_sets {
add_line!(
self,
"static inline bool {}_character_set_{}(int32_t c) {{",
self.symbol_ids[&info.symbol],
info.index
);
indent!(self);
add_whitespace!(self);
add!(self, "return ");
let tree = CharacterTree::from_ranges(&info.ranges);
self.add_character_tree(tree.as_ref());
add!(self, ";\n");
dedent!(self);
add_line!(self, "}}");
add_line!(self, "");
self.add_character_set(info);
}
add_line!(
@ -836,9 +821,10 @@ impl Generator {
}
add!(
self,
"{}_character_set_{}(lookahead)) ",
"set_contains({}_character_set_{}, {}, lookahead)) ",
self.symbol_ids[&info.symbol],
info.index
info.index,
info.ranges.len(),
);
self.add_advance_action(&action);
add!(self, "\n");
@ -931,64 +917,31 @@ impl Generator {
}
}
fn add_character_tree(&mut self, tree: Option<&CharacterTree>) {
match tree {
Some(CharacterTree::Compare {
value,
operator,
consequence,
alternative,
}) => {
let op = match operator {
Comparator::Less => "<",
Comparator::LessOrEqual => "<=",
Comparator::Equal => "==",
Comparator::GreaterOrEqual => ">=",
};
let consequence = consequence.as_ref().map(Box::as_ref);
let alternative = alternative.as_ref().map(Box::as_ref);
let simple = alternative.is_none() && consequence == Some(&CharacterTree::Yes);
if !simple {
add!(self, "(");
}
add!(self, "c {op} ");
self.add_character(*value);
if !simple {
if alternative.is_none() {
add!(self, " && ");
self.add_character_tree(consequence);
} else if consequence == Some(&CharacterTree::Yes) {
add!(self, " || ");
self.add_character_tree(alternative);
} else {
add!(self, "\n");
indent!(self);
add_whitespace!(self);
add!(self, "? ");
self.add_character_tree(consequence);
add!(self, "\n");
add_whitespace!(self);
add!(self, ": ");
self.add_character_tree(alternative);
dedent!(self);
}
}
if !simple {
add!(self, ")");
fn add_character_set(&mut self, info: &LargeCharacterSetInfo) {
add_line!(
self,
"static TSCharacterRange {}_character_set_{}[] = {{",
self.symbol_ids[&info.symbol],
info.index
);
indent!(self);
for chunk in info.ranges.chunks(8) {
add_whitespace!(self);
for (i, range) in chunk.iter().enumerate() {
if i > 0 {
add!(self, " ");
}
add!(self, "{{");
self.add_character(range.start);
add!(self, ", ");
self.add_character(range.end);
add!(self, "}},");
}
Some(CharacterTree::Yes) => {
add!(self, "true");
}
None => {
add!(self, "false");
}
add!(self, "\n");
}
dedent!(self);
add_line!(self, "}};");
add_line!(self, "");
}
fn add_advance_action(&mut self, action: &AdvanceAction) {

View file

@ -86,6 +86,11 @@ typedef union {
} entry;
} TSParseActionEntry;
typedef struct {
int32_t start;
int32_t end;
} TSCharacterRange;
struct TSLanguage {
uint32_t version;
uint32_t symbol_count;
@ -125,6 +130,24 @@ struct TSLanguage {
const TSStateId *primary_state_ids;
};
static inline bool set_contains(TSCharacterRange *ranges, uint32_t len, int32_t lookahead) {
uint32_t index = 0;
uint32_t size = len - index;
while (size > 1) {
uint32_t half_size = size / 2;
uint32_t mid_index = index + half_size;
TSCharacterRange *range = &ranges[mid_index];
if (lookahead >= range->start && lookahead <= range->end) {
return true;
} else if (lookahead > range->end) {
index = mid_index;
}
size -= half_size;
}
TSCharacterRange *range = &ranges[index];
return (lookahead >= range->start && lookahead <= range->end);
}
/*
* Lexer Macros
*/