Merge pull request #3234 from tree-sitter/simpler-large-char-set-code
Generate simpler code for matching large character sets
This commit is contained in:
commit
3b8bfaeaa4
7 changed files with 470 additions and 499 deletions
|
|
@ -9,11 +9,19 @@ use super::{coincident_tokens::CoincidentTokenIndex, token_conflicts::TokenConfl
|
|||
use crate::generate::{
|
||||
dedup::split_state_id_groups,
|
||||
grammars::{LexicalGrammar, SyntaxGrammar},
|
||||
nfa::NfaCursor,
|
||||
nfa::{CharacterSet, NfaCursor},
|
||||
rules::{Symbol, TokenSet},
|
||||
tables::{AdvanceAction, LexState, LexTable, ParseStateId, ParseTable},
|
||||
};
|
||||
|
||||
pub const LARGE_CHARACTER_RANGE_COUNT: usize = 8;
|
||||
|
||||
pub struct LexTables {
|
||||
pub main_lex_table: LexTable,
|
||||
pub keyword_lex_table: LexTable,
|
||||
pub large_character_sets: Vec<(Option<Symbol>, CharacterSet)>,
|
||||
}
|
||||
|
||||
pub fn build_lex_table(
|
||||
parse_table: &mut ParseTable,
|
||||
syntax_grammar: &SyntaxGrammar,
|
||||
|
|
@ -21,7 +29,7 @@ pub fn build_lex_table(
|
|||
keywords: &TokenSet,
|
||||
coincident_token_index: &CoincidentTokenIndex,
|
||||
token_conflict_map: &TokenConflictMap,
|
||||
) -> (LexTable, LexTable) {
|
||||
) -> LexTables {
|
||||
let keyword_lex_table = if syntax_grammar.word_token.is_some() {
|
||||
let mut builder = LexTableBuilder::new(lexical_grammar);
|
||||
builder.add_state_for_tokens(keywords);
|
||||
|
|
@ -78,10 +86,45 @@ pub fn build_lex_table(
|
|||
}
|
||||
}
|
||||
|
||||
let mut table = builder.table;
|
||||
minimize_lex_table(&mut table, parse_table);
|
||||
sort_states(&mut table, parse_table);
|
||||
(table, keyword_lex_table)
|
||||
let mut main_lex_table = mem::take(&mut builder.table);
|
||||
minimize_lex_table(&mut main_lex_table, parse_table);
|
||||
sort_states(&mut main_lex_table, parse_table);
|
||||
|
||||
let mut large_character_sets = Vec::new();
|
||||
for (variable_ix, _variable) in lexical_grammar.variables.iter().enumerate() {
|
||||
let symbol = Symbol::terminal(variable_ix);
|
||||
builder.reset();
|
||||
builder.add_state_for_tokens(&TokenSet::from_iter([symbol]));
|
||||
for state in &builder.table.states {
|
||||
let mut characters = CharacterSet::empty();
|
||||
for (chars, action) in &state.advance_actions {
|
||||
if action.in_main_token {
|
||||
characters = characters.add(chars);
|
||||
continue;
|
||||
}
|
||||
|
||||
if chars.range_count() > LARGE_CHARACTER_RANGE_COUNT
|
||||
&& !large_character_sets.iter().any(|(_, set)| set == chars)
|
||||
{
|
||||
large_character_sets.push((None, chars.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
if characters.range_count() > LARGE_CHARACTER_RANGE_COUNT
|
||||
&& !large_character_sets
|
||||
.iter()
|
||||
.any(|(_, set)| *set == characters)
|
||||
{
|
||||
large_character_sets.push((Some(symbol), characters));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
LexTables {
|
||||
main_lex_table,
|
||||
keyword_lex_table,
|
||||
large_character_sets,
|
||||
}
|
||||
}
|
||||
|
||||
struct QueueEntry {
|
||||
|
|
@ -109,6 +152,12 @@ impl<'a> LexTableBuilder<'a> {
|
|||
}
|
||||
}
|
||||
|
||||
fn reset(&mut self) {
|
||||
self.table = LexTable::default();
|
||||
self.state_queue.clear();
|
||||
self.state_ids_by_nfa_state_set.clear();
|
||||
}
|
||||
|
||||
fn add_state_for_tokens(&mut self, tokens: &TokenSet) -> usize {
|
||||
let mut eof_valid = false;
|
||||
let nfa_states = tokens
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
pub mod build_lex_table;
|
||||
pub mod build_parse_table;
|
||||
mod build_lex_table;
|
||||
mod build_parse_table;
|
||||
mod coincident_tokens;
|
||||
mod item;
|
||||
mod item_set_builder;
|
||||
|
|
@ -20,12 +20,22 @@ use self::{
|
|||
};
|
||||
use crate::generate::{
|
||||
grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar},
|
||||
nfa::NfaCursor,
|
||||
nfa::{CharacterSet, NfaCursor},
|
||||
node_types::VariableInfo,
|
||||
rules::{AliasMap, Symbol, SymbolType, TokenSet},
|
||||
tables::{LexTable, ParseAction, ParseTable, ParseTableEntry},
|
||||
};
|
||||
|
||||
pub use build_lex_table::LARGE_CHARACTER_RANGE_COUNT;
|
||||
|
||||
pub struct Tables {
|
||||
pub parse_table: ParseTable,
|
||||
pub main_lex_table: LexTable,
|
||||
pub keyword_lex_table: LexTable,
|
||||
pub word_token: Option<Symbol>,
|
||||
pub large_character_sets: Vec<(Option<Symbol>, CharacterSet)>,
|
||||
}
|
||||
|
||||
pub fn build_tables(
|
||||
syntax_grammar: &SyntaxGrammar,
|
||||
lexical_grammar: &LexicalGrammar,
|
||||
|
|
@ -33,7 +43,7 @@ pub fn build_tables(
|
|||
variable_info: &[VariableInfo],
|
||||
inlines: &InlinedProductionMap,
|
||||
report_symbol_name: Option<&str>,
|
||||
) -> Result<(ParseTable, LexTable, LexTable, Option<Symbol>)> {
|
||||
) -> Result<Tables> {
|
||||
let (mut parse_table, following_tokens, parse_state_info) =
|
||||
build_parse_table(syntax_grammar, lexical_grammar, inlines, variable_info)?;
|
||||
let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens);
|
||||
|
|
@ -62,7 +72,7 @@ pub fn build_tables(
|
|||
&token_conflict_map,
|
||||
&keywords,
|
||||
);
|
||||
let (main_lex_table, keyword_lex_table) = build_lex_table(
|
||||
let lex_tables = build_lex_table(
|
||||
&mut parse_table,
|
||||
syntax_grammar,
|
||||
lexical_grammar,
|
||||
|
|
@ -82,12 +92,14 @@ pub fn build_tables(
|
|||
report_symbol_name,
|
||||
);
|
||||
}
|
||||
Ok((
|
||||
|
||||
Ok(Tables {
|
||||
parse_table,
|
||||
main_lex_table,
|
||||
keyword_lex_table,
|
||||
syntax_grammar.word_token,
|
||||
))
|
||||
main_lex_table: lex_tables.main_lex_table,
|
||||
keyword_lex_table: lex_tables.keyword_lex_table,
|
||||
large_character_sets: lex_tables.large_character_sets,
|
||||
word_token: syntax_grammar.word_token,
|
||||
})
|
||||
}
|
||||
|
||||
fn populate_error_state(
|
||||
|
|
|
|||
|
|
@ -1,133 +0,0 @@
|
|||
use std::ops::Range;
|
||||
|
||||
/// A set of characters represented as a balanced binary tree of comparisons.
|
||||
/// This is used as an intermediate step in generating efficient code for
|
||||
/// matching a given character set.
|
||||
#[derive(PartialEq, Eq)]
|
||||
pub enum CharacterTree {
|
||||
Yes,
|
||||
Compare {
|
||||
value: char,
|
||||
operator: Comparator,
|
||||
consequence: Option<Box<CharacterTree>>,
|
||||
alternative: Option<Box<CharacterTree>>,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq)]
|
||||
pub enum Comparator {
|
||||
Less,
|
||||
LessOrEqual,
|
||||
Equal,
|
||||
GreaterOrEqual,
|
||||
}
|
||||
|
||||
impl CharacterTree {
|
||||
pub fn from_ranges(ranges: &[Range<char>]) -> Option<Self> {
|
||||
match ranges.len() {
|
||||
0 => None,
|
||||
1 => {
|
||||
let range = &ranges[0];
|
||||
if range.start == range.end {
|
||||
Some(Self::Compare {
|
||||
operator: Comparator::Equal,
|
||||
value: range.start,
|
||||
consequence: Some(Box::new(Self::Yes)),
|
||||
alternative: None,
|
||||
})
|
||||
} else {
|
||||
Some(Self::Compare {
|
||||
operator: Comparator::GreaterOrEqual,
|
||||
value: range.start,
|
||||
consequence: Some(Box::new(Self::Compare {
|
||||
operator: Comparator::LessOrEqual,
|
||||
value: range.end,
|
||||
consequence: Some(Box::new(Self::Yes)),
|
||||
alternative: None,
|
||||
})),
|
||||
alternative: None,
|
||||
})
|
||||
}
|
||||
}
|
||||
len => {
|
||||
let mid = len / 2;
|
||||
let mid_range = &ranges[mid];
|
||||
Some(Self::Compare {
|
||||
operator: Comparator::Less,
|
||||
value: mid_range.start,
|
||||
consequence: Self::from_ranges(&ranges[0..mid]).map(Box::new),
|
||||
alternative: Some(Box::new(Self::Compare {
|
||||
operator: Comparator::LessOrEqual,
|
||||
value: mid_range.end,
|
||||
consequence: Some(Box::new(Self::Yes)),
|
||||
alternative: Self::from_ranges(&ranges[(mid + 1)..]).map(Box::new),
|
||||
})),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn contains(&self, c: char) -> bool {
|
||||
match self {
|
||||
Self::Yes => true,
|
||||
Self::Compare {
|
||||
value,
|
||||
operator,
|
||||
alternative,
|
||||
consequence,
|
||||
} => {
|
||||
let condition = match operator {
|
||||
Comparator::Less => c < *value,
|
||||
Comparator::LessOrEqual => c <= *value,
|
||||
Comparator::Equal => c == *value,
|
||||
Comparator::GreaterOrEqual => c >= *value,
|
||||
};
|
||||
if condition { consequence } else { alternative }
|
||||
.as_ref()
|
||||
.map_or(false, |a| a.contains(c))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_character_tree_simple() {
|
||||
let tree = CharacterTree::from_ranges(&['a'..'d', 'h'..'l', 'p'..'r', 'u'..'u', 'z'..'z'])
|
||||
.unwrap();
|
||||
|
||||
assert!(tree.contains('a'));
|
||||
assert!(tree.contains('b'));
|
||||
assert!(tree.contains('c'));
|
||||
assert!(tree.contains('d'));
|
||||
|
||||
assert!(!tree.contains('e'));
|
||||
assert!(!tree.contains('f'));
|
||||
assert!(!tree.contains('g'));
|
||||
|
||||
assert!(tree.contains('h'));
|
||||
assert!(tree.contains('i'));
|
||||
assert!(tree.contains('j'));
|
||||
assert!(tree.contains('k'));
|
||||
assert!(tree.contains('l'));
|
||||
|
||||
assert!(!tree.contains('m'));
|
||||
assert!(!tree.contains('n'));
|
||||
assert!(!tree.contains('o'));
|
||||
|
||||
assert!(tree.contains('p'));
|
||||
assert!(tree.contains('q'));
|
||||
assert!(tree.contains('r'));
|
||||
|
||||
assert!(!tree.contains('s'));
|
||||
assert!(!tree.contains('s'));
|
||||
|
||||
assert!(tree.contains('u'));
|
||||
|
||||
assert!(!tree.contains('v'));
|
||||
}
|
||||
}
|
||||
|
|
@ -8,17 +8,15 @@ use std::{
|
|||
use anyhow::{anyhow, Context, Result};
|
||||
use build_tables::build_tables;
|
||||
use grammar_files::path_in_ignore;
|
||||
use grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar};
|
||||
use grammars::InputGrammar;
|
||||
use lazy_static::lazy_static;
|
||||
use parse_grammar::parse_grammar;
|
||||
use prepare_grammar::prepare_grammar;
|
||||
use regex::{Regex, RegexBuilder};
|
||||
use render::render_c_code;
|
||||
use rules::AliasMap;
|
||||
use semver::Version;
|
||||
|
||||
mod build_tables;
|
||||
mod char_tree;
|
||||
mod dedup;
|
||||
mod grammar_files;
|
||||
mod grammars;
|
||||
|
|
@ -105,23 +103,12 @@ pub fn generate_parser_in_directory(
|
|||
|
||||
// Parse and preprocess the grammar.
|
||||
let input_grammar = parse_grammar(&grammar_json)?;
|
||||
let (syntax_grammar, lexical_grammar, inlines, simple_aliases) =
|
||||
prepare_grammar(&input_grammar)?;
|
||||
let language_name = input_grammar.name;
|
||||
|
||||
// Generate the parser and related files.
|
||||
let GeneratedParser {
|
||||
c_code,
|
||||
node_types_json,
|
||||
} = generate_parser_for_grammar_with_opts(
|
||||
&language_name,
|
||||
syntax_grammar,
|
||||
lexical_grammar,
|
||||
&inlines,
|
||||
simple_aliases,
|
||||
abi_version,
|
||||
report_symbol_name,
|
||||
)?;
|
||||
} = generate_parser_for_grammar_with_opts(&input_grammar, abi_version, report_symbol_name)?;
|
||||
|
||||
write_file(&src_path.join("parser.c"), c_code)?;
|
||||
write_file(&src_path.join("node-types.json"), node_types_json)?;
|
||||
|
|
@ -130,7 +117,7 @@ pub fn generate_parser_in_directory(
|
|||
write_file(&header_path.join("parser.h"), tree_sitter::PARSER_HEADER)?;
|
||||
|
||||
if !path_in_ignore(&repo_path) {
|
||||
grammar_files::generate_grammar_files(&repo_path, &language_name, generate_bindings)?;
|
||||
grammar_files::generate_grammar_files(&repo_path, &input_grammar.name, generate_bindings)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
|
@ -139,29 +126,18 @@ pub fn generate_parser_in_directory(
|
|||
pub fn generate_parser_for_grammar(grammar_json: &str) -> Result<(String, String)> {
|
||||
let grammar_json = JSON_COMMENT_REGEX.replace_all(grammar_json, "\n");
|
||||
let input_grammar = parse_grammar(&grammar_json)?;
|
||||
let (syntax_grammar, lexical_grammar, inlines, simple_aliases) =
|
||||
prepare_grammar(&input_grammar)?;
|
||||
let parser = generate_parser_for_grammar_with_opts(
|
||||
&input_grammar.name,
|
||||
syntax_grammar,
|
||||
lexical_grammar,
|
||||
&inlines,
|
||||
simple_aliases,
|
||||
tree_sitter::LANGUAGE_VERSION,
|
||||
None,
|
||||
)?;
|
||||
Ok((input_grammar.name, parser.c_code))
|
||||
let parser =
|
||||
generate_parser_for_grammar_with_opts(&input_grammar, tree_sitter::LANGUAGE_VERSION, None)?;
|
||||
Ok((input_grammar.name.clone(), parser.c_code))
|
||||
}
|
||||
|
||||
fn generate_parser_for_grammar_with_opts(
|
||||
name: &str,
|
||||
syntax_grammar: SyntaxGrammar,
|
||||
lexical_grammar: LexicalGrammar,
|
||||
inlines: &InlinedProductionMap,
|
||||
simple_aliases: AliasMap,
|
||||
input_grammar: &InputGrammar,
|
||||
abi_version: usize,
|
||||
report_symbol_name: Option<&str>,
|
||||
) -> Result<GeneratedParser> {
|
||||
let (syntax_grammar, lexical_grammar, inlines, simple_aliases) =
|
||||
prepare_grammar(input_grammar)?;
|
||||
let variable_info =
|
||||
node_types::get_variable_info(&syntax_grammar, &lexical_grammar, &simple_aliases)?;
|
||||
let node_types_json = node_types::generate_node_types_json(
|
||||
|
|
@ -170,20 +146,17 @@ fn generate_parser_for_grammar_with_opts(
|
|||
&simple_aliases,
|
||||
&variable_info,
|
||||
);
|
||||
let (parse_table, main_lex_table, keyword_lex_table, keyword_capture_token) = build_tables(
|
||||
let tables = build_tables(
|
||||
&syntax_grammar,
|
||||
&lexical_grammar,
|
||||
&simple_aliases,
|
||||
&variable_info,
|
||||
inlines,
|
||||
&inlines,
|
||||
report_symbol_name,
|
||||
)?;
|
||||
let c_code = render_c_code(
|
||||
name,
|
||||
parse_table,
|
||||
main_lex_table,
|
||||
keyword_lex_table,
|
||||
keyword_capture_token,
|
||||
&input_grammar.name,
|
||||
tables,
|
||||
syntax_grammar,
|
||||
lexical_grammar,
|
||||
simple_aliases,
|
||||
|
|
|
|||
|
|
@ -1,14 +1,13 @@
|
|||
use std::{
|
||||
char,
|
||||
cmp::{max, Ordering},
|
||||
collections::HashSet,
|
||||
fmt,
|
||||
mem::swap,
|
||||
ops::Range,
|
||||
mem::{self, swap},
|
||||
ops::{Range, RangeInclusive},
|
||||
};
|
||||
|
||||
/// A set of characters represented as a vector of ranges.
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
#[derive(Clone, Default, PartialEq, Eq, Hash)]
|
||||
pub struct CharacterSet {
|
||||
ranges: Vec<Range<u32>>,
|
||||
}
|
||||
|
|
@ -115,6 +114,11 @@ impl CharacterSet {
|
|||
self
|
||||
}
|
||||
|
||||
pub fn assign(&mut self, other: &Self) {
|
||||
self.ranges.clear();
|
||||
self.ranges.extend_from_slice(&other.ranges);
|
||||
}
|
||||
|
||||
fn add_int_range(&mut self, mut i: usize, start: u32, end: u32) -> usize {
|
||||
while i < self.ranges.len() {
|
||||
let range = &mut self.ranges[i];
|
||||
|
|
@ -286,12 +290,24 @@ impl CharacterSet {
|
|||
self.add(&other)
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> impl Iterator<Item = u32> + '_ {
|
||||
self.ranges.iter().flat_map(std::clone::Clone::clone)
|
||||
pub fn char_codes(&self) -> impl Iterator<Item = u32> + '_ {
|
||||
self.ranges.iter().flat_map(Clone::clone)
|
||||
}
|
||||
|
||||
pub fn chars(&self) -> impl Iterator<Item = char> + '_ {
|
||||
self.iter().filter_map(char::from_u32)
|
||||
self.char_codes().filter_map(char::from_u32)
|
||||
}
|
||||
|
||||
pub fn range_count(&self) -> usize {
|
||||
self.ranges.len()
|
||||
}
|
||||
|
||||
pub fn ranges(&self) -> impl Iterator<Item = RangeInclusive<char>> + '_ {
|
||||
self.ranges.iter().filter_map(|range| {
|
||||
let start = range.clone().find_map(char::from_u32)?;
|
||||
let end = (range.start..range.end).rev().find_map(char::from_u32)?;
|
||||
Some(start..=end)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
|
|
@ -300,41 +316,57 @@ impl CharacterSet {
|
|||
|
||||
/// Get a reduced list of character ranges, assuming that a given
|
||||
/// set of characters can be safely ignored.
|
||||
pub fn simplify_ignoring<'a>(
|
||||
&'a self,
|
||||
ruled_out_characters: &'a HashSet<u32>,
|
||||
) -> Vec<Range<char>> {
|
||||
let mut prev_range: Option<Range<char>> = None;
|
||||
self.chars()
|
||||
.map(|c| (c, false))
|
||||
.chain(Some(('\0', true)))
|
||||
.filter_map(move |(c, done)| {
|
||||
if done {
|
||||
return prev_range.clone();
|
||||
}
|
||||
if ruled_out_characters.contains(&(c as u32)) {
|
||||
return None;
|
||||
}
|
||||
if let Some(range) = prev_range.clone() {
|
||||
let mut prev_range_successor = range.end as u32 + 1;
|
||||
while prev_range_successor < c as u32 {
|
||||
if !ruled_out_characters.contains(&prev_range_successor) {
|
||||
prev_range = Some(c..c);
|
||||
return Some(range);
|
||||
pub fn simplify_ignoring(&self, ruled_out_characters: &Self) -> Self {
|
||||
let mut prev_range: Option<Range<u32>> = None;
|
||||
Self {
|
||||
ranges: self
|
||||
.ranges
|
||||
.iter()
|
||||
.map(|range| Some(range.clone()))
|
||||
.chain([None])
|
||||
.filter_map(move |range| {
|
||||
if let Some(range) = &range {
|
||||
if ruled_out_characters.contains_codepoint_range(range.clone()) {
|
||||
return None;
|
||||
}
|
||||
|
||||
if let Some(prev_range) = &mut prev_range {
|
||||
if ruled_out_characters
|
||||
.contains_codepoint_range(prev_range.end..range.start)
|
||||
{
|
||||
prev_range.end = range.end;
|
||||
return None;
|
||||
}
|
||||
}
|
||||
prev_range_successor += 1;
|
||||
}
|
||||
prev_range = Some(range.start..c);
|
||||
} else {
|
||||
prev_range = Some(c..c);
|
||||
}
|
||||
None
|
||||
})
|
||||
.collect()
|
||||
|
||||
let result = prev_range.clone();
|
||||
prev_range = range;
|
||||
result
|
||||
})
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn contains_codepoint_range(&self, seek_range: Range<u32>) -> bool {
|
||||
let ix = match self.ranges.binary_search_by(|probe| {
|
||||
if probe.end <= seek_range.start {
|
||||
Ordering::Less
|
||||
} else if probe.start > seek_range.start {
|
||||
Ordering::Greater
|
||||
} else {
|
||||
Ordering::Equal
|
||||
}
|
||||
}) {
|
||||
Ok(ix) | Err(ix) => ix,
|
||||
};
|
||||
self.ranges.get(ix).map_or(false, |range| {
|
||||
range.start <= seek_range.start && range.end >= seek_range.end
|
||||
})
|
||||
}
|
||||
|
||||
pub fn contains(&self, c: char) -> bool {
|
||||
self.ranges.iter().any(|r| r.contains(&(c as u32)))
|
||||
self.contains_codepoint_range(c as u32..c as u32 + 1)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -387,11 +419,11 @@ impl fmt::Debug for CharacterSet {
|
|||
write!(f, "^ ")?;
|
||||
set = set.negate();
|
||||
}
|
||||
for (i, c) in set.chars().enumerate() {
|
||||
for (i, range) in set.ranges().enumerate() {
|
||||
if i > 0 {
|
||||
write!(f, ", ")?;
|
||||
}
|
||||
write!(f, "{c:?}")?;
|
||||
write!(f, "{range:?}")?;
|
||||
}
|
||||
write!(f, "]")?;
|
||||
Ok(())
|
||||
|
|
@ -503,17 +535,17 @@ impl<'a> NfaCursor<'a> {
|
|||
result.sort_unstable_by(|a, b| a.characters.cmp(&b.characters));
|
||||
|
||||
let mut i = 0;
|
||||
'i_loop: while i < result.len() {
|
||||
while i < result.len() {
|
||||
for j in 0..i {
|
||||
if result[j].states == result[i].states
|
||||
&& result[j].is_separator == result[i].is_separator
|
||||
&& result[j].precedence == result[i].precedence
|
||||
{
|
||||
let mut characters = CharacterSet::empty();
|
||||
swap(&mut characters, &mut result[j].characters);
|
||||
let characters = mem::take(&mut result[j].characters);
|
||||
result[j].characters = characters.add(&result[i].characters);
|
||||
result.remove(i);
|
||||
continue 'i_loop;
|
||||
i -= 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
i += 1;
|
||||
|
|
@ -1034,7 +1066,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
#[allow(clippy::single_range_in_vec_init)]
|
||||
fn test_character_set_get_ranges() {
|
||||
fn test_character_set_simplify_ignoring() {
|
||||
struct Row {
|
||||
chars: Vec<char>,
|
||||
ruled_out_chars: Vec<char>,
|
||||
|
|
@ -1057,6 +1089,21 @@ mod tests {
|
|||
ruled_out_chars: vec!['d', 'f', 'g'],
|
||||
expected_ranges: vec!['a'..'h', 'z'..'z'],
|
||||
},
|
||||
Row {
|
||||
chars: vec!['a', 'b', 'c', 'g', 'h', 'i'],
|
||||
ruled_out_chars: vec!['d', 'j'],
|
||||
expected_ranges: vec!['a'..'c', 'g'..'i'],
|
||||
},
|
||||
Row {
|
||||
chars: vec!['c', 'd', 'e', 'g', 'h'],
|
||||
ruled_out_chars: vec!['a', 'b', 'c', 'd', 'e', 'f'],
|
||||
expected_ranges: vec!['g'..'h'],
|
||||
},
|
||||
Row {
|
||||
chars: vec!['I', 'N'],
|
||||
ruled_out_chars: vec!['A', 'I', 'N', 'Z'],
|
||||
expected_ranges: vec![],
|
||||
},
|
||||
];
|
||||
|
||||
for Row {
|
||||
|
|
@ -1065,13 +1112,23 @@ mod tests {
|
|||
expected_ranges,
|
||||
} in &table
|
||||
{
|
||||
let ruled_out_chars = ruled_out_chars.iter().map(|c: &char| *c as u32).collect();
|
||||
let ruled_out_chars = ruled_out_chars
|
||||
.iter()
|
||||
.fold(CharacterSet::empty(), |set, c| set.add_char(*c));
|
||||
let mut set = CharacterSet::empty();
|
||||
for c in chars {
|
||||
set = set.add_char(*c);
|
||||
}
|
||||
let ranges = set.simplify_ignoring(&ruled_out_chars);
|
||||
assert_eq!(ranges, *expected_ranges);
|
||||
let actual = set.simplify_ignoring(&ruled_out_chars);
|
||||
let expected = expected_ranges
|
||||
.iter()
|
||||
.fold(CharacterSet::empty(), |set, range| {
|
||||
set.add_range(range.start, range.end)
|
||||
});
|
||||
assert_eq!(
|
||||
actual, expected,
|
||||
"chars: {chars:?}, ruled out chars: {ruled_out_chars:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
use core::ops::Range;
|
||||
use std::{
|
||||
cmp,
|
||||
collections::{HashMap, HashSet},
|
||||
|
|
@ -7,8 +6,9 @@ use std::{
|
|||
};
|
||||
|
||||
use super::{
|
||||
char_tree::{CharacterTree, Comparator},
|
||||
build_tables::Tables,
|
||||
grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType},
|
||||
nfa::CharacterSet,
|
||||
rules::{Alias, AliasMap, Symbol, SymbolType},
|
||||
tables::{
|
||||
AdvanceAction, FieldLocation, GotoAction, LexState, LexTable, ParseAction, ParseTable,
|
||||
|
|
@ -16,7 +16,6 @@ use super::{
|
|||
},
|
||||
};
|
||||
|
||||
const LARGE_CHARACTER_RANGE_COUNT: usize = 8;
|
||||
const SMALL_STATE_THRESHOLD: usize = 64;
|
||||
const ABI_VERSION_MIN: usize = 13;
|
||||
const ABI_VERSION_MAX: usize = tree_sitter::LANGUAGE_VERSION;
|
||||
|
|
@ -64,6 +63,8 @@ struct Generator {
|
|||
parse_table: ParseTable,
|
||||
main_lex_table: LexTable,
|
||||
keyword_lex_table: LexTable,
|
||||
large_character_sets: Vec<(Option<Symbol>, CharacterSet)>,
|
||||
large_character_set_constant_names: Vec<String>,
|
||||
large_state_count: usize,
|
||||
keyword_capture_token: Option<Symbol>,
|
||||
syntax_grammar: SyntaxGrammar,
|
||||
|
|
@ -80,18 +81,6 @@ struct Generator {
|
|||
abi_version: usize,
|
||||
}
|
||||
|
||||
struct TransitionSummary {
|
||||
is_included: bool,
|
||||
ranges: Vec<Range<char>>,
|
||||
call_id: Option<usize>,
|
||||
}
|
||||
|
||||
struct LargeCharacterSetInfo {
|
||||
ranges: Vec<Range<char>>,
|
||||
symbol: Symbol,
|
||||
index: usize,
|
||||
}
|
||||
|
||||
impl Generator {
|
||||
fn generate(mut self) -> String {
|
||||
self.init();
|
||||
|
|
@ -119,14 +108,20 @@ impl Generator {
|
|||
self.add_primary_state_id_list();
|
||||
}
|
||||
|
||||
// Generate a helper function for each large character set.
|
||||
// let mut sorted_large_char_sets = self.large_character_sets.iter().collect::<Vec<_>>();
|
||||
for ix in 0..self.large_character_sets.len() {
|
||||
self.add_character_set(ix);
|
||||
}
|
||||
|
||||
let mut main_lex_table = LexTable::default();
|
||||
swap(&mut main_lex_table, &mut self.main_lex_table);
|
||||
self.add_lex_function("ts_lex", main_lex_table, true);
|
||||
self.add_lex_function("ts_lex", main_lex_table);
|
||||
|
||||
if self.keyword_capture_token.is_some() {
|
||||
let mut keyword_lex_table = LexTable::default();
|
||||
swap(&mut keyword_lex_table, &mut self.keyword_lex_table);
|
||||
self.add_lex_function("ts_lex_keywords", keyword_lex_table, false);
|
||||
self.add_lex_function("ts_lex_keywords", keyword_lex_table);
|
||||
}
|
||||
|
||||
self.add_lex_modes_list();
|
||||
|
|
@ -664,97 +659,7 @@ impl Generator {
|
|||
add_line!(self, "");
|
||||
}
|
||||
|
||||
fn add_lex_function(
|
||||
&mut self,
|
||||
name: &str,
|
||||
lex_table: LexTable,
|
||||
extract_helper_functions: bool,
|
||||
) {
|
||||
let mut ruled_out_chars = HashSet::new();
|
||||
let mut large_character_sets = Vec::<LargeCharacterSetInfo>::new();
|
||||
|
||||
// For each lex state, compute a summary of the code that needs to be
|
||||
// generated.
|
||||
let state_transition_summaries = lex_table
|
||||
.states
|
||||
.iter()
|
||||
.map(|state| {
|
||||
ruled_out_chars.clear();
|
||||
|
||||
// For each state transition, compute the set of character ranges
|
||||
// that need to be checked.
|
||||
state
|
||||
.advance_actions
|
||||
.iter()
|
||||
.map(|(chars, action)| {
|
||||
let is_included = !chars.contains(char::MAX);
|
||||
let mut ranges;
|
||||
if is_included {
|
||||
ranges = chars.simplify_ignoring(&ruled_out_chars);
|
||||
ruled_out_chars.extend(chars.iter());
|
||||
} else {
|
||||
ranges = chars.clone().negate().simplify_ignoring(&ruled_out_chars);
|
||||
ranges.insert(0, '\0'..'\0');
|
||||
}
|
||||
|
||||
// Record any large character sets so that they can be extracted
|
||||
// into helper functions, reducing code duplication.
|
||||
let mut call_id = None;
|
||||
if extract_helper_functions && ranges.len() > LARGE_CHARACTER_RANGE_COUNT {
|
||||
let char_set_symbol = self
|
||||
.symbol_for_advance_action(action, &lex_table)
|
||||
.expect("No symbol for lex state");
|
||||
let mut count_for_symbol = 0;
|
||||
for (i, info) in large_character_sets.iter_mut().enumerate() {
|
||||
if info.ranges == ranges {
|
||||
call_id = Some(i);
|
||||
break;
|
||||
}
|
||||
if info.symbol == char_set_symbol {
|
||||
count_for_symbol += 1;
|
||||
}
|
||||
}
|
||||
if call_id.is_none() {
|
||||
call_id = Some(large_character_sets.len());
|
||||
large_character_sets.push(LargeCharacterSetInfo {
|
||||
symbol: char_set_symbol,
|
||||
index: count_for_symbol + 1,
|
||||
ranges: ranges.clone(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
TransitionSummary {
|
||||
is_included,
|
||||
ranges,
|
||||
call_id,
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
})
|
||||
.collect::<Vec<Vec<_>>>();
|
||||
|
||||
// Generate a helper function for each large character set.
|
||||
let mut sorted_large_char_sets = large_character_sets.iter().collect::<Vec<_>>();
|
||||
sorted_large_char_sets.sort_unstable_by_key(|info| (info.symbol, info.index));
|
||||
for info in sorted_large_char_sets {
|
||||
add_line!(
|
||||
self,
|
||||
"static inline bool {}_character_set_{}(int32_t c) {{",
|
||||
self.symbol_ids[&info.symbol],
|
||||
info.index
|
||||
);
|
||||
indent!(self);
|
||||
add_whitespace!(self);
|
||||
add!(self, "return ");
|
||||
let tree = CharacterTree::from_ranges(&info.ranges);
|
||||
self.add_character_tree(tree.as_ref());
|
||||
add!(self, ";\n");
|
||||
dedent!(self);
|
||||
add_line!(self, "}}");
|
||||
add_line!(self, "");
|
||||
}
|
||||
|
||||
fn add_lex_function(&mut self, name: &str, lex_table: LexTable) {
|
||||
add_line!(
|
||||
self,
|
||||
"static bool {name}(TSLexer *lexer, TSStateId state) {{",
|
||||
|
|
@ -769,7 +674,7 @@ impl Generator {
|
|||
for (i, state) in lex_table.states.into_iter().enumerate() {
|
||||
add_line!(self, "case {i}:");
|
||||
indent!(self);
|
||||
self.add_lex_state(state, &state_transition_summaries[i], &large_character_sets);
|
||||
self.add_lex_state(i, state);
|
||||
dedent!(self);
|
||||
}
|
||||
|
||||
|
|
@ -786,35 +691,7 @@ impl Generator {
|
|||
add_line!(self, "");
|
||||
}
|
||||
|
||||
fn symbol_for_advance_action(
|
||||
&self,
|
||||
action: &AdvanceAction,
|
||||
lex_table: &LexTable,
|
||||
) -> Option<Symbol> {
|
||||
let mut state_ids = vec![action.state];
|
||||
let mut i = 0;
|
||||
while i < state_ids.len() {
|
||||
let id = state_ids[i];
|
||||
let state = &lex_table.states[id];
|
||||
if let Some(accept) = state.accept_action {
|
||||
return Some(accept);
|
||||
}
|
||||
for (_, action) in &state.advance_actions {
|
||||
if !state_ids.contains(&action.state) {
|
||||
state_ids.push(action.state);
|
||||
}
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
fn add_lex_state(
|
||||
&mut self,
|
||||
state: LexState,
|
||||
transition_info: &[TransitionSummary],
|
||||
large_character_sets: &[LargeCharacterSetInfo],
|
||||
) {
|
||||
fn add_lex_state(&mut self, _state_ix: usize, state: LexState) {
|
||||
if let Some(accept_action) = state.accept_action {
|
||||
add_line!(self, "ACCEPT_TOKEN({});", self.symbol_ids[&accept_action]);
|
||||
}
|
||||
|
|
@ -823,37 +700,167 @@ impl Generator {
|
|||
add_line!(self, "if (eof) ADVANCE({});", eof_action.state);
|
||||
}
|
||||
|
||||
for (i, (_, action)) in state.advance_actions.into_iter().enumerate() {
|
||||
let transition = &transition_info[i];
|
||||
let mut chars_copy = CharacterSet::empty();
|
||||
let mut large_set = CharacterSet::empty();
|
||||
let mut ruled_out_chars = CharacterSet::empty();
|
||||
|
||||
// The transitions in a lex state are sorted with the single-character
|
||||
// transitions first. If there are many single-character transitions,
|
||||
// then implement them using an array of (lookahead character, state)
|
||||
// pairs, instead of individual if statements, in order to reduce compile
|
||||
// time.
|
||||
let mut leading_simple_transition_count = 0;
|
||||
let mut leading_simple_transition_character_count = 0;
|
||||
for (chars, action) in &state.advance_actions {
|
||||
if action.in_main_token
|
||||
&& chars
|
||||
.ranges()
|
||||
.all(|r| r.start() == r.end() && *r.start() as u32 <= u16::MAX as u32)
|
||||
{
|
||||
leading_simple_transition_count += 1;
|
||||
leading_simple_transition_character_count += chars.range_count();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if leading_simple_transition_character_count >= 8 {
|
||||
add_line!(self, "ADVANCE_MAP(");
|
||||
indent!(self);
|
||||
for (chars, action) in &state.advance_actions[0..leading_simple_transition_count] {
|
||||
for range in chars.ranges() {
|
||||
add_whitespace!(self);
|
||||
self.add_character(*range.start());
|
||||
add!(self, ", {},\n", action.state);
|
||||
}
|
||||
ruled_out_chars = ruled_out_chars.add(chars);
|
||||
}
|
||||
dedent!(self);
|
||||
add_line!(self, ");");
|
||||
} else {
|
||||
leading_simple_transition_count = 0;
|
||||
}
|
||||
|
||||
for (chars, action) in &state.advance_actions[leading_simple_transition_count..] {
|
||||
add_whitespace!(self);
|
||||
|
||||
// If there is a helper function for this transition's character
|
||||
// set, then generate a call to that helper function.
|
||||
if let Some(call_id) = transition.call_id {
|
||||
let info = &large_character_sets[call_id];
|
||||
add!(self, "if (");
|
||||
if !transition.is_included {
|
||||
add!(self, "!");
|
||||
// The lex state's advance actions are represented with disjoint
|
||||
// sets of characters. When translating these disjoint sets into a
|
||||
// sequence of checks, we don't need to re-check conditions that
|
||||
// have already been checked due to previous transitions.
|
||||
//
|
||||
// Note that this simplification may result in an empty character set.
|
||||
// That means that the transition is guaranteed (nothing further needs to
|
||||
// be checked), not that this transition is impossible.
|
||||
let simplified_chars = chars.simplify_ignoring(&ruled_out_chars);
|
||||
|
||||
// For large character sets, find the best matching character set from
|
||||
// a pre-selected list of large character sets, which are based on the
|
||||
// state transitions for invidual tokens. This transition may not exactly
|
||||
// match one of the pre-selected character sets. In that case, determine
|
||||
// the additional checks that need to be performed to match this transition.
|
||||
let mut best_large_char_set: Option<(usize, CharacterSet, CharacterSet)> = None;
|
||||
if simplified_chars.range_count() >= super::build_tables::LARGE_CHARACTER_RANGE_COUNT {
|
||||
for (ix, (_, set)) in self.large_character_sets.iter().enumerate() {
|
||||
chars_copy.assign(&simplified_chars);
|
||||
large_set.assign(set);
|
||||
let intersection = chars_copy.remove_intersection(&mut large_set);
|
||||
if !intersection.is_empty() {
|
||||
let additions = chars_copy.simplify_ignoring(&ruled_out_chars);
|
||||
let removals = large_set.simplify_ignoring(&ruled_out_chars);
|
||||
let total_range_count = additions.range_count() + removals.range_count();
|
||||
if total_range_count >= simplified_chars.range_count() {
|
||||
continue;
|
||||
}
|
||||
if let Some((_, best_additions, best_removals)) = &best_large_char_set {
|
||||
let best_range_count =
|
||||
best_additions.range_count() + best_removals.range_count();
|
||||
if best_range_count < total_range_count {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
best_large_char_set = Some((ix, additions, removals));
|
||||
}
|
||||
}
|
||||
add!(
|
||||
self,
|
||||
"{}_character_set_{}(lookahead)) ",
|
||||
self.symbol_ids[&info.symbol],
|
||||
info.index
|
||||
);
|
||||
self.add_advance_action(&action);
|
||||
add!(self, "\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
// Otherwise, generate code to compare the lookahead character
|
||||
// with all of the character ranges.
|
||||
if !transition.ranges.is_empty() {
|
||||
// Add this transition's character set to the set of ruled out characters,
|
||||
// which don't need to be checked for subsequent transitions in this state.
|
||||
ruled_out_chars = ruled_out_chars.add(chars);
|
||||
|
||||
let mut large_char_set_ix = None;
|
||||
let mut asserted_chars = simplified_chars;
|
||||
let mut negated_chars = CharacterSet::empty();
|
||||
if let Some((char_set_ix, additions, removals)) = best_large_char_set {
|
||||
asserted_chars = additions;
|
||||
negated_chars = removals;
|
||||
large_char_set_ix = Some(char_set_ix);
|
||||
}
|
||||
|
||||
let mut line_break = "\n".to_string();
|
||||
for _ in 0..self.indent_level + 2 {
|
||||
line_break.push_str(" ");
|
||||
}
|
||||
|
||||
let has_positive_condition = large_char_set_ix.is_some() || !asserted_chars.is_empty();
|
||||
let has_negative_condition = !negated_chars.is_empty();
|
||||
let has_condition = has_positive_condition || has_negative_condition;
|
||||
if has_condition {
|
||||
add!(self, "if (");
|
||||
self.add_character_range_conditions(&transition.ranges, transition.is_included, 2);
|
||||
if has_positive_condition && has_negative_condition {
|
||||
add!(self, "(");
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(large_char_set_ix) = large_char_set_ix {
|
||||
let large_set = &self.large_character_sets[large_char_set_ix].1;
|
||||
|
||||
// If the character set contains the null character, check that we
|
||||
// are not at the end of the file.
|
||||
let check_eof = large_set.contains('\0');
|
||||
if check_eof {
|
||||
add!(self, "(!eof && ")
|
||||
}
|
||||
|
||||
add!(
|
||||
self,
|
||||
"set_contains({}, {}, lookahead)",
|
||||
&self.large_character_set_constant_names[large_char_set_ix],
|
||||
large_set.range_count(),
|
||||
);
|
||||
if check_eof {
|
||||
add!(self, ")");
|
||||
}
|
||||
}
|
||||
|
||||
if !asserted_chars.is_empty() {
|
||||
if large_char_set_ix.is_some() {
|
||||
add!(self, " ||{line_break}");
|
||||
}
|
||||
|
||||
// If the character set contains the max character, than it probably
|
||||
// corresponds to a negated character class in a regex, so it will be more
|
||||
// concise and readable to express it in terms of negated ranges.
|
||||
let is_included = !asserted_chars.contains(char::MAX);
|
||||
if !is_included {
|
||||
asserted_chars = asserted_chars.negate().add_char('\0');
|
||||
}
|
||||
|
||||
self.add_character_range_conditions(&asserted_chars, is_included, &line_break);
|
||||
}
|
||||
|
||||
if has_negative_condition {
|
||||
if has_positive_condition {
|
||||
add!(self, ") &&{line_break}");
|
||||
}
|
||||
self.add_character_range_conditions(&negated_chars, false, &line_break);
|
||||
}
|
||||
|
||||
if has_condition {
|
||||
add!(self, ") ");
|
||||
}
|
||||
self.add_advance_action(&action);
|
||||
|
||||
self.add_advance_action(action);
|
||||
add!(self, "\n");
|
||||
}
|
||||
|
||||
|
|
@ -862,135 +869,106 @@ impl Generator {
|
|||
|
||||
fn add_character_range_conditions(
|
||||
&mut self,
|
||||
ranges: &[Range<char>],
|
||||
characters: &CharacterSet,
|
||||
is_included: bool,
|
||||
indent_count: usize,
|
||||
line_break: &str,
|
||||
) {
|
||||
let mut line_break = "\n".to_string();
|
||||
for _ in 0..self.indent_level + indent_count {
|
||||
line_break.push_str(" ");
|
||||
}
|
||||
|
||||
for (i, range) in ranges.iter().enumerate() {
|
||||
for (i, range) in characters.ranges().enumerate() {
|
||||
let start = *range.start();
|
||||
let end = *range.end();
|
||||
if is_included {
|
||||
if i > 0 {
|
||||
add!(self, " ||{line_break}");
|
||||
}
|
||||
// parenthesis needed if we add the `!eof` condition to explicitly avoid confusion with
|
||||
// precedence of `&&` and `||`
|
||||
let mut close_paren = false;
|
||||
if range.start == '\0' {
|
||||
|
||||
if start == '\0' {
|
||||
add!(self, "(!eof && ");
|
||||
close_paren = true;
|
||||
}
|
||||
if range.end == range.start {
|
||||
add!(self, "lookahead == ");
|
||||
self.add_character(range.start);
|
||||
} else if range.end as u32 == range.start as u32 + 1 {
|
||||
if close_paren {
|
||||
add!(self, "(");
|
||||
if end == '\0' {
|
||||
add!(self, "lookahead == 0");
|
||||
} else {
|
||||
add!(self, "lookahead <= ");
|
||||
}
|
||||
self.add_character(end);
|
||||
add!(self, ")");
|
||||
continue;
|
||||
} else if end == start {
|
||||
add!(self, "lookahead == ");
|
||||
self.add_character(range.start);
|
||||
self.add_character(start);
|
||||
} else if end as u32 == start as u32 + 1 {
|
||||
add!(self, "lookahead == ");
|
||||
self.add_character(start);
|
||||
add!(self, " ||{line_break}lookahead == ");
|
||||
self.add_character(range.end);
|
||||
if close_paren {
|
||||
add!(self, ")");
|
||||
}
|
||||
self.add_character(end);
|
||||
} else {
|
||||
add!(self, "(");
|
||||
self.add_character(range.start);
|
||||
self.add_character(start);
|
||||
add!(self, " <= lookahead && lookahead <= ");
|
||||
self.add_character(range.end);
|
||||
add!(self, ")");
|
||||
}
|
||||
if close_paren {
|
||||
self.add_character(end);
|
||||
add!(self, ")");
|
||||
}
|
||||
} else {
|
||||
if i > 0 {
|
||||
add!(self, " &&{line_break}");
|
||||
}
|
||||
if range.end == range.start {
|
||||
if end == start {
|
||||
add!(self, "lookahead != ");
|
||||
self.add_character(range.start);
|
||||
} else if range.end as u32 == range.start as u32 + 1 {
|
||||
self.add_character(start);
|
||||
} else if end as u32 == start as u32 + 1 {
|
||||
add!(self, "lookahead != ");
|
||||
self.add_character(range.start);
|
||||
self.add_character(start);
|
||||
add!(self, " &&{line_break}lookahead != ");
|
||||
self.add_character(range.end);
|
||||
} else if range.start != '\0' {
|
||||
self.add_character(end);
|
||||
} else if start != '\0' {
|
||||
add!(self, "(lookahead < ");
|
||||
self.add_character(range.start);
|
||||
self.add_character(start);
|
||||
add!(self, " || ");
|
||||
self.add_character(range.end);
|
||||
self.add_character(end);
|
||||
add!(self, " < lookahead)");
|
||||
} else {
|
||||
add!(self, "lookahead > ");
|
||||
self.add_character(range.end);
|
||||
self.add_character(end);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn add_character_tree(&mut self, tree: Option<&CharacterTree>) {
|
||||
match tree {
|
||||
Some(CharacterTree::Compare {
|
||||
value,
|
||||
operator,
|
||||
consequence,
|
||||
alternative,
|
||||
}) => {
|
||||
let op = match operator {
|
||||
Comparator::Less => "<",
|
||||
Comparator::LessOrEqual => "<=",
|
||||
Comparator::Equal => "==",
|
||||
Comparator::GreaterOrEqual => ">=",
|
||||
};
|
||||
let consequence = consequence.as_ref().map(Box::as_ref);
|
||||
let alternative = alternative.as_ref().map(Box::as_ref);
|
||||
fn add_character_set(&mut self, ix: usize) {
|
||||
let (symbol, characters) = self.large_character_sets[ix].clone();
|
||||
let count = self.large_character_sets[0..ix]
|
||||
.iter()
|
||||
.filter(|(sym, _)| *sym == symbol)
|
||||
.count()
|
||||
+ 1;
|
||||
|
||||
let simple = alternative.is_none() && consequence == Some(&CharacterTree::Yes);
|
||||
let constant_name = if let Some(symbol) = symbol {
|
||||
format!("{}_character_set_{}", self.symbol_ids[&symbol], count)
|
||||
} else {
|
||||
format!("extras_character_set_{}", count)
|
||||
};
|
||||
add_line!(self, "static TSCharacterRange {}[] = {{", constant_name);
|
||||
self.large_character_set_constant_names.push(constant_name);
|
||||
|
||||
if !simple {
|
||||
add!(self, "(");
|
||||
}
|
||||
|
||||
add!(self, "c {op} ");
|
||||
self.add_character(*value);
|
||||
|
||||
if !simple {
|
||||
if alternative.is_none() {
|
||||
add!(self, " && ");
|
||||
self.add_character_tree(consequence);
|
||||
} else if consequence == Some(&CharacterTree::Yes) {
|
||||
add!(self, " || ");
|
||||
self.add_character_tree(alternative);
|
||||
} else {
|
||||
add!(self, "\n");
|
||||
indent!(self);
|
||||
add_whitespace!(self);
|
||||
add!(self, "? ");
|
||||
self.add_character_tree(consequence);
|
||||
add!(self, "\n");
|
||||
add_whitespace!(self);
|
||||
add!(self, ": ");
|
||||
self.add_character_tree(alternative);
|
||||
dedent!(self);
|
||||
}
|
||||
}
|
||||
|
||||
if !simple {
|
||||
add!(self, ")");
|
||||
indent!(self);
|
||||
for (ix, range) in characters.ranges().enumerate() {
|
||||
let column = ix % 8;
|
||||
if column == 0 {
|
||||
if ix > 0 {
|
||||
add!(self, "\n");
|
||||
}
|
||||
add_whitespace!(self);
|
||||
} else {
|
||||
add!(self, " ");
|
||||
}
|
||||
Some(CharacterTree::Yes) => {
|
||||
add!(self, "true");
|
||||
}
|
||||
None => {
|
||||
add!(self, "false");
|
||||
}
|
||||
add!(self, "{{");
|
||||
self.add_character(*range.start());
|
||||
add!(self, ", ");
|
||||
self.add_character(*range.end());
|
||||
add!(self, "}},");
|
||||
}
|
||||
add!(self, "\n");
|
||||
dedent!(self);
|
||||
add_line!(self, "}};");
|
||||
add_line!(self, "");
|
||||
}
|
||||
|
||||
fn add_advance_action(&mut self, action: &AdvanceAction) {
|
||||
|
|
@ -1656,10 +1634,12 @@ impl Generator {
|
|||
'\t' => add!(self, "'\\t'"),
|
||||
'\r' => add!(self, "'\\r'"),
|
||||
_ => {
|
||||
if c == ' ' || c.is_ascii_graphic() {
|
||||
if c == '\0' {
|
||||
add!(self, "0")
|
||||
} else if c == ' ' || c.is_ascii_graphic() {
|
||||
add!(self, "'{c}'");
|
||||
} else {
|
||||
add!(self, "{}", c as u32);
|
||||
add!(self, "0x{:02x}", c as u32);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1686,10 +1666,7 @@ impl Generator {
|
|||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn render_c_code(
|
||||
name: &str,
|
||||
parse_table: ParseTable,
|
||||
main_lex_table: LexTable,
|
||||
keyword_lex_table: LexTable,
|
||||
keyword_capture_token: Option<Symbol>,
|
||||
tables: Tables,
|
||||
syntax_grammar: SyntaxGrammar,
|
||||
lexical_grammar: LexicalGrammar,
|
||||
default_aliases: AliasMap,
|
||||
|
|
@ -1705,10 +1682,12 @@ pub fn render_c_code(
|
|||
indent_level: 0,
|
||||
language_name: name.to_string(),
|
||||
large_state_count: 0,
|
||||
parse_table,
|
||||
main_lex_table,
|
||||
keyword_lex_table,
|
||||
keyword_capture_token,
|
||||
parse_table: tables.parse_table,
|
||||
main_lex_table: tables.main_lex_table,
|
||||
keyword_lex_table: tables.keyword_lex_table,
|
||||
keyword_capture_token: tables.word_token,
|
||||
large_character_sets: tables.large_character_sets,
|
||||
large_character_set_constant_names: Vec::new(),
|
||||
syntax_grammar,
|
||||
lexical_grammar,
|
||||
default_aliases,
|
||||
|
|
|
|||
|
|
@ -86,6 +86,11 @@ typedef union {
|
|||
} entry;
|
||||
} TSParseActionEntry;
|
||||
|
||||
typedef struct {
|
||||
int32_t start;
|
||||
int32_t end;
|
||||
} TSCharacterRange;
|
||||
|
||||
struct TSLanguage {
|
||||
uint32_t version;
|
||||
uint32_t symbol_count;
|
||||
|
|
@ -125,6 +130,24 @@ struct TSLanguage {
|
|||
const TSStateId *primary_state_ids;
|
||||
};
|
||||
|
||||
static inline bool set_contains(TSCharacterRange *ranges, uint32_t len, int32_t lookahead) {
|
||||
uint32_t index = 0;
|
||||
uint32_t size = len - index;
|
||||
while (size > 1) {
|
||||
uint32_t half_size = size / 2;
|
||||
uint32_t mid_index = index + half_size;
|
||||
TSCharacterRange *range = &ranges[mid_index];
|
||||
if (lookahead >= range->start && lookahead <= range->end) {
|
||||
return true;
|
||||
} else if (lookahead > range->end) {
|
||||
index = mid_index;
|
||||
}
|
||||
size -= half_size;
|
||||
}
|
||||
TSCharacterRange *range = &ranges[index];
|
||||
return (lookahead >= range->start && lookahead <= range->end);
|
||||
}
|
||||
|
||||
/*
|
||||
* Lexer Macros
|
||||
*/
|
||||
|
|
@ -154,6 +177,17 @@ struct TSLanguage {
|
|||
goto next_state; \
|
||||
}
|
||||
|
||||
#define ADVANCE_MAP(...) \
|
||||
{ \
|
||||
static const uint16_t map[] = { __VA_ARGS__ }; \
|
||||
for (uint32_t i = 0; i < sizeof(map) / sizeof(map[0]); i += 2) { \
|
||||
if (map[i] == lookahead) { \
|
||||
state = map[i + 1]; \
|
||||
goto next_state; \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#define SKIP(state_value) \
|
||||
{ \
|
||||
skip = true; \
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue