Identify large char sets for lexer using NFA transitions

This commit is contained in:
Max Brunsfeld 2024-04-09 17:53:37 -07:00
parent 39be6972fe
commit be34bc9430
5 changed files with 313 additions and 317 deletions

View file

@ -1,15 +1,23 @@
use super::coincident_tokens::CoincidentTokenIndex;
use super::token_conflicts::TokenConflictMap;
use crate::generate::dedup::split_state_id_groups;
use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar};
use crate::generate::nfa::NfaCursor;
use crate::generate::rules::{Symbol, TokenSet};
use crate::generate::tables::{AdvanceAction, LexState, LexTable, ParseStateId, ParseTable};
use crate::generate::{
build_tables::{coincident_tokens::CoincidentTokenIndex, token_conflicts::TokenConflictMap},
dedup::split_state_id_groups,
grammars::{LexicalGrammar, SyntaxGrammar},
nfa::{CharacterSet, NfaCursor, NfaState},
rules::{Symbol, TokenSet},
tables::{AdvanceAction, LexState, LexTable, ParseStateId, ParseTable},
};
use log::info;
use std::collections::hash_map::Entry;
use std::collections::{HashMap, VecDeque};
use std::collections::{hash_map::Entry, HashMap, VecDeque};
use std::mem;
const LARGE_CHARACTER_RANGE_COUNT: usize = 8;
pub struct LexTables {
pub main_lex_table: LexTable,
pub keyword_lex_table: LexTable,
pub large_character_sets: Vec<(Option<Symbol>, CharacterSet)>,
}
pub fn build_lex_table(
parse_table: &mut ParseTable,
syntax_grammar: &SyntaxGrammar,
@ -17,7 +25,7 @@ pub fn build_lex_table(
keywords: &TokenSet,
coincident_token_index: &CoincidentTokenIndex,
token_conflict_map: &TokenConflictMap,
) -> (LexTable, LexTable) {
) -> LexTables {
let keyword_lex_table = if syntax_grammar.word_token.is_some() {
let mut builder = LexTableBuilder::new(lexical_grammar);
builder.add_state_for_tokens(keywords);
@ -74,10 +82,36 @@ pub fn build_lex_table(
}
}
let mut table = builder.table;
minimize_lex_table(&mut table, parse_table);
sort_states(&mut table, parse_table);
(table, keyword_lex_table)
let mut main_lex_table = builder.table;
minimize_lex_table(&mut main_lex_table, parse_table);
sort_states(&mut main_lex_table, parse_table);
let mut large_character_sets = Vec::new();
for (state_ix, state) in lexical_grammar.nfa.states.iter().enumerate() {
if let NfaState::Advance { chars, is_sep, .. } = state {
if chars.range_count() > LARGE_CHARACTER_RANGE_COUNT {
let symbol = if *is_sep {
None
} else {
let ix = lexical_grammar
.variables
.iter()
.position(|v| v.start_state >= state_ix as u32);
ix.map(Symbol::terminal)
};
if !large_character_sets.iter().any(|(_, set)| set == chars) {
large_character_sets.push((symbol, chars.clone()));
}
}
}
}
LexTables {
main_lex_table,
keyword_lex_table,
large_character_sets,
}
}
struct QueueEntry {

View file

@ -1,25 +1,37 @@
pub mod build_lex_table;
pub mod build_parse_table;
mod build_lex_table;
mod build_parse_table;
mod coincident_tokens;
mod item;
mod item_set_builder;
mod minimize_parse_table;
mod token_conflicts;
use self::build_lex_table::build_lex_table;
use self::build_parse_table::{build_parse_table, ParseStateInfo};
use self::coincident_tokens::CoincidentTokenIndex;
use self::minimize_parse_table::minimize_parse_table;
use self::token_conflicts::TokenConflictMap;
use crate::generate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar};
use crate::generate::nfa::NfaCursor;
use crate::generate::node_types::VariableInfo;
use crate::generate::rules::{AliasMap, Symbol, SymbolType, TokenSet};
use crate::generate::tables::{LexTable, ParseAction, ParseTable, ParseTableEntry};
use self::{
build_lex_table::build_lex_table,
build_parse_table::{build_parse_table, ParseStateInfo},
coincident_tokens::CoincidentTokenIndex,
minimize_parse_table::minimize_parse_table,
token_conflicts::TokenConflictMap,
};
use crate::generate::{
grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar},
nfa::{CharacterSet, NfaCursor},
node_types::VariableInfo,
rules::{AliasMap, Symbol, SymbolType, TokenSet},
tables::{LexTable, ParseAction, ParseTable, ParseTableEntry},
};
use anyhow::Result;
use log::info;
use std::collections::{BTreeSet, HashMap};
pub struct Tables {
pub parse_table: ParseTable,
pub main_lex_table: LexTable,
pub keyword_lex_table: LexTable,
pub word_token: Option<Symbol>,
pub large_character_sets: Vec<(Option<Symbol>, CharacterSet)>,
}
pub fn build_tables(
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
@ -27,7 +39,7 @@ pub fn build_tables(
variable_info: &[VariableInfo],
inlines: &InlinedProductionMap,
report_symbol_name: Option<&str>,
) -> Result<(ParseTable, LexTable, LexTable, Option<Symbol>)> {
) -> Result<Tables> {
let (mut parse_table, following_tokens, parse_state_info) =
build_parse_table(syntax_grammar, lexical_grammar, inlines, variable_info)?;
let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens);
@ -56,7 +68,7 @@ pub fn build_tables(
&token_conflict_map,
&keywords,
);
let (main_lex_table, keyword_lex_table) = build_lex_table(
let lex_tables = build_lex_table(
&mut parse_table,
syntax_grammar,
lexical_grammar,
@ -76,12 +88,14 @@ pub fn build_tables(
report_symbol_name,
);
}
Ok((
Ok(Tables {
parse_table,
main_lex_table,
keyword_lex_table,
syntax_grammar.word_token,
))
main_lex_table: lex_tables.main_lex_table,
keyword_lex_table: lex_tables.keyword_lex_table,
large_character_sets: lex_tables.large_character_sets,
word_token: syntax_grammar.word_token,
})
}
fn populate_error_state(

View file

@ -1,21 +1,18 @@
use self::grammars::InputGrammar;
use anyhow::{anyhow, Context, Result};
use build_tables::build_tables;
use grammar_files::path_in_ignore;
use lazy_static::lazy_static;
use parse_grammar::parse_grammar;
use prepare_grammar::prepare_grammar;
use regex::{Regex, RegexBuilder};
use render::render_c_code;
use semver::Version;
use std::io::Write;
use std::path::{Path, PathBuf};
use std::process::{Command, Stdio};
use std::{env, fs};
use anyhow::{anyhow, Context, Result};
use lazy_static::lazy_static;
use regex::{Regex, RegexBuilder};
use semver::Version;
use build_tables::build_tables;
use grammar_files::path_in_ignore;
use grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar};
use parse_grammar::parse_grammar;
use prepare_grammar::prepare_grammar;
use render::render_c_code;
use rules::AliasMap;
mod build_tables;
mod dedup;
mod grammar_files;
@ -103,23 +100,12 @@ pub fn generate_parser_in_directory(
// Parse and preprocess the grammar.
let input_grammar = parse_grammar(&grammar_json)?;
let (syntax_grammar, lexical_grammar, inlines, simple_aliases) =
prepare_grammar(&input_grammar)?;
let language_name = input_grammar.name;
// Generate the parser and related files.
let GeneratedParser {
c_code,
node_types_json,
} = generate_parser_for_grammar_with_opts(
&language_name,
syntax_grammar,
lexical_grammar,
&inlines,
simple_aliases,
abi_version,
report_symbol_name,
)?;
} = generate_parser_for_grammar_with_opts(&input_grammar, abi_version, report_symbol_name)?;
write_file(&src_path.join("parser.c"), c_code)?;
write_file(&src_path.join("node-types.json"), node_types_json)?;
@ -128,7 +114,7 @@ pub fn generate_parser_in_directory(
write_file(&header_path.join("parser.h"), tree_sitter::PARSER_HEADER)?;
if !path_in_ignore(&repo_path) {
grammar_files::generate_grammar_files(&repo_path, &language_name, generate_bindings)?;
grammar_files::generate_grammar_files(&repo_path, &input_grammar.name, generate_bindings)?;
}
Ok(())
@ -137,29 +123,18 @@ pub fn generate_parser_in_directory(
pub fn generate_parser_for_grammar(grammar_json: &str) -> Result<(String, String)> {
let grammar_json = JSON_COMMENT_REGEX.replace_all(grammar_json, "\n");
let input_grammar = parse_grammar(&grammar_json)?;
let (syntax_grammar, lexical_grammar, inlines, simple_aliases) =
prepare_grammar(&input_grammar)?;
let parser = generate_parser_for_grammar_with_opts(
&input_grammar.name,
syntax_grammar,
lexical_grammar,
&inlines,
simple_aliases,
tree_sitter::LANGUAGE_VERSION,
None,
)?;
Ok((input_grammar.name, parser.c_code))
let parser =
generate_parser_for_grammar_with_opts(&input_grammar, tree_sitter::LANGUAGE_VERSION, None)?;
Ok((input_grammar.name.clone(), parser.c_code))
}
fn generate_parser_for_grammar_with_opts(
name: &str,
syntax_grammar: SyntaxGrammar,
lexical_grammar: LexicalGrammar,
inlines: &InlinedProductionMap,
simple_aliases: AliasMap,
input_grammar: &InputGrammar,
abi_version: usize,
report_symbol_name: Option<&str>,
) -> Result<GeneratedParser> {
let (syntax_grammar, lexical_grammar, inlines, simple_aliases) =
prepare_grammar(input_grammar)?;
let variable_info =
node_types::get_variable_info(&syntax_grammar, &lexical_grammar, &simple_aliases)?;
let node_types_json = node_types::generate_node_types_json(
@ -168,20 +143,17 @@ fn generate_parser_for_grammar_with_opts(
&simple_aliases,
&variable_info,
);
let (parse_table, main_lex_table, keyword_lex_table, keyword_capture_token) = build_tables(
let tables = build_tables(
&syntax_grammar,
&lexical_grammar,
&simple_aliases,
&variable_info,
inlines,
&inlines,
report_symbol_name,
)?;
let c_code = render_c_code(
name,
parse_table,
main_lex_table,
keyword_lex_table,
keyword_capture_token,
&input_grammar.name,
tables,
syntax_grammar,
lexical_grammar,
simple_aliases,

View file

@ -1,10 +1,10 @@
use std::char;
use std::cmp::max;
use std::cmp::Ordering;
use std::collections::HashSet;
use std::fmt;
use std::mem::swap;
use std::ops::Range;
use std::{
char,
cmp::{max, Ordering},
fmt,
mem::swap,
ops::{Range, RangeInclusive},
};
/// A set of characters represented as a vector of ranges.
#[derive(Clone, PartialEq, Eq, Hash)]
@ -114,6 +114,11 @@ impl CharacterSet {
self
}
pub fn assign(&mut self, other: &Self) {
self.ranges.clear();
self.ranges.extend_from_slice(&other.ranges);
}
fn add_int_range(&mut self, mut i: usize, start: u32, end: u32) -> usize {
while i < self.ranges.len() {
let range = &mut self.ranges[i];
@ -285,12 +290,24 @@ impl CharacterSet {
self.add(&other)
}
pub fn iter(&self) -> impl Iterator<Item = u32> + '_ {
self.ranges.iter().flat_map(std::clone::Clone::clone)
pub fn char_codes(&self) -> impl Iterator<Item = u32> + '_ {
self.ranges.iter().flat_map(Clone::clone)
}
pub fn chars(&self) -> impl Iterator<Item = char> + '_ {
self.iter().filter_map(char::from_u32)
self.char_codes().filter_map(char::from_u32)
}
pub fn range_count(&self) -> usize {
self.ranges.len()
}
pub fn ranges(&self) -> impl Iterator<Item = RangeInclusive<char>> + '_ {
self.ranges.iter().filter_map(|range| {
let start = range.clone().find_map(char::from_u32)?;
let end = (range.start..range.end).rev().find_map(char::from_u32)?;
Some(start..=end)
})
}
pub fn is_empty(&self) -> bool {
@ -299,41 +316,46 @@ impl CharacterSet {
/// Get a reduced list of character ranges, assuming that a given
/// set of characters can be safely ignored.
pub fn simplify_ignoring<'a>(
&'a self,
ruled_out_characters: &'a HashSet<u32>,
) -> Vec<Range<char>> {
let mut prev_range: Option<Range<char>> = None;
self.chars()
.map(|c| (c, false))
.chain(Some(('\0', true)))
.filter_map(move |(c, done)| {
if done {
return prev_range.clone();
}
if ruled_out_characters.contains(&(c as u32)) {
return None;
}
if let Some(range) = prev_range.clone() {
let mut prev_range_successor = range.end as u32 + 1;
while prev_range_successor < c as u32 {
if !ruled_out_characters.contains(&prev_range_successor) {
prev_range = Some(c..c);
return Some(range);
}
prev_range_successor += 1;
pub fn simplify_ignoring(&self, ruled_out_characters: &Self) -> Self {
let mut prev_range: Option<Range<u32>> = None;
Self {
ranges: self
.char_codes()
.map(|c| (c, false))
.chain(Some(('\0' as u32, true)))
.filter_map(move |(c, done)| {
if done {
return prev_range.clone();
}
prev_range = Some(range.start..c);
} else {
prev_range = Some(c..c);
}
None
})
.collect()
if ruled_out_characters.contains_code(c) {
return None;
}
if let Some(range) = prev_range.clone() {
let mut prev_range_successor = range.end as u32;
while prev_range_successor < c as u32 {
if !ruled_out_characters.contains_code(prev_range_successor) {
prev_range = Some(c..(c + 1));
return Some(range);
}
prev_range_successor += 1;
}
prev_range = Some(range.start..(c + 1));
} else {
prev_range = Some(c..(c + 1));
}
None
})
.collect(),
}
}
pub fn contains(&self, c: char) -> bool {
self.ranges.iter().any(|r| r.contains(&(c as u32)))
self.contains_code(c as u32)
}
fn contains_code(&self, c: u32) -> bool {
// self.ranges.iter().any(|r| r.start <= c && r.end >= c)
self.ranges.iter().any(|r| r.contains(&c))
}
}
@ -1033,7 +1055,7 @@ mod tests {
#[test]
#[allow(clippy::single_range_in_vec_init)]
fn test_character_set_get_ranges() {
fn test_character_set_simplify_ignoring() {
struct Row {
chars: Vec<char>,
ruled_out_chars: Vec<char>,
@ -1056,6 +1078,11 @@ mod tests {
ruled_out_chars: vec!['d', 'f', 'g'],
expected_ranges: vec!['a'..'h', 'z'..'z'],
},
Row {
chars: vec!['a', 'b', 'c', 'g', 'h', 'i'],
ruled_out_chars: vec!['d', 'j'],
expected_ranges: vec!['a'..'c', 'g'..'i'],
},
];
for Row {
@ -1064,13 +1091,23 @@ mod tests {
expected_ranges,
} in &table
{
let ruled_out_chars = ruled_out_chars.iter().map(|c: &char| *c as u32).collect();
let ruled_out_chars = ruled_out_chars
.iter()
.fold(CharacterSet::empty(), |set, c| set.add_char(*c));
let mut set = CharacterSet::empty();
for c in chars {
set = set.add_char(*c);
}
let ranges = set.simplify_ignoring(&ruled_out_chars);
assert_eq!(ranges, *expected_ranges);
let actual = set.simplify_ignoring(&ruled_out_chars);
let expected = expected_ranges
.iter()
.fold(CharacterSet::empty(), |set, range| {
set.add_range(range.start, range.end)
});
assert_eq!(
actual, expected,
"chars: {chars:?}, ruled out chars: {ruled_out_chars:?}"
);
}
}
}

View file

@ -1,12 +1,13 @@
use super::{
build_tables::Tables,
grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType},
nfa::CharacterSet,
rules::{Alias, AliasMap, Symbol, SymbolType},
tables::{
AdvanceAction, FieldLocation, GotoAction, LexState, LexTable, ParseAction, ParseTable,
ParseTableEntry,
},
};
use core::ops::Range;
use std::{
cmp,
collections::{HashMap, HashSet},
@ -62,6 +63,8 @@ struct Generator {
parse_table: ParseTable,
main_lex_table: LexTable,
keyword_lex_table: LexTable,
large_character_sets: Vec<(Option<Symbol>, CharacterSet)>,
large_character_set_constant_names: Vec<String>,
large_state_count: usize,
keyword_capture_token: Option<Symbol>,
syntax_grammar: SyntaxGrammar,
@ -78,18 +81,6 @@ struct Generator {
abi_version: usize,
}
struct TransitionSummary {
is_included: bool,
ranges: Vec<Range<char>>,
call_id: Option<usize>,
}
struct LargeCharacterSetInfo {
ranges: Vec<Range<char>>,
symbol: Symbol,
index: usize,
}
impl Generator {
fn generate(mut self) -> String {
self.init();
@ -117,14 +108,20 @@ impl Generator {
self.add_primary_state_id_list();
}
// Generate a helper function for each large character set.
// let mut sorted_large_char_sets = self.large_character_sets.iter().collect::<Vec<_>>();
for ix in 0..self.large_character_sets.len() {
self.add_character_set(ix);
}
let mut main_lex_table = LexTable::default();
swap(&mut main_lex_table, &mut self.main_lex_table);
self.add_lex_function("ts_lex", main_lex_table, true);
self.add_lex_function("ts_lex", main_lex_table);
if self.keyword_capture_token.is_some() {
let mut keyword_lex_table = LexTable::default();
swap(&mut keyword_lex_table, &mut self.keyword_lex_table);
self.add_lex_function("ts_lex_keywords", keyword_lex_table, false);
self.add_lex_function("ts_lex_keywords", keyword_lex_table);
}
self.add_lex_modes_list();
@ -662,83 +659,7 @@ impl Generator {
add_line!(self, "");
}
fn add_lex_function(
&mut self,
name: &str,
lex_table: LexTable,
extract_helper_functions: bool,
) {
let mut ruled_out_chars = HashSet::new();
let mut large_character_sets = Vec::<LargeCharacterSetInfo>::new();
// For each lex state, compute a summary of the code that needs to be
// generated.
let state_transition_summaries = lex_table
.states
.iter()
.map(|state| {
ruled_out_chars.clear();
// For each state transition, compute the set of character ranges
// that need to be checked.
state
.advance_actions
.iter()
.map(|(chars, action)| {
let is_included = !chars.contains(std::char::MAX);
let mut ranges;
if is_included {
ranges = chars.simplify_ignoring(&ruled_out_chars);
ruled_out_chars.extend(chars.iter());
} else {
ranges = chars.clone().negate().simplify_ignoring(&ruled_out_chars);
ranges.insert(0, '\0'..'\0');
}
// Record any large character sets so that they can be extracted
// into helper functions, reducing code duplication.
let mut call_id = None;
if extract_helper_functions && ranges.len() > LARGE_CHARACTER_RANGE_COUNT {
let char_set_symbol = self
.symbol_for_advance_action(action, &lex_table)
.expect("No symbol for lex state");
let mut count_for_symbol = 0;
for (i, info) in large_character_sets.iter_mut().enumerate() {
if info.ranges == ranges {
call_id = Some(i);
break;
}
if info.symbol == char_set_symbol {
count_for_symbol += 1;
}
}
if call_id.is_none() {
call_id = Some(large_character_sets.len());
large_character_sets.push(LargeCharacterSetInfo {
symbol: char_set_symbol,
index: count_for_symbol + 1,
ranges: ranges.clone(),
});
}
}
TransitionSummary {
is_included,
ranges,
call_id,
}
})
.collect()
})
.collect::<Vec<Vec<_>>>();
// Generate a helper function for each large character set.
let mut sorted_large_char_sets = large_character_sets.iter().collect::<Vec<_>>();
sorted_large_char_sets.sort_unstable_by_key(|info| (info.symbol, info.index));
for info in sorted_large_char_sets {
self.add_character_set(info);
}
fn add_lex_function(&mut self, name: &str, lex_table: LexTable) {
add_line!(
self,
"static bool {name}(TSLexer *lexer, TSStateId state) {{",
@ -753,7 +674,7 @@ impl Generator {
for (i, state) in lex_table.states.into_iter().enumerate() {
add_line!(self, "case {i}:");
indent!(self);
self.add_lex_state(state, &state_transition_summaries[i], &large_character_sets);
self.add_lex_state(state);
dedent!(self);
}
@ -770,35 +691,7 @@ impl Generator {
add_line!(self, "");
}
fn symbol_for_advance_action(
&self,
action: &AdvanceAction,
lex_table: &LexTable,
) -> Option<Symbol> {
let mut state_ids = vec![action.state];
let mut i = 0;
while i < state_ids.len() {
let id = state_ids[i];
let state = &lex_table.states[id];
if let Some(accept) = state.accept_action {
return Some(accept);
}
for (_, action) in &state.advance_actions {
if !state_ids.contains(&action.state) {
state_ids.push(action.state);
}
}
i += 1;
}
None
}
fn add_lex_state(
&mut self,
state: LexState,
transition_info: &[TransitionSummary],
large_character_sets: &[LargeCharacterSetInfo],
) {
fn add_lex_state(&mut self, state: LexState) {
if let Some(accept_action) = state.accept_action {
add_line!(self, "ACCEPT_TOKEN({});", self.symbol_ids[&accept_action]);
}
@ -807,37 +700,69 @@ impl Generator {
add_line!(self, "if (eof) ADVANCE({});", eof_action.state);
}
for (i, (_, action)) in state.advance_actions.into_iter().enumerate() {
let transition = &transition_info[i];
let mut chars_copy = CharacterSet::empty();
let mut large_set = CharacterSet::empty();
let mut ruled_out_chars = CharacterSet::empty();
for (chars, action) in state.advance_actions {
add_whitespace!(self);
// For each state transition, compute the set of character ranges
// that need to be checked.
let simplified = chars.simplify_ignoring(&ruled_out_chars);
ruled_out_chars = ruled_out_chars.add(&chars);
let mut chars = simplified;
// Find a large character set that matches the transition's character set,
// allowing for ruled-out characters for previous transitions.
let mut call_id = None;
if chars.range_count() >= LARGE_CHARACTER_RANGE_COUNT {
for (ix, (_, set)) in self.large_character_sets.iter().enumerate() {
chars_copy.assign(&chars);
large_set.assign(&set);
chars_copy.remove_intersection(&mut large_set);
if chars_copy.is_empty()
&& large_set.chars().all(|c| ruled_out_chars.contains(c))
{
call_id = Some(ix);
break;
}
}
}
let mut in_condition = false;
if call_id.is_some() || !chars.is_empty() {
add!(self, "if (");
in_condition = true;
}
// If there is a helper function for this transition's character
// set, then generate a call to that helper function.
if let Some(call_id) = transition.call_id {
let info = &large_character_sets[call_id];
add!(self, "if (");
if !transition.is_included {
add!(self, "!");
}
if let Some(call_id) = call_id {
add!(
self,
"set_contains({}_character_set_{}, {}, lookahead)) ",
self.symbol_ids[&info.symbol],
info.index,
info.ranges.len(),
"set_contains({}, {}, lookahead)",
self.large_character_set_constant_names[call_id],
chars.range_count(),
);
self.add_advance_action(&action);
add!(self, "\n");
continue;
}
// Otherwise, generate code to compare the lookahead character
// with all of the character ranges.
if !transition.ranges.is_empty() {
add!(self, "if (");
self.add_character_range_conditions(&transition.ranges, transition.is_included, 2);
else if !chars.is_empty() {
if call_id.is_some() {
add!(self, " || ");
}
let is_included = !chars.contains(char::MAX);
if !is_included {
chars = chars.negate().add_char('\0');
}
self.add_character_range_conditions(&chars, is_included, 2);
}
if in_condition {
add!(self, ") ");
}
self.add_advance_action(&action);
add!(self, "\n");
}
@ -847,7 +772,7 @@ impl Generator {
fn add_character_range_conditions(
&mut self,
ranges: &[Range<char>],
characters: &CharacterSet,
is_included: bool,
indent_count: usize,
) {
@ -859,86 +784,99 @@ impl Generator {
// parenthesis needed if we add the `!eof` condition to explicitly avoid confusion with
// precedence of `&&` and `||`
let (mut need_open_paren, mut need_close_paren) = (false, false);
for (i, range) in ranges.iter().enumerate() {
for (i, range) in characters.ranges().enumerate() {
let start = *range.start();
let end = *range.end();
if is_included {
if i > 0 {
add!(self, " ||{line_break}");
}
if range.start == '\0' {
if start == '\0' {
add!(self, "!eof && ");
(need_open_paren, need_close_paren) = (true, true);
}
if range.end == range.start {
if end == start {
if need_open_paren {
add!(self, "(");
need_open_paren = false;
}
add!(self, "lookahead == ");
self.add_character(range.start);
if need_close_paren && i == ranges.len() - 1 {
self.add_character(start);
if need_close_paren && i == characters.range_count() - 1 {
add!(self, ")");
need_close_paren = false;
}
} else if range.end as u32 == range.start as u32 + 1 {
} else if end as u32 == start as u32 + 1 {
add!(self, "lookahead == ");
self.add_character(range.start);
self.add_character(start);
add!(self, " ||{line_break}lookahead == ");
self.add_character(range.end);
self.add_character(end);
} else {
add!(self, "(");
self.add_character(range.start);
self.add_character(start);
add!(self, " <= lookahead && lookahead <= ");
self.add_character(range.end);
self.add_character(end);
add!(self, ")");
}
} else {
if i > 0 {
add!(self, " &&{line_break}");
}
if range.end == range.start {
if end == start {
add!(self, "lookahead != ");
self.add_character(range.start);
} else if range.end as u32 == range.start as u32 + 1 {
self.add_character(start);
} else if end as u32 == start as u32 + 1 {
add!(self, "lookahead != ");
self.add_character(range.start);
self.add_character(start);
add!(self, " &&{line_break}lookahead != ");
self.add_character(range.end);
} else if range.start != '\0' {
self.add_character(end);
} else if start != '\0' {
add!(self, "(lookahead < ");
self.add_character(range.start);
self.add_character(start);
add!(self, " || ");
self.add_character(range.end);
self.add_character(end);
add!(self, " < lookahead)");
} else {
add!(self, "lookahead > ");
self.add_character(range.end);
self.add_character(end);
}
}
}
}
fn add_character_set(&mut self, info: &LargeCharacterSetInfo) {
add_line!(
self,
"static TSCharacterRange {}_character_set_{}[] = {{",
self.symbol_ids[&info.symbol],
info.index
);
fn add_character_set(&mut self, ix: usize) {
let (symbol, characters) = self.large_character_sets[ix].clone();
let count = self.large_character_sets[0..ix]
.iter()
.filter(|(sym, _)| *sym == symbol)
.count();
let constant_name = if let Some(symbol) = symbol {
format!("{}_character_set_{}", self.symbol_ids[&symbol], count)
} else {
format!("extras_character_set_{}", count)
};
add_line!(self, "static TSCharacterRange {}[] = {{", constant_name);
self.large_character_set_constant_names.push(constant_name);
indent!(self);
for chunk in info.ranges.chunks(8) {
add_whitespace!(self);
for (i, range) in chunk.iter().enumerate() {
if i > 0 {
add!(self, " ");
for (ix, range) in characters.ranges().enumerate() {
let column = ix % 8;
if column == 0 {
if ix > 0 {
add!(self, "\n");
}
add!(self, "{{");
self.add_character(range.start);
add!(self, ", ");
self.add_character(range.end);
add!(self, "}},");
add_whitespace!(self);
} else {
add!(self, " ");
}
add!(self, "\n");
add!(self, "{{");
self.add_character(*range.start());
add!(self, ", ");
self.add_character(*range.end());
add!(self, "}},");
}
add!(self, "\n");
dedent!(self);
add_line!(self, "}};");
add_line!(self, "");
@ -1610,10 +1548,12 @@ impl Generator {
'\t' => add!(self, "'\\t'"),
'\r' => add!(self, "'\\r'"),
_ => {
if c == ' ' || c.is_ascii_graphic() {
if c == '\0' {
add!(self, "0")
} else if c == ' ' || c.is_ascii_graphic() {
add!(self, "'{c}'");
} else {
add!(self, "{}", c as u32);
add!(self, "0x{:02x}", c as u32);
}
}
}
@ -1641,10 +1581,7 @@ impl Generator {
#[allow(clippy::too_many_arguments)]
pub fn render_c_code(
name: &str,
parse_table: ParseTable,
main_lex_table: LexTable,
keyword_lex_table: LexTable,
keyword_capture_token: Option<Symbol>,
tables: Tables,
syntax_grammar: SyntaxGrammar,
lexical_grammar: LexicalGrammar,
default_aliases: AliasMap,
@ -1660,10 +1597,12 @@ pub fn render_c_code(
indent_level: 0,
language_name: name.to_string(),
large_state_count: 0,
parse_table,
main_lex_table,
keyword_lex_table,
keyword_capture_token,
parse_table: tables.parse_table,
main_lex_table: tables.main_lex_table,
keyword_lex_table: tables.keyword_lex_table,
keyword_capture_token: tables.word_token,
large_character_sets: tables.large_character_sets,
large_character_set_constant_names: Vec::new(),
syntax_grammar,
lexical_grammar,
default_aliases,