Identify large char sets for lexer using NFA transitions
This commit is contained in:
parent
39be6972fe
commit
be34bc9430
5 changed files with 313 additions and 317 deletions
|
|
@ -1,12 +1,13 @@
|
|||
use super::{
|
||||
build_tables::Tables,
|
||||
grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType},
|
||||
nfa::CharacterSet,
|
||||
rules::{Alias, AliasMap, Symbol, SymbolType},
|
||||
tables::{
|
||||
AdvanceAction, FieldLocation, GotoAction, LexState, LexTable, ParseAction, ParseTable,
|
||||
ParseTableEntry,
|
||||
},
|
||||
};
|
||||
use core::ops::Range;
|
||||
use std::{
|
||||
cmp,
|
||||
collections::{HashMap, HashSet},
|
||||
|
|
@ -62,6 +63,8 @@ struct Generator {
|
|||
parse_table: ParseTable,
|
||||
main_lex_table: LexTable,
|
||||
keyword_lex_table: LexTable,
|
||||
large_character_sets: Vec<(Option<Symbol>, CharacterSet)>,
|
||||
large_character_set_constant_names: Vec<String>,
|
||||
large_state_count: usize,
|
||||
keyword_capture_token: Option<Symbol>,
|
||||
syntax_grammar: SyntaxGrammar,
|
||||
|
|
@ -78,18 +81,6 @@ struct Generator {
|
|||
abi_version: usize,
|
||||
}
|
||||
|
||||
struct TransitionSummary {
|
||||
is_included: bool,
|
||||
ranges: Vec<Range<char>>,
|
||||
call_id: Option<usize>,
|
||||
}
|
||||
|
||||
struct LargeCharacterSetInfo {
|
||||
ranges: Vec<Range<char>>,
|
||||
symbol: Symbol,
|
||||
index: usize,
|
||||
}
|
||||
|
||||
impl Generator {
|
||||
fn generate(mut self) -> String {
|
||||
self.init();
|
||||
|
|
@ -117,14 +108,20 @@ impl Generator {
|
|||
self.add_primary_state_id_list();
|
||||
}
|
||||
|
||||
// Generate a helper function for each large character set.
|
||||
// let mut sorted_large_char_sets = self.large_character_sets.iter().collect::<Vec<_>>();
|
||||
for ix in 0..self.large_character_sets.len() {
|
||||
self.add_character_set(ix);
|
||||
}
|
||||
|
||||
let mut main_lex_table = LexTable::default();
|
||||
swap(&mut main_lex_table, &mut self.main_lex_table);
|
||||
self.add_lex_function("ts_lex", main_lex_table, true);
|
||||
self.add_lex_function("ts_lex", main_lex_table);
|
||||
|
||||
if self.keyword_capture_token.is_some() {
|
||||
let mut keyword_lex_table = LexTable::default();
|
||||
swap(&mut keyword_lex_table, &mut self.keyword_lex_table);
|
||||
self.add_lex_function("ts_lex_keywords", keyword_lex_table, false);
|
||||
self.add_lex_function("ts_lex_keywords", keyword_lex_table);
|
||||
}
|
||||
|
||||
self.add_lex_modes_list();
|
||||
|
|
@ -662,83 +659,7 @@ impl Generator {
|
|||
add_line!(self, "");
|
||||
}
|
||||
|
||||
fn add_lex_function(
|
||||
&mut self,
|
||||
name: &str,
|
||||
lex_table: LexTable,
|
||||
extract_helper_functions: bool,
|
||||
) {
|
||||
let mut ruled_out_chars = HashSet::new();
|
||||
let mut large_character_sets = Vec::<LargeCharacterSetInfo>::new();
|
||||
|
||||
// For each lex state, compute a summary of the code that needs to be
|
||||
// generated.
|
||||
let state_transition_summaries = lex_table
|
||||
.states
|
||||
.iter()
|
||||
.map(|state| {
|
||||
ruled_out_chars.clear();
|
||||
|
||||
// For each state transition, compute the set of character ranges
|
||||
// that need to be checked.
|
||||
state
|
||||
.advance_actions
|
||||
.iter()
|
||||
.map(|(chars, action)| {
|
||||
let is_included = !chars.contains(std::char::MAX);
|
||||
let mut ranges;
|
||||
if is_included {
|
||||
ranges = chars.simplify_ignoring(&ruled_out_chars);
|
||||
ruled_out_chars.extend(chars.iter());
|
||||
} else {
|
||||
ranges = chars.clone().negate().simplify_ignoring(&ruled_out_chars);
|
||||
ranges.insert(0, '\0'..'\0');
|
||||
}
|
||||
|
||||
// Record any large character sets so that they can be extracted
|
||||
// into helper functions, reducing code duplication.
|
||||
let mut call_id = None;
|
||||
if extract_helper_functions && ranges.len() > LARGE_CHARACTER_RANGE_COUNT {
|
||||
let char_set_symbol = self
|
||||
.symbol_for_advance_action(action, &lex_table)
|
||||
.expect("No symbol for lex state");
|
||||
let mut count_for_symbol = 0;
|
||||
for (i, info) in large_character_sets.iter_mut().enumerate() {
|
||||
if info.ranges == ranges {
|
||||
call_id = Some(i);
|
||||
break;
|
||||
}
|
||||
if info.symbol == char_set_symbol {
|
||||
count_for_symbol += 1;
|
||||
}
|
||||
}
|
||||
if call_id.is_none() {
|
||||
call_id = Some(large_character_sets.len());
|
||||
large_character_sets.push(LargeCharacterSetInfo {
|
||||
symbol: char_set_symbol,
|
||||
index: count_for_symbol + 1,
|
||||
ranges: ranges.clone(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
TransitionSummary {
|
||||
is_included,
|
||||
ranges,
|
||||
call_id,
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
})
|
||||
.collect::<Vec<Vec<_>>>();
|
||||
|
||||
// Generate a helper function for each large character set.
|
||||
let mut sorted_large_char_sets = large_character_sets.iter().collect::<Vec<_>>();
|
||||
sorted_large_char_sets.sort_unstable_by_key(|info| (info.symbol, info.index));
|
||||
for info in sorted_large_char_sets {
|
||||
self.add_character_set(info);
|
||||
}
|
||||
|
||||
fn add_lex_function(&mut self, name: &str, lex_table: LexTable) {
|
||||
add_line!(
|
||||
self,
|
||||
"static bool {name}(TSLexer *lexer, TSStateId state) {{",
|
||||
|
|
@ -753,7 +674,7 @@ impl Generator {
|
|||
for (i, state) in lex_table.states.into_iter().enumerate() {
|
||||
add_line!(self, "case {i}:");
|
||||
indent!(self);
|
||||
self.add_lex_state(state, &state_transition_summaries[i], &large_character_sets);
|
||||
self.add_lex_state(state);
|
||||
dedent!(self);
|
||||
}
|
||||
|
||||
|
|
@ -770,35 +691,7 @@ impl Generator {
|
|||
add_line!(self, "");
|
||||
}
|
||||
|
||||
fn symbol_for_advance_action(
|
||||
&self,
|
||||
action: &AdvanceAction,
|
||||
lex_table: &LexTable,
|
||||
) -> Option<Symbol> {
|
||||
let mut state_ids = vec![action.state];
|
||||
let mut i = 0;
|
||||
while i < state_ids.len() {
|
||||
let id = state_ids[i];
|
||||
let state = &lex_table.states[id];
|
||||
if let Some(accept) = state.accept_action {
|
||||
return Some(accept);
|
||||
}
|
||||
for (_, action) in &state.advance_actions {
|
||||
if !state_ids.contains(&action.state) {
|
||||
state_ids.push(action.state);
|
||||
}
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
fn add_lex_state(
|
||||
&mut self,
|
||||
state: LexState,
|
||||
transition_info: &[TransitionSummary],
|
||||
large_character_sets: &[LargeCharacterSetInfo],
|
||||
) {
|
||||
fn add_lex_state(&mut self, state: LexState) {
|
||||
if let Some(accept_action) = state.accept_action {
|
||||
add_line!(self, "ACCEPT_TOKEN({});", self.symbol_ids[&accept_action]);
|
||||
}
|
||||
|
|
@ -807,37 +700,69 @@ impl Generator {
|
|||
add_line!(self, "if (eof) ADVANCE({});", eof_action.state);
|
||||
}
|
||||
|
||||
for (i, (_, action)) in state.advance_actions.into_iter().enumerate() {
|
||||
let transition = &transition_info[i];
|
||||
let mut chars_copy = CharacterSet::empty();
|
||||
let mut large_set = CharacterSet::empty();
|
||||
let mut ruled_out_chars = CharacterSet::empty();
|
||||
|
||||
for (chars, action) in state.advance_actions {
|
||||
add_whitespace!(self);
|
||||
|
||||
// For each state transition, compute the set of character ranges
|
||||
// that need to be checked.
|
||||
let simplified = chars.simplify_ignoring(&ruled_out_chars);
|
||||
ruled_out_chars = ruled_out_chars.add(&chars);
|
||||
let mut chars = simplified;
|
||||
|
||||
// Find a large character set that matches the transition's character set,
|
||||
// allowing for ruled-out characters for previous transitions.
|
||||
let mut call_id = None;
|
||||
if chars.range_count() >= LARGE_CHARACTER_RANGE_COUNT {
|
||||
for (ix, (_, set)) in self.large_character_sets.iter().enumerate() {
|
||||
chars_copy.assign(&chars);
|
||||
large_set.assign(&set);
|
||||
chars_copy.remove_intersection(&mut large_set);
|
||||
if chars_copy.is_empty()
|
||||
&& large_set.chars().all(|c| ruled_out_chars.contains(c))
|
||||
{
|
||||
call_id = Some(ix);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut in_condition = false;
|
||||
if call_id.is_some() || !chars.is_empty() {
|
||||
add!(self, "if (");
|
||||
in_condition = true;
|
||||
}
|
||||
|
||||
// If there is a helper function for this transition's character
|
||||
// set, then generate a call to that helper function.
|
||||
if let Some(call_id) = transition.call_id {
|
||||
let info = &large_character_sets[call_id];
|
||||
add!(self, "if (");
|
||||
if !transition.is_included {
|
||||
add!(self, "!");
|
||||
}
|
||||
if let Some(call_id) = call_id {
|
||||
add!(
|
||||
self,
|
||||
"set_contains({}_character_set_{}, {}, lookahead)) ",
|
||||
self.symbol_ids[&info.symbol],
|
||||
info.index,
|
||||
info.ranges.len(),
|
||||
"set_contains({}, {}, lookahead)",
|
||||
self.large_character_set_constant_names[call_id],
|
||||
chars.range_count(),
|
||||
);
|
||||
self.add_advance_action(&action);
|
||||
add!(self, "\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
// Otherwise, generate code to compare the lookahead character
|
||||
// with all of the character ranges.
|
||||
if !transition.ranges.is_empty() {
|
||||
add!(self, "if (");
|
||||
self.add_character_range_conditions(&transition.ranges, transition.is_included, 2);
|
||||
else if !chars.is_empty() {
|
||||
if call_id.is_some() {
|
||||
add!(self, " || ");
|
||||
}
|
||||
let is_included = !chars.contains(char::MAX);
|
||||
if !is_included {
|
||||
chars = chars.negate().add_char('\0');
|
||||
}
|
||||
self.add_character_range_conditions(&chars, is_included, 2);
|
||||
}
|
||||
|
||||
if in_condition {
|
||||
add!(self, ") ");
|
||||
}
|
||||
|
||||
self.add_advance_action(&action);
|
||||
add!(self, "\n");
|
||||
}
|
||||
|
|
@ -847,7 +772,7 @@ impl Generator {
|
|||
|
||||
fn add_character_range_conditions(
|
||||
&mut self,
|
||||
ranges: &[Range<char>],
|
||||
characters: &CharacterSet,
|
||||
is_included: bool,
|
||||
indent_count: usize,
|
||||
) {
|
||||
|
|
@ -859,86 +784,99 @@ impl Generator {
|
|||
// parenthesis needed if we add the `!eof` condition to explicitly avoid confusion with
|
||||
// precedence of `&&` and `||`
|
||||
let (mut need_open_paren, mut need_close_paren) = (false, false);
|
||||
for (i, range) in ranges.iter().enumerate() {
|
||||
for (i, range) in characters.ranges().enumerate() {
|
||||
let start = *range.start();
|
||||
let end = *range.end();
|
||||
if is_included {
|
||||
if i > 0 {
|
||||
add!(self, " ||{line_break}");
|
||||
}
|
||||
if range.start == '\0' {
|
||||
if start == '\0' {
|
||||
add!(self, "!eof && ");
|
||||
(need_open_paren, need_close_paren) = (true, true);
|
||||
}
|
||||
if range.end == range.start {
|
||||
if end == start {
|
||||
if need_open_paren {
|
||||
add!(self, "(");
|
||||
need_open_paren = false;
|
||||
}
|
||||
add!(self, "lookahead == ");
|
||||
self.add_character(range.start);
|
||||
if need_close_paren && i == ranges.len() - 1 {
|
||||
self.add_character(start);
|
||||
if need_close_paren && i == characters.range_count() - 1 {
|
||||
add!(self, ")");
|
||||
need_close_paren = false;
|
||||
}
|
||||
} else if range.end as u32 == range.start as u32 + 1 {
|
||||
} else if end as u32 == start as u32 + 1 {
|
||||
add!(self, "lookahead == ");
|
||||
self.add_character(range.start);
|
||||
self.add_character(start);
|
||||
add!(self, " ||{line_break}lookahead == ");
|
||||
self.add_character(range.end);
|
||||
self.add_character(end);
|
||||
} else {
|
||||
add!(self, "(");
|
||||
self.add_character(range.start);
|
||||
self.add_character(start);
|
||||
add!(self, " <= lookahead && lookahead <= ");
|
||||
self.add_character(range.end);
|
||||
self.add_character(end);
|
||||
add!(self, ")");
|
||||
}
|
||||
} else {
|
||||
if i > 0 {
|
||||
add!(self, " &&{line_break}");
|
||||
}
|
||||
if range.end == range.start {
|
||||
if end == start {
|
||||
add!(self, "lookahead != ");
|
||||
self.add_character(range.start);
|
||||
} else if range.end as u32 == range.start as u32 + 1 {
|
||||
self.add_character(start);
|
||||
} else if end as u32 == start as u32 + 1 {
|
||||
add!(self, "lookahead != ");
|
||||
self.add_character(range.start);
|
||||
self.add_character(start);
|
||||
add!(self, " &&{line_break}lookahead != ");
|
||||
self.add_character(range.end);
|
||||
} else if range.start != '\0' {
|
||||
self.add_character(end);
|
||||
} else if start != '\0' {
|
||||
add!(self, "(lookahead < ");
|
||||
self.add_character(range.start);
|
||||
self.add_character(start);
|
||||
add!(self, " || ");
|
||||
self.add_character(range.end);
|
||||
self.add_character(end);
|
||||
add!(self, " < lookahead)");
|
||||
} else {
|
||||
add!(self, "lookahead > ");
|
||||
self.add_character(range.end);
|
||||
self.add_character(end);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn add_character_set(&mut self, info: &LargeCharacterSetInfo) {
|
||||
add_line!(
|
||||
self,
|
||||
"static TSCharacterRange {}_character_set_{}[] = {{",
|
||||
self.symbol_ids[&info.symbol],
|
||||
info.index
|
||||
);
|
||||
fn add_character_set(&mut self, ix: usize) {
|
||||
let (symbol, characters) = self.large_character_sets[ix].clone();
|
||||
let count = self.large_character_sets[0..ix]
|
||||
.iter()
|
||||
.filter(|(sym, _)| *sym == symbol)
|
||||
.count();
|
||||
|
||||
let constant_name = if let Some(symbol) = symbol {
|
||||
format!("{}_character_set_{}", self.symbol_ids[&symbol], count)
|
||||
} else {
|
||||
format!("extras_character_set_{}", count)
|
||||
};
|
||||
add_line!(self, "static TSCharacterRange {}[] = {{", constant_name);
|
||||
self.large_character_set_constant_names.push(constant_name);
|
||||
|
||||
indent!(self);
|
||||
for chunk in info.ranges.chunks(8) {
|
||||
add_whitespace!(self);
|
||||
for (i, range) in chunk.iter().enumerate() {
|
||||
if i > 0 {
|
||||
add!(self, " ");
|
||||
for (ix, range) in characters.ranges().enumerate() {
|
||||
let column = ix % 8;
|
||||
if column == 0 {
|
||||
if ix > 0 {
|
||||
add!(self, "\n");
|
||||
}
|
||||
add!(self, "{{");
|
||||
self.add_character(range.start);
|
||||
add!(self, ", ");
|
||||
self.add_character(range.end);
|
||||
add!(self, "}},");
|
||||
add_whitespace!(self);
|
||||
} else {
|
||||
add!(self, " ");
|
||||
}
|
||||
add!(self, "\n");
|
||||
add!(self, "{{");
|
||||
self.add_character(*range.start());
|
||||
add!(self, ", ");
|
||||
self.add_character(*range.end());
|
||||
add!(self, "}},");
|
||||
}
|
||||
add!(self, "\n");
|
||||
dedent!(self);
|
||||
add_line!(self, "}};");
|
||||
add_line!(self, "");
|
||||
|
|
@ -1610,10 +1548,12 @@ impl Generator {
|
|||
'\t' => add!(self, "'\\t'"),
|
||||
'\r' => add!(self, "'\\r'"),
|
||||
_ => {
|
||||
if c == ' ' || c.is_ascii_graphic() {
|
||||
if c == '\0' {
|
||||
add!(self, "0")
|
||||
} else if c == ' ' || c.is_ascii_graphic() {
|
||||
add!(self, "'{c}'");
|
||||
} else {
|
||||
add!(self, "{}", c as u32);
|
||||
add!(self, "0x{:02x}", c as u32);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1641,10 +1581,7 @@ impl Generator {
|
|||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn render_c_code(
|
||||
name: &str,
|
||||
parse_table: ParseTable,
|
||||
main_lex_table: LexTable,
|
||||
keyword_lex_table: LexTable,
|
||||
keyword_capture_token: Option<Symbol>,
|
||||
tables: Tables,
|
||||
syntax_grammar: SyntaxGrammar,
|
||||
lexical_grammar: LexicalGrammar,
|
||||
default_aliases: AliasMap,
|
||||
|
|
@ -1660,10 +1597,12 @@ pub fn render_c_code(
|
|||
indent_level: 0,
|
||||
language_name: name.to_string(),
|
||||
large_state_count: 0,
|
||||
parse_table,
|
||||
main_lex_table,
|
||||
keyword_lex_table,
|
||||
keyword_capture_token,
|
||||
parse_table: tables.parse_table,
|
||||
main_lex_table: tables.main_lex_table,
|
||||
keyword_lex_table: tables.keyword_lex_table,
|
||||
keyword_capture_token: tables.word_token,
|
||||
large_character_sets: tables.large_character_sets,
|
||||
large_character_set_constant_names: Vec::new(),
|
||||
syntax_grammar,
|
||||
lexical_grammar,
|
||||
default_aliases,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue