From 82fda8929e0019f6ba676f659677e84000ae1632 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 3 Jan 2019 10:31:14 -0800 Subject: [PATCH] Add EOF actions to lex table --- src/build_tables/build_lex_table.rs | 97 +++++++++++++++++++++------ src/build_tables/coincident_tokens.rs | 11 +-- src/render/mod.rs | 19 +++--- src/rules.rs | 4 ++ src/tables.rs | 2 +- 5 files changed, 96 insertions(+), 37 deletions(-) diff --git a/src/build_tables/build_lex_table.rs b/src/build_tables/build_lex_table.rs index c002f427..66a4fe43 100644 --- a/src/build_tables/build_lex_table.rs +++ b/src/build_tables/build_lex_table.rs @@ -1,7 +1,8 @@ use super::item::LookaheadSet; use super::token_conflicts::TokenConflictMap; use crate::grammars::{LexicalGrammar, SyntaxGrammar}; -use crate::nfa::NfaCursor; +use crate::nfa::{CharacterSet, NfaCursor}; +use crate::rules::Symbol; use crate::tables::{AdvanceAction, LexState, LexTable, ParseTable}; use std::collections::hash_map::Entry; use std::collections::{BTreeMap, HashMap, VecDeque}; @@ -23,7 +24,6 @@ pub(crate) fn build_lex_table( let mut builder = LexTableBuilder::new(lexical_grammar); for (i, state) in parse_table.states.iter_mut().enumerate() { - info!("populate lex state for parse state {}", i); let tokens = LookaheadSet::with(state.terminal_entries.keys().filter_map(|token| { if token.is_terminal() { if keywords.contains(&token) { @@ -31,10 +31,13 @@ pub(crate) fn build_lex_table( } else { Some(*token) } + } else if token.is_eof() { + Some(*token) } else { None } })); + info!("populate lex state for parse state {}", i); state.lex_state_id = builder.add_state_for_tokens(&tokens); } @@ -44,12 +47,18 @@ pub(crate) fn build_lex_table( (table, keyword_lex_table) } +struct QueueEntry { + state_id: usize, + nfa_states: Vec, + eof_valid: bool, +} + struct LexTableBuilder<'a> { lexical_grammar: &'a LexicalGrammar, cursor: NfaCursor<'a>, table: LexTable, - state_queue: VecDeque<(usize, Vec)>, - state_ids_by_nfa_state_set: HashMap, usize>, + state_queue: VecDeque, + state_ids_by_nfa_state_set: HashMap<(Vec, bool), usize>, } impl<'a> LexTableBuilder<'a> { @@ -64,11 +73,19 @@ impl<'a> LexTableBuilder<'a> { } fn add_state_for_tokens(&mut self, tokens: &LookaheadSet) -> usize { + let mut eof_valid = false; let nfa_states = tokens .iter() - .map(|token| self.lexical_grammar.variables[token.index].start_state) + .filter_map(|token| { + if token.is_terminal() { + Some(self.lexical_grammar.variables[token.index].start_state) + } else { + eof_valid = true; + None + } + }) .collect(); - let (state_id, is_new) = self.add_state(nfa_states); + let (state_id, is_new) = self.add_state(nfa_states, eof_valid); if is_new { info!( @@ -81,32 +98,42 @@ impl<'a> LexTableBuilder<'a> { ); } - while let Some((state_id, nfa_states)) = self.state_queue.pop_back() { - self.populate_state(state_id, nfa_states); + while let Some(QueueEntry { + state_id, + nfa_states, + eof_valid, + }) = self.state_queue.pop_front() + { + self.populate_state(state_id, nfa_states, eof_valid); } state_id } - fn add_state(&mut self, nfa_states: Vec) -> (usize, bool) { + fn add_state(&mut self, nfa_states: Vec, eof_valid: bool) -> (usize, bool) { self.cursor.reset(nfa_states); match self .state_ids_by_nfa_state_set - .entry(self.cursor.state_ids.clone()) + .entry((self.cursor.state_ids.clone(), eof_valid)) { Entry::Occupied(o) => (*o.get(), false), Entry::Vacant(v) => { let state_id = self.table.states.len(); self.table.states.push(LexState::default()); - self.state_queue.push_back((state_id, v.key().clone())); + self.state_queue.push_back(QueueEntry { + state_id, + nfa_states: v.key().0.clone(), + eof_valid, + }); v.insert(state_id); (state_id, true) } } } - fn populate_state(&mut self, state_id: usize, nfa_states: Vec) { + fn populate_state(&mut self, state_id: usize, nfa_states: Vec, eof_valid: bool) { self.cursor.force_reset(nfa_states); + // The EOF state is represented as an empty list of NFA states. let mut completion = None; for (id, prec) in self.cursor.completions() { if let Some((prev_id, prev_precedence)) = completion { @@ -121,7 +148,24 @@ impl<'a> LexTableBuilder<'a> { completion = Some((id, prec)); } - for (chars, advance_precedence, next_states, is_sep) in self.cursor.grouped_successors() { + info!("raw successors: {:?}", self.cursor.successors().collect::>()); + let successors = self.cursor.grouped_successors(); + + // If EOF is a valid lookahead token, add a transition predicated on the null + // character that leads to the empty set of NFA states. + if eof_valid { + let (next_state_id, _) = self.add_state(Vec::new(), false); + info!("populate state: {}, character: EOF", state_id); + self.table.states[state_id].advance_actions.push(( + CharacterSet::empty().add_char('\0'), + AdvanceAction { + state: next_state_id, + in_main_token: true, + }, + )); + } + + for (chars, advance_precedence, next_states, is_sep) in successors { info!( "populate state: {}, characters: {:?}, precedence: {:?}", state_id, chars, advance_precedence @@ -131,7 +175,7 @@ impl<'a> LexTableBuilder<'a> { continue; } } - let (next_state_id, _) = self.add_state(next_states); + let (next_state_id, _) = self.add_state(next_states, eof_valid && is_sep); self.table.states[state_id].advance_actions.push(( chars, AdvanceAction { @@ -141,8 +185,10 @@ impl<'a> LexTableBuilder<'a> { )); } - if let Some((completion_index, _)) = completion { - self.table.states[state_id].accept_action = Some(completion_index); + if let Some((complete_id, _)) = completion { + self.table.states[state_id].accept_action = Some(Symbol::terminal(complete_id)); + } else if self.cursor.state_ids.is_empty() { + self.table.states[state_id].accept_action = Some(Symbol::end()); } } } @@ -179,11 +225,20 @@ fn shrink_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) { } } - let final_state_replacements = (0..table.states.len()).into_iter().map(|state_id| { - let replacement = state_replacements.get(&state_id).cloned().unwrap_or(state_id); - let prior_removed = state_replacements.iter().take_while(|i| *i.0 < replacement).count(); - replacement - prior_removed - }).collect::>(); + let final_state_replacements = (0..table.states.len()) + .into_iter() + .map(|state_id| { + let replacement = state_replacements + .get(&state_id) + .cloned() + .unwrap_or(state_id); + let prior_removed = state_replacements + .iter() + .take_while(|i| *i.0 < replacement) + .count(); + replacement - prior_removed + }) + .collect::>(); for state in parse_table.states.iter_mut() { state.lex_state_id = final_state_replacements[state.lex_state_id]; diff --git a/src/build_tables/coincident_tokens.rs b/src/build_tables/coincident_tokens.rs index 5f2bb3ec..ac5931e1 100644 --- a/src/build_tables/coincident_tokens.rs +++ b/src/build_tables/coincident_tokens.rs @@ -1,10 +1,9 @@ use crate::grammars::LexicalGrammar; use crate::rules::Symbol; use crate::tables::{ParseStateId, ParseTable}; -use std::collections::HashSet; pub(crate) struct CoincidentTokenIndex { - entries: Vec>, + entries: Vec>, n: usize, } @@ -13,20 +12,22 @@ impl CoincidentTokenIndex { let n = lexical_grammar.variables.len(); let mut result = Self { n, - entries: vec![HashSet::new(); n * n], + entries: vec![Vec::new(); n * n], }; for (i, state) in table.states.iter().enumerate() { for symbol in state.terminal_entries.keys() { for other_symbol in state.terminal_entries.keys() { let index = result.index(*symbol, *other_symbol); - result.entries[index].insert(i); + if result.entries[index].last().cloned() != Some(i) { + result.entries[index].push(i); + } } } } result } - pub fn states_with(&self, a: Symbol, b: Symbol) -> &HashSet { + pub fn states_with(&self, a: Symbol, b: Symbol) -> &Vec { &self.entries[self.index(a, b)] } diff --git a/src/render/mod.rs b/src/render/mod.rs index 250218c1..624fa1e0 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -125,7 +125,7 @@ impl Generator { .symbols .iter() .filter(|symbol| { - if symbol.is_terminal() { + if symbol.is_terminal() || symbol.is_eof() { true } else if symbol.is_external() { self.syntax_grammar.external_tokens[symbol.index] @@ -359,7 +359,7 @@ impl Generator { add_line!( self, "ACCEPT_TOKEN({})", - self.symbol_ids[&Symbol::terminal(accept_action)] + self.symbol_ids[&accept_action] ); } @@ -462,18 +462,16 @@ impl Generator { let mut prev_range: Option> = None; chars .iter() - .cloned() - .chain(Some('\0')) - .filter_map(move |c| { + .map(|c| (*c, false)) + .chain(Some(('\0', true))) + .filter_map(move |(c, done)| { + if done { + return prev_range.clone(); + } if ruled_out_characters.contains(&(c as u32)) { return None; } if let Some(range) = prev_range.clone() { - if c == '\0' { - prev_range = Some(c..c); - return Some(range); - } - let mut prev_range_successor = range.end as u32 + 1; while prev_range_successor < c as u32 { if !ruled_out_characters.contains(&prev_range_successor) { @@ -948,6 +946,7 @@ impl Generator { fn add_character(&mut self, c: char) { if c.is_ascii() { match c { + '\0' => add!(self, "'\\0'"), '\'' => add!(self, "'\\''"), '\\' => add!(self, "'\\\\'"), '\t' => add!(self, "'\\t'"), diff --git a/src/rules.rs b/src/rules.rs index ad16c632..bd0340fc 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -162,6 +162,10 @@ impl Symbol { self.kind == SymbolType::External } + pub fn is_eof(&self) -> bool { + self.kind == SymbolType::End + } + pub fn non_terminal(index: usize) -> Self { Symbol { kind: SymbolType::NonTerminal, diff --git a/src/tables.rs b/src/tables.rs index 21222135..f400d25c 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -55,7 +55,7 @@ pub(crate) struct AdvanceAction { #[derive(Clone, Debug, Default, PartialEq, Eq)] pub(crate) struct LexState { pub advance_actions: Vec<(CharacterSet, AdvanceAction)>, - pub accept_action: Option, + pub accept_action: Option, } #[derive(Debug, PartialEq, Eq)]