Add EOF actions to lex table
This commit is contained in:
parent
92d4fe419c
commit
82fda8929e
5 changed files with 96 additions and 37 deletions
|
|
@ -1,7 +1,8 @@
|
|||
use super::item::LookaheadSet;
|
||||
use super::token_conflicts::TokenConflictMap;
|
||||
use crate::grammars::{LexicalGrammar, SyntaxGrammar};
|
||||
use crate::nfa::NfaCursor;
|
||||
use crate::nfa::{CharacterSet, NfaCursor};
|
||||
use crate::rules::Symbol;
|
||||
use crate::tables::{AdvanceAction, LexState, LexTable, ParseTable};
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::{BTreeMap, HashMap, VecDeque};
|
||||
|
|
@ -23,7 +24,6 @@ pub(crate) fn build_lex_table(
|
|||
|
||||
let mut builder = LexTableBuilder::new(lexical_grammar);
|
||||
for (i, state) in parse_table.states.iter_mut().enumerate() {
|
||||
info!("populate lex state for parse state {}", i);
|
||||
let tokens = LookaheadSet::with(state.terminal_entries.keys().filter_map(|token| {
|
||||
if token.is_terminal() {
|
||||
if keywords.contains(&token) {
|
||||
|
|
@ -31,10 +31,13 @@ pub(crate) fn build_lex_table(
|
|||
} else {
|
||||
Some(*token)
|
||||
}
|
||||
} else if token.is_eof() {
|
||||
Some(*token)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}));
|
||||
info!("populate lex state for parse state {}", i);
|
||||
state.lex_state_id = builder.add_state_for_tokens(&tokens);
|
||||
}
|
||||
|
||||
|
|
@ -44,12 +47,18 @@ pub(crate) fn build_lex_table(
|
|||
(table, keyword_lex_table)
|
||||
}
|
||||
|
||||
struct QueueEntry {
|
||||
state_id: usize,
|
||||
nfa_states: Vec<u32>,
|
||||
eof_valid: bool,
|
||||
}
|
||||
|
||||
struct LexTableBuilder<'a> {
|
||||
lexical_grammar: &'a LexicalGrammar,
|
||||
cursor: NfaCursor<'a>,
|
||||
table: LexTable,
|
||||
state_queue: VecDeque<(usize, Vec<u32>)>,
|
||||
state_ids_by_nfa_state_set: HashMap<Vec<u32>, usize>,
|
||||
state_queue: VecDeque<QueueEntry>,
|
||||
state_ids_by_nfa_state_set: HashMap<(Vec<u32>, bool), usize>,
|
||||
}
|
||||
|
||||
impl<'a> LexTableBuilder<'a> {
|
||||
|
|
@ -64,11 +73,19 @@ impl<'a> LexTableBuilder<'a> {
|
|||
}
|
||||
|
||||
fn add_state_for_tokens(&mut self, tokens: &LookaheadSet) -> usize {
|
||||
let mut eof_valid = false;
|
||||
let nfa_states = tokens
|
||||
.iter()
|
||||
.map(|token| self.lexical_grammar.variables[token.index].start_state)
|
||||
.filter_map(|token| {
|
||||
if token.is_terminal() {
|
||||
Some(self.lexical_grammar.variables[token.index].start_state)
|
||||
} else {
|
||||
eof_valid = true;
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
let (state_id, is_new) = self.add_state(nfa_states);
|
||||
let (state_id, is_new) = self.add_state(nfa_states, eof_valid);
|
||||
|
||||
if is_new {
|
||||
info!(
|
||||
|
|
@ -81,32 +98,42 @@ impl<'a> LexTableBuilder<'a> {
|
|||
);
|
||||
}
|
||||
|
||||
while let Some((state_id, nfa_states)) = self.state_queue.pop_back() {
|
||||
self.populate_state(state_id, nfa_states);
|
||||
while let Some(QueueEntry {
|
||||
state_id,
|
||||
nfa_states,
|
||||
eof_valid,
|
||||
}) = self.state_queue.pop_front()
|
||||
{
|
||||
self.populate_state(state_id, nfa_states, eof_valid);
|
||||
}
|
||||
state_id
|
||||
}
|
||||
|
||||
fn add_state(&mut self, nfa_states: Vec<u32>) -> (usize, bool) {
|
||||
fn add_state(&mut self, nfa_states: Vec<u32>, eof_valid: bool) -> (usize, bool) {
|
||||
self.cursor.reset(nfa_states);
|
||||
match self
|
||||
.state_ids_by_nfa_state_set
|
||||
.entry(self.cursor.state_ids.clone())
|
||||
.entry((self.cursor.state_ids.clone(), eof_valid))
|
||||
{
|
||||
Entry::Occupied(o) => (*o.get(), false),
|
||||
Entry::Vacant(v) => {
|
||||
let state_id = self.table.states.len();
|
||||
self.table.states.push(LexState::default());
|
||||
self.state_queue.push_back((state_id, v.key().clone()));
|
||||
self.state_queue.push_back(QueueEntry {
|
||||
state_id,
|
||||
nfa_states: v.key().0.clone(),
|
||||
eof_valid,
|
||||
});
|
||||
v.insert(state_id);
|
||||
(state_id, true)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn populate_state(&mut self, state_id: usize, nfa_states: Vec<u32>) {
|
||||
fn populate_state(&mut self, state_id: usize, nfa_states: Vec<u32>, eof_valid: bool) {
|
||||
self.cursor.force_reset(nfa_states);
|
||||
|
||||
// The EOF state is represented as an empty list of NFA states.
|
||||
let mut completion = None;
|
||||
for (id, prec) in self.cursor.completions() {
|
||||
if let Some((prev_id, prev_precedence)) = completion {
|
||||
|
|
@ -121,7 +148,24 @@ impl<'a> LexTableBuilder<'a> {
|
|||
completion = Some((id, prec));
|
||||
}
|
||||
|
||||
for (chars, advance_precedence, next_states, is_sep) in self.cursor.grouped_successors() {
|
||||
info!("raw successors: {:?}", self.cursor.successors().collect::<Vec<_>>());
|
||||
let successors = self.cursor.grouped_successors();
|
||||
|
||||
// If EOF is a valid lookahead token, add a transition predicated on the null
|
||||
// character that leads to the empty set of NFA states.
|
||||
if eof_valid {
|
||||
let (next_state_id, _) = self.add_state(Vec::new(), false);
|
||||
info!("populate state: {}, character: EOF", state_id);
|
||||
self.table.states[state_id].advance_actions.push((
|
||||
CharacterSet::empty().add_char('\0'),
|
||||
AdvanceAction {
|
||||
state: next_state_id,
|
||||
in_main_token: true,
|
||||
},
|
||||
));
|
||||
}
|
||||
|
||||
for (chars, advance_precedence, next_states, is_sep) in successors {
|
||||
info!(
|
||||
"populate state: {}, characters: {:?}, precedence: {:?}",
|
||||
state_id, chars, advance_precedence
|
||||
|
|
@ -131,7 +175,7 @@ impl<'a> LexTableBuilder<'a> {
|
|||
continue;
|
||||
}
|
||||
}
|
||||
let (next_state_id, _) = self.add_state(next_states);
|
||||
let (next_state_id, _) = self.add_state(next_states, eof_valid && is_sep);
|
||||
self.table.states[state_id].advance_actions.push((
|
||||
chars,
|
||||
AdvanceAction {
|
||||
|
|
@ -141,8 +185,10 @@ impl<'a> LexTableBuilder<'a> {
|
|||
));
|
||||
}
|
||||
|
||||
if let Some((completion_index, _)) = completion {
|
||||
self.table.states[state_id].accept_action = Some(completion_index);
|
||||
if let Some((complete_id, _)) = completion {
|
||||
self.table.states[state_id].accept_action = Some(Symbol::terminal(complete_id));
|
||||
} else if self.cursor.state_ids.is_empty() {
|
||||
self.table.states[state_id].accept_action = Some(Symbol::end());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -179,11 +225,20 @@ fn shrink_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) {
|
|||
}
|
||||
}
|
||||
|
||||
let final_state_replacements = (0..table.states.len()).into_iter().map(|state_id| {
|
||||
let replacement = state_replacements.get(&state_id).cloned().unwrap_or(state_id);
|
||||
let prior_removed = state_replacements.iter().take_while(|i| *i.0 < replacement).count();
|
||||
replacement - prior_removed
|
||||
}).collect::<Vec<_>>();
|
||||
let final_state_replacements = (0..table.states.len())
|
||||
.into_iter()
|
||||
.map(|state_id| {
|
||||
let replacement = state_replacements
|
||||
.get(&state_id)
|
||||
.cloned()
|
||||
.unwrap_or(state_id);
|
||||
let prior_removed = state_replacements
|
||||
.iter()
|
||||
.take_while(|i| *i.0 < replacement)
|
||||
.count();
|
||||
replacement - prior_removed
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
for state in parse_table.states.iter_mut() {
|
||||
state.lex_state_id = final_state_replacements[state.lex_state_id];
|
||||
|
|
|
|||
|
|
@ -1,10 +1,9 @@
|
|||
use crate::grammars::LexicalGrammar;
|
||||
use crate::rules::Symbol;
|
||||
use crate::tables::{ParseStateId, ParseTable};
|
||||
use std::collections::HashSet;
|
||||
|
||||
pub(crate) struct CoincidentTokenIndex {
|
||||
entries: Vec<HashSet<ParseStateId>>,
|
||||
entries: Vec<Vec<ParseStateId>>,
|
||||
n: usize,
|
||||
}
|
||||
|
||||
|
|
@ -13,20 +12,22 @@ impl CoincidentTokenIndex {
|
|||
let n = lexical_grammar.variables.len();
|
||||
let mut result = Self {
|
||||
n,
|
||||
entries: vec![HashSet::new(); n * n],
|
||||
entries: vec![Vec::new(); n * n],
|
||||
};
|
||||
for (i, state) in table.states.iter().enumerate() {
|
||||
for symbol in state.terminal_entries.keys() {
|
||||
for other_symbol in state.terminal_entries.keys() {
|
||||
let index = result.index(*symbol, *other_symbol);
|
||||
result.entries[index].insert(i);
|
||||
if result.entries[index].last().cloned() != Some(i) {
|
||||
result.entries[index].push(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
pub fn states_with(&self, a: Symbol, b: Symbol) -> &HashSet<ParseStateId> {
|
||||
pub fn states_with(&self, a: Symbol, b: Symbol) -> &Vec<ParseStateId> {
|
||||
&self.entries[self.index(a, b)]
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -125,7 +125,7 @@ impl Generator {
|
|||
.symbols
|
||||
.iter()
|
||||
.filter(|symbol| {
|
||||
if symbol.is_terminal() {
|
||||
if symbol.is_terminal() || symbol.is_eof() {
|
||||
true
|
||||
} else if symbol.is_external() {
|
||||
self.syntax_grammar.external_tokens[symbol.index]
|
||||
|
|
@ -359,7 +359,7 @@ impl Generator {
|
|||
add_line!(
|
||||
self,
|
||||
"ACCEPT_TOKEN({})",
|
||||
self.symbol_ids[&Symbol::terminal(accept_action)]
|
||||
self.symbol_ids[&accept_action]
|
||||
);
|
||||
}
|
||||
|
||||
|
|
@ -462,18 +462,16 @@ impl Generator {
|
|||
let mut prev_range: Option<Range<char>> = None;
|
||||
chars
|
||||
.iter()
|
||||
.cloned()
|
||||
.chain(Some('\0'))
|
||||
.filter_map(move |c| {
|
||||
.map(|c| (*c, false))
|
||||
.chain(Some(('\0', true)))
|
||||
.filter_map(move |(c, done)| {
|
||||
if done {
|
||||
return prev_range.clone();
|
||||
}
|
||||
if ruled_out_characters.contains(&(c as u32)) {
|
||||
return None;
|
||||
}
|
||||
if let Some(range) = prev_range.clone() {
|
||||
if c == '\0' {
|
||||
prev_range = Some(c..c);
|
||||
return Some(range);
|
||||
}
|
||||
|
||||
let mut prev_range_successor = range.end as u32 + 1;
|
||||
while prev_range_successor < c as u32 {
|
||||
if !ruled_out_characters.contains(&prev_range_successor) {
|
||||
|
|
@ -948,6 +946,7 @@ impl Generator {
|
|||
fn add_character(&mut self, c: char) {
|
||||
if c.is_ascii() {
|
||||
match c {
|
||||
'\0' => add!(self, "'\\0'"),
|
||||
'\'' => add!(self, "'\\''"),
|
||||
'\\' => add!(self, "'\\\\'"),
|
||||
'\t' => add!(self, "'\\t'"),
|
||||
|
|
|
|||
|
|
@ -162,6 +162,10 @@ impl Symbol {
|
|||
self.kind == SymbolType::External
|
||||
}
|
||||
|
||||
pub fn is_eof(&self) -> bool {
|
||||
self.kind == SymbolType::End
|
||||
}
|
||||
|
||||
pub fn non_terminal(index: usize) -> Self {
|
||||
Symbol {
|
||||
kind: SymbolType::NonTerminal,
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ pub(crate) struct AdvanceAction {
|
|||
#[derive(Clone, Debug, Default, PartialEq, Eq)]
|
||||
pub(crate) struct LexState {
|
||||
pub advance_actions: Vec<(CharacterSet, AdvanceAction)>,
|
||||
pub accept_action: Option<usize>,
|
||||
pub accept_action: Option<Symbol>,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue