Fix computation of following tokens

This commit is contained in:
Max Brunsfeld 2019-01-18 15:13:13 -08:00
parent 31bdf5eb97
commit ff41f05a20
5 changed files with 125 additions and 36 deletions

View file

@ -41,12 +41,11 @@ struct ParseTableBuilder<'a> {
item_sets_by_state_id: Vec<ParseItemSet<'a>>,
parse_state_queue: VecDeque<ParseStateQueueEntry>,
parse_table: ParseTable,
following_tokens: Vec<TokenSet>,
state_ids_to_log: Vec<ParseStateId>,
}
impl<'a> ParseTableBuilder<'a> {
fn build(mut self) -> Result<(ParseTable, Vec<TokenSet>)> {
fn build(mut self) -> Result<ParseTable> {
// Ensure that the empty alias sequence has index 0.
self.parse_table.alias_sequences.push(Vec::new());
@ -99,7 +98,7 @@ impl<'a> ParseTableBuilder<'a> {
self.remove_precedences();
Ok((self.parse_table, self.following_tokens))
Ok(self.parse_table)
}
fn add_parse_state(
@ -108,20 +107,6 @@ impl<'a> ParseTableBuilder<'a> {
preceding_auxiliary_symbols: &AuxiliarySymbolSequence,
item_set: ParseItemSet<'a>,
) -> ParseStateId {
if preceding_symbols.len() > 1 {
let left_tokens = self
.item_set_builder
.last_set(&preceding_symbols[preceding_symbols.len() - 2]);
let right_tokens = self
.item_set_builder
.first_set(&preceding_symbols[preceding_symbols.len() - 1]);
for left_token in left_tokens.iter() {
if left_token.is_terminal() {
self.following_tokens[left_token.index].insert_all(right_tokens);
}
}
}
let mut hasher = DefaultHasher::new();
item_set.hash_unfinished_items(&mut hasher);
let unfinished_item_signature = hasher.finish();
@ -705,17 +690,50 @@ impl<'a> ParseTableBuilder<'a> {
}
}
fn populate_following_tokens(
result: &mut Vec<TokenSet>,
grammar: &SyntaxGrammar,
inlines: &InlinedProductionMap,
builder: &ParseItemSetBuilder,
) {
let productions = grammar
.variables
.iter()
.flat_map(|v| &v.productions)
.chain(&inlines.productions);
for production in productions {
for i in 1..production.steps.len() {
let left_tokens = builder.last_set(&production.steps[i - 1].symbol);
let right_tokens = builder.first_set(&production.steps[i].symbol);
for left_token in left_tokens.iter() {
if left_token.is_terminal() {
result[left_token.index].insert_all_terminals(right_tokens);
}
}
}
}
}
pub(crate) fn build_parse_table(
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
inlines: &InlinedProductionMap,
state_ids_to_log: Vec<usize>,
) -> Result<(ParseTable, Vec<TokenSet>)> {
ParseTableBuilder {
let item_set_builder = ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines);
let mut following_tokens = vec![TokenSet::new(); lexical_grammar.variables.len()];
populate_following_tokens(
&mut following_tokens,
syntax_grammar,
inlines,
&item_set_builder,
);
let table = ParseTableBuilder {
syntax_grammar,
lexical_grammar,
state_ids_to_log,
item_set_builder: ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines),
item_set_builder,
state_ids_by_item_set: HashMap::new(),
item_sets_by_state_id: Vec::new(),
parse_state_queue: VecDeque::new(),
@ -725,7 +743,8 @@ pub(crate) fn build_parse_table(
alias_sequences: Vec::new(),
max_aliased_production_length: 0,
},
following_tokens: vec![TokenSet::new(); lexical_grammar.variables.len()],
}
.build()
.build()?;
Ok((table, following_tokens))
}

View file

@ -48,7 +48,11 @@ pub(crate) struct ParseItemDisplay<'a>(
pub &'a LexicalGrammar,
);
pub(crate) struct TokenSetDisplay<'a>(&'a TokenSet, &'a SyntaxGrammar, &'a LexicalGrammar);
pub(crate) struct TokenSetDisplay<'a>(
pub &'a TokenSet,
pub &'a SyntaxGrammar,
pub &'a LexicalGrammar,
);
#[allow(dead_code)]
pub(crate) struct ParseItemSetDisplay<'a>(
@ -134,30 +138,42 @@ impl TokenSet {
vec.set(other.index, true);
}
pub fn insert_all(&mut self, other: &TokenSet) -> bool {
pub fn insert_all_terminals(&mut self, other: &TokenSet) -> bool {
let mut result = false;
if other.terminal_bits.len() > self.terminal_bits.len() {
self.terminal_bits.resize(other.terminal_bits.len(), false);
}
if other.external_bits.len() > self.external_bits.len() {
self.external_bits.resize(other.external_bits.len(), false);
}
for (i, element) in other.terminal_bits.iter().enumerate() {
if element {
result |= !self.terminal_bits[i];
self.terminal_bits.set(i, element);
}
}
result
}
fn insert_all_externals(&mut self, other: &TokenSet) -> bool {
let mut result = false;
if other.external_bits.len() > self.external_bits.len() {
self.external_bits.resize(other.external_bits.len(), false);
}
for (i, element) in other.external_bits.iter().enumerate() {
if element {
result |= !self.external_bits[i];
self.external_bits.set(i, element);
}
}
result
}
pub fn insert_all(&mut self, other: &TokenSet) -> bool {
let mut result = false;
if other.eof {
result |= !self.eof;
self.eof = true;
}
result |= self.insert_all_terminals(other);
result |= self.insert_all_externals(other);
result
}
}

View file

@ -1,6 +1,6 @@
use super::item::{ParseItem, ParseItemDisplay, ParseItemSet, TokenSet};
use super::item::{ParseItem, ParseItemDisplay, ParseItemSet, TokenSet, TokenSetDisplay};
use crate::generate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar};
use crate::generate::rules::Symbol;
use crate::generate::rules::{Symbol, SymbolType};
use hashbrown::{HashMap, HashSet};
use std::fmt;
@ -268,7 +268,7 @@ impl<'a> ParseItemSetBuilder<'a> {
}
pub fn last_set(&self, symbol: &Symbol) -> &TokenSet {
&self.first_sets[symbol]
&self.last_sets[symbol]
}
fn add_item(&self, set: &mut ParseItemSet<'a>, item: ParseItem<'a>, lookaheads: &TokenSet) {
@ -300,6 +300,40 @@ impl<'a> fmt::Debug for ParseItemSetBuilder<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "ParseItemSetBuilder {{\n")?;
write!(f, " first_sets: {{\n")?;
for (symbol, first_set) in &self.first_sets {
let name = match symbol.kind {
SymbolType::NonTerminal => &self.syntax_grammar.variables[symbol.index].name,
SymbolType::External => &self.syntax_grammar.external_tokens[symbol.index].name,
SymbolType::Terminal => &self.lexical_grammar.variables[symbol.index].name,
SymbolType::End => "END",
};
write!(
f,
" first({:?}): {}\n",
name,
TokenSetDisplay(first_set, &self.syntax_grammar, &self.lexical_grammar)
)?;
}
write!(f, " }}\n")?;
write!(f, " last_sets: {{\n")?;
for (symbol, last_set) in &self.last_sets {
let name = match symbol.kind {
SymbolType::NonTerminal => &self.syntax_grammar.variables[symbol.index].name,
SymbolType::External => &self.syntax_grammar.external_tokens[symbol.index].name,
SymbolType::Terminal => &self.lexical_grammar.variables[symbol.index].name,
SymbolType::End => "END",
};
write!(
f,
" last({:?}): {}\n",
name,
TokenSetDisplay(last_set, &self.syntax_grammar, &self.lexical_grammar)
)?;
}
write!(f, " }}\n")?;
write!(f, " additions: {{\n")?;
for (i, variable) in self.syntax_grammar.variables.iter().enumerate() {
write!(f, " {}: {{\n", variable.name)?;

View file

@ -1,5 +1,5 @@
use crate::generate::build_tables::item::TokenSet;
use crate::generate::grammars::LexicalGrammar;
use crate::generate::build_tables::item::{TokenSet, TokenSetDisplay};
use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar};
use crate::generate::nfa::{CharacterSet, NfaCursor, NfaTransition};
use hashbrown::HashSet;
use std::cmp::Ordering;
@ -16,6 +16,7 @@ struct TokenConflictStatus {
pub(crate) struct TokenConflictMap<'a> {
n: usize,
status_matrix: Vec<TokenConflictStatus>,
following_tokens: Vec<TokenSet>,
starting_chars_by_index: Vec<CharacterSet>,
following_chars_by_index: Vec<CharacterSet>,
grammar: &'a LexicalGrammar,
@ -25,7 +26,7 @@ impl<'a> TokenConflictMap<'a> {
pub fn new(grammar: &'a LexicalGrammar, following_tokens: Vec<TokenSet>) -> Self {
let mut cursor = NfaCursor::new(&grammar.nfa, Vec::new());
let starting_chars = get_starting_chars(&mut cursor, grammar);
let following_chars = get_following_chars(&starting_chars, following_tokens);
let following_chars = get_following_chars(&starting_chars, &following_tokens);
let n = grammar.variables.len();
let mut status_matrix = vec![TokenConflictStatus::default(); n * n];
@ -40,6 +41,7 @@ impl<'a> TokenConflictMap<'a> {
TokenConflictMap {
n,
status_matrix,
following_tokens,
starting_chars_by_index: starting_chars,
following_chars_by_index: following_chars,
grammar,
@ -115,9 +117,27 @@ impl<'a> fmt::Debug for TokenConflictMap<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "TokenConflictMap {{\n")?;
let syntax_grammar = SyntaxGrammar::default();
write!(f, " following_tokens: {{\n")?;
for (i, following_tokens) in self.following_tokens.iter().enumerate() {
write!(
f,
" follow({:?}): {},\n",
self.grammar.variables[i].name,
TokenSetDisplay(following_tokens, &syntax_grammar, &self.grammar)
)?;
}
write!(f, " }},\n")?;
write!(f, " starting_characters: {{\n")?;
for i in 0..self.n {
write!(f, " {}: {:?},\n", i, self.starting_chars_by_index[i])?;
write!(
f,
" {:?}: {:?},\n",
self.grammar.variables[i].name,
self.starting_chars_by_index[i]
)?;
}
write!(f, " }},\n")?;
@ -169,10 +189,10 @@ fn get_starting_chars(cursor: &mut NfaCursor, grammar: &LexicalGrammar) -> Vec<C
fn get_following_chars(
starting_chars: &Vec<CharacterSet>,
following_tokens: Vec<TokenSet>,
following_tokens: &Vec<TokenSet>,
) -> Vec<CharacterSet> {
following_tokens
.into_iter()
.iter()
.map(|following_tokens| {
let mut chars = CharacterSet::empty();
for token in following_tokens.iter() {

View file

@ -81,7 +81,7 @@ pub(crate) struct ExternalToken {
pub corresponding_internal_token: Option<Symbol>,
}
#[derive(Debug)]
#[derive(Debug, Default)]
pub(crate) struct SyntaxGrammar {
pub variables: Vec<SyntaxVariable>,
pub extra_tokens: Vec<Symbol>,