Fix computation of following tokens

2019-01-18 15:13:13 -08:00 · 2019-01-18 15:13:13 -08:00 · ff41f05a20
commit ff41f05a20
parent 31bdf5eb97
5 changed files with 125 additions and 36 deletions
--- a/cli/src/generate/build_tables/build_parse_table.rs
+++ b/cli/src/generate/build_tables/build_parse_table.rs
@ -41,12 +41,11 @@ struct ParseTableBuilder<'a> {
    item_sets_by_state_id: Vec<ParseItemSet<'a>>,
    parse_state_queue: VecDeque<ParseStateQueueEntry>,
    parse_table: ParseTable,
-    following_tokens: Vec<TokenSet>,
    state_ids_to_log: Vec<ParseStateId>,
 }

 impl<'a> ParseTableBuilder<'a> {
-    fn build(mut self) -> Result<(ParseTable, Vec<TokenSet>)> {
+    fn build(mut self) -> Result<ParseTable> {
        // Ensure that the empty alias sequence has index 0.
        self.parse_table.alias_sequences.push(Vec::new());

@ -99,7 +98,7 @@ impl<'a> ParseTableBuilder<'a> {

        self.remove_precedences();

-        Ok((self.parse_table, self.following_tokens))
+        Ok(self.parse_table)
    }

    fn add_parse_state(
@ -108,20 +107,6 @@ impl<'a> ParseTableBuilder<'a> {
        preceding_auxiliary_symbols: &AuxiliarySymbolSequence,
        item_set: ParseItemSet<'a>,
    ) -> ParseStateId {
-        if preceding_symbols.len() > 1 {
-            let left_tokens = self
-                .item_set_builder
-                .last_set(&preceding_symbols[preceding_symbols.len() - 2]);
-            let right_tokens = self
-                .item_set_builder
-                .first_set(&preceding_symbols[preceding_symbols.len() - 1]);
-            for left_token in left_tokens.iter() {
-                if left_token.is_terminal() {
-                    self.following_tokens[left_token.index].insert_all(right_tokens);
-                }
-            }
-        }
-
        let mut hasher = DefaultHasher::new();
        item_set.hash_unfinished_items(&mut hasher);
        let unfinished_item_signature = hasher.finish();
@ -705,17 +690,50 @@ impl<'a> ParseTableBuilder<'a> {
    }
 }

+fn populate_following_tokens(
+    result: &mut Vec<TokenSet>,
+    grammar: &SyntaxGrammar,
+    inlines: &InlinedProductionMap,
+    builder: &ParseItemSetBuilder,
+) {
+    let productions = grammar
+        .variables
+        .iter()
+        .flat_map(|v| &v.productions)
+        .chain(&inlines.productions);
+    for production in productions {
+        for i in 1..production.steps.len() {
+            let left_tokens = builder.last_set(&production.steps[i - 1].symbol);
+            let right_tokens = builder.first_set(&production.steps[i].symbol);
+            for left_token in left_tokens.iter() {
+                if left_token.is_terminal() {
+                    result[left_token.index].insert_all_terminals(right_tokens);
+                }
+            }
+        }
+    }
+}
+
 pub(crate) fn build_parse_table(
    syntax_grammar: &SyntaxGrammar,
    lexical_grammar: &LexicalGrammar,
    inlines: &InlinedProductionMap,
    state_ids_to_log: Vec<usize>,
 ) -> Result<(ParseTable, Vec<TokenSet>)> {
-    ParseTableBuilder {
+    let item_set_builder = ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines);
+    let mut following_tokens = vec![TokenSet::new(); lexical_grammar.variables.len()];
+    populate_following_tokens(
+        &mut following_tokens,
+        syntax_grammar,
+        inlines,
+        &item_set_builder,
+    );
+
+    let table = ParseTableBuilder {
        syntax_grammar,
        lexical_grammar,
        state_ids_to_log,
-        item_set_builder: ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines),
+        item_set_builder,
        state_ids_by_item_set: HashMap::new(),
        item_sets_by_state_id: Vec::new(),
        parse_state_queue: VecDeque::new(),
@ -725,7 +743,8 @@ pub(crate) fn build_parse_table(
            alias_sequences: Vec::new(),
            max_aliased_production_length: 0,
        },
-        following_tokens: vec![TokenSet::new(); lexical_grammar.variables.len()],
    }
-    .build()
+    .build()?;
+
+    Ok((table, following_tokens))
 }
--- a/cli/src/generate/build_tables/item.rs
+++ b/cli/src/generate/build_tables/item.rs
@ -48,7 +48,11 @@ pub(crate) struct ParseItemDisplay<'a>(
    pub &'a LexicalGrammar,
 );

-pub(crate) struct TokenSetDisplay<'a>(&'a TokenSet, &'a SyntaxGrammar, &'a LexicalGrammar);
+pub(crate) struct TokenSetDisplay<'a>(
+    pub &'a TokenSet,
+    pub &'a SyntaxGrammar,
+    pub &'a LexicalGrammar,
+);

 #[allow(dead_code)]
 pub(crate) struct ParseItemSetDisplay<'a>(
@ -134,30 +138,42 @@ impl TokenSet {
        vec.set(other.index, true);
    }

-    pub fn insert_all(&mut self, other: &TokenSet) -> bool {
+    pub fn insert_all_terminals(&mut self, other: &TokenSet) -> bool {
        let mut result = false;
        if other.terminal_bits.len() > self.terminal_bits.len() {
            self.terminal_bits.resize(other.terminal_bits.len(), false);
        }
-        if other.external_bits.len() > self.external_bits.len() {
-            self.external_bits.resize(other.external_bits.len(), false);
-        }
        for (i, element) in other.terminal_bits.iter().enumerate() {
            if element {
                result |= !self.terminal_bits[i];
                self.terminal_bits.set(i, element);
            }
        }
+        result
+    }
+
+    fn insert_all_externals(&mut self, other: &TokenSet) -> bool {
+        let mut result = false;
+        if other.external_bits.len() > self.external_bits.len() {
+            self.external_bits.resize(other.external_bits.len(), false);
+        }
        for (i, element) in other.external_bits.iter().enumerate() {
            if element {
                result |= !self.external_bits[i];
                self.external_bits.set(i, element);
            }
        }
+        result
+    }
+
+    pub fn insert_all(&mut self, other: &TokenSet) -> bool {
+        let mut result = false;
        if other.eof {
            result |= !self.eof;
            self.eof = true;
        }
+        result |= self.insert_all_terminals(other);
+        result |= self.insert_all_externals(other);
        result
    }
 }
--- a/cli/src/generate/build_tables/item_set_builder.rs
+++ b/cli/src/generate/build_tables/item_set_builder.rs
@ -1,6 +1,6 @@
-use super::item::{ParseItem, ParseItemDisplay, ParseItemSet, TokenSet};
+use super::item::{ParseItem, ParseItemDisplay, ParseItemSet, TokenSet, TokenSetDisplay};
 use crate::generate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar};
-use crate::generate::rules::Symbol;
+use crate::generate::rules::{Symbol, SymbolType};
 use hashbrown::{HashMap, HashSet};
 use std::fmt;

@ -268,7 +268,7 @@ impl<'a> ParseItemSetBuilder<'a> {
    }

    pub fn last_set(&self, symbol: &Symbol) -> &TokenSet {
-        &self.first_sets[symbol]
+        &self.last_sets[symbol]
    }

    fn add_item(&self, set: &mut ParseItemSet<'a>, item: ParseItem<'a>, lookaheads: &TokenSet) {
@ -300,6 +300,40 @@ impl<'a> fmt::Debug for ParseItemSetBuilder<'a> {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "ParseItemSetBuilder {{\n")?;

+        write!(f, "  first_sets: {{\n")?;
+        for (symbol, first_set) in &self.first_sets {
+            let name = match symbol.kind {
+                SymbolType::NonTerminal => &self.syntax_grammar.variables[symbol.index].name,
+                SymbolType::External => &self.syntax_grammar.external_tokens[symbol.index].name,
+                SymbolType::Terminal => &self.lexical_grammar.variables[symbol.index].name,
+                SymbolType::End => "END",
+            };
+            write!(
+                f,
+                "    first({:?}): {}\n",
+                name,
+                TokenSetDisplay(first_set, &self.syntax_grammar, &self.lexical_grammar)
+            )?;
+        }
+        write!(f, "  }}\n")?;
+
+        write!(f, "  last_sets: {{\n")?;
+        for (symbol, last_set) in &self.last_sets {
+            let name = match symbol.kind {
+                SymbolType::NonTerminal => &self.syntax_grammar.variables[symbol.index].name,
+                SymbolType::External => &self.syntax_grammar.external_tokens[symbol.index].name,
+                SymbolType::Terminal => &self.lexical_grammar.variables[symbol.index].name,
+                SymbolType::End => "END",
+            };
+            write!(
+                f,
+                "    last({:?}): {}\n",
+                name,
+                TokenSetDisplay(last_set, &self.syntax_grammar, &self.lexical_grammar)
+            )?;
+        }
+        write!(f, "  }}\n")?;
+
        write!(f, "  additions: {{\n")?;
        for (i, variable) in self.syntax_grammar.variables.iter().enumerate() {
            write!(f, "    {}: {{\n", variable.name)?;
--- a/cli/src/generate/build_tables/token_conflicts.rs
+++ b/cli/src/generate/build_tables/token_conflicts.rs
@ -1,5 +1,5 @@
-use crate::generate::build_tables::item::TokenSet;
-use crate::generate::grammars::LexicalGrammar;
+use crate::generate::build_tables::item::{TokenSet, TokenSetDisplay};
+use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar};
 use crate::generate::nfa::{CharacterSet, NfaCursor, NfaTransition};
 use hashbrown::HashSet;
 use std::cmp::Ordering;
@ -16,6 +16,7 @@ struct TokenConflictStatus {
 pub(crate) struct TokenConflictMap<'a> {
    n: usize,
    status_matrix: Vec<TokenConflictStatus>,
+    following_tokens: Vec<TokenSet>,
    starting_chars_by_index: Vec<CharacterSet>,
    following_chars_by_index: Vec<CharacterSet>,
    grammar: &'a LexicalGrammar,
@ -25,7 +26,7 @@ impl<'a> TokenConflictMap<'a> {
    pub fn new(grammar: &'a LexicalGrammar, following_tokens: Vec<TokenSet>) -> Self {
        let mut cursor = NfaCursor::new(&grammar.nfa, Vec::new());
        let starting_chars = get_starting_chars(&mut cursor, grammar);
-        let following_chars = get_following_chars(&starting_chars, following_tokens);
+        let following_chars = get_following_chars(&starting_chars, &following_tokens);

        let n = grammar.variables.len();
        let mut status_matrix = vec![TokenConflictStatus::default(); n * n];
@ -40,6 +41,7 @@ impl<'a> TokenConflictMap<'a> {
        TokenConflictMap {
            n,
            status_matrix,
+            following_tokens,
            starting_chars_by_index: starting_chars,
            following_chars_by_index: following_chars,
            grammar,
@ -115,9 +117,27 @@ impl<'a> fmt::Debug for TokenConflictMap<'a> {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "TokenConflictMap {{\n")?;

+        let syntax_grammar = SyntaxGrammar::default();
+
+        write!(f, "  following_tokens: {{\n")?;
+        for (i, following_tokens) in self.following_tokens.iter().enumerate() {
+            write!(
+                f,
+                "    follow({:?}): {},\n",
+                self.grammar.variables[i].name,
+                TokenSetDisplay(following_tokens, &syntax_grammar, &self.grammar)
+            )?;
+        }
+        write!(f, "  }},\n")?;
+
        write!(f, "  starting_characters: {{\n")?;
        for i in 0..self.n {
-            write!(f, "    {}: {:?},\n", i, self.starting_chars_by_index[i])?;
+            write!(
+                f,
+                "    {:?}: {:?},\n",
+                self.grammar.variables[i].name,
+                self.starting_chars_by_index[i]
+            )?;
        }
        write!(f, "  }},\n")?;

@ -169,10 +189,10 @@ fn get_starting_chars(cursor: &mut NfaCursor, grammar: &LexicalGrammar) -> Vec<C

 fn get_following_chars(
    starting_chars: &Vec<CharacterSet>,
-    following_tokens: Vec<TokenSet>,
+    following_tokens: &Vec<TokenSet>,
 ) -> Vec<CharacterSet> {
    following_tokens
-        .into_iter()
+        .iter()
        .map(|following_tokens| {
            let mut chars = CharacterSet::empty();
            for token in following_tokens.iter() {
--- a/cli/src/generate/grammars.rs
+++ b/cli/src/generate/grammars.rs
@ -81,7 +81,7 @@ pub(crate) struct ExternalToken {
    pub corresponding_internal_token: Option<Symbol>,
 }

-#[derive(Debug)]
+#[derive(Debug, Default)]
 pub(crate) struct SyntaxGrammar {
    pub variables: Vec<SyntaxVariable>,
    pub extra_tokens: Vec<Symbol>,