Implement parse state merging

2019-01-01 13:47:29 -08:00 · 2019-01-01 13:47:29 -08:00 · a46b8fcb46
commit a46b8fcb46
parent c6b9e97c58
9 changed files with 364 additions and 40 deletions
--- a/src/build_tables/build_parse_table.rs
+++ b/src/build_tables/build_parse_table.rs
@ -7,7 +7,8 @@ use crate::tables::{
    AliasSequenceId, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry,
 };
 use core::ops::Range;
-use std::collections::hash_map::Entry;
+use std::hash::Hasher;
+use std::collections::hash_map::{Entry, DefaultHasher};
 use std::collections::{HashMap, HashSet, VecDeque};
 use std::fmt::Write;

@ -44,14 +45,13 @@ impl<'a> ParseTableBuilder<'a> {
        self.parse_table.alias_sequences.push(Vec::new());

        // Ensure that the error state has index 0.
-        let error_state_id =
-            self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default());
+        self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default());

        self.add_parse_state(
            &Vec::new(),
            &Vec::new(),
            ParseItemSet::with(
-                [(ParseItem::start(), LookaheadSet::with(&[Symbol::end()]))]
+                [(ParseItem::start(), LookaheadSet::with([Symbol::end()].iter().cloned()))]
                    .iter()
                    .cloned(),
            ),
@ -78,6 +78,10 @@ impl<'a> ParseTableBuilder<'a> {
            }
        }

+        let mut hasher = DefaultHasher::new();
+        item_set.hash_unfinished_items(&mut hasher);
+        let unfinished_item_signature = hasher.finish();
+
        match self.state_ids_by_item_set.entry(item_set) {
            Entry::Occupied(o) => *o.get(),
            Entry::Vacant(v) => {
@ -87,6 +91,7 @@ impl<'a> ParseTableBuilder<'a> {
                    lex_state_id: 0,
                    terminal_entries: HashMap::new(),
                    nonterminal_entries: HashMap::new(),
+                    unfinished_item_signature,
                });
                self.parse_state_queue.push_back(ParseStateQueueEntry {
                    state_id,
--- a/src/build_tables/coincident_tokens.rs
+++ b/src/build_tables/coincident_tokens.rs
@ -0,0 +1,36 @@
+use crate::rules::Symbol;
+use crate::tables::{ParseStateId, ParseTable};
+use std::collections::{HashMap, HashSet};
+
+pub(crate) struct CoincidentTokenIndex {
+    entries: HashMap<(Symbol, Symbol), HashSet<ParseStateId>>,
+    empty: HashSet<ParseStateId>,
+}
+
+impl CoincidentTokenIndex {
+    pub fn new(table: &ParseTable) -> Self {
+        let mut entries = HashMap::new();
+        for (i, state) in table.states.iter().enumerate() {
+            for symbol in state.terminal_entries.keys() {
+                for other_symbol in state.terminal_entries.keys() {
+                    entries
+                        .entry((*symbol, *other_symbol))
+                        .or_insert(HashSet::new())
+                        .insert(i);
+                }
+            }
+        }
+        Self {
+            entries,
+            empty: HashSet::new(),
+        }
+    }
+
+    pub fn states_with(&self, a: Symbol, b: Symbol) -> &HashSet<ParseStateId> {
+        self.entries.get(&(a, b)).unwrap_or(&self.empty)
+    }
+
+    pub fn contains(&self, a: Symbol, b: Symbol) -> bool {
+        self.entries.contains_key(&(a, b))
+    }
+}
--- a/src/build_tables/item.rs
+++ b/src/build_tables/item.rs
@ -2,11 +2,11 @@ use crate::grammars::{LexicalGrammar, Production, ProductionStep, SyntaxGrammar}
 use crate::rules::Associativity;
 use crate::rules::{Symbol, SymbolType};
 use smallbitvec::SmallBitVec;
+use std::cmp::Ordering;
 use std::collections::BTreeMap;
 use std::fmt;
 use std::hash::{Hash, Hasher};
 use std::u32;
-use std::cmp::Ordering;

 lazy_static! {
    static ref START_PRODUCTION: Production = Production {
@ -85,10 +85,10 @@ impl LookaheadSet {
            .chain(if self.eof { Some(Symbol::end()) } else { None })
    }

-    pub fn with<'a>(symbols: impl IntoIterator<Item = &'a Symbol>) -> Self {
+    pub fn with(symbols: impl IntoIterator<Item = Symbol>) -> Self {
        let mut result = Self::new();
        for symbol in symbols {
-            result.insert(*symbol);
+            result.insert(symbol);
        }
        result
    }
@ -219,6 +219,21 @@ impl<'a> ParseItemSet<'a> {
        result
    }

+    pub fn hash_unfinished_items(&self, h: &mut impl Hasher) {
+        let mut previous_variable_index = u32::MAX;
+        let mut previous_step_index = u32::MAX;
+        for item in self.entries.keys() {
+            if item.step().is_none() && item.variable_index != previous_variable_index
+                || item.step_index != previous_step_index
+            {
+                h.write_u32(item.variable_index);
+                h.write_u32(item.step_index);
+                previous_variable_index = item.variable_index;
+                previous_step_index = item.step_index;
+            }
+        }
+    }
+
    pub fn display_with(
        &'a self,
        syntax_grammar: &'a SyntaxGrammar,
@ -369,11 +384,18 @@ impl<'a> Ord for ParseItem<'a> {
        if o != Ordering::Equal {
            return o;
        }
-        let o = self.production.dynamic_precedence.cmp(&other.production.dynamic_precedence);
+        let o = self
+            .production
+            .dynamic_precedence
+            .cmp(&other.production.dynamic_precedence);
        if o != Ordering::Equal {
            return o;
        }
-        let o = self.production.steps.len().cmp(&other.production.steps.len());
+        let o = self
+            .production
+            .steps
+            .len()
+            .cmp(&other.production.steps.len());
        if o != Ordering::Equal {
            return o;
        }
--- a/src/build_tables/mod.rs
+++ b/src/build_tables/mod.rs
@ -1,18 +1,20 @@
-use crate::error::Result;
-use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar};
-use crate::rules::{AliasMap, Symbol};
-use crate::tables::{LexTable, ParseTable};
-
 mod build_parse_table;
+mod coincident_tokens;
 mod item;
 mod item_set_builder;
 mod lex_table_builder;
 mod shrink_parse_table;
-mod token_conflict_map;
+mod token_conflicts;

 use self::build_parse_table::build_parse_table;
+use self::coincident_tokens::CoincidentTokenIndex;
+use self::item::LookaheadSet;
 use self::shrink_parse_table::shrink_parse_table;
-use self::token_conflict_map::TokenConflictMap;
+use self::token_conflicts::TokenConflictMap;
+use crate::error::Result;
+use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar};
+use crate::rules::{AliasMap, Symbol};
+use crate::tables::{LexTable, ParseAction, ParseTable, ParseTableEntry};

 pub(crate) fn build_tables(
    syntax_grammar: &SyntaxGrammar,
@ -23,6 +25,76 @@ pub(crate) fn build_tables(
    let (mut parse_table, following_tokens) =
        build_parse_table(syntax_grammar, lexical_grammar, inlines)?;
    let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens);
-    shrink_parse_table(&mut parse_table, syntax_grammar, simple_aliases);
+    let coincident_token_index = CoincidentTokenIndex::new(&parse_table);
+    populate_error_state(
+        &mut parse_table,
+        syntax_grammar,
+        lexical_grammar,
+        &coincident_token_index,
+        &token_conflict_map,
+    );
+    shrink_parse_table(
+        &mut parse_table,
+        syntax_grammar,
+        simple_aliases,
+        &token_conflict_map,
+    );
    Ok((parse_table, LexTable::default(), LexTable::default(), None))
 }
+
+fn populate_error_state(
+    parse_table: &mut ParseTable,
+    syntax_grammar: &SyntaxGrammar,
+    lexical_grammar: &LexicalGrammar,
+    coincident_token_index: &CoincidentTokenIndex,
+    token_conflict_map: &TokenConflictMap,
+) {
+    let state = &mut parse_table.states[0];
+    let n = lexical_grammar.variables.len();
+    let conflict_free_tokens = LookaheadSet::with((0..n).into_iter().filter_map(|i| {
+        let conflicts_with_other_tokens = (0..n).into_iter().all(|j| {
+            j == i
+                || coincident_token_index.contains(Symbol::terminal(i), Symbol::terminal(j))
+                || !token_conflict_map.does_conflict(i, j)
+        });
+        if conflicts_with_other_tokens {
+            None
+        } else {
+            Some(Symbol::terminal(i))
+        }
+    }));
+
+    let recover_entry = ParseTableEntry {
+        reusable: false,
+        actions: vec![ParseAction::Recover],
+    };
+
+    for i in 0..n {
+        let symbol = Symbol::terminal(i);
+        let can_be_used_for_recovery = conflict_free_tokens.contains(&symbol)
+            || conflict_free_tokens.iter().all(|t| {
+                coincident_token_index.contains(symbol, t)
+                    || !token_conflict_map.does_conflict(i, t.index)
+            });
+        if can_be_used_for_recovery {
+            eprintln!("include {}", &lexical_grammar.variables[symbol.index].name);
+            state
+                .terminal_entries
+                .entry(symbol)
+                .or_insert_with(|| recover_entry.clone());
+        } else {
+            eprintln!("exclude {}", &lexical_grammar.variables[symbol.index].name);
+        }
+    }
+
+    for (i, external_token) in syntax_grammar.external_tokens.iter().enumerate() {
+        if external_token.corresponding_internal_token.is_none() {
+            state
+                .terminal_entries
+                .entry(Symbol::external(i))
+                .or_insert_with(|| recover_entry.clone());
+        }
+    }
+
+    state.terminal_entries.insert(Symbol::end(), recover_entry);
+}
--- a/src/build_tables/shrink_parse_table.rs
+++ b/src/build_tables/shrink_parse_table.rs
@ -1,14 +1,17 @@
+use super::token_conflicts::TokenConflictMap;
 use crate::grammars::{SyntaxGrammar, VariableType};
-use crate::rules::AliasMap;
-use crate::tables::{ParseAction, ParseTable};
+use crate::rules::{AliasMap, Symbol};
+use crate::tables::{ParseAction, ParseState, ParseTable, ParseTableEntry};
 use std::collections::{HashMap, HashSet};

 pub(crate) fn shrink_parse_table(
    parse_table: &mut ParseTable,
    syntax_grammar: &SyntaxGrammar,
    simple_aliases: &AliasMap,
+    token_conflict_map: &TokenConflictMap,
 ) {
    remove_unit_reductions(parse_table, syntax_grammar, simple_aliases);
+    merge_compatible_states(parse_table, syntax_grammar, token_conflict_map);
    remove_unused_states(parse_table);
 }

@ -86,6 +89,157 @@ fn remove_unit_reductions(
    }
 }

+fn merge_compatible_states(
+    parse_table: &mut ParseTable,
+    syntax_grammar: &SyntaxGrammar,
+    token_conflict_map: &TokenConflictMap,
+) {
+    let mut state_ids_by_signature = HashMap::new();
+    for (i, state) in parse_table.states.iter().enumerate() {
+        state_ids_by_signature
+            .entry(state.unfinished_item_signature)
+            .or_insert(Vec::new())
+            .push(i);
+    }
+
+    let mut deleted_states = HashSet::new();
+    loop {
+        let mut state_replacements = HashMap::new();
+        for (_, state_ids) in &state_ids_by_signature {
+            for i in state_ids {
+                for j in state_ids {
+                    if j == i {
+                        break;
+                    }
+                    if deleted_states.contains(j) || deleted_states.contains(i) {
+                        continue;
+                    }
+                    if merge_parse_state(syntax_grammar, token_conflict_map, parse_table, *j, *i) {
+                        deleted_states.insert(*i);
+                        state_replacements.insert(*i, *j);
+                    }
+                }
+            }
+        }
+
+        if state_replacements.is_empty() {
+            break;
+        }
+
+        for state in parse_table.states.iter_mut() {
+            state.update_referenced_states(|other_state_id, _| {
+                *state_replacements
+                    .get(&other_state_id)
+                    .unwrap_or(&other_state_id)
+            });
+        }
+    }
+}
+
+fn merge_parse_state(
+    syntax_grammar: &SyntaxGrammar,
+    token_conflict_map: &TokenConflictMap,
+    parse_table: &mut ParseTable,
+    left: usize,
+    right: usize,
+) -> bool {
+    let left_state = &parse_table.states[left];
+    let right_state = &parse_table.states[right];
+
+    if left_state.nonterminal_entries != right_state.nonterminal_entries {
+        return false;
+    }
+
+    for (symbol, left_entry) in &left_state.terminal_entries {
+        if let Some(right_entry) = right_state.terminal_entries.get(symbol) {
+            if right_entry.actions != left_entry.actions {
+                return false;
+            }
+        } else if !can_add_entry_to_state(
+            syntax_grammar,
+            token_conflict_map,
+            right_state,
+            *symbol,
+            left_entry,
+        ) {
+            return false;
+        }
+    }
+
+    eprintln!("maybe merge {} {}", left, right);
+
+    let mut symbols_to_add = Vec::new();
+    for (symbol, right_entry) in &right_state.terminal_entries {
+        if !left_state.terminal_entries.contains_key(&symbol) {
+            if !can_add_entry_to_state(
+                syntax_grammar,
+                token_conflict_map,
+                left_state,
+                *symbol,
+                right_entry,
+            ) {
+                return false;
+            }
+            symbols_to_add.push(*symbol);
+        }
+    }
+
+    for symbol in symbols_to_add {
+        let entry = parse_table.states[right].terminal_entries[&symbol].clone();
+        parse_table.states[left]
+            .terminal_entries
+            .insert(symbol, entry);
+    }
+
+    true
+}
+
+fn can_add_entry_to_state(
+    syntax_grammar: &SyntaxGrammar,
+    token_conflict_map: &TokenConflictMap,
+    state: &ParseState,
+    token: Symbol,
+    entry: &ParseTableEntry,
+) -> bool {
+    // Do not add external tokens; they could conflict lexically with any of the state's
+    // existing lookahead tokens.
+    if token.is_external() {
+        return false;
+    }
+
+    // Only merge parse states by allowing existing reductions to happen
+    // with additional lookahead tokens. Do not alter parse states in ways
+    // that allow entirely new types of actions to happen.
+    if state.terminal_entries.iter().all(|(_, e)| e != entry) {
+        return false;
+    }
+    match entry.actions.last() {
+        Some(ParseAction::Reduce { .. }) => {}
+        _ => return false,
+    }
+
+    // Do not add tokens which are both internal and external. Their validity could
+    // influence the behavior of the external scanner.
+    if syntax_grammar
+        .external_tokens
+        .iter()
+        .any(|t| t.corresponding_internal_token == Some(token))
+    {
+        return false;
+    }
+
+    // Do not add a token if it conflicts with an existing token.
+    if token.is_terminal() {
+        for existing_token in state.terminal_entries.keys() {
+            if token_conflict_map.does_conflict(token.index, existing_token.index) {
+                return false;
+            }
+        }
+    }
+
+    true
+}
+
 fn remove_unused_states(parse_table: &mut ParseTable) {
    let mut state_usage_map = vec![false; parse_table.states.len()];
    for state in &parse_table.states {
--- a/src/build_tables/token_conflict_map.rs
+++ b/src/build_tables/token_conflict_map.rs
@ -8,6 +8,7 @@ use std::fmt;
 struct TokenConflictStatus {
    does_overlap: bool,
    does_match_valid_continuation: bool,
+    does_match_separators: bool,
    matches_same_string: bool,
 }

@ -46,8 +47,9 @@ impl TokenConflictMap {
        self.status_matrix[matrix_index(self.n, i, j)].matches_same_string
    }

-    pub fn does_match_valid_continuation(&self, i: usize, j: usize) -> bool {
-        self.status_matrix[matrix_index(self.n, i, j)].does_match_valid_continuation
+    pub fn does_conflict(&self, i: usize, j: usize) -> bool {
+        let entry = &self.status_matrix[matrix_index(self.n, i, j)];
+        entry.does_match_valid_continuation || entry.does_match_separators
    }

    pub fn does_overlap(&self, i: usize, j: usize) -> bool {
@ -207,10 +209,15 @@ fn compute_conflict_status(
                        if chars.does_intersect(&following_chars[j]) {
                            result.0.does_match_valid_continuation = true;
                        }
+                        if cursor.in_separator() {
+                            result.0.does_match_separators = true;
+                        }
                    } else {
                        result.1.does_overlap = true;
                        if chars.does_intersect(&following_chars[i]) {
                            result.1.does_match_valid_continuation = true;
+                        } else {
+                            result.1.does_match_separators = true;
                        }
                    }
                }
@ -326,9 +333,9 @@ mod tests {
        let token_map = TokenConflictMap::new(
            &grammar,
            vec![
-                LookaheadSet::with(&[Symbol::terminal(var("identifier"))]),
-                LookaheadSet::with(&[Symbol::terminal(var("in"))]),
-                LookaheadSet::with(&[Symbol::terminal(var("identifier"))]),
+                LookaheadSet::with([Symbol::terminal(var("identifier"))].iter().cloned()),
+                LookaheadSet::with([Symbol::terminal(var("in"))].iter().cloned()),
+                LookaheadSet::with([Symbol::terminal(var("identifier"))].iter().cloned()),
            ],
        );

@ -338,12 +345,12 @@ mod tests {

        // Depending on what character follows, the string "in" may be treated as part of an
        // `identifier` token.
-        assert!(token_map.does_match_valid_continuation(var("identifier"), var("in")));
+        assert!(token_map.does_conflict(var("identifier"), var("in")));

        // Depending on what character follows, the string "instanceof" may be treated as part of
        // an `identifier` token.
-        assert!(token_map.does_match_valid_continuation(var("identifier"), var("instanceof")));
-        assert!(token_map.does_match_valid_continuation(var("instanceof"), var("in")));
+        assert!(token_map.does_conflict(var("identifier"), var("instanceof")));
+        assert!(token_map.does_conflict(var("instanceof"), var("in")));
    }

    fn index_of_var(grammar: &LexicalGrammar, name: &str) -> usize {
--- a/src/nfa.rs
+++ b/src/nfa.rs
@ -86,15 +86,34 @@ impl CharacterSet {
    }

    pub fn add(self, other: &CharacterSet) -> Self {
-        if let CharacterSet::Include(other_chars) = other {
-            if let CharacterSet::Include(mut chars) = self {
-                chars.extend(other_chars);
-                chars.sort_unstable();
-                chars.dedup();
-                return CharacterSet::Include(chars);
-            }
+        match self {
+            CharacterSet::Include(mut chars) => match other {
+                CharacterSet::Include(other_chars) => {
+                    chars.extend(other_chars);
+                    chars.sort_unstable();
+                    chars.dedup();
+                    CharacterSet::Include(chars)
+                }
+                CharacterSet::Exclude(other_chars) => {
+                    let excluded_chars = other_chars
+                        .iter()
+                        .cloned()
+                        .filter(|c| !chars.contains(&c))
+                        .collect();
+                    CharacterSet::Exclude(excluded_chars)
+                }
+            },
+            CharacterSet::Exclude(mut chars) => match other {
+                CharacterSet::Include(other_chars) => {
+                    chars.retain(|c| !other_chars.contains(&c));
+                    CharacterSet::Exclude(chars)
+                }
+                CharacterSet::Exclude(other_chars) => {
+                    chars.retain(|c| other_chars.contains(&c));
+                    CharacterSet::Exclude(chars)
+                },
+            },
        }
-        panic!("Called add with a negated character set");
    }

    pub fn does_intersect(&self, other: &CharacterSet) -> bool {
@ -458,6 +477,9 @@ mod tests {
                    (CharacterSet::empty().add_char('f'), 0, 4),
                ],
                vec![
+                    (CharacterSet::empty().add_char('d'), 0, vec![1, 2]),
+                    (CharacterSet::empty().add_char('f'), 0, vec![1, 4]),
+                    (CharacterSet::empty().add_char('i'), 0, vec![1, 3]),
                    (
                        CharacterSet::empty()
                            .add_range('a', 'c')
@ -467,9 +489,6 @@ mod tests {
                        0,
                        vec![1],
                    ),
-                    (CharacterSet::empty().add_char('d'), 0, vec![1, 2]),
-                    (CharacterSet::empty().add_char('f'), 0, vec![1, 4]),
-                    (CharacterSet::empty().add_char('i'), 0, vec![1, 3]),
                ],
            ),
        ];
--- a/src/prepare_grammar/expand_tokens.rs
+++ b/src/prepare_grammar/expand_tokens.rs
@ -164,12 +164,20 @@ impl NfaBuilder {
                    Err(Error::regex("Unicode character classes are not supported"))
                }
                Class::Perl(class) => {
-                    self.push_advance(self.expand_perl_character_class(&class.kind), next_state_id);
+                    let mut chars = self.expand_perl_character_class(&class.kind);
+                    if class.negated {
+                        chars = chars.negate();
+                    }
+                    self.push_advance(chars, next_state_id);
                    Ok(true)
                }
                Class::Bracketed(class) => match &class.kind {
                    ClassSet::Item(item) => {
-                        self.push_advance(self.expand_character_class(&item)?, next_state_id);
+                        let mut chars = self.expand_character_class(&item)?;
+                        if class.negated {
+                            chars = chars.negate();
+                        }
+                        self.push_advance(chars, next_state_id);
                        Ok(true)
                    }
                    ClassSet::BinaryOp(_) => Err(Error::regex(
--- a/src/tables.rs
+++ b/src/tables.rs
@ -37,6 +37,7 @@ pub(crate) struct ParseState {
    pub terminal_entries: HashMap<Symbol, ParseTableEntry>,
    pub nonterminal_entries: HashMap<Symbol, ParseStateId>,
    pub lex_state_id: usize,
+    pub unfinished_item_signature: u64,
 }

 #[derive(Debug, PartialEq, Eq)]