tree-sitter/src/build_tables/shrink_parse_table.rs

use super::item::LookaheadSet;
use super::token_conflicts::TokenConflictMap;
use crate::grammars::{SyntaxGrammar, VariableType};
use crate::rules::{AliasMap, Symbol};
use crate::tables::{ParseAction, ParseState, ParseTable, ParseTableEntry};
use hashbrown::{HashMap, HashSet};

pub(crate) fn shrink_parse_table(
    parse_table: &mut ParseTable,
    syntax_grammar: &SyntaxGrammar,
    simple_aliases: &AliasMap,
    token_conflict_map: &TokenConflictMap,
    keywords: &LookaheadSet,
) {
    let mut optimizer = Optimizer {
        parse_table,
        syntax_grammar,
        token_conflict_map,
        keywords,
        simple_aliases,
    };
    optimizer.remove_unit_reductions();
    optimizer.merge_compatible_states();
    optimizer.remove_unused_states();
}

struct Optimizer<'a> {
    parse_table: &'a mut ParseTable,
    syntax_grammar: &'a SyntaxGrammar,
    token_conflict_map: &'a TokenConflictMap<'a>,
    keywords: &'a LookaheadSet,
    simple_aliases: &'a AliasMap,
}

impl<'a> Optimizer<'a> {
    fn remove_unit_reductions(&mut self) {
        let mut aliased_symbols = HashSet::new();
        for variable in &self.syntax_grammar.variables {
            for production in &variable.productions {
                for step in &production.steps {
                    if step.alias.is_some() {
                        aliased_symbols.insert(step.symbol);
                    }
                }
            }
        }

        let mut unit_reduction_symbols_by_state = HashMap::new();
        for (i, state) in self.parse_table.states.iter().enumerate() {
            let mut only_unit_reductions = true;
            let mut unit_reduction_symbol = None;
            for (_, entry) in &state.terminal_entries {
                for action in &entry.actions {
                    match action {
                        ParseAction::ShiftExtra => continue,
                        ParseAction::Reduce {
                            child_count: 1,
                            alias_sequence_id: 0,
                            symbol,
                            ..
                        } => {
                            if !self.simple_aliases.contains_key(&symbol)
                                && !aliased_symbols.contains(&symbol)
                                && self.syntax_grammar.variables[symbol.index].kind
                                    != VariableType::Named
                                && (unit_reduction_symbol.is_none()
                                    || unit_reduction_symbol == Some(symbol))
                            {
                                unit_reduction_symbol = Some(symbol);
                                continue;
                            }
                        }
                        _ => {}
                    }
                    only_unit_reductions = false;
                    break;
                }

                if !only_unit_reductions {
                    break;
                }
            }

            if let Some(symbol) = unit_reduction_symbol {
                if only_unit_reductions {
                    unit_reduction_symbols_by_state.insert(i, *symbol);
                }
            }
        }

        for state in self.parse_table.states.iter_mut() {
            let mut done = false;
            while !done {
                done = true;
                state.update_referenced_states(|other_state_id, state| {
                    if let Some(symbol) = unit_reduction_symbols_by_state.get(&other_state_id) {
                        done = false;
                        state.nonterminal_entries[symbol]
                    } else {
                        other_state_id
                    }
                })
            }
        }
    }

    fn merge_compatible_states(&mut self) {
        let mut state_ids_by_signature = HashMap::new();
        for (i, state) in self.parse_table.states.iter().enumerate() {
            state_ids_by_signature
                .entry(state.unfinished_item_signature)
                .or_insert(Vec::new())
                .push(i);
        }

        let mut deleted_states = HashSet::new();
        loop {
            let mut state_replacements = HashMap::new();
            for (_, state_ids) in &state_ids_by_signature {
                for i in state_ids {
                    for j in state_ids {
                        if j == i {
                            break;
                        }
                        if deleted_states.contains(j) || deleted_states.contains(i) {
                            continue;
                        }
                        if self.merge_parse_state(*j, *i) {
                            deleted_states.insert(*i);
                            state_replacements.insert(*i, *j);
                        }
                    }
                }
            }

            if state_replacements.is_empty() {
                break;
            }

            for state in self.parse_table.states.iter_mut() {
                state.update_referenced_states(|other_state_id, _| {
                    *state_replacements
                        .get(&other_state_id)
                        .unwrap_or(&other_state_id)
                });
            }
        }
    }

    fn merge_parse_state(&mut self, left: usize, right: usize) -> bool {
        let left_state = &self.parse_table.states[left];
        let right_state = &self.parse_table.states[right];

        if left_state.nonterminal_entries != right_state.nonterminal_entries {
            return false;
        }

        for (symbol, left_entry) in &left_state.terminal_entries {
            if let Some(right_entry) = right_state.terminal_entries.get(symbol) {
                if right_entry.actions != left_entry.actions {
                    return false;
                }
            } else if !self.can_add_entry_to_state(right_state, *symbol, left_entry) {
                return false;
            }
        }

        let mut symbols_to_add = Vec::new();
        for (symbol, right_entry) in &right_state.terminal_entries {
            if !left_state.terminal_entries.contains_key(&symbol) {
                if !self.can_add_entry_to_state(left_state, *symbol, right_entry) {
                    return false;
                }
                symbols_to_add.push(*symbol);
            }
        }

        for symbol in symbols_to_add {
            let entry = self.parse_table.states[right].terminal_entries[&symbol].clone();
            self.parse_table.states[left]
                .terminal_entries
                .insert(symbol, entry);
        }

        true
    }

    fn can_add_entry_to_state(
        &self,
        state: &ParseState,
        token: Symbol,
        entry: &ParseTableEntry,
    ) -> bool {
        // Do not add external tokens; they could conflict lexically with any of the state's
        // existing lookahead tokens.
        if token.is_external() {
            return false;
        }

        // Only merge_compatible_states parse states by allowing existing reductions to happen
        // with additional lookahead tokens. Do not alter parse states in ways
        // that allow entirely new types of actions to happen.
        if state.terminal_entries.iter().all(|(_, e)| e != entry) {
            return false;
        }
        match entry.actions.last() {
            Some(ParseAction::Reduce { .. }) => {}
            _ => return false,
        }

        // Do not add tokens which are both internal and external. Their validity could
        // influence the behavior of the external scanner.
        if self
            .syntax_grammar
            .external_tokens
            .iter()
            .any(|t| t.corresponding_internal_token == Some(token))
        {
            return false;
        }

        let is_word_token = self.syntax_grammar.word_token == Some(token);
        let is_keyword = self.keywords.contains(&token);

        // Do not add a token if it conflicts with an existing token.
        if token.is_terminal() {
            for existing_token in state.terminal_entries.keys() {
                if (is_word_token && self.keywords.contains(existing_token))
                    || is_keyword && self.syntax_grammar.word_token.as_ref() == Some(existing_token)
                {
                    continue;
                }
                if self
                    .token_conflict_map
                    .does_conflict(token.index, existing_token.index)
                    || self
                        .token_conflict_map
                        .does_match_same_string(token.index, existing_token.index)
                {
                    return false;
                }
            }
        }

        true
    }

    fn remove_unused_states(&mut self) {
        let mut state_usage_map = vec![false; self.parse_table.states.len()];

        state_usage_map[0] = true;
        state_usage_map[1] = true;

        for state in &self.parse_table.states {
            for referenced_state in state.referenced_states() {
                state_usage_map[referenced_state] = true;
            }
        }
        let mut removed_predecessor_count = 0;
        let mut state_replacement_map = vec![0; self.parse_table.states.len()];
        for state_id in 0..self.parse_table.states.len() {
            state_replacement_map[state_id] = state_id - removed_predecessor_count;
            if !state_usage_map[state_id] {
                removed_predecessor_count += 1;
            }
        }
        let mut state_id = 0;
        let mut original_state_id = 0;
        while state_id < self.parse_table.states.len() {
            if state_usage_map[original_state_id] {
                self.parse_table.states[state_id].update_referenced_states(|other_state_id, _| {
                    state_replacement_map[other_state_id]
                });
                state_id += 1;
            } else {
                self.parse_table.states.remove(state_id);
            }
            original_state_id += 1;
        }
    }
}