Fix logic for identifying error recovery tokens

2019-01-03 13:49:50 -08:00 · 2019-01-03 13:49:50 -08:00 · 5a7d781aaa
commit 5a7d781aaa
parent 70f00d1a1e
3 changed files with 311 additions and 253 deletions
--- a/src/build_tables/coincident_tokens.rs
+++ b/src/build_tables/coincident_tokens.rs
@ -1,23 +1,26 @@
 use crate::grammars::LexicalGrammar;
 use crate::rules::Symbol;
 use crate::tables::{ParseStateId, ParseTable};
+use std::fmt;

-pub(crate) struct CoincidentTokenIndex {
+pub(crate) struct CoincidentTokenIndex<'a> {
    entries: Vec<Vec<ParseStateId>>,
+    grammar: &'a LexicalGrammar,
    n: usize,
 }

-impl CoincidentTokenIndex {
-    pub fn new(table: &ParseTable, lexical_grammar: &LexicalGrammar) -> Self {
+impl<'a> CoincidentTokenIndex<'a> {
+    pub fn new(table: &ParseTable, lexical_grammar: &'a LexicalGrammar) -> Self {
        let n = lexical_grammar.variables.len();
        let mut result = Self {
            n,
+            grammar: lexical_grammar,
            entries: vec![Vec::new(); n * n],
        };
        for (i, state) in table.states.iter().enumerate() {
            for symbol in state.terminal_entries.keys() {
                for other_symbol in state.terminal_entries.keys() {
-                    let index = result.index(*symbol, *other_symbol);
+                    let index = result.index(symbol.index, other_symbol.index);
                    if result.entries[index].last().cloned() != Some(i) {
                        result.entries[index].push(i);
                    }
@ -28,18 +31,41 @@ impl CoincidentTokenIndex {
    }

    pub fn states_with(&self, a: Symbol, b: Symbol) -> &Vec<ParseStateId> {
-        &self.entries[self.index(a, b)]
+        &self.entries[self.index(a.index, b.index)]
    }

    pub fn contains(&self, a: Symbol, b: Symbol) -> bool {
-        !self.entries[self.index(a, b)].is_empty()
+        !self.entries[self.index(a.index, b.index)].is_empty()
    }

-    fn index(&self, a: Symbol, b: Symbol) -> usize {
-        if a.index < b.index {
-            a.index * self.n + b.index
+    fn index(&self, a: usize, b: usize) -> usize {
+        if a < b {
+            a * self.n + b
        } else {
-            b.index * self.n + a.index
+            b * self.n + a
        }
    }
 }
+
+impl<'a> fmt::Debug for CoincidentTokenIndex<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "CoincidentTokenIndex {{\n")?;
+
+        write!(f, "  entries: {{\n")?;
+        for i in 0..self.n {
+            write!(f, "    {}: {{\n", self.grammar.variables[i].name)?;
+            for j in 0..self.n {
+                write!(
+                    f,
+                    "      {}: {:?},\n",
+                    self.grammar.variables[j].name,
+                    self.entries[self.index(i, j)].len()
+                )?;
+            }
+            write!(f, "    }},\n")?;
+        }
+        write!(f, "  }},")?;
+        write!(f, "}}")?;
+        Ok(())
+    }
+}
--- a/src/build_tables/mod.rs
+++ b/src/build_tables/mod.rs
@ -47,6 +47,7 @@ pub(crate) fn build_tables(
        syntax_grammar,
        simple_aliases,
        &token_conflict_map,
+        &keywords,
    );
    let (main_lex_table, keyword_lex_table) =
        build_lex_table(&mut parse_table, syntax_grammar, lexical_grammar, &keywords);
@ -67,15 +68,22 @@ fn populate_error_state(
 ) {
    let state = &mut parse_table.states[0];
    let n = lexical_grammar.variables.len();
+
+    // First identify the *conflict-free tokens*: tokens that do not overlap with
+    // any other token in any way.
    let conflict_free_tokens = LookaheadSet::with((0..n).into_iter().filter_map(|i| {
-        let conflicts_with_other_tokens = (0..n).into_iter().all(|j| {
-            j == i
-                || coincident_token_index.contains(Symbol::terminal(i), Symbol::terminal(j))
-                || !token_conflict_map.does_conflict(i, j)
+        let conflicts_with_other_tokens = (0..n).into_iter().any(|j| {
+            j != i
+                && !coincident_token_index.contains(Symbol::terminal(i), Symbol::terminal(j))
+                && token_conflict_map.does_conflict(i, j)
        });
        if conflicts_with_other_tokens {
            None
        } else {
+            info!(
+                "error recovery - token {} has no conflicts",
+                lexical_grammar.variables[i].name
+            );
            Some(Symbol::terminal(i))
        }
    }));
@ -85,19 +93,32 @@ fn populate_error_state(
        actions: vec![ParseAction::Recover],
    };

+    // Exclude from the error-recovery state any token that conflicts with one of
+    // the *conflict-free tokens* identified above.
    for i in 0..n {
        let symbol = Symbol::terminal(i);
-        let can_be_used_for_recovery = conflict_free_tokens.contains(&symbol)
-            || conflict_free_tokens.iter().all(|t| {
-                coincident_token_index.contains(symbol, t)
-                    || !token_conflict_map.does_conflict(i, t.index)
-            });
-        if can_be_used_for_recovery {
-            state
-                .terminal_entries
-                .entry(symbol)
-                .or_insert_with(|| recover_entry.clone());
+        if !conflict_free_tokens.contains(&symbol) {
+            if syntax_grammar.word_token != Some(symbol) {
+                if let Some(t) = conflict_free_tokens.iter().find(|t| {
+                    !coincident_token_index.contains(symbol, *t)
+                        && token_conflict_map.does_conflict(symbol.index, t.index)
+                }) {
+                    info!(
+                        "error recovery - exclude token {} because of conflict with {}",
+                        lexical_grammar.variables[i].name, lexical_grammar.variables[t.index].name
+                    );
+                    continue;
+                }
+            }
        }
+        info!(
+            "error recovery - include token {}",
+            lexical_grammar.variables[i].name
+        );
+        state
+            .terminal_entries
+            .entry(symbol)
+            .or_insert_with(|| recover_entry.clone());
    }

    for (i, external_token) in syntax_grammar.external_tokens.iter().enumerate() {
@ -134,7 +155,10 @@ fn identify_keywords(
            if all_chars_are_alphabetical(&cursor)
                && token_conflict_map.does_match_same_string(i, word_token.index)
            {
-                info!("Keywords - add candidate {}", lexical_grammar.variables[i].name);
+                info!(
+                    "Keywords - add candidate {}",
+                    lexical_grammar.variables[i].name
+                );
                Some(Symbol::terminal(i))
            } else {
                None
--- a/src/build_tables/shrink_parse_table.rs
+++ b/src/build_tables/shrink_parse_table.rs
@ -1,3 +1,4 @@
+use super::item::LookaheadSet;
 use super::token_conflicts::TokenConflictMap;
 use crate::grammars::{SyntaxGrammar, VariableType};
 use crate::rules::{AliasMap, Symbol};
@ -9,265 +10,272 @@ pub(crate) fn shrink_parse_table(
    syntax_grammar: &SyntaxGrammar,
    simple_aliases: &AliasMap,
    token_conflict_map: &TokenConflictMap,
+    keywords: &LookaheadSet,
 ) {
-    remove_unit_reductions(parse_table, syntax_grammar, simple_aliases);
-    merge_compatible_states(parse_table, syntax_grammar, token_conflict_map);
-    remove_unused_states(parse_table);
+    let mut optimizer = Optimizer {
+        parse_table,
+        syntax_grammar,
+        token_conflict_map,
+        keywords,
+        simple_aliases,
+    };
+    optimizer.remove_unit_reductions();
+    optimizer.merge_compatible_states();
+    optimizer.remove_unused_states();
 }

-fn remove_unit_reductions(
-    parse_table: &mut ParseTable,
-    syntax_grammar: &SyntaxGrammar,
-    simple_aliases: &AliasMap,
-) {
-    let mut aliased_symbols = HashSet::new();
-    for variable in &syntax_grammar.variables {
-        for production in &variable.productions {
-            for step in &production.steps {
-                if step.alias.is_some() {
-                    aliased_symbols.insert(step.symbol);
+struct Optimizer<'a> {
+    parse_table: &'a mut ParseTable,
+    syntax_grammar: &'a SyntaxGrammar,
+    token_conflict_map: &'a TokenConflictMap<'a>,
+    keywords: &'a LookaheadSet,
+    simple_aliases: &'a AliasMap,
+}
+
+impl<'a> Optimizer<'a> {
+    fn remove_unit_reductions(&mut self) {
+        let mut aliased_symbols = HashSet::new();
+        for variable in &self.syntax_grammar.variables {
+            for production in &variable.productions {
+                for step in &production.steps {
+                    if step.alias.is_some() {
+                        aliased_symbols.insert(step.symbol);
+                    }
                }
            }
        }
+
+        let mut unit_reduction_symbols_by_state = HashMap::new();
+        for (i, state) in self.parse_table.states.iter().enumerate() {
+            let mut only_unit_reductions = true;
+            let mut unit_reduction_symbol = None;
+            for (_, entry) in &state.terminal_entries {
+                for action in &entry.actions {
+                    match action {
+                        ParseAction::ShiftExtra => continue,
+                        ParseAction::Reduce {
+                            child_count: 1,
+                            alias_sequence_id: 0,
+                            symbol,
+                            ..
+                        } => {
+                            if !self.simple_aliases.contains_key(&symbol)
+                                && !aliased_symbols.contains(&symbol)
+                                && self.syntax_grammar.variables[symbol.index].kind
+                                    != VariableType::Named
+                                && (unit_reduction_symbol.is_none()
+                                    || unit_reduction_symbol == Some(symbol))
+                            {
+                                unit_reduction_symbol = Some(symbol);
+                                continue;
+                            }
+                        }
+                        _ => {}
+                    }
+                    only_unit_reductions = false;
+                    break;
+                }
+
+                if !only_unit_reductions {
+                    break;
+                }
+            }
+
+            if let Some(symbol) = unit_reduction_symbol {
+                if only_unit_reductions {
+                    unit_reduction_symbols_by_state.insert(i, *symbol);
+                }
+            }
+        }
+
+        for state in self.parse_table.states.iter_mut() {
+            let mut done = false;
+            while !done {
+                done = true;
+                state.update_referenced_states(|other_state_id, state| {
+                    if let Some(symbol) = unit_reduction_symbols_by_state.get(&other_state_id) {
+                        done = false;
+                        state.nonterminal_entries[symbol]
+                    } else {
+                        other_state_id
+                    }
+                })
+            }
+        }
    }

-    let mut unit_reduction_symbols_by_state = HashMap::new();
-    for (i, state) in parse_table.states.iter().enumerate() {
-        let mut only_unit_reductions = true;
-        let mut unit_reduction_symbol = None;
-        for (_, entry) in &state.terminal_entries {
-            for action in &entry.actions {
-                match action {
-                    ParseAction::ShiftExtra => continue,
-                    ParseAction::Reduce {
-                        child_count: 1,
-                        alias_sequence_id: 0,
-                        symbol,
-                        ..
-                    } => {
-                        if !simple_aliases.contains_key(&symbol)
-                            && !aliased_symbols.contains(&symbol)
-                            && syntax_grammar.variables[symbol.index].kind != VariableType::Named
-                            && (unit_reduction_symbol.is_none()
-                                || unit_reduction_symbol == Some(symbol))
-                        {
-                            unit_reduction_symbol = Some(symbol);
+    fn merge_compatible_states(&mut self) {
+        let mut state_ids_by_signature = HashMap::new();
+        for (i, state) in self.parse_table.states.iter().enumerate() {
+            state_ids_by_signature
+                .entry(state.unfinished_item_signature)
+                .or_insert(Vec::new())
+                .push(i);
+        }
+
+        let mut deleted_states = HashSet::new();
+        loop {
+            let mut state_replacements = HashMap::new();
+            for (_, state_ids) in &state_ids_by_signature {
+                for i in state_ids {
+                    for j in state_ids {
+                        if j == i {
+                            break;
+                        }
+                        if deleted_states.contains(j) || deleted_states.contains(i) {
                            continue;
                        }
+                        if self.merge_parse_state(*j, *i) {
+                            deleted_states.insert(*i);
+                            state_replacements.insert(*i, *j);
+                        }
                    }
-                    _ => {}
                }
-                only_unit_reductions = false;
+            }
+
+            if state_replacements.is_empty() {
                break;
            }

-            if !only_unit_reductions {
-                break;
-            }
-        }
-
-        if let Some(symbol) = unit_reduction_symbol {
-            if only_unit_reductions {
-                unit_reduction_symbols_by_state.insert(i, *symbol);
+            for state in self.parse_table.states.iter_mut() {
+                state.update_referenced_states(|other_state_id, _| {
+                    *state_replacements
+                        .get(&other_state_id)
+                        .unwrap_or(&other_state_id)
+                });
            }
        }
    }

-    for state in parse_table.states.iter_mut() {
-        let mut done = false;
-        while !done {
-            done = true;
-            state.update_referenced_states(|other_state_id, state| {
-                if let Some(symbol) = unit_reduction_symbols_by_state.get(&other_state_id) {
-                    done = false;
-                    state.nonterminal_entries[symbol]
-                } else {
-                    other_state_id
-                }
-            })
-        }
-    }
-}
+    fn merge_parse_state(&mut self, left: usize, right: usize) -> bool {
+        let left_state = &self.parse_table.states[left];
+        let right_state = &self.parse_table.states[right];

-fn merge_compatible_states(
-    parse_table: &mut ParseTable,
-    syntax_grammar: &SyntaxGrammar,
-    token_conflict_map: &TokenConflictMap,
-) {
-    let mut state_ids_by_signature = HashMap::new();
-    for (i, state) in parse_table.states.iter().enumerate() {
-        state_ids_by_signature
-            .entry(state.unfinished_item_signature)
-            .or_insert(Vec::new())
-            .push(i);
-    }
-
-    let mut deleted_states = HashSet::new();
-    loop {
-        let mut state_replacements = HashMap::new();
-        for (_, state_ids) in &state_ids_by_signature {
-            for i in state_ids {
-                for j in state_ids {
-                    if j == i {
-                        break;
-                    }
-                    if deleted_states.contains(j) || deleted_states.contains(i) {
-                        continue;
-                    }
-                    if merge_parse_state(syntax_grammar, token_conflict_map, parse_table, *j, *i) {
-                        deleted_states.insert(*i);
-                        state_replacements.insert(*i, *j);
-                    }
-                }
-            }
-        }
-
-        if state_replacements.is_empty() {
-            break;
-        }
-
-        for state in parse_table.states.iter_mut() {
-            state.update_referenced_states(|other_state_id, _| {
-                *state_replacements
-                    .get(&other_state_id)
-                    .unwrap_or(&other_state_id)
-            });
-        }
-    }
-}
-
-fn merge_parse_state(
-    syntax_grammar: &SyntaxGrammar,
-    token_conflict_map: &TokenConflictMap,
-    parse_table: &mut ParseTable,
-    left: usize,
-    right: usize,
-) -> bool {
-    let left_state = &parse_table.states[left];
-    let right_state = &parse_table.states[right];
-
-    if left_state.nonterminal_entries != right_state.nonterminal_entries {
-        return false;
-    }
-
-    for (symbol, left_entry) in &left_state.terminal_entries {
-        if let Some(right_entry) = right_state.terminal_entries.get(symbol) {
-            if right_entry.actions != left_entry.actions {
-                return false;
-            }
-        } else if !can_add_entry_to_state(
-            syntax_grammar,
-            token_conflict_map,
-            right_state,
-            *symbol,
-            left_entry,
-        ) {
+        if left_state.nonterminal_entries != right_state.nonterminal_entries {
            return false;
        }
-    }

-    let mut symbols_to_add = Vec::new();
-    for (symbol, right_entry) in &right_state.terminal_entries {
-        if !left_state.terminal_entries.contains_key(&symbol) {
-            if !can_add_entry_to_state(
-                syntax_grammar,
-                token_conflict_map,
-                left_state,
-                *symbol,
-                right_entry,
-            ) {
-                return false;
-            }
-            symbols_to_add.push(*symbol);
-        }
-    }
-
-    for symbol in symbols_to_add {
-        let entry = parse_table.states[right].terminal_entries[&symbol].clone();
-        parse_table.states[left]
-            .terminal_entries
-            .insert(symbol, entry);
-    }
-
-    true
-}
-
-fn can_add_entry_to_state(
-    syntax_grammar: &SyntaxGrammar,
-    token_conflict_map: &TokenConflictMap,
-    state: &ParseState,
-    token: Symbol,
-    entry: &ParseTableEntry,
-) -> bool {
-    // Do not add external tokens; they could conflict lexically with any of the state's
-    // existing lookahead tokens.
-    if token.is_external() {
-        return false;
-    }
-
-    // Only merge parse states by allowing existing reductions to happen
-    // with additional lookahead tokens. Do not alter parse states in ways
-    // that allow entirely new types of actions to happen.
-    if state.terminal_entries.iter().all(|(_, e)| e != entry) {
-        return false;
-    }
-    match entry.actions.last() {
-        Some(ParseAction::Reduce { .. }) => {}
-        _ => return false,
-    }
-
-    // Do not add tokens which are both internal and external. Their validity could
-    // influence the behavior of the external scanner.
-    if syntax_grammar
-        .external_tokens
-        .iter()
-        .any(|t| t.corresponding_internal_token == Some(token))
-    {
-        return false;
-    }
-
-    // Do not add a token if it conflicts with an existing token.
-    if token.is_terminal() {
-        for existing_token in state.terminal_entries.keys() {
-            if token_conflict_map.does_conflict(token.index, existing_token.index) {
+        for (symbol, left_entry) in &left_state.terminal_entries {
+            if let Some(right_entry) = right_state.terminal_entries.get(symbol) {
+                if right_entry.actions != left_entry.actions {
+                    return false;
+                }
+            } else if !self.can_add_entry_to_state(right_state, *symbol, left_entry) {
                return false;
            }
        }
+
+        let mut symbols_to_add = Vec::new();
+        for (symbol, right_entry) in &right_state.terminal_entries {
+            if !left_state.terminal_entries.contains_key(&symbol) {
+                if !self.can_add_entry_to_state(left_state, *symbol, right_entry) {
+                    return false;
+                }
+                symbols_to_add.push(*symbol);
+            }
+        }
+
+        for symbol in symbols_to_add {
+            let entry = self.parse_table.states[right].terminal_entries[&symbol].clone();
+            self.parse_table.states[left]
+                .terminal_entries
+                .insert(symbol, entry);
+        }
+
+        true
    }

-    true
-}
-
-fn remove_unused_states(parse_table: &mut ParseTable) {
-    let mut state_usage_map = vec![false; parse_table.states.len()];
-
-    state_usage_map[0] = true;
-    state_usage_map[1] = true;
-
-    for state in &parse_table.states {
-        for referenced_state in state.referenced_states() {
-            state_usage_map[referenced_state] = true;
+    fn can_add_entry_to_state(
+        &self,
+        state: &ParseState,
+        token: Symbol,
+        entry: &ParseTableEntry,
+    ) -> bool {
+        // Do not add external tokens; they could conflict lexically with any of the state's
+        // existing lookahead tokens.
+        if token.is_external() {
+            return false;
        }
+
+        // Only merge_compatible_states parse states by allowing existing reductions to happen
+        // with additional lookahead tokens. Do not alter parse states in ways
+        // that allow entirely new types of actions to happen.
+        if state.terminal_entries.iter().all(|(_, e)| e != entry) {
+            return false;
+        }
+        match entry.actions.last() {
+            Some(ParseAction::Reduce { .. }) => {}
+            _ => return false,
+        }
+
+        // Do not add tokens which are both internal and external. Their validity could
+        // influence the behavior of the external scanner.
+        if self
+            .syntax_grammar
+            .external_tokens
+            .iter()
+            .any(|t| t.corresponding_internal_token == Some(token))
+        {
+            return false;
+        }
+
+        let is_word_token = self.syntax_grammar.word_token == Some(token);
+        let is_keyword = self.keywords.contains(&token);
+
+        // Do not add a token if it conflicts with an existing token.
+        if token.is_terminal() {
+            for existing_token in state.terminal_entries.keys() {
+                if (is_word_token && self.keywords.contains(existing_token))
+                    || is_keyword && self.syntax_grammar.word_token.as_ref() == Some(existing_token)
+                {
+                    continue;
+                }
+                if self
+                    .token_conflict_map
+                    .does_conflict(token.index, existing_token.index)
+                    || self
+                        .token_conflict_map
+                        .does_match_same_string(token.index, existing_token.index)
+                {
+                    return false;
+                }
+            }
+        }
+
+        true
    }
-    let mut removed_predecessor_count = 0;
-    let mut state_replacement_map = vec![0; parse_table.states.len()];
-    for state_id in 0..parse_table.states.len() {
-        state_replacement_map[state_id] = state_id - removed_predecessor_count;
-        if !state_usage_map[state_id] {
-            removed_predecessor_count += 1;
+
+    fn remove_unused_states(&mut self) {
+        let mut state_usage_map = vec![false; self.parse_table.states.len()];
+
+        state_usage_map[0] = true;
+        state_usage_map[1] = true;
+
+        for state in &self.parse_table.states {
+            for referenced_state in state.referenced_states() {
+                state_usage_map[referenced_state] = true;
+            }
        }
-    }
-    let mut state_id = 0;
-    let mut original_state_id = 0;
-    while state_id < parse_table.states.len() {
-        if state_usage_map[original_state_id] {
-            parse_table.states[state_id].update_referenced_states(|other_state_id, _| {
-                state_replacement_map[other_state_id]
-            });
-            state_id += 1;
-        } else {
-            parse_table.states.remove(state_id);
+        let mut removed_predecessor_count = 0;
+        let mut state_replacement_map = vec![0; self.parse_table.states.len()];
+        for state_id in 0..self.parse_table.states.len() {
+            state_replacement_map[state_id] = state_id - removed_predecessor_count;
+            if !state_usage_map[state_id] {
+                removed_predecessor_count += 1;
+            }
+        }
+        let mut state_id = 0;
+        let mut original_state_id = 0;
+        while state_id < self.parse_table.states.len() {
+            if state_usage_map[original_state_id] {
+                self.parse_table.states[state_id].update_referenced_states(|other_state_id, _| {
+                    state_replacement_map[other_state_id]
+                });
+                state_id += 1;
+            } else {
+                self.parse_table.states.remove(state_id);
+            }
+            original_state_id += 1;
        }
-        original_state_id += 1;
    }
 }