Fix logic for identifying error recovery tokens

This commit is contained in:
Max Brunsfeld 2019-01-03 13:49:50 -08:00
parent 70f00d1a1e
commit 5a7d781aaa
3 changed files with 311 additions and 253 deletions

View file

@ -1,23 +1,26 @@
use crate::grammars::LexicalGrammar;
use crate::rules::Symbol;
use crate::tables::{ParseStateId, ParseTable};
use std::fmt;
pub(crate) struct CoincidentTokenIndex {
pub(crate) struct CoincidentTokenIndex<'a> {
entries: Vec<Vec<ParseStateId>>,
grammar: &'a LexicalGrammar,
n: usize,
}
impl CoincidentTokenIndex {
pub fn new(table: &ParseTable, lexical_grammar: &LexicalGrammar) -> Self {
impl<'a> CoincidentTokenIndex<'a> {
pub fn new(table: &ParseTable, lexical_grammar: &'a LexicalGrammar) -> Self {
let n = lexical_grammar.variables.len();
let mut result = Self {
n,
grammar: lexical_grammar,
entries: vec![Vec::new(); n * n],
};
for (i, state) in table.states.iter().enumerate() {
for symbol in state.terminal_entries.keys() {
for other_symbol in state.terminal_entries.keys() {
let index = result.index(*symbol, *other_symbol);
let index = result.index(symbol.index, other_symbol.index);
if result.entries[index].last().cloned() != Some(i) {
result.entries[index].push(i);
}
@ -28,18 +31,41 @@ impl CoincidentTokenIndex {
}
pub fn states_with(&self, a: Symbol, b: Symbol) -> &Vec<ParseStateId> {
&self.entries[self.index(a, b)]
&self.entries[self.index(a.index, b.index)]
}
pub fn contains(&self, a: Symbol, b: Symbol) -> bool {
!self.entries[self.index(a, b)].is_empty()
!self.entries[self.index(a.index, b.index)].is_empty()
}
fn index(&self, a: Symbol, b: Symbol) -> usize {
if a.index < b.index {
a.index * self.n + b.index
fn index(&self, a: usize, b: usize) -> usize {
if a < b {
a * self.n + b
} else {
b.index * self.n + a.index
b * self.n + a
}
}
}
impl<'a> fmt::Debug for CoincidentTokenIndex<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "CoincidentTokenIndex {{\n")?;
write!(f, " entries: {{\n")?;
for i in 0..self.n {
write!(f, " {}: {{\n", self.grammar.variables[i].name)?;
for j in 0..self.n {
write!(
f,
" {}: {:?},\n",
self.grammar.variables[j].name,
self.entries[self.index(i, j)].len()
)?;
}
write!(f, " }},\n")?;
}
write!(f, " }},")?;
write!(f, "}}")?;
Ok(())
}
}

View file

@ -47,6 +47,7 @@ pub(crate) fn build_tables(
syntax_grammar,
simple_aliases,
&token_conflict_map,
&keywords,
);
let (main_lex_table, keyword_lex_table) =
build_lex_table(&mut parse_table, syntax_grammar, lexical_grammar, &keywords);
@ -67,15 +68,22 @@ fn populate_error_state(
) {
let state = &mut parse_table.states[0];
let n = lexical_grammar.variables.len();
// First identify the *conflict-free tokens*: tokens that do not overlap with
// any other token in any way.
let conflict_free_tokens = LookaheadSet::with((0..n).into_iter().filter_map(|i| {
let conflicts_with_other_tokens = (0..n).into_iter().all(|j| {
j == i
|| coincident_token_index.contains(Symbol::terminal(i), Symbol::terminal(j))
|| !token_conflict_map.does_conflict(i, j)
let conflicts_with_other_tokens = (0..n).into_iter().any(|j| {
j != i
&& !coincident_token_index.contains(Symbol::terminal(i), Symbol::terminal(j))
&& token_conflict_map.does_conflict(i, j)
});
if conflicts_with_other_tokens {
None
} else {
info!(
"error recovery - token {} has no conflicts",
lexical_grammar.variables[i].name
);
Some(Symbol::terminal(i))
}
}));
@ -85,19 +93,32 @@ fn populate_error_state(
actions: vec![ParseAction::Recover],
};
// Exclude from the error-recovery state any token that conflicts with one of
// the *conflict-free tokens* identified above.
for i in 0..n {
let symbol = Symbol::terminal(i);
let can_be_used_for_recovery = conflict_free_tokens.contains(&symbol)
|| conflict_free_tokens.iter().all(|t| {
coincident_token_index.contains(symbol, t)
|| !token_conflict_map.does_conflict(i, t.index)
});
if can_be_used_for_recovery {
state
.terminal_entries
.entry(symbol)
.or_insert_with(|| recover_entry.clone());
if !conflict_free_tokens.contains(&symbol) {
if syntax_grammar.word_token != Some(symbol) {
if let Some(t) = conflict_free_tokens.iter().find(|t| {
!coincident_token_index.contains(symbol, *t)
&& token_conflict_map.does_conflict(symbol.index, t.index)
}) {
info!(
"error recovery - exclude token {} because of conflict with {}",
lexical_grammar.variables[i].name, lexical_grammar.variables[t.index].name
);
continue;
}
}
}
info!(
"error recovery - include token {}",
lexical_grammar.variables[i].name
);
state
.terminal_entries
.entry(symbol)
.or_insert_with(|| recover_entry.clone());
}
for (i, external_token) in syntax_grammar.external_tokens.iter().enumerate() {
@ -134,7 +155,10 @@ fn identify_keywords(
if all_chars_are_alphabetical(&cursor)
&& token_conflict_map.does_match_same_string(i, word_token.index)
{
info!("Keywords - add candidate {}", lexical_grammar.variables[i].name);
info!(
"Keywords - add candidate {}",
lexical_grammar.variables[i].name
);
Some(Symbol::terminal(i))
} else {
None

View file

@ -1,3 +1,4 @@
use super::item::LookaheadSet;
use super::token_conflicts::TokenConflictMap;
use crate::grammars::{SyntaxGrammar, VariableType};
use crate::rules::{AliasMap, Symbol};
@ -9,265 +10,272 @@ pub(crate) fn shrink_parse_table(
syntax_grammar: &SyntaxGrammar,
simple_aliases: &AliasMap,
token_conflict_map: &TokenConflictMap,
keywords: &LookaheadSet,
) {
remove_unit_reductions(parse_table, syntax_grammar, simple_aliases);
merge_compatible_states(parse_table, syntax_grammar, token_conflict_map);
remove_unused_states(parse_table);
let mut optimizer = Optimizer {
parse_table,
syntax_grammar,
token_conflict_map,
keywords,
simple_aliases,
};
optimizer.remove_unit_reductions();
optimizer.merge_compatible_states();
optimizer.remove_unused_states();
}
fn remove_unit_reductions(
parse_table: &mut ParseTable,
syntax_grammar: &SyntaxGrammar,
simple_aliases: &AliasMap,
) {
let mut aliased_symbols = HashSet::new();
for variable in &syntax_grammar.variables {
for production in &variable.productions {
for step in &production.steps {
if step.alias.is_some() {
aliased_symbols.insert(step.symbol);
struct Optimizer<'a> {
parse_table: &'a mut ParseTable,
syntax_grammar: &'a SyntaxGrammar,
token_conflict_map: &'a TokenConflictMap<'a>,
keywords: &'a LookaheadSet,
simple_aliases: &'a AliasMap,
}
impl<'a> Optimizer<'a> {
fn remove_unit_reductions(&mut self) {
let mut aliased_symbols = HashSet::new();
for variable in &self.syntax_grammar.variables {
for production in &variable.productions {
for step in &production.steps {
if step.alias.is_some() {
aliased_symbols.insert(step.symbol);
}
}
}
}
let mut unit_reduction_symbols_by_state = HashMap::new();
for (i, state) in self.parse_table.states.iter().enumerate() {
let mut only_unit_reductions = true;
let mut unit_reduction_symbol = None;
for (_, entry) in &state.terminal_entries {
for action in &entry.actions {
match action {
ParseAction::ShiftExtra => continue,
ParseAction::Reduce {
child_count: 1,
alias_sequence_id: 0,
symbol,
..
} => {
if !self.simple_aliases.contains_key(&symbol)
&& !aliased_symbols.contains(&symbol)
&& self.syntax_grammar.variables[symbol.index].kind
!= VariableType::Named
&& (unit_reduction_symbol.is_none()
|| unit_reduction_symbol == Some(symbol))
{
unit_reduction_symbol = Some(symbol);
continue;
}
}
_ => {}
}
only_unit_reductions = false;
break;
}
if !only_unit_reductions {
break;
}
}
if let Some(symbol) = unit_reduction_symbol {
if only_unit_reductions {
unit_reduction_symbols_by_state.insert(i, *symbol);
}
}
}
for state in self.parse_table.states.iter_mut() {
let mut done = false;
while !done {
done = true;
state.update_referenced_states(|other_state_id, state| {
if let Some(symbol) = unit_reduction_symbols_by_state.get(&other_state_id) {
done = false;
state.nonterminal_entries[symbol]
} else {
other_state_id
}
})
}
}
}
let mut unit_reduction_symbols_by_state = HashMap::new();
for (i, state) in parse_table.states.iter().enumerate() {
let mut only_unit_reductions = true;
let mut unit_reduction_symbol = None;
for (_, entry) in &state.terminal_entries {
for action in &entry.actions {
match action {
ParseAction::ShiftExtra => continue,
ParseAction::Reduce {
child_count: 1,
alias_sequence_id: 0,
symbol,
..
} => {
if !simple_aliases.contains_key(&symbol)
&& !aliased_symbols.contains(&symbol)
&& syntax_grammar.variables[symbol.index].kind != VariableType::Named
&& (unit_reduction_symbol.is_none()
|| unit_reduction_symbol == Some(symbol))
{
unit_reduction_symbol = Some(symbol);
fn merge_compatible_states(&mut self) {
let mut state_ids_by_signature = HashMap::new();
for (i, state) in self.parse_table.states.iter().enumerate() {
state_ids_by_signature
.entry(state.unfinished_item_signature)
.or_insert(Vec::new())
.push(i);
}
let mut deleted_states = HashSet::new();
loop {
let mut state_replacements = HashMap::new();
for (_, state_ids) in &state_ids_by_signature {
for i in state_ids {
for j in state_ids {
if j == i {
break;
}
if deleted_states.contains(j) || deleted_states.contains(i) {
continue;
}
if self.merge_parse_state(*j, *i) {
deleted_states.insert(*i);
state_replacements.insert(*i, *j);
}
}
_ => {}
}
only_unit_reductions = false;
}
if state_replacements.is_empty() {
break;
}
if !only_unit_reductions {
break;
}
}
if let Some(symbol) = unit_reduction_symbol {
if only_unit_reductions {
unit_reduction_symbols_by_state.insert(i, *symbol);
for state in self.parse_table.states.iter_mut() {
state.update_referenced_states(|other_state_id, _| {
*state_replacements
.get(&other_state_id)
.unwrap_or(&other_state_id)
});
}
}
}
for state in parse_table.states.iter_mut() {
let mut done = false;
while !done {
done = true;
state.update_referenced_states(|other_state_id, state| {
if let Some(symbol) = unit_reduction_symbols_by_state.get(&other_state_id) {
done = false;
state.nonterminal_entries[symbol]
} else {
other_state_id
}
})
}
}
}
fn merge_parse_state(&mut self, left: usize, right: usize) -> bool {
let left_state = &self.parse_table.states[left];
let right_state = &self.parse_table.states[right];
fn merge_compatible_states(
parse_table: &mut ParseTable,
syntax_grammar: &SyntaxGrammar,
token_conflict_map: &TokenConflictMap,
) {
let mut state_ids_by_signature = HashMap::new();
for (i, state) in parse_table.states.iter().enumerate() {
state_ids_by_signature
.entry(state.unfinished_item_signature)
.or_insert(Vec::new())
.push(i);
}
let mut deleted_states = HashSet::new();
loop {
let mut state_replacements = HashMap::new();
for (_, state_ids) in &state_ids_by_signature {
for i in state_ids {
for j in state_ids {
if j == i {
break;
}
if deleted_states.contains(j) || deleted_states.contains(i) {
continue;
}
if merge_parse_state(syntax_grammar, token_conflict_map, parse_table, *j, *i) {
deleted_states.insert(*i);
state_replacements.insert(*i, *j);
}
}
}
}
if state_replacements.is_empty() {
break;
}
for state in parse_table.states.iter_mut() {
state.update_referenced_states(|other_state_id, _| {
*state_replacements
.get(&other_state_id)
.unwrap_or(&other_state_id)
});
}
}
}
fn merge_parse_state(
syntax_grammar: &SyntaxGrammar,
token_conflict_map: &TokenConflictMap,
parse_table: &mut ParseTable,
left: usize,
right: usize,
) -> bool {
let left_state = &parse_table.states[left];
let right_state = &parse_table.states[right];
if left_state.nonterminal_entries != right_state.nonterminal_entries {
return false;
}
for (symbol, left_entry) in &left_state.terminal_entries {
if let Some(right_entry) = right_state.terminal_entries.get(symbol) {
if right_entry.actions != left_entry.actions {
return false;
}
} else if !can_add_entry_to_state(
syntax_grammar,
token_conflict_map,
right_state,
*symbol,
left_entry,
) {
if left_state.nonterminal_entries != right_state.nonterminal_entries {
return false;
}
}
let mut symbols_to_add = Vec::new();
for (symbol, right_entry) in &right_state.terminal_entries {
if !left_state.terminal_entries.contains_key(&symbol) {
if !can_add_entry_to_state(
syntax_grammar,
token_conflict_map,
left_state,
*symbol,
right_entry,
) {
return false;
}
symbols_to_add.push(*symbol);
}
}
for symbol in symbols_to_add {
let entry = parse_table.states[right].terminal_entries[&symbol].clone();
parse_table.states[left]
.terminal_entries
.insert(symbol, entry);
}
true
}
fn can_add_entry_to_state(
syntax_grammar: &SyntaxGrammar,
token_conflict_map: &TokenConflictMap,
state: &ParseState,
token: Symbol,
entry: &ParseTableEntry,
) -> bool {
// Do not add external tokens; they could conflict lexically with any of the state's
// existing lookahead tokens.
if token.is_external() {
return false;
}
// Only merge parse states by allowing existing reductions to happen
// with additional lookahead tokens. Do not alter parse states in ways
// that allow entirely new types of actions to happen.
if state.terminal_entries.iter().all(|(_, e)| e != entry) {
return false;
}
match entry.actions.last() {
Some(ParseAction::Reduce { .. }) => {}
_ => return false,
}
// Do not add tokens which are both internal and external. Their validity could
// influence the behavior of the external scanner.
if syntax_grammar
.external_tokens
.iter()
.any(|t| t.corresponding_internal_token == Some(token))
{
return false;
}
// Do not add a token if it conflicts with an existing token.
if token.is_terminal() {
for existing_token in state.terminal_entries.keys() {
if token_conflict_map.does_conflict(token.index, existing_token.index) {
for (symbol, left_entry) in &left_state.terminal_entries {
if let Some(right_entry) = right_state.terminal_entries.get(symbol) {
if right_entry.actions != left_entry.actions {
return false;
}
} else if !self.can_add_entry_to_state(right_state, *symbol, left_entry) {
return false;
}
}
let mut symbols_to_add = Vec::new();
for (symbol, right_entry) in &right_state.terminal_entries {
if !left_state.terminal_entries.contains_key(&symbol) {
if !self.can_add_entry_to_state(left_state, *symbol, right_entry) {
return false;
}
symbols_to_add.push(*symbol);
}
}
for symbol in symbols_to_add {
let entry = self.parse_table.states[right].terminal_entries[&symbol].clone();
self.parse_table.states[left]
.terminal_entries
.insert(symbol, entry);
}
true
}
true
}
fn remove_unused_states(parse_table: &mut ParseTable) {
let mut state_usage_map = vec![false; parse_table.states.len()];
state_usage_map[0] = true;
state_usage_map[1] = true;
for state in &parse_table.states {
for referenced_state in state.referenced_states() {
state_usage_map[referenced_state] = true;
fn can_add_entry_to_state(
&self,
state: &ParseState,
token: Symbol,
entry: &ParseTableEntry,
) -> bool {
// Do not add external tokens; they could conflict lexically with any of the state's
// existing lookahead tokens.
if token.is_external() {
return false;
}
// Only merge_compatible_states parse states by allowing existing reductions to happen
// with additional lookahead tokens. Do not alter parse states in ways
// that allow entirely new types of actions to happen.
if state.terminal_entries.iter().all(|(_, e)| e != entry) {
return false;
}
match entry.actions.last() {
Some(ParseAction::Reduce { .. }) => {}
_ => return false,
}
// Do not add tokens which are both internal and external. Their validity could
// influence the behavior of the external scanner.
if self
.syntax_grammar
.external_tokens
.iter()
.any(|t| t.corresponding_internal_token == Some(token))
{
return false;
}
let is_word_token = self.syntax_grammar.word_token == Some(token);
let is_keyword = self.keywords.contains(&token);
// Do not add a token if it conflicts with an existing token.
if token.is_terminal() {
for existing_token in state.terminal_entries.keys() {
if (is_word_token && self.keywords.contains(existing_token))
|| is_keyword && self.syntax_grammar.word_token.as_ref() == Some(existing_token)
{
continue;
}
if self
.token_conflict_map
.does_conflict(token.index, existing_token.index)
|| self
.token_conflict_map
.does_match_same_string(token.index, existing_token.index)
{
return false;
}
}
}
true
}
let mut removed_predecessor_count = 0;
let mut state_replacement_map = vec![0; parse_table.states.len()];
for state_id in 0..parse_table.states.len() {
state_replacement_map[state_id] = state_id - removed_predecessor_count;
if !state_usage_map[state_id] {
removed_predecessor_count += 1;
fn remove_unused_states(&mut self) {
let mut state_usage_map = vec![false; self.parse_table.states.len()];
state_usage_map[0] = true;
state_usage_map[1] = true;
for state in &self.parse_table.states {
for referenced_state in state.referenced_states() {
state_usage_map[referenced_state] = true;
}
}
}
let mut state_id = 0;
let mut original_state_id = 0;
while state_id < parse_table.states.len() {
if state_usage_map[original_state_id] {
parse_table.states[state_id].update_referenced_states(|other_state_id, _| {
state_replacement_map[other_state_id]
});
state_id += 1;
} else {
parse_table.states.remove(state_id);
let mut removed_predecessor_count = 0;
let mut state_replacement_map = vec![0; self.parse_table.states.len()];
for state_id in 0..self.parse_table.states.len() {
state_replacement_map[state_id] = state_id - removed_predecessor_count;
if !state_usage_map[state_id] {
removed_predecessor_count += 1;
}
}
let mut state_id = 0;
let mut original_state_id = 0;
while state_id < self.parse_table.states.len() {
if state_usage_map[original_state_id] {
self.parse_table.states[state_id].update_referenced_states(|other_state_id, _| {
state_replacement_map[other_state_id]
});
state_id += 1;
} else {
self.parse_table.states.remove(state_id);
}
original_state_id += 1;
}
original_state_id += 1;
}
}