feat: add 'reserved word' construct

Co-authored-by: Amaan Qureshi <amaanq12@gmail.com>
This commit is contained in:
Max Brunsfeld 2024-12-23 00:06:32 -08:00 committed by GitHub
parent 2a63077cac
commit 201b41cf11
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
31 changed files with 2367 additions and 1628 deletions

View file

@ -43,15 +43,17 @@ pub fn build_lex_table(
let tokens = state
.terminal_entries
.keys()
.copied()
.chain(state.reserved_words.iter())
.filter_map(|token| {
if token.is_terminal() {
if keywords.contains(token) {
if keywords.contains(&token) {
syntax_grammar.word_token
} else {
Some(*token)
Some(token)
}
} else if token.is_eof() {
Some(*token)
Some(token)
} else {
None
}

View file

@ -10,13 +10,11 @@ use indexmap::{map::Entry, IndexMap};
use rustc_hash::FxHasher;
use super::{
item::{ParseItem, ParseItemSet, ParseItemSetCore},
item::{ParseItem, ParseItemSet, ParseItemSetCore, ParseItemSetEntry},
item_set_builder::ParseItemSetBuilder,
};
use crate::{
grammars::{
InlinedProductionMap, LexicalGrammar, PrecedenceEntry, SyntaxGrammar, VariableType,
},
grammars::{LexicalGrammar, PrecedenceEntry, ReservedWordSetId, SyntaxGrammar, VariableType},
node_types::VariableInfo,
rules::{Associativity, Precedence, Symbol, SymbolType, TokenSet},
tables::{
@ -67,6 +65,33 @@ struct ParseTableBuilder<'a> {
}
impl<'a> ParseTableBuilder<'a> {
fn new(
syntax_grammar: &'a SyntaxGrammar,
lexical_grammar: &'a LexicalGrammar,
item_set_builder: ParseItemSetBuilder<'a>,
variable_info: &'a [VariableInfo],
) -> Self {
Self {
syntax_grammar,
lexical_grammar,
item_set_builder,
variable_info,
non_terminal_extra_states: Vec::new(),
state_ids_by_item_set: IndexMap::default(),
core_ids_by_core: HashMap::new(),
parse_state_info_by_id: Vec::new(),
parse_state_queue: VecDeque::new(),
actual_conflicts: syntax_grammar.expected_conflicts.iter().cloned().collect(),
parse_table: ParseTable {
states: Vec::new(),
symbols: Vec::new(),
external_lex_states: Vec::new(),
production_infos: Vec::new(),
max_aliased_production_length: 1,
},
}
}
fn build(mut self) -> Result<(ParseTable, Vec<ParseStateInfo<'a>>)> {
// Ensure that the empty alias sequence has index 0.
self.parse_table
@ -80,10 +105,13 @@ impl<'a> ParseTableBuilder<'a> {
self.add_parse_state(
&Vec::new(),
&Vec::new(),
ParseItemSet::with(std::iter::once((
ParseItem::start(),
std::iter::once(Symbol::end()).collect(),
))),
ParseItemSet {
entries: vec![ParseItemSetEntry {
item: ParseItem::start(),
lookaheads: std::iter::once(Symbol::end()).collect(),
following_reserved_word_set: ReservedWordSetId::default(),
}],
},
);
// Compute the possible item sets for non-terminal extras.
@ -99,15 +127,14 @@ impl<'a> ParseTableBuilder<'a> {
non_terminal_extra_item_sets_by_first_terminal
.entry(production.first_symbol().unwrap())
.or_insert_with(ParseItemSet::default)
.insert(
ParseItem {
variable_index: extra_non_terminal.index as u32,
production,
step_index: 1,
has_preceding_inherited_fields: false,
},
&std::iter::once(Symbol::end_of_nonterminal_extra()).collect(),
);
.insert(ParseItem {
variable_index: extra_non_terminal.index as u32,
production,
step_index: 1,
has_preceding_inherited_fields: false,
})
.lookaheads
.insert(Symbol::end_of_nonterminal_extra());
}
}
@ -176,6 +203,7 @@ impl<'a> ParseTableBuilder<'a> {
external_lex_state_id: 0,
terminal_entries: IndexMap::default(),
nonterminal_entries: IndexMap::default(),
reserved_words: TokenSet::default(),
core_id,
});
self.parse_state_queue.push_back(ParseStateQueueEntry {
@ -202,13 +230,18 @@ impl<'a> ParseTableBuilder<'a> {
// Each item in the item set contributes to either or a Shift action or a Reduce
// action in this state.
for (item, lookaheads) in &item_set.entries {
for ParseItemSetEntry {
item,
lookaheads,
following_reserved_word_set: reserved_lookaheads,
} in &item_set.entries
{
// If the item is unfinished, then this state has a transition for the item's
// next symbol. Advance the item to its next step and insert the resulting
// item into the successor item set.
if let Some(next_symbol) = item.symbol() {
let mut successor = item.successor();
if next_symbol.is_non_terminal() {
let successor_set = if next_symbol.is_non_terminal() {
let variable = &self.syntax_grammar.variables[next_symbol.index];
// Keep track of where auxiliary non-terminals (repeat symbols) are
@ -237,13 +270,16 @@ impl<'a> ParseTableBuilder<'a> {
non_terminal_successors
.entry(next_symbol)
.or_insert_with(ParseItemSet::default)
.insert(successor, lookaheads);
} else {
terminal_successors
.entry(next_symbol)
.or_insert_with(ParseItemSet::default)
.insert(successor, lookaheads);
}
};
let successor_entry = successor_set.insert(successor);
successor_entry.lookaheads.insert_all(lookaheads);
successor_entry.following_reserved_word_set = successor_entry
.following_reserved_word_set
.max(*reserved_lookaheads);
}
// If the item is finished, then add a Reduce action to this state based
// on this item.
@ -370,7 +406,7 @@ impl<'a> ParseTableBuilder<'a> {
)?;
}
// Finally, add actions for the grammar's `extra` symbols.
// Add actions for the grammar's `extra` symbols.
let state = &mut self.parse_table.states[state_id];
let is_end_of_non_terminal_extra = state.is_end_of_non_terminal_extra();
@ -382,7 +418,7 @@ impl<'a> ParseTableBuilder<'a> {
let parent_symbols = item_set
.entries
.iter()
.filter_map(|(item, _)| {
.filter_map(|ParseItemSetEntry { item, .. }| {
if !item.is_augmented() && item.step_index > 0 {
Some(item.variable_index)
} else {
@ -436,6 +472,30 @@ impl<'a> ParseTableBuilder<'a> {
}
}
if let Some(keyword_capture_token) = self.syntax_grammar.word_token {
let reserved_word_set_id = item_set
.entries
.iter()
.filter_map(|entry| {
if let Some(next_step) = entry.item.step() {
if next_step.symbol == keyword_capture_token {
Some(next_step.reserved_word_set_id)
} else {
None
}
} else if entry.lookaheads.contains(&keyword_capture_token) {
Some(entry.following_reserved_word_set)
} else {
None
}
})
.max();
if let Some(reserved_word_set_id) = reserved_word_set_id {
state.reserved_words =
self.syntax_grammar.reserved_word_sets[reserved_word_set_id.0].clone();
}
}
Ok(())
}
@ -462,7 +522,10 @@ impl<'a> ParseTableBuilder<'a> {
let mut considered_associativity = false;
let mut shift_precedence = Vec::<(&Precedence, Symbol)>::new();
let mut conflicting_items = HashSet::new();
for (item, lookaheads) in &item_set.entries {
for ParseItemSetEntry {
item, lookaheads, ..
} in &item_set.entries
{
if let Some(step) = item.step() {
if item.step_index > 0
&& self
@ -836,7 +899,7 @@ impl<'a> ParseTableBuilder<'a> {
let parent_symbols = item_set
.entries
.iter()
.filter_map(|(item, _)| {
.filter_map(|ParseItemSetEntry { item, .. }| {
let variable_index = item.variable_index as usize;
if item.symbol() == Some(symbol)
&& !self.syntax_grammar.variables[variable_index].is_auxiliary()
@ -931,77 +994,17 @@ impl<'a> ParseTableBuilder<'a> {
}
}
fn populate_following_tokens(
result: &mut [TokenSet],
grammar: &SyntaxGrammar,
inlines: &InlinedProductionMap,
builder: &ParseItemSetBuilder,
) {
let productions = grammar
.variables
.iter()
.flat_map(|v| &v.productions)
.chain(&inlines.productions);
let all_tokens = (0..result.len())
.map(Symbol::terminal)
.collect::<TokenSet>();
for production in productions {
for i in 1..production.steps.len() {
let left_tokens = builder.last_set(&production.steps[i - 1].symbol);
let right_tokens = builder.first_set(&production.steps[i].symbol);
for left_token in left_tokens.iter() {
if left_token.is_terminal() {
result[left_token.index].insert_all_terminals(right_tokens);
}
}
}
}
for extra in &grammar.extra_symbols {
if extra.is_terminal() {
for entry in result.iter_mut() {
entry.insert(*extra);
}
result[extra.index].clone_from(&all_tokens);
}
}
}
pub fn build_parse_table<'a>(
syntax_grammar: &'a SyntaxGrammar,
lexical_grammar: &'a LexicalGrammar,
inlines: &'a InlinedProductionMap,
item_set_builder: ParseItemSetBuilder<'a>,
variable_info: &'a [VariableInfo],
) -> Result<(ParseTable, Vec<TokenSet>, Vec<ParseStateInfo<'a>>)> {
let actual_conflicts = syntax_grammar.expected_conflicts.iter().cloned().collect();
let item_set_builder = ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines);
let mut following_tokens = vec![TokenSet::new(); lexical_grammar.variables.len()];
populate_following_tokens(
&mut following_tokens,
syntax_grammar,
inlines,
&item_set_builder,
);
let (table, item_sets) = ParseTableBuilder {
) -> Result<(ParseTable, Vec<ParseStateInfo<'a>>)> {
ParseTableBuilder::new(
syntax_grammar,
lexical_grammar,
item_set_builder,
variable_info,
non_terminal_extra_states: Vec::new(),
actual_conflicts,
state_ids_by_item_set: IndexMap::default(),
core_ids_by_core: HashMap::new(),
parse_state_info_by_id: Vec::new(),
parse_state_queue: VecDeque::new(),
parse_table: ParseTable {
states: Vec::new(),
symbols: Vec::new(),
external_lex_states: Vec::new(),
production_infos: Vec::new(),
max_aliased_production_length: 1,
},
}
.build()?;
Ok((table, following_tokens, item_sets))
)
.build()
}

View file

@ -7,7 +7,10 @@ use std::{
use lazy_static::lazy_static;
use crate::{
grammars::{LexicalGrammar, Production, ProductionStep, SyntaxGrammar},
grammars::{
LexicalGrammar, Production, ProductionStep, ReservedWordSetId, SyntaxGrammar,
NO_RESERVED_WORDS,
},
rules::{Associativity, Precedence, Symbol, SymbolType, TokenSet},
};
@ -23,6 +26,7 @@ lazy_static! {
associativity: None,
alias: None,
field_name: None,
reserved_word_set_id: NO_RESERVED_WORDS,
}],
};
}
@ -58,7 +62,14 @@ pub struct ParseItem<'a> {
/// to a state in the final parse table.
#[derive(Clone, Debug, PartialEq, Eq, Default)]
pub struct ParseItemSet<'a> {
pub entries: Vec<(ParseItem<'a>, TokenSet)>,
pub entries: Vec<ParseItemSetEntry<'a>>,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct ParseItemSetEntry<'a> {
pub item: ParseItem<'a>,
pub lookaheads: TokenSet,
pub following_reserved_word_set: ReservedWordSetId,
}
/// A [`ParseItemSetCore`] is like a [`ParseItemSet`], but without the lookahead
@ -152,30 +163,26 @@ impl<'a> ParseItem<'a> {
}
impl<'a> ParseItemSet<'a> {
pub fn with(elements: impl IntoIterator<Item = (ParseItem<'a>, TokenSet)>) -> Self {
let mut result = Self::default();
for (item, lookaheads) in elements {
result.insert(item, &lookaheads);
}
result
}
pub fn insert(&mut self, item: ParseItem<'a>, lookaheads: &TokenSet) -> &mut TokenSet {
match self.entries.binary_search_by(|(i, _)| i.cmp(&item)) {
pub fn insert(&mut self, item: ParseItem<'a>) -> &mut ParseItemSetEntry<'a> {
match self.entries.binary_search_by(|e| e.item.cmp(&item)) {
Err(i) => {
self.entries.insert(i, (item, lookaheads.clone()));
&mut self.entries[i].1
}
Ok(i) => {
self.entries[i].1.insert_all(lookaheads);
&mut self.entries[i].1
self.entries.insert(
i,
ParseItemSetEntry {
item,
lookaheads: TokenSet::new(),
following_reserved_word_set: ReservedWordSetId::default(),
},
);
&mut self.entries[i]
}
Ok(i) => &mut self.entries[i],
}
}
pub fn core(&self) -> ParseItemSetCore<'a> {
ParseItemSetCore {
entries: self.entries.iter().map(|e| e.0).collect(),
entries: self.entries.iter().map(|e| e.item).collect(),
}
}
}
@ -195,14 +202,21 @@ impl fmt::Display for ParseItemDisplay<'_> {
for (i, step) in self.0.production.steps.iter().enumerate() {
if i == self.0.step_index as usize {
write!(f, "")?;
if let Some(associativity) = step.associativity {
if !step.precedence.is_none()
|| step.associativity.is_some()
|| step.reserved_word_set_id != ReservedWordSetId::default()
{
write!(f, " (")?;
if step.precedence.is_none() {
write!(f, " ({associativity:?})")?;
} else {
write!(f, " ({} {associativity:?})", step.precedence)?;
write!(f, " {}", step.precedence)?;
}
} else if !step.precedence.is_none() {
write!(f, " ({})", step.precedence)?;
if let Some(associativity) = step.associativity {
write!(f, " {associativity:?}")?;
}
if step.reserved_word_set_id != ReservedWordSetId::default() {
write!(f, "reserved: {}", step.reserved_word_set_id)?;
}
write!(f, " )")?;
}
}
@ -270,13 +284,21 @@ impl fmt::Display for TokenSetDisplay<'_> {
impl fmt::Display for ParseItemSetDisplay<'_> {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
for (item, lookaheads) in &self.0.entries {
writeln!(
for entry in &self.0.entries {
write!(
f,
"{}\t{}",
ParseItemDisplay(item, self.1, self.2),
TokenSetDisplay(lookaheads, self.1, self.2)
ParseItemDisplay(&entry.item, self.1, self.2),
TokenSetDisplay(&entry.lookaheads, self.1, self.2),
)?;
if entry.following_reserved_word_set != ReservedWordSetId::default() {
write!(
f,
"\treserved word set: {}",
entry.following_reserved_word_set
)?;
}
writeln!(f)?;
}
Ok(())
}
@ -296,7 +318,7 @@ impl Hash for ParseItem<'_> {
// this item, unless any of the following are true:
// * the children have fields
// * the children have aliases
// * the children are hidden and
// * the children are hidden and represent rules that have fields.
// See the docs for `has_preceding_inherited_fields`.
for step in &self.production.steps[0..self.step_index as usize] {
step.alias.hash(hasher);
@ -399,9 +421,10 @@ impl Eq for ParseItem<'_> {}
impl Hash for ParseItemSet<'_> {
fn hash<H: Hasher>(&self, hasher: &mut H) {
hasher.write_usize(self.entries.len());
for (item, lookaheads) in &self.entries {
item.hash(hasher);
lookaheads.hash(hasher);
for entry in &self.entries {
entry.item.hash(hasher);
entry.lookaheads.hash(hasher);
entry.following_reserved_word_set.hash(hasher);
}
}
}

View file

@ -3,9 +3,9 @@ use std::{
fmt,
};
use super::item::{ParseItem, ParseItemDisplay, ParseItemSet, TokenSetDisplay};
use super::item::{ParseItem, ParseItemDisplay, ParseItemSet, ParseItemSetEntry, TokenSetDisplay};
use crate::{
grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar},
grammars::{InlinedProductionMap, LexicalGrammar, ReservedWordSetId, SyntaxGrammar},
rules::{Symbol, SymbolType, TokenSet},
};
@ -15,9 +15,10 @@ struct TransitiveClosureAddition<'a> {
info: FollowSetInfo,
}
#[derive(Clone, Debug, PartialEq, Eq)]
#[derive(Clone, Debug, Default, PartialEq, Eq)]
struct FollowSetInfo {
lookaheads: TokenSet,
reserved_lookaheads: ReservedWordSetId,
propagates_lookaheads: bool,
}
@ -25,6 +26,7 @@ pub struct ParseItemSetBuilder<'a> {
syntax_grammar: &'a SyntaxGrammar,
lexical_grammar: &'a LexicalGrammar,
first_sets: HashMap<Symbol, TokenSet>,
reserved_first_sets: HashMap<Symbol, ReservedWordSetId>,
last_sets: HashMap<Symbol, TokenSet>,
inlines: &'a InlinedProductionMap,
transitive_closure_additions: Vec<Vec<TransitiveClosureAddition<'a>>>,
@ -46,6 +48,7 @@ impl<'a> ParseItemSetBuilder<'a> {
syntax_grammar,
lexical_grammar,
first_sets: HashMap::new(),
reserved_first_sets: HashMap::new(),
last_sets: HashMap::new(),
inlines,
transitive_closure_additions: vec![Vec::new(); syntax_grammar.variables.len()],
@ -54,8 +57,7 @@ impl<'a> ParseItemSetBuilder<'a> {
// For each grammar symbol, populate the FIRST and LAST sets: the set of
// terminals that appear at the beginning and end that symbol's productions,
// respectively.
//
// For a terminal symbol, the FIRST and LAST set just consists of the
// For a terminal symbol, the FIRST and LAST sets just consist of the
// terminal itself.
for i in 0..lexical_grammar.variables.len() {
let symbol = Symbol::terminal(i);
@ -63,6 +65,9 @@ impl<'a> ParseItemSetBuilder<'a> {
set.insert(symbol);
result.first_sets.insert(symbol, set.clone());
result.last_sets.insert(symbol, set);
result
.reserved_first_sets
.insert(symbol, ReservedWordSetId::default());
}
for i in 0..syntax_grammar.external_tokens.len() {
@ -71,12 +76,15 @@ impl<'a> ParseItemSetBuilder<'a> {
set.insert(symbol);
result.first_sets.insert(symbol, set.clone());
result.last_sets.insert(symbol, set);
result
.reserved_first_sets
.insert(symbol, ReservedWordSetId::default());
}
// The FIRST set of a non-terminal `i` is the union of the following sets:
// * the set of all terminals that appear at the beginnings of i's productions
// * the FIRST sets of all the non-terminals that appear at the beginnings of i's
// productions
// The FIRST set of a non-terminal `i` is the union of the the FIRST sets
// of all the symbols that appear at the beginnings of i's productions. Some
// of these symbols may themselves be non-terminals, so this is a recursive
// definition.
//
// Rather than computing these sets using recursion, we use an explicit stack
// called `symbols_to_process`.
@ -84,37 +92,36 @@ impl<'a> ParseItemSetBuilder<'a> {
let mut processed_non_terminals = HashSet::new();
for i in 0..syntax_grammar.variables.len() {
let symbol = Symbol::non_terminal(i);
let first_set = result.first_sets.entry(symbol).or_default();
let reserved_first_set = result.reserved_first_sets.entry(symbol).or_default();
let first_set = result
.first_sets
.entry(symbol)
.or_insert_with(TokenSet::new);
processed_non_terminals.clear();
symbols_to_process.clear();
symbols_to_process.push(symbol);
while let Some(current_symbol) = symbols_to_process.pop() {
if current_symbol.is_terminal() || current_symbol.is_external() {
first_set.insert(current_symbol);
} else if processed_non_terminals.insert(current_symbol) {
for production in &syntax_grammar.variables[current_symbol.index].productions {
if let Some(step) = production.steps.first() {
while let Some(sym) = symbols_to_process.pop() {
for production in &syntax_grammar.variables[sym.index].productions {
if let Some(step) = production.steps.first() {
if step.symbol.is_terminal() || step.symbol.is_external() {
first_set.insert(step.symbol);
} else if processed_non_terminals.insert(step.symbol) {
symbols_to_process.push(step.symbol);
}
*reserved_first_set = (*reserved_first_set).max(step.reserved_word_set_id);
}
}
}
// The LAST set is defined in a similar way to the FIRST set.
let last_set = result.last_sets.entry(symbol).or_insert_with(TokenSet::new);
let last_set = result.last_sets.entry(symbol).or_default();
processed_non_terminals.clear();
symbols_to_process.clear();
symbols_to_process.push(symbol);
while let Some(current_symbol) = symbols_to_process.pop() {
if current_symbol.is_terminal() || current_symbol.is_external() {
last_set.insert(current_symbol);
} else if processed_non_terminals.insert(current_symbol) {
for production in &syntax_grammar.variables[current_symbol.index].productions {
if let Some(step) = production.steps.last() {
while let Some(sym) = symbols_to_process.pop() {
for production in &syntax_grammar.variables[sym.index].productions {
if let Some(step) = production.steps.last() {
if step.symbol.is_terminal() || step.symbol.is_external() {
last_set.insert(step.symbol);
} else if processed_non_terminals.insert(step.symbol) {
symbols_to_process.push(step.symbol);
}
}
@ -124,67 +131,75 @@ impl<'a> ParseItemSetBuilder<'a> {
// To compute an item set's transitive closure, we find each item in the set
// whose next symbol is a non-terminal, and we add new items to the set for
// each of that symbols' productions. These productions might themselves begin
// each of that symbol's productions. These productions might themselves begin
// with non-terminals, so the process continues recursively. In this process,
// the total set of entries that get added depends only on two things:
// * the set of non-terminal symbols that occur at each item's current position
// * the set of terminals that occurs after each of these non-terminal symbols
//
// * the non-terminal symbol that occurs next in each item
//
// * the set of terminals that can follow that non-terminal symbol in the item
//
// So we can avoid a lot of duplicated recursive work by precomputing, for each
// non-terminal symbol `i`, a final list of *additions* that must be made to an
// item set when `i` occurs as the next symbol in one if its core items. The
// structure of an *addition* is as follows:
// * `item` - the new item that must be added as part of the expansion of `i`
// * `lookaheads` - lookahead tokens that can always come after that item in the expansion
// of `i`
// * `propagates_lookaheads` - a boolean indicating whether or not `item` can occur at the
// *end* of the expansion of `i`, so that i's own current lookahead tokens can occur
// after `item`.
// item set when symbol `i` occurs as the next symbol in one if its core items.
// The structure of a precomputed *addition* is as follows:
//
// Again, rather than computing these additions recursively, we use an explicit
// stack called `entries_to_process`.
// * `item` - the new item that must be added as part of the expansion of the symbol `i`.
//
// * `lookaheads` - the set of possible lookahead tokens that can always come after `item`
// in an expansion of symbol `i`.
//
// * `reserved_lookaheads` - the set of reserved lookahead lookahead tokens that can
// always come after `item` in the expansion of symbol `i`.
//
// * `propagates_lookaheads` - a boolean indicating whether or not `item` can occur at the
// *end* of the expansion of symbol `i`, so that i's own current lookahead tokens can
// occur after `item`.
//
// Rather than computing these additions recursively, we use an explicit stack.
let empty_lookaheads = TokenSet::new();
let mut stack = Vec::new();
let mut follow_set_info_by_non_terminal = HashMap::<usize, FollowSetInfo>::new();
for i in 0..syntax_grammar.variables.len() {
let empty_lookaheads = TokenSet::new();
let mut entries_to_process = vec![(i, &empty_lookaheads, true)];
// First, build up a map whose keys are all of the non-terminals that can
// appear at the beginning of non-terminal `i`, and whose values store
// information about the tokens that can follow each non-terminal.
let mut follow_set_info_by_non_terminal = HashMap::new();
while let Some(entry) = entries_to_process.pop() {
let (variable_index, lookaheads, propagates_lookaheads) = entry;
let existing_info = follow_set_info_by_non_terminal
.entry(variable_index)
.or_insert_with(|| FollowSetInfo {
lookaheads: TokenSet::new(),
propagates_lookaheads: false,
});
let did_add_follow_set_info;
if propagates_lookaheads {
did_add_follow_set_info = !existing_info.propagates_lookaheads;
existing_info.propagates_lookaheads = true;
} else {
did_add_follow_set_info = existing_info.lookaheads.insert_all(lookaheads);
// information about the tokens that can follow those non-terminals.
stack.clear();
stack.push((i, &empty_lookaheads, ReservedWordSetId::default(), true));
follow_set_info_by_non_terminal.clear();
while let Some((sym_ix, lookaheads, reserved_word_set_id, propagates_lookaheads)) =
stack.pop()
{
let mut did_add = false;
let info = follow_set_info_by_non_terminal.entry(sym_ix).or_default();
did_add |= info.lookaheads.insert_all(lookaheads);
if reserved_word_set_id > info.reserved_lookaheads {
info.reserved_lookaheads = reserved_word_set_id;
did_add = true;
}
did_add |= propagates_lookaheads && !info.propagates_lookaheads;
info.propagates_lookaheads |= propagates_lookaheads;
if !did_add {
continue;
}
if did_add_follow_set_info {
for production in &syntax_grammar.variables[variable_index].productions {
if let Some(symbol) = production.first_symbol() {
if symbol.is_non_terminal() {
if production.steps.len() == 1 {
entries_to_process.push((
symbol.index,
lookaheads,
propagates_lookaheads,
));
} else {
entries_to_process.push((
symbol.index,
&result.first_sets[&production.steps[1].symbol],
false,
));
}
for production in &syntax_grammar.variables[sym_ix].productions {
if let Some(symbol) = production.first_symbol() {
if symbol.is_non_terminal() {
if let Some(next_step) = production.steps.get(1) {
stack.push((
symbol.index,
&result.first_sets[&next_step.symbol],
result.reserved_first_sets[&next_step.symbol],
false,
));
} else {
stack.push((
symbol.index,
lookaheads,
reserved_word_set_id,
propagates_lookaheads,
));
}
}
}
@ -194,7 +209,7 @@ impl<'a> ParseItemSetBuilder<'a> {
// Store all of those non-terminals' productions, along with their associated
// lookahead info, as *additions* associated with non-terminal `i`.
let additions_for_non_terminal = &mut result.transitive_closure_additions[i];
for (variable_index, follow_set_info) in follow_set_info_by_non_terminal {
for (&variable_index, follow_set_info) in &follow_set_info_by_non_terminal {
let variable = &syntax_grammar.variables[variable_index];
let non_terminal = Symbol::non_terminal(variable_index);
let variable_index = variable_index as u32;
@ -239,20 +254,23 @@ impl<'a> ParseItemSetBuilder<'a> {
pub fn transitive_closure(&self, item_set: &ParseItemSet<'a>) -> ParseItemSet<'a> {
let mut result = ParseItemSet::default();
for (item, lookaheads) in &item_set.entries {
for entry in &item_set.entries {
if let Some(productions) = self
.inlines
.inlined_productions(item.production, item.step_index)
.inlined_productions(entry.item.production, entry.item.step_index)
{
for production in productions {
self.add_item(
&mut result,
item.substitute_production(production),
lookaheads,
&ParseItemSetEntry {
item: entry.item.substitute_production(production),
lookaheads: entry.lookaheads.clone(),
following_reserved_word_set: entry.following_reserved_word_set,
},
);
}
} else {
self.add_item(&mut result, *item, lookaheads);
self.add_item(&mut result, entry);
}
}
result
@ -262,30 +280,64 @@ impl<'a> ParseItemSetBuilder<'a> {
&self.first_sets[symbol]
}
pub fn reserved_first_set(&self, symbol: &Symbol) -> Option<&TokenSet> {
let id = *self.reserved_first_sets.get(symbol)?;
Some(&self.syntax_grammar.reserved_word_sets[id.0])
}
pub fn last_set(&self, symbol: &Symbol) -> &TokenSet {
&self.last_sets[symbol]
}
fn add_item(&self, set: &mut ParseItemSet<'a>, item: ParseItem<'a>, lookaheads: &TokenSet) {
if let Some(step) = item.step() {
fn add_item(&self, set: &mut ParseItemSet<'a>, entry: &ParseItemSetEntry<'a>) {
if let Some(step) = entry.item.step() {
if step.symbol.is_non_terminal() {
let next_step = item.successor().step();
let next_step = entry.item.successor().step();
// Determine which tokens can follow this non-terminal.
let following_tokens = next_step.map_or(lookaheads, |next_step| {
self.first_sets.get(&next_step.symbol).unwrap()
});
let (following_tokens, following_reserved_tokens) =
if let Some(next_step) = next_step {
(
self.first_sets.get(&next_step.symbol).unwrap(),
*self.reserved_first_sets.get(&next_step.symbol).unwrap(),
)
} else {
(&entry.lookaheads, entry.following_reserved_word_set)
};
// Use the pre-computed *additions* to expand the non-terminal.
for addition in &self.transitive_closure_additions[step.symbol.index] {
let lookaheads = set.insert(addition.item, &addition.info.lookaheads);
let entry = set.insert(addition.item);
entry.lookaheads.insert_all(&addition.info.lookaheads);
if let Some(word_token) = self.syntax_grammar.word_token {
if addition.info.lookaheads.contains(&word_token) {
entry.following_reserved_word_set = entry
.following_reserved_word_set
.max(addition.info.reserved_lookaheads);
}
}
if addition.info.propagates_lookaheads {
lookaheads.insert_all(following_tokens);
entry.lookaheads.insert_all(following_tokens);
if let Some(word_token) = self.syntax_grammar.word_token {
if following_tokens.contains(&word_token) {
entry.following_reserved_word_set = entry
.following_reserved_word_set
.max(following_reserved_tokens);
}
}
}
}
}
}
set.insert(item, lookaheads);
let e = set.insert(entry.item);
e.lookaheads.insert_all(&entry.lookaheads);
e.following_reserved_word_set = e
.following_reserved_word_set
.max(entry.following_reserved_word_set);
}
}

View file

@ -170,17 +170,12 @@ impl Minimizer<'_> {
let mut new_states = Vec::with_capacity(state_ids_by_group_id.len());
for state_ids in &state_ids_by_group_id {
// Initialize the new state based on the first old state in the group.
let mut parse_state = ParseState::default();
mem::swap(&mut parse_state, &mut self.parse_table.states[state_ids[0]]);
let mut parse_state = mem::take(&mut self.parse_table.states[state_ids[0]]);
// Extend the new state with all of the actions from the other old states
// in the group.
for state_id in &state_ids[1..] {
let mut other_parse_state = ParseState::default();
mem::swap(
&mut other_parse_state,
&mut self.parse_table.states[*state_id],
);
let other_parse_state = mem::take(&mut self.parse_table.states[*state_id]);
parse_state
.terminal_entries
@ -188,6 +183,12 @@ impl Minimizer<'_> {
parse_state
.nonterminal_entries
.extend(other_parse_state.nonterminal_entries);
parse_state
.reserved_words
.insert_all(&other_parse_state.reserved_words);
for symbol in parse_state.terminal_entries.keys() {
parse_state.reserved_words.remove(symbol);
}
}
// Update the new state's outgoing references using the new grouping.
@ -216,24 +217,14 @@ impl Minimizer<'_> {
) {
return true;
}
} else if self.token_conflicts(
left_state.id,
right_state.id,
right_state.terminal_entries.keys(),
*token,
) {
} else if self.token_conflicts(left_state.id, right_state.id, right_state, *token) {
return true;
}
}
for token in right_state.terminal_entries.keys() {
if !left_state.terminal_entries.contains_key(token)
&& self.token_conflicts(
left_state.id,
right_state.id,
left_state.terminal_entries.keys(),
*token,
)
&& self.token_conflicts(left_state.id, right_state.id, left_state, *token)
{
return true;
}
@ -350,11 +341,11 @@ impl Minimizer<'_> {
false
}
fn token_conflicts<'b>(
fn token_conflicts(
&self,
left_id: ParseStateId,
right_id: ParseStateId,
existing_tokens: impl Iterator<Item = &'b Symbol>,
right_state: &ParseState,
new_token: Symbol,
) -> bool {
if new_token == Symbol::end_of_nonterminal_extra() {
@ -372,6 +363,10 @@ impl Minimizer<'_> {
return true;
}
if right_state.reserved_words.contains(&new_token) {
return false;
}
// Do not add tokens which are both internal and external. Their validity could
// influence the behavior of the external scanner.
if self
@ -388,23 +383,30 @@ impl Minimizer<'_> {
}
// Do not add a token if it conflicts with an existing token.
for token in existing_tokens {
if token.is_terminal()
&& !(self.syntax_grammar.word_token == Some(*token)
&& self.keywords.contains(&new_token))
&& !(self.syntax_grammar.word_token == Some(new_token)
&& self.keywords.contains(token))
&& (self
for token in right_state.terminal_entries.keys().copied() {
if !token.is_terminal() {
continue;
}
if self.syntax_grammar.word_token == Some(token) && self.keywords.contains(&new_token) {
continue;
}
if self.syntax_grammar.word_token == Some(new_token) && self.keywords.contains(&token) {
continue;
}
if self
.token_conflict_map
.does_conflict(new_token.index, token.index)
|| self
.token_conflict_map
.does_conflict(new_token.index, token.index)
|| self
.token_conflict_map
.does_match_same_string(new_token.index, token.index))
.does_match_same_string(new_token.index, token.index)
{
info!(
"split states {left_id} {right_id} - token {} conflicts with {}",
"split states {} {} - token {} conflicts with {}",
left_id,
right_id,
self.symbol_name(&new_token),
self.symbol_name(token),
self.symbol_name(&token),
);
return true;
}

View file

@ -16,6 +16,7 @@ use self::{
build_lex_table::build_lex_table,
build_parse_table::{build_parse_table, ParseStateInfo},
coincident_tokens::CoincidentTokenIndex,
item_set_builder::ParseItemSetBuilder,
minimize_parse_table::minimize_parse_table,
token_conflicts::TokenConflictMap,
};
@ -31,7 +32,6 @@ pub struct Tables {
pub parse_table: ParseTable,
pub main_lex_table: LexTable,
pub keyword_lex_table: LexTable,
pub word_token: Option<Symbol>,
pub large_character_sets: Vec<(Option<Symbol>, CharacterSet)>,
}
@ -43,8 +43,15 @@ pub fn build_tables(
inlines: &InlinedProductionMap,
report_symbol_name: Option<&str>,
) -> Result<Tables> {
let (mut parse_table, following_tokens, parse_state_info) =
build_parse_table(syntax_grammar, lexical_grammar, inlines, variable_info)?;
let item_set_builder = ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines);
let following_tokens =
get_following_tokens(syntax_grammar, lexical_grammar, inlines, &item_set_builder);
let (mut parse_table, parse_state_info) = build_parse_table(
syntax_grammar,
lexical_grammar,
item_set_builder,
variable_info,
)?;
let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens);
let coincident_token_index = CoincidentTokenIndex::new(&parse_table, lexical_grammar);
let keywords = identify_keywords(
@ -97,10 +104,50 @@ pub fn build_tables(
main_lex_table: lex_tables.main_lex_table,
keyword_lex_table: lex_tables.keyword_lex_table,
large_character_sets: lex_tables.large_character_sets,
word_token: syntax_grammar.word_token,
})
}
fn get_following_tokens(
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
inlines: &InlinedProductionMap,
builder: &ParseItemSetBuilder,
) -> Vec<TokenSet> {
let mut result = vec![TokenSet::new(); lexical_grammar.variables.len()];
let productions = syntax_grammar
.variables
.iter()
.flat_map(|v| &v.productions)
.chain(&inlines.productions);
let all_tokens = (0..result.len())
.map(Symbol::terminal)
.collect::<TokenSet>();
for production in productions {
for i in 1..production.steps.len() {
let left_tokens = builder.last_set(&production.steps[i - 1].symbol);
let right_tokens = builder.first_set(&production.steps[i].symbol);
let right_reserved_tokens = builder.reserved_first_set(&production.steps[i].symbol);
for left_token in left_tokens.iter() {
if left_token.is_terminal() {
result[left_token.index].insert_all_terminals(right_tokens);
if let Some(reserved_tokens) = right_reserved_tokens {
result[left_token.index].insert_all_terminals(reserved_tokens);
}
}
}
}
}
for extra in &syntax_grammar.extra_symbols {
if extra.is_terminal() {
for entry in &mut result {
entry.insert(*extra);
}
result[extra.index] = all_tokens.clone();
}
}
result
}
fn populate_error_state(
parse_table: &mut ParseTable,
syntax_grammar: &SyntaxGrammar,
@ -414,9 +461,9 @@ fn report_state_info<'a>(
for (i, state) in parse_table.states.iter().enumerate() {
all_state_indices.insert(i);
let item_set = &parse_state_info[state.id];
for (item, _) in &item_set.1.entries {
if !item.is_augmented() {
symbols_with_state_indices[item.variable_index as usize]
for entry in &item_set.1.entries {
if !entry.item.is_augmented() {
symbols_with_state_indices[entry.item.variable_index as usize]
.1
.insert(i);
}