feat: add 'reserved word' construct
Co-authored-by: Amaan Qureshi <amaanq12@gmail.com>
This commit is contained in:
parent
2a63077cac
commit
201b41cf11
31 changed files with 2367 additions and 1628 deletions
|
|
@ -43,15 +43,17 @@ pub fn build_lex_table(
|
|||
let tokens = state
|
||||
.terminal_entries
|
||||
.keys()
|
||||
.copied()
|
||||
.chain(state.reserved_words.iter())
|
||||
.filter_map(|token| {
|
||||
if token.is_terminal() {
|
||||
if keywords.contains(token) {
|
||||
if keywords.contains(&token) {
|
||||
syntax_grammar.word_token
|
||||
} else {
|
||||
Some(*token)
|
||||
Some(token)
|
||||
}
|
||||
} else if token.is_eof() {
|
||||
Some(*token)
|
||||
Some(token)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
|
|
|
|||
|
|
@ -10,13 +10,11 @@ use indexmap::{map::Entry, IndexMap};
|
|||
use rustc_hash::FxHasher;
|
||||
|
||||
use super::{
|
||||
item::{ParseItem, ParseItemSet, ParseItemSetCore},
|
||||
item::{ParseItem, ParseItemSet, ParseItemSetCore, ParseItemSetEntry},
|
||||
item_set_builder::ParseItemSetBuilder,
|
||||
};
|
||||
use crate::{
|
||||
grammars::{
|
||||
InlinedProductionMap, LexicalGrammar, PrecedenceEntry, SyntaxGrammar, VariableType,
|
||||
},
|
||||
grammars::{LexicalGrammar, PrecedenceEntry, ReservedWordSetId, SyntaxGrammar, VariableType},
|
||||
node_types::VariableInfo,
|
||||
rules::{Associativity, Precedence, Symbol, SymbolType, TokenSet},
|
||||
tables::{
|
||||
|
|
@ -67,6 +65,33 @@ struct ParseTableBuilder<'a> {
|
|||
}
|
||||
|
||||
impl<'a> ParseTableBuilder<'a> {
|
||||
fn new(
|
||||
syntax_grammar: &'a SyntaxGrammar,
|
||||
lexical_grammar: &'a LexicalGrammar,
|
||||
item_set_builder: ParseItemSetBuilder<'a>,
|
||||
variable_info: &'a [VariableInfo],
|
||||
) -> Self {
|
||||
Self {
|
||||
syntax_grammar,
|
||||
lexical_grammar,
|
||||
item_set_builder,
|
||||
variable_info,
|
||||
non_terminal_extra_states: Vec::new(),
|
||||
state_ids_by_item_set: IndexMap::default(),
|
||||
core_ids_by_core: HashMap::new(),
|
||||
parse_state_info_by_id: Vec::new(),
|
||||
parse_state_queue: VecDeque::new(),
|
||||
actual_conflicts: syntax_grammar.expected_conflicts.iter().cloned().collect(),
|
||||
parse_table: ParseTable {
|
||||
states: Vec::new(),
|
||||
symbols: Vec::new(),
|
||||
external_lex_states: Vec::new(),
|
||||
production_infos: Vec::new(),
|
||||
max_aliased_production_length: 1,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn build(mut self) -> Result<(ParseTable, Vec<ParseStateInfo<'a>>)> {
|
||||
// Ensure that the empty alias sequence has index 0.
|
||||
self.parse_table
|
||||
|
|
@ -80,10 +105,13 @@ impl<'a> ParseTableBuilder<'a> {
|
|||
self.add_parse_state(
|
||||
&Vec::new(),
|
||||
&Vec::new(),
|
||||
ParseItemSet::with(std::iter::once((
|
||||
ParseItem::start(),
|
||||
std::iter::once(Symbol::end()).collect(),
|
||||
))),
|
||||
ParseItemSet {
|
||||
entries: vec![ParseItemSetEntry {
|
||||
item: ParseItem::start(),
|
||||
lookaheads: std::iter::once(Symbol::end()).collect(),
|
||||
following_reserved_word_set: ReservedWordSetId::default(),
|
||||
}],
|
||||
},
|
||||
);
|
||||
|
||||
// Compute the possible item sets for non-terminal extras.
|
||||
|
|
@ -99,15 +127,14 @@ impl<'a> ParseTableBuilder<'a> {
|
|||
non_terminal_extra_item_sets_by_first_terminal
|
||||
.entry(production.first_symbol().unwrap())
|
||||
.or_insert_with(ParseItemSet::default)
|
||||
.insert(
|
||||
ParseItem {
|
||||
variable_index: extra_non_terminal.index as u32,
|
||||
production,
|
||||
step_index: 1,
|
||||
has_preceding_inherited_fields: false,
|
||||
},
|
||||
&std::iter::once(Symbol::end_of_nonterminal_extra()).collect(),
|
||||
);
|
||||
.insert(ParseItem {
|
||||
variable_index: extra_non_terminal.index as u32,
|
||||
production,
|
||||
step_index: 1,
|
||||
has_preceding_inherited_fields: false,
|
||||
})
|
||||
.lookaheads
|
||||
.insert(Symbol::end_of_nonterminal_extra());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -176,6 +203,7 @@ impl<'a> ParseTableBuilder<'a> {
|
|||
external_lex_state_id: 0,
|
||||
terminal_entries: IndexMap::default(),
|
||||
nonterminal_entries: IndexMap::default(),
|
||||
reserved_words: TokenSet::default(),
|
||||
core_id,
|
||||
});
|
||||
self.parse_state_queue.push_back(ParseStateQueueEntry {
|
||||
|
|
@ -202,13 +230,18 @@ impl<'a> ParseTableBuilder<'a> {
|
|||
|
||||
// Each item in the item set contributes to either or a Shift action or a Reduce
|
||||
// action in this state.
|
||||
for (item, lookaheads) in &item_set.entries {
|
||||
for ParseItemSetEntry {
|
||||
item,
|
||||
lookaheads,
|
||||
following_reserved_word_set: reserved_lookaheads,
|
||||
} in &item_set.entries
|
||||
{
|
||||
// If the item is unfinished, then this state has a transition for the item's
|
||||
// next symbol. Advance the item to its next step and insert the resulting
|
||||
// item into the successor item set.
|
||||
if let Some(next_symbol) = item.symbol() {
|
||||
let mut successor = item.successor();
|
||||
if next_symbol.is_non_terminal() {
|
||||
let successor_set = if next_symbol.is_non_terminal() {
|
||||
let variable = &self.syntax_grammar.variables[next_symbol.index];
|
||||
|
||||
// Keep track of where auxiliary non-terminals (repeat symbols) are
|
||||
|
|
@ -237,13 +270,16 @@ impl<'a> ParseTableBuilder<'a> {
|
|||
non_terminal_successors
|
||||
.entry(next_symbol)
|
||||
.or_insert_with(ParseItemSet::default)
|
||||
.insert(successor, lookaheads);
|
||||
} else {
|
||||
terminal_successors
|
||||
.entry(next_symbol)
|
||||
.or_insert_with(ParseItemSet::default)
|
||||
.insert(successor, lookaheads);
|
||||
}
|
||||
};
|
||||
let successor_entry = successor_set.insert(successor);
|
||||
successor_entry.lookaheads.insert_all(lookaheads);
|
||||
successor_entry.following_reserved_word_set = successor_entry
|
||||
.following_reserved_word_set
|
||||
.max(*reserved_lookaheads);
|
||||
}
|
||||
// If the item is finished, then add a Reduce action to this state based
|
||||
// on this item.
|
||||
|
|
@ -370,7 +406,7 @@ impl<'a> ParseTableBuilder<'a> {
|
|||
)?;
|
||||
}
|
||||
|
||||
// Finally, add actions for the grammar's `extra` symbols.
|
||||
// Add actions for the grammar's `extra` symbols.
|
||||
let state = &mut self.parse_table.states[state_id];
|
||||
let is_end_of_non_terminal_extra = state.is_end_of_non_terminal_extra();
|
||||
|
||||
|
|
@ -382,7 +418,7 @@ impl<'a> ParseTableBuilder<'a> {
|
|||
let parent_symbols = item_set
|
||||
.entries
|
||||
.iter()
|
||||
.filter_map(|(item, _)| {
|
||||
.filter_map(|ParseItemSetEntry { item, .. }| {
|
||||
if !item.is_augmented() && item.step_index > 0 {
|
||||
Some(item.variable_index)
|
||||
} else {
|
||||
|
|
@ -436,6 +472,30 @@ impl<'a> ParseTableBuilder<'a> {
|
|||
}
|
||||
}
|
||||
|
||||
if let Some(keyword_capture_token) = self.syntax_grammar.word_token {
|
||||
let reserved_word_set_id = item_set
|
||||
.entries
|
||||
.iter()
|
||||
.filter_map(|entry| {
|
||||
if let Some(next_step) = entry.item.step() {
|
||||
if next_step.symbol == keyword_capture_token {
|
||||
Some(next_step.reserved_word_set_id)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else if entry.lookaheads.contains(&keyword_capture_token) {
|
||||
Some(entry.following_reserved_word_set)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.max();
|
||||
if let Some(reserved_word_set_id) = reserved_word_set_id {
|
||||
state.reserved_words =
|
||||
self.syntax_grammar.reserved_word_sets[reserved_word_set_id.0].clone();
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
|
@ -462,7 +522,10 @@ impl<'a> ParseTableBuilder<'a> {
|
|||
let mut considered_associativity = false;
|
||||
let mut shift_precedence = Vec::<(&Precedence, Symbol)>::new();
|
||||
let mut conflicting_items = HashSet::new();
|
||||
for (item, lookaheads) in &item_set.entries {
|
||||
for ParseItemSetEntry {
|
||||
item, lookaheads, ..
|
||||
} in &item_set.entries
|
||||
{
|
||||
if let Some(step) = item.step() {
|
||||
if item.step_index > 0
|
||||
&& self
|
||||
|
|
@ -836,7 +899,7 @@ impl<'a> ParseTableBuilder<'a> {
|
|||
let parent_symbols = item_set
|
||||
.entries
|
||||
.iter()
|
||||
.filter_map(|(item, _)| {
|
||||
.filter_map(|ParseItemSetEntry { item, .. }| {
|
||||
let variable_index = item.variable_index as usize;
|
||||
if item.symbol() == Some(symbol)
|
||||
&& !self.syntax_grammar.variables[variable_index].is_auxiliary()
|
||||
|
|
@ -931,77 +994,17 @@ impl<'a> ParseTableBuilder<'a> {
|
|||
}
|
||||
}
|
||||
|
||||
fn populate_following_tokens(
|
||||
result: &mut [TokenSet],
|
||||
grammar: &SyntaxGrammar,
|
||||
inlines: &InlinedProductionMap,
|
||||
builder: &ParseItemSetBuilder,
|
||||
) {
|
||||
let productions = grammar
|
||||
.variables
|
||||
.iter()
|
||||
.flat_map(|v| &v.productions)
|
||||
.chain(&inlines.productions);
|
||||
let all_tokens = (0..result.len())
|
||||
.map(Symbol::terminal)
|
||||
.collect::<TokenSet>();
|
||||
for production in productions {
|
||||
for i in 1..production.steps.len() {
|
||||
let left_tokens = builder.last_set(&production.steps[i - 1].symbol);
|
||||
let right_tokens = builder.first_set(&production.steps[i].symbol);
|
||||
for left_token in left_tokens.iter() {
|
||||
if left_token.is_terminal() {
|
||||
result[left_token.index].insert_all_terminals(right_tokens);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for extra in &grammar.extra_symbols {
|
||||
if extra.is_terminal() {
|
||||
for entry in result.iter_mut() {
|
||||
entry.insert(*extra);
|
||||
}
|
||||
result[extra.index].clone_from(&all_tokens);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn build_parse_table<'a>(
|
||||
syntax_grammar: &'a SyntaxGrammar,
|
||||
lexical_grammar: &'a LexicalGrammar,
|
||||
inlines: &'a InlinedProductionMap,
|
||||
item_set_builder: ParseItemSetBuilder<'a>,
|
||||
variable_info: &'a [VariableInfo],
|
||||
) -> Result<(ParseTable, Vec<TokenSet>, Vec<ParseStateInfo<'a>>)> {
|
||||
let actual_conflicts = syntax_grammar.expected_conflicts.iter().cloned().collect();
|
||||
let item_set_builder = ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines);
|
||||
let mut following_tokens = vec![TokenSet::new(); lexical_grammar.variables.len()];
|
||||
populate_following_tokens(
|
||||
&mut following_tokens,
|
||||
syntax_grammar,
|
||||
inlines,
|
||||
&item_set_builder,
|
||||
);
|
||||
|
||||
let (table, item_sets) = ParseTableBuilder {
|
||||
) -> Result<(ParseTable, Vec<ParseStateInfo<'a>>)> {
|
||||
ParseTableBuilder::new(
|
||||
syntax_grammar,
|
||||
lexical_grammar,
|
||||
item_set_builder,
|
||||
variable_info,
|
||||
non_terminal_extra_states: Vec::new(),
|
||||
actual_conflicts,
|
||||
state_ids_by_item_set: IndexMap::default(),
|
||||
core_ids_by_core: HashMap::new(),
|
||||
parse_state_info_by_id: Vec::new(),
|
||||
parse_state_queue: VecDeque::new(),
|
||||
parse_table: ParseTable {
|
||||
states: Vec::new(),
|
||||
symbols: Vec::new(),
|
||||
external_lex_states: Vec::new(),
|
||||
production_infos: Vec::new(),
|
||||
max_aliased_production_length: 1,
|
||||
},
|
||||
}
|
||||
.build()?;
|
||||
|
||||
Ok((table, following_tokens, item_sets))
|
||||
)
|
||||
.build()
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,7 +7,10 @@ use std::{
|
|||
use lazy_static::lazy_static;
|
||||
|
||||
use crate::{
|
||||
grammars::{LexicalGrammar, Production, ProductionStep, SyntaxGrammar},
|
||||
grammars::{
|
||||
LexicalGrammar, Production, ProductionStep, ReservedWordSetId, SyntaxGrammar,
|
||||
NO_RESERVED_WORDS,
|
||||
},
|
||||
rules::{Associativity, Precedence, Symbol, SymbolType, TokenSet},
|
||||
};
|
||||
|
||||
|
|
@ -23,6 +26,7 @@ lazy_static! {
|
|||
associativity: None,
|
||||
alias: None,
|
||||
field_name: None,
|
||||
reserved_word_set_id: NO_RESERVED_WORDS,
|
||||
}],
|
||||
};
|
||||
}
|
||||
|
|
@ -58,7 +62,14 @@ pub struct ParseItem<'a> {
|
|||
/// to a state in the final parse table.
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Default)]
|
||||
pub struct ParseItemSet<'a> {
|
||||
pub entries: Vec<(ParseItem<'a>, TokenSet)>,
|
||||
pub entries: Vec<ParseItemSetEntry<'a>>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
pub struct ParseItemSetEntry<'a> {
|
||||
pub item: ParseItem<'a>,
|
||||
pub lookaheads: TokenSet,
|
||||
pub following_reserved_word_set: ReservedWordSetId,
|
||||
}
|
||||
|
||||
/// A [`ParseItemSetCore`] is like a [`ParseItemSet`], but without the lookahead
|
||||
|
|
@ -152,30 +163,26 @@ impl<'a> ParseItem<'a> {
|
|||
}
|
||||
|
||||
impl<'a> ParseItemSet<'a> {
|
||||
pub fn with(elements: impl IntoIterator<Item = (ParseItem<'a>, TokenSet)>) -> Self {
|
||||
let mut result = Self::default();
|
||||
for (item, lookaheads) in elements {
|
||||
result.insert(item, &lookaheads);
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, item: ParseItem<'a>, lookaheads: &TokenSet) -> &mut TokenSet {
|
||||
match self.entries.binary_search_by(|(i, _)| i.cmp(&item)) {
|
||||
pub fn insert(&mut self, item: ParseItem<'a>) -> &mut ParseItemSetEntry<'a> {
|
||||
match self.entries.binary_search_by(|e| e.item.cmp(&item)) {
|
||||
Err(i) => {
|
||||
self.entries.insert(i, (item, lookaheads.clone()));
|
||||
&mut self.entries[i].1
|
||||
}
|
||||
Ok(i) => {
|
||||
self.entries[i].1.insert_all(lookaheads);
|
||||
&mut self.entries[i].1
|
||||
self.entries.insert(
|
||||
i,
|
||||
ParseItemSetEntry {
|
||||
item,
|
||||
lookaheads: TokenSet::new(),
|
||||
following_reserved_word_set: ReservedWordSetId::default(),
|
||||
},
|
||||
);
|
||||
&mut self.entries[i]
|
||||
}
|
||||
Ok(i) => &mut self.entries[i],
|
||||
}
|
||||
}
|
||||
|
||||
pub fn core(&self) -> ParseItemSetCore<'a> {
|
||||
ParseItemSetCore {
|
||||
entries: self.entries.iter().map(|e| e.0).collect(),
|
||||
entries: self.entries.iter().map(|e| e.item).collect(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -195,14 +202,21 @@ impl fmt::Display for ParseItemDisplay<'_> {
|
|||
for (i, step) in self.0.production.steps.iter().enumerate() {
|
||||
if i == self.0.step_index as usize {
|
||||
write!(f, " •")?;
|
||||
if let Some(associativity) = step.associativity {
|
||||
if !step.precedence.is_none()
|
||||
|| step.associativity.is_some()
|
||||
|| step.reserved_word_set_id != ReservedWordSetId::default()
|
||||
{
|
||||
write!(f, " (")?;
|
||||
if step.precedence.is_none() {
|
||||
write!(f, " ({associativity:?})")?;
|
||||
} else {
|
||||
write!(f, " ({} {associativity:?})", step.precedence)?;
|
||||
write!(f, " {}", step.precedence)?;
|
||||
}
|
||||
} else if !step.precedence.is_none() {
|
||||
write!(f, " ({})", step.precedence)?;
|
||||
if let Some(associativity) = step.associativity {
|
||||
write!(f, " {associativity:?}")?;
|
||||
}
|
||||
if step.reserved_word_set_id != ReservedWordSetId::default() {
|
||||
write!(f, "reserved: {}", step.reserved_word_set_id)?;
|
||||
}
|
||||
write!(f, " )")?;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -270,13 +284,21 @@ impl fmt::Display for TokenSetDisplay<'_> {
|
|||
|
||||
impl fmt::Display for ParseItemSetDisplay<'_> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
for (item, lookaheads) in &self.0.entries {
|
||||
writeln!(
|
||||
for entry in &self.0.entries {
|
||||
write!(
|
||||
f,
|
||||
"{}\t{}",
|
||||
ParseItemDisplay(item, self.1, self.2),
|
||||
TokenSetDisplay(lookaheads, self.1, self.2)
|
||||
ParseItemDisplay(&entry.item, self.1, self.2),
|
||||
TokenSetDisplay(&entry.lookaheads, self.1, self.2),
|
||||
)?;
|
||||
if entry.following_reserved_word_set != ReservedWordSetId::default() {
|
||||
write!(
|
||||
f,
|
||||
"\treserved word set: {}",
|
||||
entry.following_reserved_word_set
|
||||
)?;
|
||||
}
|
||||
writeln!(f)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
|
@ -296,7 +318,7 @@ impl Hash for ParseItem<'_> {
|
|||
// this item, unless any of the following are true:
|
||||
// * the children have fields
|
||||
// * the children have aliases
|
||||
// * the children are hidden and
|
||||
// * the children are hidden and represent rules that have fields.
|
||||
// See the docs for `has_preceding_inherited_fields`.
|
||||
for step in &self.production.steps[0..self.step_index as usize] {
|
||||
step.alias.hash(hasher);
|
||||
|
|
@ -399,9 +421,10 @@ impl Eq for ParseItem<'_> {}
|
|||
impl Hash for ParseItemSet<'_> {
|
||||
fn hash<H: Hasher>(&self, hasher: &mut H) {
|
||||
hasher.write_usize(self.entries.len());
|
||||
for (item, lookaheads) in &self.entries {
|
||||
item.hash(hasher);
|
||||
lookaheads.hash(hasher);
|
||||
for entry in &self.entries {
|
||||
entry.item.hash(hasher);
|
||||
entry.lookaheads.hash(hasher);
|
||||
entry.following_reserved_word_set.hash(hasher);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,9 +3,9 @@ use std::{
|
|||
fmt,
|
||||
};
|
||||
|
||||
use super::item::{ParseItem, ParseItemDisplay, ParseItemSet, TokenSetDisplay};
|
||||
use super::item::{ParseItem, ParseItemDisplay, ParseItemSet, ParseItemSetEntry, TokenSetDisplay};
|
||||
use crate::{
|
||||
grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar},
|
||||
grammars::{InlinedProductionMap, LexicalGrammar, ReservedWordSetId, SyntaxGrammar},
|
||||
rules::{Symbol, SymbolType, TokenSet},
|
||||
};
|
||||
|
||||
|
|
@ -15,9 +15,10 @@ struct TransitiveClosureAddition<'a> {
|
|||
info: FollowSetInfo,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
#[derive(Clone, Debug, Default, PartialEq, Eq)]
|
||||
struct FollowSetInfo {
|
||||
lookaheads: TokenSet,
|
||||
reserved_lookaheads: ReservedWordSetId,
|
||||
propagates_lookaheads: bool,
|
||||
}
|
||||
|
||||
|
|
@ -25,6 +26,7 @@ pub struct ParseItemSetBuilder<'a> {
|
|||
syntax_grammar: &'a SyntaxGrammar,
|
||||
lexical_grammar: &'a LexicalGrammar,
|
||||
first_sets: HashMap<Symbol, TokenSet>,
|
||||
reserved_first_sets: HashMap<Symbol, ReservedWordSetId>,
|
||||
last_sets: HashMap<Symbol, TokenSet>,
|
||||
inlines: &'a InlinedProductionMap,
|
||||
transitive_closure_additions: Vec<Vec<TransitiveClosureAddition<'a>>>,
|
||||
|
|
@ -46,6 +48,7 @@ impl<'a> ParseItemSetBuilder<'a> {
|
|||
syntax_grammar,
|
||||
lexical_grammar,
|
||||
first_sets: HashMap::new(),
|
||||
reserved_first_sets: HashMap::new(),
|
||||
last_sets: HashMap::new(),
|
||||
inlines,
|
||||
transitive_closure_additions: vec![Vec::new(); syntax_grammar.variables.len()],
|
||||
|
|
@ -54,8 +57,7 @@ impl<'a> ParseItemSetBuilder<'a> {
|
|||
// For each grammar symbol, populate the FIRST and LAST sets: the set of
|
||||
// terminals that appear at the beginning and end that symbol's productions,
|
||||
// respectively.
|
||||
//
|
||||
// For a terminal symbol, the FIRST and LAST set just consists of the
|
||||
// For a terminal symbol, the FIRST and LAST sets just consist of the
|
||||
// terminal itself.
|
||||
for i in 0..lexical_grammar.variables.len() {
|
||||
let symbol = Symbol::terminal(i);
|
||||
|
|
@ -63,6 +65,9 @@ impl<'a> ParseItemSetBuilder<'a> {
|
|||
set.insert(symbol);
|
||||
result.first_sets.insert(symbol, set.clone());
|
||||
result.last_sets.insert(symbol, set);
|
||||
result
|
||||
.reserved_first_sets
|
||||
.insert(symbol, ReservedWordSetId::default());
|
||||
}
|
||||
|
||||
for i in 0..syntax_grammar.external_tokens.len() {
|
||||
|
|
@ -71,12 +76,15 @@ impl<'a> ParseItemSetBuilder<'a> {
|
|||
set.insert(symbol);
|
||||
result.first_sets.insert(symbol, set.clone());
|
||||
result.last_sets.insert(symbol, set);
|
||||
result
|
||||
.reserved_first_sets
|
||||
.insert(symbol, ReservedWordSetId::default());
|
||||
}
|
||||
|
||||
// The FIRST set of a non-terminal `i` is the union of the following sets:
|
||||
// * the set of all terminals that appear at the beginnings of i's productions
|
||||
// * the FIRST sets of all the non-terminals that appear at the beginnings of i's
|
||||
// productions
|
||||
// The FIRST set of a non-terminal `i` is the union of the the FIRST sets
|
||||
// of all the symbols that appear at the beginnings of i's productions. Some
|
||||
// of these symbols may themselves be non-terminals, so this is a recursive
|
||||
// definition.
|
||||
//
|
||||
// Rather than computing these sets using recursion, we use an explicit stack
|
||||
// called `symbols_to_process`.
|
||||
|
|
@ -84,37 +92,36 @@ impl<'a> ParseItemSetBuilder<'a> {
|
|||
let mut processed_non_terminals = HashSet::new();
|
||||
for i in 0..syntax_grammar.variables.len() {
|
||||
let symbol = Symbol::non_terminal(i);
|
||||
let first_set = result.first_sets.entry(symbol).or_default();
|
||||
let reserved_first_set = result.reserved_first_sets.entry(symbol).or_default();
|
||||
|
||||
let first_set = result
|
||||
.first_sets
|
||||
.entry(symbol)
|
||||
.or_insert_with(TokenSet::new);
|
||||
processed_non_terminals.clear();
|
||||
symbols_to_process.clear();
|
||||
symbols_to_process.push(symbol);
|
||||
while let Some(current_symbol) = symbols_to_process.pop() {
|
||||
if current_symbol.is_terminal() || current_symbol.is_external() {
|
||||
first_set.insert(current_symbol);
|
||||
} else if processed_non_terminals.insert(current_symbol) {
|
||||
for production in &syntax_grammar.variables[current_symbol.index].productions {
|
||||
if let Some(step) = production.steps.first() {
|
||||
while let Some(sym) = symbols_to_process.pop() {
|
||||
for production in &syntax_grammar.variables[sym.index].productions {
|
||||
if let Some(step) = production.steps.first() {
|
||||
if step.symbol.is_terminal() || step.symbol.is_external() {
|
||||
first_set.insert(step.symbol);
|
||||
} else if processed_non_terminals.insert(step.symbol) {
|
||||
symbols_to_process.push(step.symbol);
|
||||
}
|
||||
*reserved_first_set = (*reserved_first_set).max(step.reserved_word_set_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// The LAST set is defined in a similar way to the FIRST set.
|
||||
let last_set = result.last_sets.entry(symbol).or_insert_with(TokenSet::new);
|
||||
let last_set = result.last_sets.entry(symbol).or_default();
|
||||
processed_non_terminals.clear();
|
||||
symbols_to_process.clear();
|
||||
symbols_to_process.push(symbol);
|
||||
while let Some(current_symbol) = symbols_to_process.pop() {
|
||||
if current_symbol.is_terminal() || current_symbol.is_external() {
|
||||
last_set.insert(current_symbol);
|
||||
} else if processed_non_terminals.insert(current_symbol) {
|
||||
for production in &syntax_grammar.variables[current_symbol.index].productions {
|
||||
if let Some(step) = production.steps.last() {
|
||||
while let Some(sym) = symbols_to_process.pop() {
|
||||
for production in &syntax_grammar.variables[sym.index].productions {
|
||||
if let Some(step) = production.steps.last() {
|
||||
if step.symbol.is_terminal() || step.symbol.is_external() {
|
||||
last_set.insert(step.symbol);
|
||||
} else if processed_non_terminals.insert(step.symbol) {
|
||||
symbols_to_process.push(step.symbol);
|
||||
}
|
||||
}
|
||||
|
|
@ -124,67 +131,75 @@ impl<'a> ParseItemSetBuilder<'a> {
|
|||
|
||||
// To compute an item set's transitive closure, we find each item in the set
|
||||
// whose next symbol is a non-terminal, and we add new items to the set for
|
||||
// each of that symbols' productions. These productions might themselves begin
|
||||
// each of that symbol's productions. These productions might themselves begin
|
||||
// with non-terminals, so the process continues recursively. In this process,
|
||||
// the total set of entries that get added depends only on two things:
|
||||
// * the set of non-terminal symbols that occur at each item's current position
|
||||
// * the set of terminals that occurs after each of these non-terminal symbols
|
||||
//
|
||||
// * the non-terminal symbol that occurs next in each item
|
||||
//
|
||||
// * the set of terminals that can follow that non-terminal symbol in the item
|
||||
//
|
||||
// So we can avoid a lot of duplicated recursive work by precomputing, for each
|
||||
// non-terminal symbol `i`, a final list of *additions* that must be made to an
|
||||
// item set when `i` occurs as the next symbol in one if its core items. The
|
||||
// structure of an *addition* is as follows:
|
||||
// * `item` - the new item that must be added as part of the expansion of `i`
|
||||
// * `lookaheads` - lookahead tokens that can always come after that item in the expansion
|
||||
// of `i`
|
||||
// * `propagates_lookaheads` - a boolean indicating whether or not `item` can occur at the
|
||||
// *end* of the expansion of `i`, so that i's own current lookahead tokens can occur
|
||||
// after `item`.
|
||||
// item set when symbol `i` occurs as the next symbol in one if its core items.
|
||||
// The structure of a precomputed *addition* is as follows:
|
||||
//
|
||||
// Again, rather than computing these additions recursively, we use an explicit
|
||||
// stack called `entries_to_process`.
|
||||
// * `item` - the new item that must be added as part of the expansion of the symbol `i`.
|
||||
//
|
||||
// * `lookaheads` - the set of possible lookahead tokens that can always come after `item`
|
||||
// in an expansion of symbol `i`.
|
||||
//
|
||||
// * `reserved_lookaheads` - the set of reserved lookahead lookahead tokens that can
|
||||
// always come after `item` in the expansion of symbol `i`.
|
||||
//
|
||||
// * `propagates_lookaheads` - a boolean indicating whether or not `item` can occur at the
|
||||
// *end* of the expansion of symbol `i`, so that i's own current lookahead tokens can
|
||||
// occur after `item`.
|
||||
//
|
||||
// Rather than computing these additions recursively, we use an explicit stack.
|
||||
let empty_lookaheads = TokenSet::new();
|
||||
let mut stack = Vec::new();
|
||||
let mut follow_set_info_by_non_terminal = HashMap::<usize, FollowSetInfo>::new();
|
||||
for i in 0..syntax_grammar.variables.len() {
|
||||
let empty_lookaheads = TokenSet::new();
|
||||
let mut entries_to_process = vec![(i, &empty_lookaheads, true)];
|
||||
|
||||
// First, build up a map whose keys are all of the non-terminals that can
|
||||
// appear at the beginning of non-terminal `i`, and whose values store
|
||||
// information about the tokens that can follow each non-terminal.
|
||||
let mut follow_set_info_by_non_terminal = HashMap::new();
|
||||
while let Some(entry) = entries_to_process.pop() {
|
||||
let (variable_index, lookaheads, propagates_lookaheads) = entry;
|
||||
let existing_info = follow_set_info_by_non_terminal
|
||||
.entry(variable_index)
|
||||
.or_insert_with(|| FollowSetInfo {
|
||||
lookaheads: TokenSet::new(),
|
||||
propagates_lookaheads: false,
|
||||
});
|
||||
|
||||
let did_add_follow_set_info;
|
||||
if propagates_lookaheads {
|
||||
did_add_follow_set_info = !existing_info.propagates_lookaheads;
|
||||
existing_info.propagates_lookaheads = true;
|
||||
} else {
|
||||
did_add_follow_set_info = existing_info.lookaheads.insert_all(lookaheads);
|
||||
// information about the tokens that can follow those non-terminals.
|
||||
stack.clear();
|
||||
stack.push((i, &empty_lookaheads, ReservedWordSetId::default(), true));
|
||||
follow_set_info_by_non_terminal.clear();
|
||||
while let Some((sym_ix, lookaheads, reserved_word_set_id, propagates_lookaheads)) =
|
||||
stack.pop()
|
||||
{
|
||||
let mut did_add = false;
|
||||
let info = follow_set_info_by_non_terminal.entry(sym_ix).or_default();
|
||||
did_add |= info.lookaheads.insert_all(lookaheads);
|
||||
if reserved_word_set_id > info.reserved_lookaheads {
|
||||
info.reserved_lookaheads = reserved_word_set_id;
|
||||
did_add = true;
|
||||
}
|
||||
did_add |= propagates_lookaheads && !info.propagates_lookaheads;
|
||||
info.propagates_lookaheads |= propagates_lookaheads;
|
||||
if !did_add {
|
||||
continue;
|
||||
}
|
||||
|
||||
if did_add_follow_set_info {
|
||||
for production in &syntax_grammar.variables[variable_index].productions {
|
||||
if let Some(symbol) = production.first_symbol() {
|
||||
if symbol.is_non_terminal() {
|
||||
if production.steps.len() == 1 {
|
||||
entries_to_process.push((
|
||||
symbol.index,
|
||||
lookaheads,
|
||||
propagates_lookaheads,
|
||||
));
|
||||
} else {
|
||||
entries_to_process.push((
|
||||
symbol.index,
|
||||
&result.first_sets[&production.steps[1].symbol],
|
||||
false,
|
||||
));
|
||||
}
|
||||
for production in &syntax_grammar.variables[sym_ix].productions {
|
||||
if let Some(symbol) = production.first_symbol() {
|
||||
if symbol.is_non_terminal() {
|
||||
if let Some(next_step) = production.steps.get(1) {
|
||||
stack.push((
|
||||
symbol.index,
|
||||
&result.first_sets[&next_step.symbol],
|
||||
result.reserved_first_sets[&next_step.symbol],
|
||||
false,
|
||||
));
|
||||
} else {
|
||||
stack.push((
|
||||
symbol.index,
|
||||
lookaheads,
|
||||
reserved_word_set_id,
|
||||
propagates_lookaheads,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -194,7 +209,7 @@ impl<'a> ParseItemSetBuilder<'a> {
|
|||
// Store all of those non-terminals' productions, along with their associated
|
||||
// lookahead info, as *additions* associated with non-terminal `i`.
|
||||
let additions_for_non_terminal = &mut result.transitive_closure_additions[i];
|
||||
for (variable_index, follow_set_info) in follow_set_info_by_non_terminal {
|
||||
for (&variable_index, follow_set_info) in &follow_set_info_by_non_terminal {
|
||||
let variable = &syntax_grammar.variables[variable_index];
|
||||
let non_terminal = Symbol::non_terminal(variable_index);
|
||||
let variable_index = variable_index as u32;
|
||||
|
|
@ -239,20 +254,23 @@ impl<'a> ParseItemSetBuilder<'a> {
|
|||
|
||||
pub fn transitive_closure(&self, item_set: &ParseItemSet<'a>) -> ParseItemSet<'a> {
|
||||
let mut result = ParseItemSet::default();
|
||||
for (item, lookaheads) in &item_set.entries {
|
||||
for entry in &item_set.entries {
|
||||
if let Some(productions) = self
|
||||
.inlines
|
||||
.inlined_productions(item.production, item.step_index)
|
||||
.inlined_productions(entry.item.production, entry.item.step_index)
|
||||
{
|
||||
for production in productions {
|
||||
self.add_item(
|
||||
&mut result,
|
||||
item.substitute_production(production),
|
||||
lookaheads,
|
||||
&ParseItemSetEntry {
|
||||
item: entry.item.substitute_production(production),
|
||||
lookaheads: entry.lookaheads.clone(),
|
||||
following_reserved_word_set: entry.following_reserved_word_set,
|
||||
},
|
||||
);
|
||||
}
|
||||
} else {
|
||||
self.add_item(&mut result, *item, lookaheads);
|
||||
self.add_item(&mut result, entry);
|
||||
}
|
||||
}
|
||||
result
|
||||
|
|
@ -262,30 +280,64 @@ impl<'a> ParseItemSetBuilder<'a> {
|
|||
&self.first_sets[symbol]
|
||||
}
|
||||
|
||||
pub fn reserved_first_set(&self, symbol: &Symbol) -> Option<&TokenSet> {
|
||||
let id = *self.reserved_first_sets.get(symbol)?;
|
||||
Some(&self.syntax_grammar.reserved_word_sets[id.0])
|
||||
}
|
||||
|
||||
pub fn last_set(&self, symbol: &Symbol) -> &TokenSet {
|
||||
&self.last_sets[symbol]
|
||||
}
|
||||
|
||||
fn add_item(&self, set: &mut ParseItemSet<'a>, item: ParseItem<'a>, lookaheads: &TokenSet) {
|
||||
if let Some(step) = item.step() {
|
||||
fn add_item(&self, set: &mut ParseItemSet<'a>, entry: &ParseItemSetEntry<'a>) {
|
||||
if let Some(step) = entry.item.step() {
|
||||
if step.symbol.is_non_terminal() {
|
||||
let next_step = item.successor().step();
|
||||
let next_step = entry.item.successor().step();
|
||||
|
||||
// Determine which tokens can follow this non-terminal.
|
||||
let following_tokens = next_step.map_or(lookaheads, |next_step| {
|
||||
self.first_sets.get(&next_step.symbol).unwrap()
|
||||
});
|
||||
let (following_tokens, following_reserved_tokens) =
|
||||
if let Some(next_step) = next_step {
|
||||
(
|
||||
self.first_sets.get(&next_step.symbol).unwrap(),
|
||||
*self.reserved_first_sets.get(&next_step.symbol).unwrap(),
|
||||
)
|
||||
} else {
|
||||
(&entry.lookaheads, entry.following_reserved_word_set)
|
||||
};
|
||||
|
||||
// Use the pre-computed *additions* to expand the non-terminal.
|
||||
for addition in &self.transitive_closure_additions[step.symbol.index] {
|
||||
let lookaheads = set.insert(addition.item, &addition.info.lookaheads);
|
||||
let entry = set.insert(addition.item);
|
||||
entry.lookaheads.insert_all(&addition.info.lookaheads);
|
||||
|
||||
if let Some(word_token) = self.syntax_grammar.word_token {
|
||||
if addition.info.lookaheads.contains(&word_token) {
|
||||
entry.following_reserved_word_set = entry
|
||||
.following_reserved_word_set
|
||||
.max(addition.info.reserved_lookaheads);
|
||||
}
|
||||
}
|
||||
|
||||
if addition.info.propagates_lookaheads {
|
||||
lookaheads.insert_all(following_tokens);
|
||||
entry.lookaheads.insert_all(following_tokens);
|
||||
|
||||
if let Some(word_token) = self.syntax_grammar.word_token {
|
||||
if following_tokens.contains(&word_token) {
|
||||
entry.following_reserved_word_set = entry
|
||||
.following_reserved_word_set
|
||||
.max(following_reserved_tokens);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
set.insert(item, lookaheads);
|
||||
|
||||
let e = set.insert(entry.item);
|
||||
e.lookaheads.insert_all(&entry.lookaheads);
|
||||
e.following_reserved_word_set = e
|
||||
.following_reserved_word_set
|
||||
.max(entry.following_reserved_word_set);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -170,17 +170,12 @@ impl Minimizer<'_> {
|
|||
let mut new_states = Vec::with_capacity(state_ids_by_group_id.len());
|
||||
for state_ids in &state_ids_by_group_id {
|
||||
// Initialize the new state based on the first old state in the group.
|
||||
let mut parse_state = ParseState::default();
|
||||
mem::swap(&mut parse_state, &mut self.parse_table.states[state_ids[0]]);
|
||||
let mut parse_state = mem::take(&mut self.parse_table.states[state_ids[0]]);
|
||||
|
||||
// Extend the new state with all of the actions from the other old states
|
||||
// in the group.
|
||||
for state_id in &state_ids[1..] {
|
||||
let mut other_parse_state = ParseState::default();
|
||||
mem::swap(
|
||||
&mut other_parse_state,
|
||||
&mut self.parse_table.states[*state_id],
|
||||
);
|
||||
let other_parse_state = mem::take(&mut self.parse_table.states[*state_id]);
|
||||
|
||||
parse_state
|
||||
.terminal_entries
|
||||
|
|
@ -188,6 +183,12 @@ impl Minimizer<'_> {
|
|||
parse_state
|
||||
.nonterminal_entries
|
||||
.extend(other_parse_state.nonterminal_entries);
|
||||
parse_state
|
||||
.reserved_words
|
||||
.insert_all(&other_parse_state.reserved_words);
|
||||
for symbol in parse_state.terminal_entries.keys() {
|
||||
parse_state.reserved_words.remove(symbol);
|
||||
}
|
||||
}
|
||||
|
||||
// Update the new state's outgoing references using the new grouping.
|
||||
|
|
@ -216,24 +217,14 @@ impl Minimizer<'_> {
|
|||
) {
|
||||
return true;
|
||||
}
|
||||
} else if self.token_conflicts(
|
||||
left_state.id,
|
||||
right_state.id,
|
||||
right_state.terminal_entries.keys(),
|
||||
*token,
|
||||
) {
|
||||
} else if self.token_conflicts(left_state.id, right_state.id, right_state, *token) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
for token in right_state.terminal_entries.keys() {
|
||||
if !left_state.terminal_entries.contains_key(token)
|
||||
&& self.token_conflicts(
|
||||
left_state.id,
|
||||
right_state.id,
|
||||
left_state.terminal_entries.keys(),
|
||||
*token,
|
||||
)
|
||||
&& self.token_conflicts(left_state.id, right_state.id, left_state, *token)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
|
@ -350,11 +341,11 @@ impl Minimizer<'_> {
|
|||
false
|
||||
}
|
||||
|
||||
fn token_conflicts<'b>(
|
||||
fn token_conflicts(
|
||||
&self,
|
||||
left_id: ParseStateId,
|
||||
right_id: ParseStateId,
|
||||
existing_tokens: impl Iterator<Item = &'b Symbol>,
|
||||
right_state: &ParseState,
|
||||
new_token: Symbol,
|
||||
) -> bool {
|
||||
if new_token == Symbol::end_of_nonterminal_extra() {
|
||||
|
|
@ -372,6 +363,10 @@ impl Minimizer<'_> {
|
|||
return true;
|
||||
}
|
||||
|
||||
if right_state.reserved_words.contains(&new_token) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Do not add tokens which are both internal and external. Their validity could
|
||||
// influence the behavior of the external scanner.
|
||||
if self
|
||||
|
|
@ -388,23 +383,30 @@ impl Minimizer<'_> {
|
|||
}
|
||||
|
||||
// Do not add a token if it conflicts with an existing token.
|
||||
for token in existing_tokens {
|
||||
if token.is_terminal()
|
||||
&& !(self.syntax_grammar.word_token == Some(*token)
|
||||
&& self.keywords.contains(&new_token))
|
||||
&& !(self.syntax_grammar.word_token == Some(new_token)
|
||||
&& self.keywords.contains(token))
|
||||
&& (self
|
||||
for token in right_state.terminal_entries.keys().copied() {
|
||||
if !token.is_terminal() {
|
||||
continue;
|
||||
}
|
||||
if self.syntax_grammar.word_token == Some(token) && self.keywords.contains(&new_token) {
|
||||
continue;
|
||||
}
|
||||
if self.syntax_grammar.word_token == Some(new_token) && self.keywords.contains(&token) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if self
|
||||
.token_conflict_map
|
||||
.does_conflict(new_token.index, token.index)
|
||||
|| self
|
||||
.token_conflict_map
|
||||
.does_conflict(new_token.index, token.index)
|
||||
|| self
|
||||
.token_conflict_map
|
||||
.does_match_same_string(new_token.index, token.index))
|
||||
.does_match_same_string(new_token.index, token.index)
|
||||
{
|
||||
info!(
|
||||
"split states {left_id} {right_id} - token {} conflicts with {}",
|
||||
"split states {} {} - token {} conflicts with {}",
|
||||
left_id,
|
||||
right_id,
|
||||
self.symbol_name(&new_token),
|
||||
self.symbol_name(token),
|
||||
self.symbol_name(&token),
|
||||
);
|
||||
return true;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@ use self::{
|
|||
build_lex_table::build_lex_table,
|
||||
build_parse_table::{build_parse_table, ParseStateInfo},
|
||||
coincident_tokens::CoincidentTokenIndex,
|
||||
item_set_builder::ParseItemSetBuilder,
|
||||
minimize_parse_table::minimize_parse_table,
|
||||
token_conflicts::TokenConflictMap,
|
||||
};
|
||||
|
|
@ -31,7 +32,6 @@ pub struct Tables {
|
|||
pub parse_table: ParseTable,
|
||||
pub main_lex_table: LexTable,
|
||||
pub keyword_lex_table: LexTable,
|
||||
pub word_token: Option<Symbol>,
|
||||
pub large_character_sets: Vec<(Option<Symbol>, CharacterSet)>,
|
||||
}
|
||||
|
||||
|
|
@ -43,8 +43,15 @@ pub fn build_tables(
|
|||
inlines: &InlinedProductionMap,
|
||||
report_symbol_name: Option<&str>,
|
||||
) -> Result<Tables> {
|
||||
let (mut parse_table, following_tokens, parse_state_info) =
|
||||
build_parse_table(syntax_grammar, lexical_grammar, inlines, variable_info)?;
|
||||
let item_set_builder = ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines);
|
||||
let following_tokens =
|
||||
get_following_tokens(syntax_grammar, lexical_grammar, inlines, &item_set_builder);
|
||||
let (mut parse_table, parse_state_info) = build_parse_table(
|
||||
syntax_grammar,
|
||||
lexical_grammar,
|
||||
item_set_builder,
|
||||
variable_info,
|
||||
)?;
|
||||
let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens);
|
||||
let coincident_token_index = CoincidentTokenIndex::new(&parse_table, lexical_grammar);
|
||||
let keywords = identify_keywords(
|
||||
|
|
@ -97,10 +104,50 @@ pub fn build_tables(
|
|||
main_lex_table: lex_tables.main_lex_table,
|
||||
keyword_lex_table: lex_tables.keyword_lex_table,
|
||||
large_character_sets: lex_tables.large_character_sets,
|
||||
word_token: syntax_grammar.word_token,
|
||||
})
|
||||
}
|
||||
|
||||
fn get_following_tokens(
|
||||
syntax_grammar: &SyntaxGrammar,
|
||||
lexical_grammar: &LexicalGrammar,
|
||||
inlines: &InlinedProductionMap,
|
||||
builder: &ParseItemSetBuilder,
|
||||
) -> Vec<TokenSet> {
|
||||
let mut result = vec![TokenSet::new(); lexical_grammar.variables.len()];
|
||||
let productions = syntax_grammar
|
||||
.variables
|
||||
.iter()
|
||||
.flat_map(|v| &v.productions)
|
||||
.chain(&inlines.productions);
|
||||
let all_tokens = (0..result.len())
|
||||
.map(Symbol::terminal)
|
||||
.collect::<TokenSet>();
|
||||
for production in productions {
|
||||
for i in 1..production.steps.len() {
|
||||
let left_tokens = builder.last_set(&production.steps[i - 1].symbol);
|
||||
let right_tokens = builder.first_set(&production.steps[i].symbol);
|
||||
let right_reserved_tokens = builder.reserved_first_set(&production.steps[i].symbol);
|
||||
for left_token in left_tokens.iter() {
|
||||
if left_token.is_terminal() {
|
||||
result[left_token.index].insert_all_terminals(right_tokens);
|
||||
if let Some(reserved_tokens) = right_reserved_tokens {
|
||||
result[left_token.index].insert_all_terminals(reserved_tokens);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for extra in &syntax_grammar.extra_symbols {
|
||||
if extra.is_terminal() {
|
||||
for entry in &mut result {
|
||||
entry.insert(*extra);
|
||||
}
|
||||
result[extra.index] = all_tokens.clone();
|
||||
}
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
fn populate_error_state(
|
||||
parse_table: &mut ParseTable,
|
||||
syntax_grammar: &SyntaxGrammar,
|
||||
|
|
@ -414,9 +461,9 @@ fn report_state_info<'a>(
|
|||
for (i, state) in parse_table.states.iter().enumerate() {
|
||||
all_state_indices.insert(i);
|
||||
let item_set = &parse_state_info[state.id];
|
||||
for (item, _) in &item_set.1.entries {
|
||||
if !item.is_augmented() {
|
||||
symbols_with_state_indices[item.variable_index as usize]
|
||||
for entry in &item_set.1.entries {
|
||||
if !entry.item.is_augmented() {
|
||||
symbols_with_state_indices[entry.item.variable_index as usize]
|
||||
.1
|
||||
.insert(i);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ pub fn split_state_id_groups<S>(
|
|||
state_ids_by_group_id: &mut Vec<Vec<usize>>,
|
||||
group_ids_by_state_id: &mut [usize],
|
||||
start_group_id: usize,
|
||||
mut f: impl FnMut(&S, &S, &[usize]) -> bool,
|
||||
mut should_split: impl FnMut(&S, &S, &[usize]) -> bool,
|
||||
) -> bool {
|
||||
let mut result = false;
|
||||
|
||||
|
|
@ -33,7 +33,7 @@ pub fn split_state_id_groups<S>(
|
|||
}
|
||||
let right_state = &states[right_state_id];
|
||||
|
||||
if f(left_state, right_state, group_ids_by_state_id) {
|
||||
if should_split(left_state, right_state, group_ids_by_state_id) {
|
||||
split_state_ids.push(right_state_id);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@ function alias(rule, value) {
|
|||
result.value = value.symbol.name;
|
||||
return result;
|
||||
case Object:
|
||||
case GrammarSymbol:
|
||||
if (typeof value.type === 'string' && value.type === 'SYMBOL') {
|
||||
result.named = true;
|
||||
result.value = value.name;
|
||||
|
|
@ -153,11 +154,26 @@ function seq(...elements) {
|
|||
};
|
||||
}
|
||||
|
||||
function sym(name) {
|
||||
class GrammarSymbol {
|
||||
constructor(name) {
|
||||
this.type = "SYMBOL";
|
||||
this.name = name;
|
||||
}
|
||||
}
|
||||
|
||||
function reserved(wordset, rule) {
|
||||
if (typeof wordset !== 'string') {
|
||||
throw new Error('Invalid reserved word set name: ' + wordset)
|
||||
}
|
||||
return {
|
||||
type: "SYMBOL",
|
||||
name
|
||||
};
|
||||
type: "RESERVED",
|
||||
content: normalize(rule),
|
||||
context_name: wordset,
|
||||
}
|
||||
}
|
||||
|
||||
function sym(name) {
|
||||
return new GrammarSymbol(name);
|
||||
}
|
||||
|
||||
function token(value) {
|
||||
|
|
@ -236,6 +252,7 @@ function grammar(baseGrammar, options) {
|
|||
inline: [],
|
||||
supertypes: [],
|
||||
precedences: [],
|
||||
reserved: {},
|
||||
};
|
||||
} else {
|
||||
baseGrammar = baseGrammar.grammar;
|
||||
|
|
@ -309,6 +326,28 @@ function grammar(baseGrammar, options) {
|
|||
}
|
||||
}
|
||||
|
||||
let reserved = baseGrammar.reserved;
|
||||
if (options.reserved) {
|
||||
if (typeof options.reserved !== "object") {
|
||||
throw new Error("Grammar's 'reserved' property must be an object.");
|
||||
}
|
||||
|
||||
for (const reservedWordSetName of Object.keys(options.reserved)) {
|
||||
const reservedWordSetFn = options.reserved[reservedWordSetName]
|
||||
if (typeof reservedWordSetFn !== "function") {
|
||||
throw new Error(`Grammar reserved word sets must all be functions. '${reservedWordSetName}' is not.`);
|
||||
}
|
||||
|
||||
const reservedTokens = reservedWordSetFn.call(ruleBuilder, ruleBuilder, baseGrammar.reserved[reservedWordSetName]);
|
||||
|
||||
if (!Array.isArray(reservedTokens)) {
|
||||
throw new Error(`Grammar's reserved word set functions must all return arrays of rules. '${reservedWordSetName}' does not.`);
|
||||
}
|
||||
|
||||
reserved[reservedWordSetName] = reservedTokens.map(normalize);
|
||||
}
|
||||
}
|
||||
|
||||
let extras = baseGrammar.extras.slice();
|
||||
if (options.extras) {
|
||||
if (typeof options.extras !== "function") {
|
||||
|
|
@ -439,6 +478,7 @@ function grammar(baseGrammar, options) {
|
|||
externals,
|
||||
inline,
|
||||
supertypes,
|
||||
reserved,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
|
@ -478,6 +518,7 @@ globalThis.optional = optional;
|
|||
globalThis.prec = prec;
|
||||
globalThis.repeat = repeat;
|
||||
globalThis.repeat1 = repeat1;
|
||||
global.reserved = reserved;
|
||||
globalThis.seq = seq;
|
||||
globalThis.sym = sym;
|
||||
globalThis.token = token;
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ use std::{collections::HashMap, fmt};
|
|||
|
||||
use super::{
|
||||
nfa::Nfa,
|
||||
rules::{Alias, Associativity, Precedence, Rule, Symbol},
|
||||
rules::{Alias, Associativity, Precedence, Rule, Symbol, TokenSet},
|
||||
};
|
||||
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
|
||||
|
|
@ -39,6 +39,13 @@ pub struct InputGrammar {
|
|||
pub variables_to_inline: Vec<String>,
|
||||
pub supertype_symbols: Vec<String>,
|
||||
pub word_token: Option<String>,
|
||||
pub reserved_words: Vec<ReservedWordContext<Rule>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, PartialEq, Eq)]
|
||||
pub struct ReservedWordContext<T> {
|
||||
pub name: String,
|
||||
pub reserved_words: Vec<T>,
|
||||
}
|
||||
|
||||
// Extracted lexical grammar
|
||||
|
|
@ -66,8 +73,20 @@ pub struct ProductionStep {
|
|||
pub associativity: Option<Associativity>,
|
||||
pub alias: Option<Alias>,
|
||||
pub field_name: Option<String>,
|
||||
pub reserved_word_set_id: ReservedWordSetId,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Hash, PartialOrd, Ord)]
|
||||
pub struct ReservedWordSetId(pub usize);
|
||||
|
||||
impl fmt::Display for ReservedWordSetId {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
self.0.fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
pub const NO_RESERVED_WORDS: ReservedWordSetId = ReservedWordSetId(usize::MAX);
|
||||
|
||||
#[derive(Clone, Debug, Default, PartialEq, Eq)]
|
||||
pub struct Production {
|
||||
pub steps: Vec<ProductionStep>,
|
||||
|
|
@ -104,51 +123,44 @@ pub struct SyntaxGrammar {
|
|||
pub variables_to_inline: Vec<Symbol>,
|
||||
pub word_token: Option<Symbol>,
|
||||
pub precedence_orderings: Vec<Vec<PrecedenceEntry>>,
|
||||
pub reserved_word_sets: Vec<TokenSet>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl ProductionStep {
|
||||
#[must_use]
|
||||
pub const fn new(symbol: Symbol) -> Self {
|
||||
pub fn new(symbol: Symbol) -> Self {
|
||||
Self {
|
||||
symbol,
|
||||
precedence: Precedence::None,
|
||||
associativity: None,
|
||||
alias: None,
|
||||
field_name: None,
|
||||
reserved_word_set_id: ReservedWordSetId::default(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_prec(self, precedence: Precedence, associativity: Option<Associativity>) -> Self {
|
||||
Self {
|
||||
symbol: self.symbol,
|
||||
precedence,
|
||||
associativity,
|
||||
alias: self.alias,
|
||||
field_name: self.field_name,
|
||||
}
|
||||
pub fn with_prec(
|
||||
mut self,
|
||||
precedence: Precedence,
|
||||
associativity: Option<Associativity>,
|
||||
) -> Self {
|
||||
self.precedence = precedence;
|
||||
self.associativity = associativity;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_alias(self, value: &str, is_named: bool) -> Self {
|
||||
Self {
|
||||
symbol: self.symbol,
|
||||
precedence: self.precedence,
|
||||
associativity: self.associativity,
|
||||
alias: Some(Alias {
|
||||
value: value.to_string(),
|
||||
is_named,
|
||||
}),
|
||||
field_name: self.field_name,
|
||||
}
|
||||
pub fn with_alias(mut self, value: &str, is_named: bool) -> Self {
|
||||
self.alias = Some(Alias {
|
||||
value: value.to_string(),
|
||||
is_named,
|
||||
});
|
||||
self
|
||||
}
|
||||
pub fn with_field_name(self, name: &str) -> Self {
|
||||
Self {
|
||||
symbol: self.symbol,
|
||||
precedence: self.precedence,
|
||||
associativity: self.associativity,
|
||||
alias: self.alias,
|
||||
field_name: Some(name.to_string()),
|
||||
}
|
||||
|
||||
pub fn with_field_name(mut self, name: &str) -> Self {
|
||||
self.field_name = Some(name.to_string());
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
use std::collections::HashSet;
|
||||
|
||||
use anyhow::{anyhow, Result};
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use serde::Deserialize;
|
||||
use serde_json::{Map, Value};
|
||||
|
||||
|
|
@ -8,6 +8,7 @@ use super::{
|
|||
grammars::{InputGrammar, PrecedenceEntry, Variable, VariableType},
|
||||
rules::{Precedence, Rule},
|
||||
};
|
||||
use crate::grammars::ReservedWordContext;
|
||||
|
||||
#[derive(Deserialize)]
|
||||
#[serde(tag = "type")]
|
||||
|
|
@ -68,6 +69,10 @@ enum RuleJSON {
|
|||
IMMEDIATE_TOKEN {
|
||||
content: Box<RuleJSON>,
|
||||
},
|
||||
RESERVED {
|
||||
context_name: String,
|
||||
content: Box<RuleJSON>,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
|
|
@ -93,7 +98,10 @@ pub struct GrammarJSON {
|
|||
inline: Vec<String>,
|
||||
#[serde(default)]
|
||||
supertypes: Vec<String>,
|
||||
#[serde(default)]
|
||||
word: Option<String>,
|
||||
#[serde(default)]
|
||||
reserved: Map<String, Value>,
|
||||
}
|
||||
|
||||
fn rule_is_referenced(rule: &Rule, target: &str) -> bool {
|
||||
|
|
@ -102,7 +110,9 @@ fn rule_is_referenced(rule: &Rule, target: &str) -> bool {
|
|||
Rule::Choice(rules) | Rule::Seq(rules) => {
|
||||
rules.iter().any(|r| rule_is_referenced(r, target))
|
||||
}
|
||||
Rule::Metadata { rule, .. } => rule_is_referenced(rule, target),
|
||||
Rule::Metadata { rule, .. } | Rule::Reserved { rule, .. } => {
|
||||
rule_is_referenced(rule, target)
|
||||
}
|
||||
Rule::Repeat(inner) => rule_is_referenced(inner, target),
|
||||
Rule::Blank | Rule::String(_) | Rule::Pattern(_, _) | Rule::Symbol(_) => false,
|
||||
}
|
||||
|
|
@ -226,6 +236,27 @@ pub(crate) fn parse_grammar(input: &str) -> Result<InputGrammar> {
|
|||
});
|
||||
}
|
||||
|
||||
let reserved_words = grammar_json
|
||||
.reserved
|
||||
.into_iter()
|
||||
.map(|(name, rule_values)| {
|
||||
let mut reserved_words = Vec::new();
|
||||
|
||||
let Value::Array(rule_values) = rule_values else {
|
||||
bail!("reserved word sets must be arrays");
|
||||
};
|
||||
|
||||
for value in rule_values {
|
||||
let rule_json: RuleJSON = serde_json::from_value(value)?;
|
||||
reserved_words.push(parse_rule(rule_json));
|
||||
}
|
||||
Ok(ReservedWordContext {
|
||||
name,
|
||||
reserved_words,
|
||||
})
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
|
||||
Ok(InputGrammar {
|
||||
name: grammar_json.name,
|
||||
word_token: grammar_json.word,
|
||||
|
|
@ -236,6 +267,7 @@ pub(crate) fn parse_grammar(input: &str) -> Result<InputGrammar> {
|
|||
variables,
|
||||
extra_symbols,
|
||||
external_tokens,
|
||||
reserved_words,
|
||||
})
|
||||
}
|
||||
|
||||
|
|
@ -283,6 +315,13 @@ fn parse_rule(json: RuleJSON) -> Rule {
|
|||
RuleJSON::PREC_DYNAMIC { value, content } => {
|
||||
Rule::prec_dynamic(value, parse_rule(*content))
|
||||
}
|
||||
RuleJSON::RESERVED {
|
||||
content,
|
||||
context_name,
|
||||
} => Rule::Reserved {
|
||||
rule: Box::new(parse_rule(*content)),
|
||||
context_name,
|
||||
},
|
||||
RuleJSON::TOKEN { content } => Rule::token(parse_rule(*content)),
|
||||
RuleJSON::IMMEDIATE_TOKEN { content } => Rule::immediate_token(parse_rule(*content)),
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
use std::{collections::HashMap, mem};
|
||||
use std::collections::HashMap;
|
||||
|
||||
use anyhow::{anyhow, Result};
|
||||
|
||||
use super::{ExtractedLexicalGrammar, ExtractedSyntaxGrammar, InternedGrammar};
|
||||
use crate::{
|
||||
grammars::{ExternalToken, Variable, VariableType},
|
||||
grammars::{ExternalToken, ReservedWordContext, Variable, VariableType},
|
||||
rules::{MetadataParams, Rule, Symbol, SymbolType},
|
||||
};
|
||||
|
||||
|
|
@ -148,6 +148,27 @@ pub(super) fn extract_tokens(
|
|||
word_token = Some(token);
|
||||
}
|
||||
|
||||
let mut reserved_word_contexts = Vec::new();
|
||||
for reserved_word_context in grammar.reserved_word_sets {
|
||||
let mut reserved_words = Vec::new();
|
||||
for reserved_rule in reserved_word_context.reserved_words {
|
||||
if let Rule::Symbol(symbol) = reserved_rule {
|
||||
reserved_words.push(symbol_replacer.replace_symbol(symbol));
|
||||
} else if let Some(index) = lexical_variables
|
||||
.iter()
|
||||
.position(|v| v.rule == reserved_rule)
|
||||
{
|
||||
reserved_words.push(Symbol::terminal(index));
|
||||
} else {
|
||||
return Err(anyhow!("Reserved words must be tokens"));
|
||||
}
|
||||
}
|
||||
reserved_word_contexts.push(ReservedWordContext {
|
||||
name: reserved_word_context.name,
|
||||
reserved_words,
|
||||
});
|
||||
}
|
||||
|
||||
Ok((
|
||||
ExtractedSyntaxGrammar {
|
||||
variables,
|
||||
|
|
@ -158,6 +179,7 @@ pub(super) fn extract_tokens(
|
|||
external_tokens,
|
||||
word_token,
|
||||
precedence_orderings: grammar.precedence_orderings,
|
||||
reserved_word_sets: reserved_word_contexts,
|
||||
},
|
||||
ExtractedLexicalGrammar {
|
||||
variables: lexical_variables,
|
||||
|
|
@ -188,9 +210,7 @@ impl TokenExtractor {
|
|||
self.current_variable_name.push_str(&variable.name);
|
||||
self.current_variable_token_count = 0;
|
||||
self.is_first_rule = is_first;
|
||||
let mut rule = Rule::Blank;
|
||||
mem::swap(&mut rule, &mut variable.rule);
|
||||
variable.rule = self.extract_tokens_in_rule(&rule)?;
|
||||
variable.rule = self.extract_tokens_in_rule(&variable.rule)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
|
@ -237,6 +257,10 @@ impl TokenExtractor {
|
|||
.map(|e| self.extract_tokens_in_rule(e))
|
||||
.collect::<Result<Vec<_>>>()?,
|
||||
)),
|
||||
Rule::Reserved { rule, context_name } => Ok(Rule::Reserved {
|
||||
rule: Box::new(self.extract_tokens_in_rule(rule)?),
|
||||
context_name: context_name.clone(),
|
||||
}),
|
||||
_ => Ok(input.clone()),
|
||||
}
|
||||
}
|
||||
|
|
@ -305,6 +329,10 @@ impl SymbolReplacer {
|
|||
params: params.clone(),
|
||||
rule: Box::new(self.replace_symbols_in_rule(rule)),
|
||||
},
|
||||
Rule::Reserved { rule, context_name } => Rule::Reserved {
|
||||
rule: Box::new(self.replace_symbols_in_rule(rule)),
|
||||
context_name: context_name.clone(),
|
||||
},
|
||||
_ => rule.clone(),
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,48 +1,77 @@
|
|||
use std::collections::HashMap;
|
||||
|
||||
use anyhow::{anyhow, Result};
|
||||
use indoc::indoc;
|
||||
|
||||
use super::ExtractedSyntaxGrammar;
|
||||
use crate::{
|
||||
grammars::{Production, ProductionStep, SyntaxGrammar, SyntaxVariable, Variable},
|
||||
rules::{Alias, Associativity, Precedence, Rule, Symbol},
|
||||
grammars::{
|
||||
Production, ProductionStep, ReservedWordSetId, SyntaxGrammar, SyntaxVariable, Variable,
|
||||
},
|
||||
rules::{Alias, Associativity, Precedence, Rule, Symbol, TokenSet},
|
||||
};
|
||||
|
||||
struct RuleFlattener {
|
||||
production: Production,
|
||||
reserved_word_set_ids: HashMap<String, ReservedWordSetId>,
|
||||
precedence_stack: Vec<Precedence>,
|
||||
associativity_stack: Vec<Associativity>,
|
||||
reserved_word_stack: Vec<ReservedWordSetId>,
|
||||
alias_stack: Vec<Alias>,
|
||||
field_name_stack: Vec<String>,
|
||||
}
|
||||
|
||||
impl RuleFlattener {
|
||||
const fn new() -> Self {
|
||||
const fn new(reserved_word_set_ids: HashMap<String, ReservedWordSetId>) -> Self {
|
||||
Self {
|
||||
production: Production {
|
||||
steps: Vec::new(),
|
||||
dynamic_precedence: 0,
|
||||
},
|
||||
reserved_word_set_ids,
|
||||
precedence_stack: Vec::new(),
|
||||
associativity_stack: Vec::new(),
|
||||
reserved_word_stack: Vec::new(),
|
||||
alias_stack: Vec::new(),
|
||||
field_name_stack: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn flatten(mut self, rule: Rule) -> Production {
|
||||
self.apply(rule, true);
|
||||
self.production
|
||||
fn flatten_variable(&mut self, variable: Variable) -> Result<SyntaxVariable> {
|
||||
let mut productions = Vec::new();
|
||||
for rule in extract_choices(variable.rule) {
|
||||
let production = self.flatten_rule(rule)?;
|
||||
if !productions.contains(&production) {
|
||||
productions.push(production);
|
||||
}
|
||||
}
|
||||
Ok(SyntaxVariable {
|
||||
name: variable.name,
|
||||
kind: variable.kind,
|
||||
productions,
|
||||
})
|
||||
}
|
||||
|
||||
fn apply(&mut self, rule: Rule, at_end: bool) -> bool {
|
||||
fn flatten_rule(&mut self, rule: Rule) -> Result<Production> {
|
||||
self.production = Production::default();
|
||||
self.alias_stack.clear();
|
||||
self.reserved_word_stack.clear();
|
||||
self.precedence_stack.clear();
|
||||
self.associativity_stack.clear();
|
||||
self.field_name_stack.clear();
|
||||
self.apply(rule, true)?;
|
||||
Ok(self.production.clone())
|
||||
}
|
||||
|
||||
fn apply(&mut self, rule: Rule, at_end: bool) -> Result<bool> {
|
||||
match rule {
|
||||
Rule::Seq(members) => {
|
||||
let mut result = false;
|
||||
let last_index = members.len() - 1;
|
||||
for (i, member) in members.into_iter().enumerate() {
|
||||
result |= self.apply(member, i == last_index && at_end);
|
||||
result |= self.apply(member, i == last_index && at_end)?;
|
||||
}
|
||||
result
|
||||
Ok(result)
|
||||
}
|
||||
Rule::Metadata { rule, params } => {
|
||||
let mut has_precedence = false;
|
||||
|
|
@ -73,7 +102,7 @@ impl RuleFlattener {
|
|||
self.production.dynamic_precedence = params.dynamic_precedence;
|
||||
}
|
||||
|
||||
let did_push = self.apply(*rule, at_end);
|
||||
let did_push = self.apply(*rule, at_end)?;
|
||||
|
||||
if has_precedence {
|
||||
self.precedence_stack.pop();
|
||||
|
|
@ -102,7 +131,18 @@ impl RuleFlattener {
|
|||
self.field_name_stack.pop();
|
||||
}
|
||||
|
||||
did_push
|
||||
Ok(did_push)
|
||||
}
|
||||
Rule::Reserved { rule, context_name } => {
|
||||
self.reserved_word_stack.push(
|
||||
self.reserved_word_set_ids
|
||||
.get(&context_name)
|
||||
.copied()
|
||||
.ok_or_else(|| anyhow!("no such reserved word set: {context_name}"))?,
|
||||
);
|
||||
let did_push = self.apply(*rule, at_end)?;
|
||||
self.reserved_word_stack.pop();
|
||||
Ok(did_push)
|
||||
}
|
||||
Rule::Symbol(symbol) => {
|
||||
self.production.steps.push(ProductionStep {
|
||||
|
|
@ -113,12 +153,17 @@ impl RuleFlattener {
|
|||
.cloned()
|
||||
.unwrap_or(Precedence::None),
|
||||
associativity: self.associativity_stack.last().copied(),
|
||||
reserved_word_set_id: self
|
||||
.reserved_word_stack
|
||||
.last()
|
||||
.copied()
|
||||
.unwrap_or(ReservedWordSetId::default()),
|
||||
alias: self.alias_stack.last().cloned(),
|
||||
field_name: self.field_name_stack.last().cloned(),
|
||||
});
|
||||
true
|
||||
Ok(true)
|
||||
}
|
||||
_ => false,
|
||||
_ => Ok(false),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -155,25 +200,17 @@ fn extract_choices(rule: Rule) -> Vec<Rule> {
|
|||
params: params.clone(),
|
||||
})
|
||||
.collect(),
|
||||
Rule::Reserved { rule, context_name } => extract_choices(*rule)
|
||||
.into_iter()
|
||||
.map(|rule| Rule::Reserved {
|
||||
rule: Box::new(rule),
|
||||
context_name: context_name.clone(),
|
||||
})
|
||||
.collect(),
|
||||
_ => vec![rule],
|
||||
}
|
||||
}
|
||||
|
||||
fn flatten_variable(variable: Variable) -> SyntaxVariable {
|
||||
let mut productions = Vec::new();
|
||||
for rule in extract_choices(variable.rule) {
|
||||
let production = RuleFlattener::new().flatten(rule);
|
||||
if !productions.contains(&production) {
|
||||
productions.push(production);
|
||||
}
|
||||
}
|
||||
SyntaxVariable {
|
||||
name: variable.name,
|
||||
kind: variable.kind,
|
||||
productions,
|
||||
}
|
||||
}
|
||||
|
||||
fn symbol_is_used(variables: &[SyntaxVariable], symbol: Symbol) -> bool {
|
||||
for variable in variables {
|
||||
for production in &variable.productions {
|
||||
|
|
@ -188,10 +225,18 @@ fn symbol_is_used(variables: &[SyntaxVariable], symbol: Symbol) -> bool {
|
|||
}
|
||||
|
||||
pub(super) fn flatten_grammar(grammar: ExtractedSyntaxGrammar) -> Result<SyntaxGrammar> {
|
||||
let mut variables = Vec::new();
|
||||
for variable in grammar.variables {
|
||||
variables.push(flatten_variable(variable));
|
||||
let mut reserved_word_set_ids_by_name = HashMap::new();
|
||||
for (ix, set) in grammar.reserved_word_sets.iter().enumerate() {
|
||||
reserved_word_set_ids_by_name.insert(set.name.clone(), ReservedWordSetId(ix));
|
||||
}
|
||||
|
||||
let mut flattener = RuleFlattener::new(reserved_word_set_ids_by_name);
|
||||
let variables = grammar
|
||||
.variables
|
||||
.into_iter()
|
||||
.map(|variable| flattener.flatten_variable(variable))
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
|
||||
for (i, variable) in variables.iter().enumerate() {
|
||||
let symbol = Symbol::non_terminal(i);
|
||||
|
||||
|
|
@ -218,6 +263,17 @@ pub(super) fn flatten_grammar(grammar: ExtractedSyntaxGrammar) -> Result<SyntaxG
|
|||
}
|
||||
}
|
||||
}
|
||||
let mut reserved_word_sets = grammar
|
||||
.reserved_word_sets
|
||||
.into_iter()
|
||||
.map(|set| set.reserved_words.into_iter().collect())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// If no default reserved word set is specified, there are no reserved words.
|
||||
if reserved_word_sets.is_empty() {
|
||||
reserved_word_sets.push(TokenSet::default());
|
||||
}
|
||||
|
||||
Ok(SyntaxGrammar {
|
||||
extra_symbols: grammar.extra_symbols,
|
||||
expected_conflicts: grammar.expected_conflicts,
|
||||
|
|
@ -226,6 +282,7 @@ pub(super) fn flatten_grammar(grammar: ExtractedSyntaxGrammar) -> Result<SyntaxG
|
|||
external_tokens: grammar.external_tokens,
|
||||
supertype_symbols: grammar.supertype_symbols,
|
||||
word_token: grammar.word_token,
|
||||
reserved_word_sets,
|
||||
variables,
|
||||
})
|
||||
}
|
||||
|
|
@ -237,28 +294,31 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_flatten_grammar() {
|
||||
let result = flatten_variable(Variable {
|
||||
name: "test".to_string(),
|
||||
kind: VariableType::Named,
|
||||
rule: Rule::seq(vec![
|
||||
Rule::non_terminal(1),
|
||||
Rule::prec_left(
|
||||
Precedence::Integer(101),
|
||||
Rule::seq(vec![
|
||||
Rule::non_terminal(2),
|
||||
Rule::choice(vec![
|
||||
Rule::prec_right(
|
||||
Precedence::Integer(102),
|
||||
Rule::seq(vec![Rule::non_terminal(3), Rule::non_terminal(4)]),
|
||||
),
|
||||
Rule::non_terminal(5),
|
||||
let mut flattener = RuleFlattener::new(HashMap::default());
|
||||
let result = flattener
|
||||
.flatten_variable(Variable {
|
||||
name: "test".to_string(),
|
||||
kind: VariableType::Named,
|
||||
rule: Rule::seq(vec![
|
||||
Rule::non_terminal(1),
|
||||
Rule::prec_left(
|
||||
Precedence::Integer(101),
|
||||
Rule::seq(vec![
|
||||
Rule::non_terminal(2),
|
||||
Rule::choice(vec![
|
||||
Rule::prec_right(
|
||||
Precedence::Integer(102),
|
||||
Rule::seq(vec![Rule::non_terminal(3), Rule::non_terminal(4)]),
|
||||
),
|
||||
Rule::non_terminal(5),
|
||||
]),
|
||||
Rule::non_terminal(6),
|
||||
]),
|
||||
Rule::non_terminal(6),
|
||||
]),
|
||||
),
|
||||
Rule::non_terminal(7),
|
||||
]),
|
||||
});
|
||||
),
|
||||
Rule::non_terminal(7),
|
||||
]),
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
result.productions,
|
||||
|
|
@ -295,28 +355,31 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_flatten_grammar_with_maximum_dynamic_precedence() {
|
||||
let result = flatten_variable(Variable {
|
||||
name: "test".to_string(),
|
||||
kind: VariableType::Named,
|
||||
rule: Rule::seq(vec![
|
||||
Rule::non_terminal(1),
|
||||
Rule::prec_dynamic(
|
||||
101,
|
||||
Rule::seq(vec![
|
||||
Rule::non_terminal(2),
|
||||
Rule::choice(vec![
|
||||
Rule::prec_dynamic(
|
||||
102,
|
||||
Rule::seq(vec![Rule::non_terminal(3), Rule::non_terminal(4)]),
|
||||
),
|
||||
Rule::non_terminal(5),
|
||||
let mut flattener = RuleFlattener::new(HashMap::default());
|
||||
let result = flattener
|
||||
.flatten_variable(Variable {
|
||||
name: "test".to_string(),
|
||||
kind: VariableType::Named,
|
||||
rule: Rule::seq(vec![
|
||||
Rule::non_terminal(1),
|
||||
Rule::prec_dynamic(
|
||||
101,
|
||||
Rule::seq(vec![
|
||||
Rule::non_terminal(2),
|
||||
Rule::choice(vec![
|
||||
Rule::prec_dynamic(
|
||||
102,
|
||||
Rule::seq(vec![Rule::non_terminal(3), Rule::non_terminal(4)]),
|
||||
),
|
||||
Rule::non_terminal(5),
|
||||
]),
|
||||
Rule::non_terminal(6),
|
||||
]),
|
||||
Rule::non_terminal(6),
|
||||
]),
|
||||
),
|
||||
Rule::non_terminal(7),
|
||||
]),
|
||||
});
|
||||
),
|
||||
Rule::non_terminal(7),
|
||||
]),
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
result.productions,
|
||||
|
|
@ -348,14 +411,17 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_flatten_grammar_with_final_precedence() {
|
||||
let result = flatten_variable(Variable {
|
||||
name: "test".to_string(),
|
||||
kind: VariableType::Named,
|
||||
rule: Rule::prec_left(
|
||||
Precedence::Integer(101),
|
||||
Rule::seq(vec![Rule::non_terminal(1), Rule::non_terminal(2)]),
|
||||
),
|
||||
});
|
||||
let mut flattener = RuleFlattener::new(HashMap::default());
|
||||
let result = flattener
|
||||
.flatten_variable(Variable {
|
||||
name: "test".to_string(),
|
||||
kind: VariableType::Named,
|
||||
rule: Rule::prec_left(
|
||||
Precedence::Integer(101),
|
||||
Rule::seq(vec![Rule::non_terminal(1), Rule::non_terminal(2)]),
|
||||
),
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
result.productions,
|
||||
|
|
@ -370,14 +436,16 @@ mod tests {
|
|||
}]
|
||||
);
|
||||
|
||||
let result = flatten_variable(Variable {
|
||||
name: "test".to_string(),
|
||||
kind: VariableType::Named,
|
||||
rule: Rule::prec_left(
|
||||
Precedence::Integer(101),
|
||||
Rule::seq(vec![Rule::non_terminal(1)]),
|
||||
),
|
||||
});
|
||||
let result = flattener
|
||||
.flatten_variable(Variable {
|
||||
name: "test".to_string(),
|
||||
kind: VariableType::Named,
|
||||
rule: Rule::prec_left(
|
||||
Precedence::Integer(101),
|
||||
Rule::seq(vec![Rule::non_terminal(1)]),
|
||||
),
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
result.productions,
|
||||
|
|
@ -391,18 +459,21 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_flatten_grammar_with_field_names() {
|
||||
let result = flatten_variable(Variable {
|
||||
name: "test".to_string(),
|
||||
kind: VariableType::Named,
|
||||
rule: Rule::seq(vec![
|
||||
Rule::field("first-thing".to_string(), Rule::terminal(1)),
|
||||
Rule::terminal(2),
|
||||
Rule::choice(vec![
|
||||
Rule::Blank,
|
||||
Rule::field("second-thing".to_string(), Rule::terminal(3)),
|
||||
let mut flattener = RuleFlattener::new(HashMap::default());
|
||||
let result = flattener
|
||||
.flatten_variable(Variable {
|
||||
name: "test".to_string(),
|
||||
kind: VariableType::Named,
|
||||
rule: Rule::seq(vec![
|
||||
Rule::field("first-thing".to_string(), Rule::terminal(1)),
|
||||
Rule::terminal(2),
|
||||
Rule::choice(vec![
|
||||
Rule::Blank,
|
||||
Rule::field("second-thing".to_string(), Rule::terminal(3)),
|
||||
]),
|
||||
]),
|
||||
]),
|
||||
});
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
result.productions,
|
||||
|
|
@ -436,6 +507,7 @@ mod tests {
|
|||
external_tokens: Vec::new(),
|
||||
supertype_symbols: Vec::new(),
|
||||
word_token: None,
|
||||
reserved_word_sets: Vec::new(),
|
||||
variables: vec![Variable {
|
||||
name: "test".to_string(),
|
||||
kind: VariableType::Named,
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ use anyhow::{anyhow, Result};
|
|||
|
||||
use super::InternedGrammar;
|
||||
use crate::{
|
||||
grammars::{InputGrammar, Variable, VariableType},
|
||||
grammars::{InputGrammar, ReservedWordContext, Variable, VariableType},
|
||||
rules::{Rule, Symbol},
|
||||
};
|
||||
|
||||
|
|
@ -45,6 +45,18 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result<InternedGrammar>
|
|||
})?);
|
||||
}
|
||||
|
||||
let mut reserved_words = Vec::with_capacity(grammar.reserved_words.len());
|
||||
for reserved_word_set in &grammar.reserved_words {
|
||||
let mut interned_set = Vec::new();
|
||||
for rule in &reserved_word_set.reserved_words {
|
||||
interned_set.push(interner.intern_rule(rule, None)?);
|
||||
}
|
||||
reserved_words.push(ReservedWordContext {
|
||||
name: reserved_word_set.name.clone(),
|
||||
reserved_words: interned_set,
|
||||
});
|
||||
}
|
||||
|
||||
let mut expected_conflicts = Vec::new();
|
||||
for conflict in &grammar.expected_conflicts {
|
||||
let mut interned_conflict = Vec::with_capacity(conflict.len());
|
||||
|
|
@ -87,6 +99,7 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result<InternedGrammar>
|
|||
supertype_symbols,
|
||||
word_token,
|
||||
precedence_orderings: grammar.precedence_orderings.clone(),
|
||||
reserved_word_sets: reserved_words,
|
||||
})
|
||||
}
|
||||
|
||||
|
|
@ -118,6 +131,10 @@ impl Interner<'_> {
|
|||
rule: Box::new(self.intern_rule(rule, name)?),
|
||||
params: params.clone(),
|
||||
}),
|
||||
Rule::Reserved { rule, context_name } => Ok(Rule::Reserved {
|
||||
rule: Box::new(self.intern_rule(rule, name)?),
|
||||
context_name: context_name.clone(),
|
||||
}),
|
||||
Rule::NamedSymbol(name) => self.intern_name(name).map_or_else(
|
||||
|| Err(anyhow!("Undefined symbol `{name}`")),
|
||||
|symbol| Ok(Rule::Symbol(symbol)),
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ use super::{
|
|||
},
|
||||
rules::{AliasMap, Precedence, Rule, Symbol},
|
||||
};
|
||||
use crate::grammars::ReservedWordContext;
|
||||
|
||||
pub struct IntermediateGrammar<T, U> {
|
||||
variables: Vec<Variable>,
|
||||
|
|
@ -37,6 +38,7 @@ pub struct IntermediateGrammar<T, U> {
|
|||
variables_to_inline: Vec<Symbol>,
|
||||
supertype_symbols: Vec<Symbol>,
|
||||
word_token: Option<Symbol>,
|
||||
reserved_word_sets: Vec<ReservedWordContext<T>>,
|
||||
}
|
||||
|
||||
pub type InternedGrammar = IntermediateGrammar<Rule, Variable>;
|
||||
|
|
@ -60,6 +62,7 @@ impl<T, U> Default for IntermediateGrammar<T, U> {
|
|||
variables_to_inline: Vec::default(),
|
||||
supertype_symbols: Vec::default(),
|
||||
word_token: Option::default(),
|
||||
reserved_word_sets: Vec::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ use super::{
|
|||
build_tables::Tables,
|
||||
grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType},
|
||||
nfa::CharacterSet,
|
||||
rules::{Alias, AliasMap, Symbol, SymbolType},
|
||||
rules::{Alias, AliasMap, Symbol, SymbolType, TokenSet},
|
||||
tables::{
|
||||
AdvanceAction, FieldLocation, GotoAction, LexState, LexTable, ParseAction, ParseTable,
|
||||
ParseTableEntry,
|
||||
|
|
@ -19,7 +19,7 @@ use super::{
|
|||
const SMALL_STATE_THRESHOLD: usize = 64;
|
||||
const ABI_VERSION_MIN: usize = 14;
|
||||
const ABI_VERSION_MAX: usize = tree_sitter::LANGUAGE_VERSION;
|
||||
const ABI_VERSION_WITH_METADATA: usize = 15;
|
||||
const ABI_VERSION_WITH_RESERVED_WORDS: usize = 15;
|
||||
const BUILD_VERSION: &str = env!("CARGO_PKG_VERSION");
|
||||
const BUILD_SHA: Option<&'static str> = option_env!("BUILD_SHA");
|
||||
|
||||
|
|
@ -58,6 +58,7 @@ macro_rules! dedent {
|
|||
};
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct Generator {
|
||||
buffer: String,
|
||||
indent_level: usize,
|
||||
|
|
@ -68,7 +69,6 @@ struct Generator {
|
|||
large_character_sets: Vec<(Option<Symbol>, CharacterSet)>,
|
||||
large_character_set_info: Vec<LargeCharacterSetInfo>,
|
||||
large_state_count: usize,
|
||||
keyword_capture_token: Option<Symbol>,
|
||||
syntax_grammar: SyntaxGrammar,
|
||||
lexical_grammar: LexicalGrammar,
|
||||
default_aliases: AliasMap,
|
||||
|
|
@ -77,6 +77,8 @@ struct Generator {
|
|||
alias_ids: HashMap<Alias, String>,
|
||||
unique_aliases: Vec<Alias>,
|
||||
symbol_map: HashMap<Symbol, Symbol>,
|
||||
reserved_word_sets: Vec<TokenSet>,
|
||||
reserved_word_set_ids_by_parse_state: Vec<usize>,
|
||||
field_names: Vec<String>,
|
||||
|
||||
#[allow(unused)]
|
||||
|
|
@ -119,7 +121,7 @@ impl Generator {
|
|||
swap(&mut main_lex_table, &mut self.main_lex_table);
|
||||
self.add_lex_function("ts_lex", main_lex_table);
|
||||
|
||||
if self.keyword_capture_token.is_some() {
|
||||
if self.syntax_grammar.word_token.is_some() {
|
||||
let mut keyword_lex_table = LexTable::default();
|
||||
swap(&mut keyword_lex_table, &mut self.keyword_lex_table);
|
||||
self.add_lex_function("ts_lex_keywords", keyword_lex_table);
|
||||
|
|
@ -135,7 +137,13 @@ impl Generator {
|
|||
}
|
||||
self.buffer.push_str(&lex_functions);
|
||||
|
||||
self.add_lex_modes_list();
|
||||
self.add_lex_modes();
|
||||
|
||||
if self.abi_version >= ABI_VERSION_WITH_RESERVED_WORDS && self.reserved_word_sets.len() > 1
|
||||
{
|
||||
self.add_reserved_word_sets();
|
||||
}
|
||||
|
||||
self.add_parse_table();
|
||||
|
||||
if !self.syntax_grammar.external_tokens.is_empty() {
|
||||
|
|
@ -266,6 +274,22 @@ impl Generator {
|
|||
});
|
||||
}
|
||||
|
||||
// Assign an id to each unique reserved word set
|
||||
self.reserved_word_sets.push(TokenSet::new());
|
||||
for state in &self.parse_table.states {
|
||||
let id = if let Some(ix) = self
|
||||
.reserved_word_sets
|
||||
.iter()
|
||||
.position(|set| *set == state.reserved_words)
|
||||
{
|
||||
ix
|
||||
} else {
|
||||
self.reserved_word_sets.push(state.reserved_words.clone());
|
||||
self.reserved_word_sets.len() - 1
|
||||
};
|
||||
self.reserved_word_set_ids_by_parse_state.push(id);
|
||||
}
|
||||
|
||||
// Determine which states should use the "small state" representation, and which should
|
||||
// use the normal array representation.
|
||||
let threshold = cmp::min(SMALL_STATE_THRESHOLD, self.parse_table.symbols.len() / 2);
|
||||
|
|
@ -365,6 +389,16 @@ impl Generator {
|
|||
"#define MAX_ALIAS_SEQUENCE_LENGTH {}",
|
||||
self.parse_table.max_aliased_production_length
|
||||
);
|
||||
add_line!(
|
||||
self,
|
||||
"#define MAX_RESERVED_WORD_SET_SIZE {}",
|
||||
self.reserved_word_sets
|
||||
.iter()
|
||||
.map(TokenSet::len)
|
||||
.max()
|
||||
.unwrap()
|
||||
);
|
||||
|
||||
add_line!(
|
||||
self,
|
||||
"#define PRODUCTION_ID_COUNT {}",
|
||||
|
|
@ -1016,25 +1050,66 @@ impl Generator {
|
|||
}
|
||||
}
|
||||
|
||||
fn add_lex_modes_list(&mut self) {
|
||||
fn add_lex_modes(&mut self) {
|
||||
add_line!(
|
||||
self,
|
||||
"static const TSLexMode ts_lex_modes[STATE_COUNT] = {{"
|
||||
"static const {} ts_lex_modes[STATE_COUNT] = {{",
|
||||
if self.abi_version >= ABI_VERSION_WITH_RESERVED_WORDS {
|
||||
"TSLexerMode"
|
||||
} else {
|
||||
"TSLexMode"
|
||||
}
|
||||
);
|
||||
indent!(self);
|
||||
for (i, state) in self.parse_table.states.iter().enumerate() {
|
||||
add_whitespace!(self);
|
||||
add!(self, "[{}] = {{", i);
|
||||
if state.is_end_of_non_terminal_extra() {
|
||||
add_line!(self, "[{i}] = {{(TSStateId)(-1)}},");
|
||||
} else if state.external_lex_state_id > 0 {
|
||||
add_line!(
|
||||
self,
|
||||
"[{i}] = {{.lex_state = {}, .external_lex_state = {}}},",
|
||||
state.lex_state_id,
|
||||
state.external_lex_state_id
|
||||
);
|
||||
add!(self, "(TSStateId)(-1),");
|
||||
} else {
|
||||
add_line!(self, "[{i}] = {{.lex_state = {}}},", state.lex_state_id);
|
||||
add!(self, ".lex_state = {}", state.lex_state_id);
|
||||
|
||||
if state.external_lex_state_id > 0 {
|
||||
add!(
|
||||
self,
|
||||
", .external_lex_state = {}",
|
||||
state.external_lex_state_id
|
||||
);
|
||||
}
|
||||
|
||||
if self.abi_version >= ABI_VERSION_WITH_RESERVED_WORDS {
|
||||
let reserved_word_set_id = self.reserved_word_set_ids_by_parse_state[i];
|
||||
if reserved_word_set_id != 0 {
|
||||
add!(self, ", .reserved_word_set_id = {reserved_word_set_id}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
add!(self, "}},\n");
|
||||
}
|
||||
dedent!(self);
|
||||
add_line!(self, "}};");
|
||||
add_line!(self, "");
|
||||
}
|
||||
|
||||
fn add_reserved_word_sets(&mut self) {
|
||||
add_line!(
|
||||
self,
|
||||
"static const TSSymbol ts_reserved_words[{}][MAX_RESERVED_WORD_SET_SIZE] = {{",
|
||||
self.reserved_word_sets.len(),
|
||||
);
|
||||
indent!(self);
|
||||
for (id, set) in self.reserved_word_sets.iter().enumerate() {
|
||||
if id == 0 {
|
||||
continue;
|
||||
}
|
||||
add_line!(self, "[{}] = {{", id);
|
||||
indent!(self);
|
||||
for token in set.iter() {
|
||||
add_line!(self, "{},", self.symbol_ids[&token]);
|
||||
}
|
||||
dedent!(self);
|
||||
add_line!(self, "}},");
|
||||
}
|
||||
dedent!(self);
|
||||
add_line!(self, "}};");
|
||||
|
|
@ -1110,6 +1185,7 @@ impl Generator {
|
|||
let mut parse_table_entries = HashMap::new();
|
||||
let mut next_parse_action_list_index = 0;
|
||||
|
||||
// Parse action lists zero is for the default value, when a symbol is not valid.
|
||||
self.get_parse_action_list_id(
|
||||
&ParseTableEntry {
|
||||
actions: Vec::new(),
|
||||
|
|
@ -1135,7 +1211,7 @@ impl Generator {
|
|||
.enumerate()
|
||||
.take(self.large_state_count)
|
||||
{
|
||||
add_line!(self, "[{i}] = {{");
|
||||
add_line!(self, "[STATE({i})] = {{");
|
||||
indent!(self);
|
||||
|
||||
// Ensure the entries are in a deterministic order, since they are
|
||||
|
|
@ -1167,9 +1243,11 @@ impl Generator {
|
|||
);
|
||||
add_line!(self, "[{}] = ACTIONS({entry_id}),", self.symbol_ids[symbol]);
|
||||
}
|
||||
|
||||
dedent!(self);
|
||||
add_line!(self, "}},");
|
||||
}
|
||||
|
||||
dedent!(self);
|
||||
add_line!(self, "}};");
|
||||
add_line!(self, "");
|
||||
|
|
@ -1178,11 +1256,11 @@ impl Generator {
|
|||
add_line!(self, "static const uint16_t ts_small_parse_table[] = {{");
|
||||
indent!(self);
|
||||
|
||||
let mut index = 0;
|
||||
let mut next_table_index = 0;
|
||||
let mut small_state_indices = Vec::new();
|
||||
let mut symbols_by_value = HashMap::<(usize, SymbolType), Vec<Symbol>>::new();
|
||||
for state in self.parse_table.states.iter().skip(self.large_state_count) {
|
||||
small_state_indices.push(index);
|
||||
small_state_indices.push(next_table_index);
|
||||
symbols_by_value.clear();
|
||||
|
||||
terminal_entries.clear();
|
||||
|
|
@ -1221,10 +1299,16 @@ impl Generator {
|
|||
(symbols.len(), *kind, *value, symbols[0])
|
||||
});
|
||||
|
||||
add_line!(self, "[{index}] = {},", values_with_symbols.len());
|
||||
add_line!(
|
||||
self,
|
||||
"[{next_table_index}] = {},",
|
||||
values_with_symbols.len()
|
||||
);
|
||||
indent!(self);
|
||||
next_table_index += 1;
|
||||
|
||||
for ((value, kind), symbols) in &mut values_with_symbols {
|
||||
next_table_index += 2 + symbols.len();
|
||||
if *kind == SymbolType::NonTerminal {
|
||||
add_line!(self, "STATE({value}), {},", symbols.len());
|
||||
} else {
|
||||
|
|
@ -1240,11 +1324,6 @@ impl Generator {
|
|||
}
|
||||
|
||||
dedent!(self);
|
||||
|
||||
index += 1 + values_with_symbols
|
||||
.iter()
|
||||
.map(|(_, symbols)| 2 + symbols.len())
|
||||
.sum::<usize>();
|
||||
}
|
||||
|
||||
dedent!(self);
|
||||
|
|
@ -1412,9 +1491,9 @@ impl Generator {
|
|||
}
|
||||
|
||||
// Lexing
|
||||
add_line!(self, ".lex_modes = ts_lex_modes,");
|
||||
add_line!(self, ".lex_modes = (const void*)ts_lex_modes,");
|
||||
add_line!(self, ".lex_fn = ts_lex,");
|
||||
if let Some(keyword_capture_token) = self.keyword_capture_token {
|
||||
if let Some(keyword_capture_token) = self.syntax_grammar.word_token {
|
||||
add_line!(self, ".keyword_lex_fn = ts_lex_keywords,");
|
||||
add_line!(
|
||||
self,
|
||||
|
|
@ -1439,8 +1518,22 @@ impl Generator {
|
|||
|
||||
add_line!(self, ".primary_state_ids = ts_primary_state_ids,");
|
||||
|
||||
if self.abi_version >= ABI_VERSION_WITH_METADATA {
|
||||
if self.abi_version >= ABI_VERSION_WITH_RESERVED_WORDS {
|
||||
add_line!(self, ".name = \"{}\",", self.language_name);
|
||||
|
||||
if self.reserved_word_sets.len() > 1 {
|
||||
add_line!(self, ".reserved_words = &ts_reserved_words[0][0],");
|
||||
}
|
||||
|
||||
add_line!(
|
||||
self,
|
||||
".max_reserved_word_set_size = {},",
|
||||
self.reserved_word_sets
|
||||
.iter()
|
||||
.map(TokenSet::len)
|
||||
.max()
|
||||
.unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
dedent!(self);
|
||||
|
|
@ -1716,26 +1809,17 @@ pub fn render_c_code(
|
|||
);
|
||||
|
||||
Generator {
|
||||
buffer: String::new(),
|
||||
indent_level: 0,
|
||||
language_name: name.to_string(),
|
||||
large_state_count: 0,
|
||||
parse_table: tables.parse_table,
|
||||
main_lex_table: tables.main_lex_table,
|
||||
keyword_lex_table: tables.keyword_lex_table,
|
||||
keyword_capture_token: tables.word_token,
|
||||
large_character_sets: tables.large_character_sets,
|
||||
large_character_set_info: Vec::new(),
|
||||
syntax_grammar,
|
||||
lexical_grammar,
|
||||
default_aliases,
|
||||
symbol_ids: HashMap::new(),
|
||||
symbol_order: HashMap::new(),
|
||||
alias_ids: HashMap::new(),
|
||||
symbol_map: HashMap::new(),
|
||||
unique_aliases: Vec::new(),
|
||||
field_names: Vec::new(),
|
||||
abi_version,
|
||||
..Default::default()
|
||||
}
|
||||
.generate()
|
||||
}
|
||||
|
|
|
|||
|
|
@ -68,13 +68,17 @@ pub enum Rule {
|
|||
},
|
||||
Repeat(Box<Rule>),
|
||||
Seq(Vec<Rule>),
|
||||
Reserved {
|
||||
rule: Box<Rule>,
|
||||
context_name: String,
|
||||
},
|
||||
}
|
||||
|
||||
// Because tokens are represented as small (~400 max) unsigned integers,
|
||||
// sets of tokens can be efficiently represented as bit vectors with each
|
||||
// index corresponding to a token, and each value representing whether or not
|
||||
// the token is present in the set.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
#[derive(Default, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct TokenSet {
|
||||
terminal_bits: SmallBitVec,
|
||||
external_bits: SmallBitVec,
|
||||
|
|
@ -82,6 +86,32 @@ pub struct TokenSet {
|
|||
end_of_nonterminal_extra: bool,
|
||||
}
|
||||
|
||||
impl fmt::Debug for TokenSet {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_list().entries(self.iter()).finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for TokenSet {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for TokenSet {
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
self.terminal_bits
|
||||
.iter()
|
||||
.cmp(other.terminal_bits.iter())
|
||||
.then_with(|| self.external_bits.iter().cmp(other.external_bits.iter()))
|
||||
.then_with(|| self.eof.cmp(&other.eof))
|
||||
.then_with(|| {
|
||||
self.end_of_nonterminal_extra
|
||||
.cmp(&other.end_of_nonterminal_extra)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl Rule {
|
||||
pub fn field(name: String, content: Self) -> Self {
|
||||
add_metadata(content, move |params| {
|
||||
|
|
@ -154,7 +184,9 @@ impl Rule {
|
|||
match self {
|
||||
Self::Blank | Self::Pattern(..) | Self::NamedSymbol(_) | Self::Symbol(_) => false,
|
||||
Self::String(string) => string.is_empty(),
|
||||
Self::Metadata { rule, .. } | Self::Repeat(rule) => rule.is_empty(),
|
||||
Self::Metadata { rule, .. } | Self::Repeat(rule) | Self::Reserved { rule, .. } => {
|
||||
rule.is_empty()
|
||||
}
|
||||
Self::Choice(rules) => rules.iter().any(Self::is_empty),
|
||||
Self::Seq(rules) => rules.iter().all(Self::is_empty),
|
||||
}
|
||||
|
|
@ -394,6 +426,9 @@ impl TokenSet {
|
|||
};
|
||||
if other.index < vec.len() && vec[other.index] {
|
||||
vec.set(other.index, false);
|
||||
while vec.last() == Some(false) {
|
||||
vec.pop();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
false
|
||||
|
|
@ -406,6 +441,13 @@ impl TokenSet {
|
|||
&& !self.external_bits.iter().any(|a| a)
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.eof as usize
|
||||
+ self.end_of_nonterminal_extra as usize
|
||||
+ self.terminal_bits.iter().filter(|b| *b).count()
|
||||
+ self.external_bits.iter().filter(|b| *b).count()
|
||||
}
|
||||
|
||||
pub fn insert_all_terminals(&mut self, other: &Self) -> bool {
|
||||
let mut result = false;
|
||||
if other.terminal_bits.len() > self.terminal_bits.len() {
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@ pub struct ParseState {
|
|||
pub id: ParseStateId,
|
||||
pub terminal_entries: IndexMap<Symbol, ParseTableEntry, BuildHasherDefault<FxHasher>>,
|
||||
pub nonterminal_entries: IndexMap<Symbol, GotoAction, BuildHasherDefault<FxHasher>>,
|
||||
pub reserved_words: TokenSet,
|
||||
pub lex_state_id: usize,
|
||||
pub external_lex_state_id: usize,
|
||||
pub core_id: usize,
|
||||
|
|
@ -64,7 +65,7 @@ pub struct ProductionInfo {
|
|||
pub field_map: BTreeMap<String, Vec<FieldLocation>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
#[derive(Debug, Default, PartialEq, Eq)]
|
||||
pub struct ParseTable {
|
||||
pub states: Vec<ParseState>,
|
||||
pub symbols: Vec<Symbol>,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue