Implement more of parse table generation

This commit is contained in:
Max Brunsfeld 2018-12-20 13:36:39 -08:00
parent 5eb8806959
commit a3dcfa0a52
14 changed files with 1515 additions and 107 deletions

View file

@ -1,10 +1,10 @@
use super::inline_variables::InlinedProductionMap;
use crate::grammars::{Production, ProductionStep, SyntaxGrammar};
use crate::rules::{Symbol, SymbolType};
use crate::grammars::{LexicalGrammar, Production, ProductionStep, SyntaxGrammar};
use crate::rules::{Associativity, Symbol, SymbolType};
use smallbitvec::SmallBitVec;
use std::collections::HashMap;
use std::hash::{Hash, Hasher};
use std::collections::{HashMap, BTreeMap};
use std::fmt;
use std::hash::{Hash, Hasher};
lazy_static! {
static ref START_PRODUCTION: Production = Production {
@ -28,7 +28,7 @@ pub(crate) struct LookaheadSet {
eof: bool,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub(crate) enum ParseItem {
Start {
step_index: u32,
@ -47,10 +47,29 @@ pub(crate) enum ParseItem {
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct ParseItemSet {
pub entries: HashMap<ParseItem, LookaheadSet>,
pub entries: BTreeMap<ParseItem, LookaheadSet>,
}
pub(crate) struct ParseItemDisplay<'a>(&'a ParseItem, &'a SyntaxGrammar, &'a InlinedProductionMap);
pub(crate) struct ParseItemDisplay<'a>(
&'a ParseItem,
&'a SyntaxGrammar,
&'a LexicalGrammar,
&'a InlinedProductionMap,
);
pub(crate) struct LookaheadSetDisplay<'a>(&'a LookaheadSet, &'a SyntaxGrammar, &'a LexicalGrammar);
pub(crate) struct ParseItemSetDisplay<'a>(
&'a ParseItemSet,
&'a SyntaxGrammar,
&'a LexicalGrammar,
&'a InlinedProductionMap,
);
struct ParseItemSetMapEntry(ParseItemSet, u64);
pub(crate) struct ParseItemSetMap<T> {
map: HashMap<ParseItemSetMapEntry, T>
}
impl LookaheadSet {
pub fn new() -> Self {
@ -61,12 +80,61 @@ impl LookaheadSet {
}
}
pub fn insert(&mut self, other: Symbol) {
match other.kind {
SymbolType::NonTerminal => panic!("Cannot store non-terminals in a LookaheadSet"),
SymbolType::Terminal => self.terminal_bits.set(other.index, true),
SymbolType::External => self.external_bits.set(other.index, true),
pub fn iter<'a>(&'a self) -> impl Iterator<Item = Symbol> + 'a {
self.terminal_bits
.iter()
.enumerate()
.filter_map(|(i, value)| {
if value {
Some(Symbol::terminal(i))
} else {
None
}
})
.chain(
self.external_bits
.iter()
.enumerate()
.filter_map(|(i, value)| {
if value {
Some(Symbol::external(i))
} else {
None
}
}),
)
.chain(if self.eof { Some(Symbol::end()) } else { None })
}
pub fn with<'a>(symbols: impl IntoIterator<Item = &'a Symbol>) -> Self {
let mut result = Self::new();
for symbol in symbols {
result.insert(*symbol);
}
result
}
pub fn contains(&self, symbol: &Symbol) -> bool {
match symbol.kind {
SymbolType::NonTerminal => panic!("Cannot store non-terminals in a LookaheadSet"),
SymbolType::Terminal => self.terminal_bits.get(symbol.index).unwrap_or(false),
SymbolType::External => self.external_bits.get(symbol.index).unwrap_or(false),
SymbolType::End => self.eof,
}
}
pub fn insert(&mut self, other: Symbol) {
let vec = match other.kind {
SymbolType::NonTerminal => panic!("Cannot store non-terminals in a LookaheadSet"),
SymbolType::Terminal => &mut self.terminal_bits,
SymbolType::External => &mut self.external_bits,
SymbolType::End => {
self.eof = true;
return;
}
};
vec.resize(other.index + 1, false);
vec.set(other.index, true);
}
pub fn insert_all(&mut self, other: &LookaheadSet) -> bool {
@ -95,6 +163,14 @@ impl LookaheadSet {
}
result
}
pub fn display_with<'a>(
&'a self,
syntax_grammar: &'a SyntaxGrammar,
lexical_grammar: &'a LexicalGrammar,
) -> LookaheadSetDisplay<'a> {
LookaheadSetDisplay(self, syntax_grammar, lexical_grammar)
}
}
impl ParseItem {
@ -126,18 +202,53 @@ impl ParseItem {
&grammar.variables[*variable_index as usize].productions[*production_index as usize]
}
ParseItem::Inlined {
production_index,
..
production_index, ..
} => &inlined_productions.inlined_productions[*production_index as usize],
}
}
pub fn symbol(
&self,
grammar: &SyntaxGrammar,
inlined_productions: &InlinedProductionMap,
) -> Option<Symbol> {
self.step(grammar, inlined_productions).map(|s| s.symbol)
}
pub fn step<'a>(
&self,
grammar: &'a SyntaxGrammar,
inlined_productions: &'a InlinedProductionMap,
) -> Option<&'a ProductionStep> {
self.production(grammar, inlined_productions).steps.get(self.step_index())
self.production(grammar, inlined_productions)
.steps
.get(self.step_index())
}
pub fn precedence<'a>(
&self,
grammar: &'a SyntaxGrammar,
inlines: &'a InlinedProductionMap,
) -> i32 {
self.production(grammar, inlines)
.steps
.get(self.step_index() - 1)
.map(|s| s.precedence)
.unwrap_or(0)
}
pub fn associativity<'a>(
&self,
grammar: &'a SyntaxGrammar,
inlines: &'a InlinedProductionMap,
) -> Option<Associativity> {
let production = self.production(grammar, inlines);
let step_index = self.step_index();
if step_index == production.steps.len() {
production.steps.last().and_then(|s| s.associativity)
} else {
None
}
}
pub fn variable_index(&self) -> u32 {
@ -156,6 +267,14 @@ impl ParseItem {
}
}
pub fn is_final(&self) -> bool {
if let ParseItem::Start { step_index: 1 } = self {
true
} else {
false
}
}
fn step_index_mut(&mut self) -> &mut u32 {
match self {
ParseItem::Start { step_index }
@ -164,8 +283,13 @@ impl ParseItem {
}
}
pub fn with<'a>(&'a self, grammar: &'a SyntaxGrammar, inlines: &'a InlinedProductionMap) -> ParseItemDisplay<'a> {
ParseItemDisplay(self, grammar, inlines)
pub fn display_with<'a>(
&'a self,
syntax_grammar: &'a SyntaxGrammar,
lexical_grammar: &'a LexicalGrammar,
inlines: &'a InlinedProductionMap,
) -> ParseItemDisplay<'a> {
ParseItemDisplay(self, syntax_grammar, lexical_grammar, inlines)
}
pub fn successor(&self) -> ParseItem {
@ -176,33 +300,107 @@ impl ParseItem {
}
impl ParseItemSet {
pub fn new() -> Self {
Self { entries: HashMap::new() }
pub fn with<'a>(elements: impl IntoIterator<Item = &'a (ParseItem, LookaheadSet)>) -> Self {
let mut result = Self::default();
for (item, lookaheads) in elements {
result.entries.insert(*item, lookaheads.clone());
}
result
}
pub fn display_with<'a>(
&'a self,
syntax_grammar: &'a SyntaxGrammar,
lexical_grammar: &'a LexicalGrammar,
inlines: &'a InlinedProductionMap,
) -> ParseItemSetDisplay<'a> {
ParseItemSetDisplay(self, syntax_grammar, lexical_grammar, inlines)
}
}
impl Default for ParseItemSet {
fn default() -> Self {
Self {
entries: BTreeMap::new(),
}
}
}
impl<'a> fmt::Display for ParseItemDisplay<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
if let ParseItem::Start { .. } = &self.0 {
write!(f, "START →")?;
} else {
write!(
f,
"{} →",
&self.1.variables[self.0.variable_index() as usize].name
)?;
}
let step_index = self.0.step_index();
let production = self.0.production(self.1, self.2);
let production = self.0.production(self.1, self.3);
for (i, step) in production.steps.iter().enumerate() {
if i > 0 {
write!(f, " ")?;
}
if i == step_index {
write!(f, "")?;
write!(f, "")?;
}
let name = if step.symbol.is_terminal() {
"terminal"
write!(f, " ")?;
if step.symbol.is_terminal() {
if let Some(variable) = self.2.variables.get(step.symbol.index) {
write!(f, "{}", &variable.name)?;
} else {
write!(f, "{}-{}", "terminal", step.symbol.index)?;
}
} else if step.symbol.is_external() {
"external"
write!(f, "{}", &self.1.external_tokens[step.symbol.index].name)?;
} else {
"non-terminal"
};
write!(f, "{}", &self.1.variables[step.symbol.index].name)?;
}
}
write!(f, "{}-{}", name, step.symbol.index)?;
if production.steps.len() == step_index {
write!(f, "")?;
}
Ok(())
}
}
impl<'a> fmt::Display for LookaheadSetDisplay<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
write!(f, "[")?;
for (i, symbol) in self.0.iter().enumerate() {
if i > 0 {
write!(f, ", ")?;
}
if symbol.is_terminal() {
if let Some(variable) = self.2.variables.get(symbol.index) {
write!(f, "{}", &variable.name)?;
} else {
write!(f, "{}-{}", "terminal", symbol.index)?;
}
} else if symbol.is_external() {
write!(f, "{}", &self.1.external_tokens[symbol.index].name)?;
} else {
write!(f, "{}", &self.1.variables[symbol.index].name)?;
}
}
write!(f, "]")?;
Ok(())
}
}
impl<'a> fmt::Display for ParseItemSetDisplay<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
for (item, lookaheads) in self.0.entries.iter() {
writeln!(
f,
"{}\t{}",
item.display_with(self.1, self.2, self.3),
lookaheads.display_with(self.1, self.2)
)?;
}
Ok(())
}

View file

@ -20,7 +20,7 @@ pub(crate) struct ParseItemSetBuilder {
first_sets: HashMap<Symbol, LookaheadSet>,
last_sets: HashMap<Symbol, LookaheadSet>,
transitive_closure_additions: Vec<Vec<TransitiveClosureAddition>>,
inlined_production_map: InlinedProductionMap,
pub inlines: InlinedProductionMap,
}
fn find_or_push<T: Eq>(vector: &mut Vec<T>, value: T) {
@ -35,7 +35,7 @@ impl ParseItemSetBuilder {
first_sets: HashMap::new(),
last_sets: HashMap::new(),
transitive_closure_additions: vec![Vec::new(); syntax_grammar.variables.len()],
inlined_production_map: InlinedProductionMap::new(syntax_grammar),
inlines: InlinedProductionMap::new(syntax_grammar),
};
// For each grammar symbol, populate the FIRST and LAST sets: the set of
@ -192,6 +192,10 @@ impl ParseItemSetBuilder {
let additions_for_non_terminal = &mut result.transitive_closure_additions[i];
for (variable_index, follow_set_info) in follow_set_info_by_non_terminal {
let variable = &syntax_grammar.variables[variable_index];
let non_terminal = Symbol::non_terminal(variable_index);
if syntax_grammar.variables_to_inline.contains(&non_terminal) {
continue;
}
for production_index in 0..variable.productions.len() {
let item = ParseItem::Normal {
variable_index: variable_index as u32,
@ -199,7 +203,7 @@ impl ParseItemSetBuilder {
step_index: 0,
};
if let Some(inlined_items) = result.inlined_production_map.inlined_items(item) {
if let Some(inlined_items) = result.inlines.inlined_items(item) {
for inlined_item in inlined_items {
find_or_push(
additions_for_non_terminal,
@ -227,32 +231,36 @@ impl ParseItemSetBuilder {
pub(crate) fn transitive_closure(
&mut self,
item_set: ParseItemSet,
item_set: &ParseItemSet,
grammar: &SyntaxGrammar,
) -> ParseItemSet {
let mut result = ParseItemSet::new();
for (item, lookaheads) in item_set.entries {
if let Some(items) = self.inlined_production_map.inlined_items(item) {
let mut result = ParseItemSet::default();
for (item, lookaheads) in &item_set.entries {
if let Some(items) = self.inlines.inlined_items(*item) {
for item in items {
self.add_item(&mut result, item, lookaheads.clone(), grammar);
self.add_item(&mut result, item, lookaheads, grammar);
}
} else {
self.add_item(&mut result, item, lookaheads, grammar);
self.add_item(&mut result, *item, lookaheads, grammar);
}
}
result
}
pub fn first_set(&self, symbol: &Symbol) -> &LookaheadSet {
&self.first_sets[symbol]
}
fn add_item(
&self,
set: &mut ParseItemSet,
item: ParseItem,
lookaheads: LookaheadSet,
lookaheads: &LookaheadSet,
grammar: &SyntaxGrammar,
) {
if let Some(step) = item.step(grammar, &self.inlined_production_map) {
if let Some(step) = item.step(grammar, &self.inlines) {
if step.symbol.is_non_terminal() {
let next_step = item.successor().step(grammar, &self.inlined_production_map);
let next_step = item.successor().step(grammar, &self.inlines);
// Determine which tokens can follow this non-terminal.
let following_tokens = if let Some(next_step) = next_step {
@ -274,6 +282,6 @@ impl ParseItemSetBuilder {
}
}
}
set.entries.insert(item, lookaheads);
set.entries.insert(item, lookaheads.clone());
}
}

View file

@ -1,37 +1,611 @@
mod item;
mod inline_variables;
mod item;
mod item_set_builder;
use std::collections::{HashMap, VecDeque};
use crate::grammars::{SyntaxGrammar, LexicalGrammar};
use crate::tables::{ParseTable, LexTable, ParseStateId};
use crate::rules::{AliasMap, Symbol};
use crate::error::Result;
use self::item::ParseItemSet;
use self::item::{LookaheadSet, ParseItem, ParseItemSet};
use self::item_set_builder::ParseItemSetBuilder;
use crate::error::{Error, Result};
use crate::grammars::{LexicalGrammar, SyntaxGrammar, VariableType};
use crate::rules::{AliasMap, Associativity, Symbol, SymbolType};
use crate::tables::ParseTableEntry;
use crate::tables::{AliasSequenceId, LexTable, ParseAction, ParseState, ParseStateId, ParseTable};
use core::ops::Range;
use std::collections::hash_map::Entry;
use std::collections::{HashMap, HashSet, VecDeque};
use std::fmt::Write;
#[derive(Clone)]
struct AuxiliarySymbolInfo {
auxiliary_symbol: Symbol,
parent_symbols: Vec<Symbol>,
}
type SymbolSequence = Vec<Symbol>;
type AuxiliarySymbolSequence = Vec<AuxiliarySymbolInfo>;
struct ParseStateQueueEntry {
preceding_symbols: SymbolSequence,
item_set: ParseItemSet,
preceding_auxiliary_symbols: AuxiliarySymbolSequence,
state_id: ParseStateId,
}
struct ParseTableBuilder<'a> {
item_set_builder: ParseItemSetBuilder,
syntax_grammar: &'a SyntaxGrammar,
lexical_grammar: &'a LexicalGrammar,
simple_aliases: &'a AliasMap,
state_ids_by_item_set: HashMap<ParseItemSet, ParseStateId>,
item_sets_by_state_id: Vec<&'a ParseItemSet>,
item_sets_by_state_id: Vec<ParseItemSet>,
parse_state_queue: VecDeque<ParseStateQueueEntry>,
parse_table: ParseTable,
}
impl<'a> ParseTableBuilder<'a> {
fn build(mut self) -> Result<(ParseTable, LexTable, LexTable, Option<Symbol>)> {
// Ensure that the empty rename sequence has index 0.
self.parse_table.alias_sequences.push(Vec::new());
// Ensure that the error state has index 0.
let error_state_id = self.add_parse_state(
&Vec::new(),
&Vec::new(),
ParseItemSet::default(),
);
self.add_parse_state(
&Vec::new(),
&Vec::new(),
ParseItemSet::with(&[(ParseItem::start(), LookaheadSet::with(&[Symbol::end()]))]),
);
self.process_part_state_queue()?;
self.populate_used_symbols();
Err(Error::grammar("oh no"))
}
fn add_parse_state(
&mut self,
preceding_symbols: &SymbolSequence,
preceding_auxiliary_symbols: &AuxiliarySymbolSequence,
item_set: ParseItemSet,
) -> ParseStateId {
match self.state_ids_by_item_set.entry(item_set) {
Entry::Occupied(o) => {
// eprintln!("Item set already processed at state {}", *o.get());
*o.get()
}
Entry::Vacant(v) => {
// eprintln!("Item set not yet processed");
let state_id = self.parse_table.states.len();
self.item_sets_by_state_id.push(v.key().clone());
self.parse_table.states.push(ParseState {
terminal_entries: HashMap::new(),
nonterminal_entries: HashMap::new(),
});
self.parse_state_queue.push_back(ParseStateQueueEntry {
state_id,
preceding_symbols: preceding_symbols.clone(),
preceding_auxiliary_symbols: preceding_auxiliary_symbols.clone(),
});
v.insert(state_id);
state_id
}
}
}
fn process_part_state_queue(&mut self) -> Result<()> {
while let Some(entry) = self.parse_state_queue.pop_front() {
println!(
"ITEM SET {}:\n{}",
entry.state_id,
self.item_sets_by_state_id[entry.state_id].display_with(
&self.syntax_grammar,
&self.lexical_grammar,
&self.item_set_builder.inlines
)
);
let item_set = self.item_set_builder.transitive_closure(
&self.item_sets_by_state_id[entry.state_id],
self.syntax_grammar,
);
// println!("TRANSITIVE CLOSURE:");
// for item in item_set.entries.keys() {
// println!("{}", item.display_with(&self.syntax_grammar, &self.lexical_grammar, &self.item_set_builder.inlines));
// }
// println!("");
self.add_actions(
entry.preceding_symbols,
entry.preceding_auxiliary_symbols,
item_set,
entry.state_id,
)?;
}
Ok(())
}
fn add_actions(
&mut self,
mut preceding_symbols: SymbolSequence,
mut preceding_auxiliary_symbols: Vec<AuxiliarySymbolInfo>,
item_set: ParseItemSet,
state_id: ParseStateId,
) -> Result<()> {
let mut terminal_successors = HashMap::new();
let mut non_terminal_successors = HashMap::new();
let mut lookaheads_with_conflicts = HashSet::new();
for (item, lookaheads) in &item_set.entries {
if let Some(next_symbol) =
item.symbol(self.syntax_grammar, &self.item_set_builder.inlines)
{
let successor = item.successor();
if next_symbol.is_non_terminal() {
// Keep track of where auxiliary non-terminals (repeat symbols) are
// used within visible symbols. This information may be needed later
// for conflict resolution.
if self.syntax_grammar.variables[next_symbol.index].is_auxiliary() {
preceding_auxiliary_symbols
.push(self.get_auxiliary_node_info(&item_set, next_symbol));
}
non_terminal_successors
.entry(next_symbol)
.or_insert_with(|| ParseItemSet::default())
.entries
.entry(successor)
.or_insert_with(|| LookaheadSet::new())
.insert_all(lookaheads);
} else {
terminal_successors
.entry(next_symbol)
.or_insert_with(|| ParseItemSet::default())
.entries
.entry(successor)
.or_insert_with(|| LookaheadSet::new())
.insert_all(lookaheads);
}
} else {
let action = if item.is_final() {
ParseAction::Accept
} else {
let production =
item.production(&self.syntax_grammar, &self.item_set_builder.inlines);
ParseAction::Reduce {
symbol: Symbol::non_terminal(item.variable_index() as usize),
child_count: item.step_index(),
precedence: production.last_precedence(),
associativity: production.last_associativity(),
dynamic_precedence: production.dynamic_precedence,
alias_sequence_id: self.get_alias_sequence_id(item),
}
};
for lookahead in lookaheads.iter() {
let entry = self.parse_table.states[state_id]
.terminal_entries
.entry(lookahead);
let entry = entry.or_insert_with(|| ParseTableEntry::new());
if entry.actions.is_empty() {
entry.actions.push(action);
} else if action.precedence() > entry.actions[0].precedence() {
entry.actions.clear();
entry.actions.push(action);
lookaheads_with_conflicts.remove(&lookahead);
} else if action.precedence() == entry.actions[0].precedence() {
entry.actions.push(action);
lookaheads_with_conflicts.insert(lookahead);
}
}
}
}
for (symbol, next_item_set) in terminal_successors {
preceding_symbols.push(symbol);
let next_state_id = self.add_parse_state(
&preceding_symbols,
&preceding_auxiliary_symbols,
next_item_set,
);
preceding_symbols.pop();
let entry = self.parse_table.states[state_id]
.terminal_entries
.entry(symbol);
if let Entry::Occupied(e) = &entry {
if !e.get().actions.is_empty() {
lookaheads_with_conflicts.insert(symbol);
}
}
entry
.or_insert_with(|| ParseTableEntry::new())
.actions
.push(ParseAction::Shift {
state: next_state_id,
is_repetition: false,
});
}
for (symbol, next_item_set) in non_terminal_successors {
preceding_symbols.push(symbol);
let next_state_id = self.add_parse_state(
&preceding_symbols,
&preceding_auxiliary_symbols,
next_item_set,
);
preceding_symbols.pop();
self.parse_table.states[state_id]
.nonterminal_entries
.insert(symbol, next_state_id);
}
for symbol in lookaheads_with_conflicts {
self.handle_conflict(
&item_set,
state_id,
&preceding_symbols,
&preceding_auxiliary_symbols,
symbol,
)?;
}
Ok(())
}
fn handle_conflict(
&mut self,
item_set: &ParseItemSet,
state_id: ParseStateId,
preceding_symbols: &SymbolSequence,
preceding_auxiliary_symbols: &Vec<AuxiliarySymbolInfo>,
conflicting_lookahead: Symbol,
) -> Result<()> {
let entry = self.parse_table.states[state_id]
.terminal_entries
.get_mut(&conflicting_lookahead)
.unwrap();
// Determine which items in the set conflict with each other, and the
// precedences associated with SHIFT vs REDUCE actions. There won't
// be multiple REDUCE actions with different precedences; that is
// sorted out ahead of time in `add_actions`. But there can still be
// REDUCE-REDUCE conflicts where all actions have the *same*
// precedence, and there can still be SHIFT/REDUCE conflicts.
let reduce_precedence = entry.actions[0].precedence();
let mut considered_associativity = false;
let mut shift_precedence: Option<Range<i32>> = None;
let mut conflicting_items = HashSet::new();
for (item, lookaheads) in &item_set.entries {
let production = item.production(&self.syntax_grammar, &self.item_set_builder.inlines);
let step_index = item.step_index();
if let Some(step) = production.steps.get(step_index) {
if step_index > 0 {
if self
.item_set_builder
.first_set(&step.symbol)
.contains(&conflicting_lookahead)
{
conflicting_items.insert(item);
let precedence = production.steps[step_index - 1].precedence;
if let Some(range) = &mut shift_precedence {
if precedence < range.start {
range.start = precedence;
} else if precedence > range.end {
range.end = precedence;
}
} else {
shift_precedence = Some(precedence..precedence);
}
}
}
} else if lookaheads.contains(&conflicting_lookahead) {
conflicting_items.insert(item);
}
}
if let ParseAction::Shift { is_repetition, .. } = entry.actions.last_mut().unwrap() {
let shift_precedence = shift_precedence.unwrap_or(0..0);
// If all of the items in the conflict have the same parent symbol,
// and that parent symbols is auxiliary, then this is just the intentional
// ambiguity associated with a repeat rule. Resolve that class of ambiguity
// by leaving it in the parse table, but marking the SHIFT action with
// an `is_repetition` flag.
let conflicting_variable_index =
conflicting_items.iter().next().unwrap().variable_index();
if self.syntax_grammar.variables[conflicting_variable_index as usize].is_auxiliary() {
if conflicting_items
.iter()
.all(|item| item.variable_index() == conflicting_variable_index)
{
*is_repetition = true;
return Ok(());
}
}
// If the SHIFT action has higher precedence, remove all the REDUCE actions.
if shift_precedence.start > reduce_precedence
|| (shift_precedence.start == reduce_precedence
&& shift_precedence.end > reduce_precedence)
{
entry.actions.drain(0..entry.actions.len() - 1);
}
// If the REDUCE actions have higher precedence, remove the SHIFT action.
else if shift_precedence.end < reduce_precedence
|| (shift_precedence.end == reduce_precedence
&& shift_precedence.start < reduce_precedence)
{
entry.actions.pop();
conflicting_items.retain(|item| {
item.step(&self.syntax_grammar, &self.item_set_builder.inlines)
.is_none()
});
}
// If the SHIFT and REDUCE actions have the same predence, consider
// the REDUCE actions' associativity.
else if shift_precedence == (reduce_precedence..reduce_precedence) {
considered_associativity = true;
let mut has_left = false;
let mut has_right = false;
let mut has_non = false;
for action in &entry.actions {
if let ParseAction::Reduce { associativity, .. } = action {
match associativity {
Some(Associativity::Left) => has_left = true,
Some(Associativity::Right) => has_right = true,
None => has_non = true,
}
}
}
// If all reduce actions are left associative, remove the SHIFT action.
// If all reduce actions are right associative, remove the REDUCE actions.
match (has_left, has_non, has_right) {
(true, false, false) => {
entry.actions.pop();
conflicting_items.retain(|item| {
item.step(&self.syntax_grammar, &self.item_set_builder.inlines)
.is_none()
});
}
(false, false, true) => {
entry.actions.drain(0..entry.actions.len() - 1);
}
_ => {}
}
}
}
// If all of the actions but one have been eliminated, then there's no problem.
let entry = self.parse_table.states[state_id]
.terminal_entries
.get_mut(&conflicting_lookahead)
.unwrap();
if entry.actions.len() == 1 {
return Ok(());
}
// Determine the set of parent symbols involved in this conflict.
let mut actual_conflict = Vec::new();
for item in &conflicting_items {
let symbol = Symbol::non_terminal(item.variable_index() as usize);
if self.syntax_grammar.variables[symbol.index].is_auxiliary() {
actual_conflict.extend(
preceding_auxiliary_symbols
.iter()
.rev()
.find_map(|info| {
if info.auxiliary_symbol == symbol {
Some(&info.parent_symbols)
} else {
None
}
})
.unwrap()
.iter(),
);
} else {
actual_conflict.push(symbol);
}
}
actual_conflict.sort_unstable();
actual_conflict.dedup();
// If this set of symbols has been whitelisted, then there's no error.
if self
.syntax_grammar
.expected_conflicts
.contains(&actual_conflict)
{
return Ok(());
}
let mut msg = "Unresolved conflict for symbol sequence:\n\n".to_string();
for symbol in preceding_symbols {
write!(&mut msg, " {}", self.symbol_name(symbol)).unwrap();
}
write!(
&mut msg,
" • {} …\n\n",
self.symbol_name(&conflicting_lookahead)
)
.unwrap();
write!(&mut msg, "Possible interpretations:\n").unwrap();
for (i, item) in conflicting_items.iter().enumerate() {
write!(&mut msg, "\n {}:", i).unwrap();
for preceding_symbol in preceding_symbols
.iter()
.take(preceding_symbols.len() - item.step_index())
{
write!(&mut msg, " {}", self.symbol_name(preceding_symbol)).unwrap();
}
write!(
&mut msg,
" ({}",
&self.syntax_grammar.variables[item.variable_index() as usize].name
)
.unwrap();
for (j, step) in item
.production(&self.syntax_grammar, &self.item_set_builder.inlines)
.steps
.iter()
.enumerate()
{
if j == item.step_index() {
write!(&mut msg, "").unwrap();
}
write!(&mut msg, " {}", self.symbol_name(&step.symbol)).unwrap();
}
write!(&mut msg, ")").unwrap();
if item
.step(&self.syntax_grammar, &self.item_set_builder.inlines)
.is_none()
{
write!(
&mut msg,
" • {}",
self.symbol_name(&conflicting_lookahead)
)
.unwrap();
}
let precedence = item.precedence(&self.syntax_grammar, &self.item_set_builder.inlines);
let associativity =
item.associativity(&self.syntax_grammar, &self.item_set_builder.inlines);
if precedence != 0 || associativity.is_some() {
write!(
&mut msg,
"(precedence: {}, associativity: {:?})",
precedence, associativity
)
.unwrap();
}
}
// TODO - generate suggested resolutions
Err(Error::ConflictError(msg))
}
fn get_auxiliary_node_info(
&self,
item_set: &ParseItemSet,
symbol: Symbol,
) -> AuxiliarySymbolInfo {
let parent_symbols = item_set
.entries
.keys()
.filter_map(|item| {
if item.symbol(&self.syntax_grammar, &self.item_set_builder.inlines) == Some(symbol)
{
None
} else {
None
}
})
.collect();
AuxiliarySymbolInfo {
auxiliary_symbol: symbol,
parent_symbols,
}
}
fn populate_used_symbols(&mut self) {
let mut terminal_usages = vec![false; self.lexical_grammar.variables.len()];
let mut non_terminal_usages = vec![false; self.syntax_grammar.variables.len()];
let mut external_usages = vec![false; self.syntax_grammar.external_tokens.len()];
for state in &self.parse_table.states {
for symbol in state.terminal_entries.keys() {
match symbol.kind {
SymbolType::Terminal => terminal_usages[symbol.index] = true,
SymbolType::External => external_usages[symbol.index] = true,
_ => {}
}
}
for symbol in state.nonterminal_entries.keys() {
non_terminal_usages[symbol.index] = true;
}
}
for (i, value) in terminal_usages.into_iter().enumerate() {
if value {
self.parse_table.symbols.push(Symbol::terminal(i));
}
}
for (i, value) in non_terminal_usages.into_iter().enumerate() {
if value {
self.parse_table.symbols.push(Symbol::non_terminal(i));
}
}
for (i, value) in external_usages.into_iter().enumerate() {
if value {
self.parse_table.symbols.push(Symbol::external(i));
}
}
}
fn get_alias_sequence_id(&mut self, item: &ParseItem) -> AliasSequenceId {
let production = item.production(&self.syntax_grammar, &self.item_set_builder.inlines);
let alias_sequence = production.steps.iter().map(|s| s.alias.clone()).collect();
if let Some(index) = self
.parse_table
.alias_sequences
.iter()
.position(|seq| *seq == alias_sequence)
{
index
} else {
self.parse_table.alias_sequences.push(alias_sequence);
self.parse_table.alias_sequences.len() - 1
}
}
fn symbol_name(&self, symbol: &Symbol) -> String {
match symbol.kind {
SymbolType::End => "EOF".to_string(),
SymbolType::External => self.syntax_grammar.external_tokens[symbol.index]
.name
.clone(),
SymbolType::NonTerminal => self.syntax_grammar.variables[symbol.index].name.clone(),
SymbolType::Terminal => {
let variable = &self.lexical_grammar.variables[symbol.index];
if variable.kind == VariableType::Named {
variable.name.clone()
} else {
format!("\"{}\"", &variable.name)
}
}
}
}
}
pub(crate) fn build_tables(
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
simple_aliases: &AliasMap
simple_aliases: &AliasMap,
) -> Result<(ParseTable, LexTable, LexTable, Option<Symbol>)> {
unimplemented!();
ParseTableBuilder {
syntax_grammar,
lexical_grammar,
simple_aliases,
item_set_builder: ParseItemSetBuilder::new(syntax_grammar, lexical_grammar),
state_ids_by_item_set: HashMap::new(),
item_sets_by_state_id: Vec::new(),
parse_state_queue: VecDeque::new(),
parse_table: ParseTable {
states: Vec::new(),
alias_sequences: Vec::new(),
symbols: Vec::new(),
},
}
.build()
}

View file

@ -3,6 +3,7 @@ pub enum Error {
GrammarError(String),
SymbolError(String),
RegexError(String),
ConflictError(String),
}
pub type Result<T> = std::result::Result<T, Error>;

View file

@ -4,8 +4,8 @@ use crate::prepare_grammar::prepare_grammar;
use crate::build_tables::build_tables;
use crate::render::render_c_code;
pub fn generate_parser_for_grammar(input: String) -> Result<String> {
let input_grammar = parse_grammar(&input)?;
pub fn generate_parser_for_grammar(input: &str) -> Result<String> {
let input_grammar = parse_grammar(input)?;
let (syntax_grammar, lexical_grammar, simple_aliases) = prepare_grammar(&input_grammar)?;
let (parse_table, main_lex_table, keyword_lex_table, keyword_capture_token) = build_tables(
&syntax_grammar,

View file

@ -38,7 +38,7 @@ pub(crate) struct LexicalVariable {
pub start_state: u32,
}
#[derive(Debug, PartialEq, Eq)]
#[derive(Debug, Default, PartialEq, Eq)]
pub(crate) struct LexicalGrammar {
pub nfa: Nfa,
pub variables: Vec<LexicalVariable>,
@ -112,6 +112,14 @@ impl Production {
pub fn first_symbol(&self) -> Option<Symbol> {
self.steps.first().map(|s| s.symbol.clone())
}
pub fn last_precedence(&self) -> i32 {
self.steps.last().map(|s| s.precedence).unwrap_or(0)
}
pub fn last_associativity(&self) -> Option<Associativity> {
self.steps.last().map(|s| s.associativity).unwrap_or(None)
}
}
impl Default for Production {
@ -137,3 +145,9 @@ impl Variable {
Self { name: name.to_string(), kind: VariableType::Anonymous, rule }
}
}
impl SyntaxVariable {
pub fn is_auxiliary(&self) -> bool {
self.kind == VariableType::Auxiliary
}
}

334
src/js/dsl.js Normal file
View file

@ -0,0 +1,334 @@
const UNICODE_ESCAPE_PATTERN = /\\u([0-9a-f]{4})/gi;
const DELIMITER_ESCAPE_PATTERN = /\\\//g;
function alias(rule, value) {
const result = {
type: "ALIAS",
content: normalize(rule),
named: false,
value: null
};
switch (value.constructor) {
case String:
result.named = false;
result.value = value;
return result;
case ReferenceError:
result.named = true;
result.value = value.symbol.name;
return result;
case Object:
if (typeof value.type === 'string' && value.type === 'SYMBOL') {
result.named = true;
result.value = value.name;
return result;
}
}
throw new Error('Invalid alias value ' + value);
}
function blank() {
return {
type: "BLANK"
};
}
function choice(...elements) {
return {
type: "CHOICE",
members: elements.map(normalize)
};
}
function optional(value) {
return choice(value, blank());
}
function prec(number, rule) {
if (rule == null) {
rule = number;
number = 0;
}
return {
type: "PREC",
value: number,
content: normalize(rule)
};
}
prec.left = function(number, rule) {
if (rule == null) {
rule = number;
number = 0;
}
return {
type: "PREC_LEFT",
value: number,
content: normalize(rule)
};
}
prec.right = function(number, rule) {
if (rule == null) {
rule = number;
number = 0;
}
return {
type: "PREC_RIGHT",
value: number,
content: normalize(rule)
};
}
prec.dynamic = function(number, rule) {
return {
type: "PREC_DYNAMIC",
value: number,
content: normalize(rule)
};
}
function repeat(rule) {
return {
type: "REPEAT",
content: normalize(rule)
};
}
function repeat1(rule) {
return {
type: "REPEAT1",
content: normalize(rule)
};
}
function seq(...elements) {
return {
type: "SEQ",
members: elements.map(normalize)
};
}
function sym(name) {
return {
type: "SYMBOL",
name: name
};
}
function token(value) {
return {
type: "TOKEN",
content: normalize(value)
};
}
token.immediate = function(value) {
return {
type: "IMMEDIATE_TOKEN",
content: normalize(value)
};
}
function normalize(value) {
if (typeof value == "undefined")
throw new Error("Undefined symbol");
switch (value.constructor) {
case String:
return {
type: 'STRING',
value
};
case RegExp:
return {
type: 'PATTERN',
value: value.source
.replace(
DELIMITER_ESCAPE_PATTERN,
'/'
)
.replace(
UNICODE_ESCAPE_PATTERN,
(match, group) => String.fromCharCode(parseInt(group, 16))
)
};
case ReferenceError:
throw value
default:
if (typeof value.type === 'string') {
return value;
} else {
throw new TypeError("Invalid rule: " + value.toString());
}
}
}
function RuleBuilder(ruleMap) {
return new Proxy({}, {
get(target, propertyName) {
const symbol = {
type: 'SYMBOL',
name: propertyName
};
if (!ruleMap || ruleMap.hasOwnProperty(propertyName)) {
return symbol;
} else {
const error = new ReferenceError(`Undefined symbol '${propertyName}'`);
error.symbol = symbol;
return error;
}
}
})
}
function grammar(baseGrammar, options) {
if (!options) {
options = baseGrammar;
baseGrammar = {
name: null,
rules: {},
extras: [normalize(/\s/)],
conflicts: [],
externals: [],
inline: []
};
}
let externals = baseGrammar.externals;
if (options.externals) {
if (typeof options.externals !== "function") {
throw new Error("Grammar's 'externals' property must be a function.");
}
const externalsRuleBuilder = RuleBuilder(null)
const externalRules = options.externals.call(externalsRuleBuilder, externalsRuleBuilder, baseGrammar.externals);
if (!Array.isArray(externalRules)) {
throw new Error("Grammar's 'externals' property must return an array of rules.");
}
externals = externalRules.map(normalize);
}
const ruleMap = {};
for (const key in options.rules) {
ruleMap[key] = true;
}
for (const key in baseGrammar.rules) {
ruleMap[key] = true;
}
for (const external of externals) {
if (typeof external.name === 'string') {
ruleMap[external.name] = true;
}
}
const ruleBuilder = RuleBuilder(ruleMap);
const name = options.name;
if (typeof name !== "string") {
throw new Error("Grammar's 'name' property must be a string.");
}
if (!/^[a-zA-Z_]\w*$/.test(name)) {
throw new Error("Grammar's 'name' property must not start with a digit and cannot contain non-word characters.");
}
let rules = Object.assign({}, baseGrammar.rules);
if (options.rules) {
if (typeof options.rules !== "object") {
throw new Error("Grammar's 'rules' property must be an object.");
}
for (const ruleName in options.rules) {
const ruleFn = options.rules[ruleName];
if (typeof ruleFn !== "function") {
throw new Error("Grammar rules must all be functions. '" + ruleName + "' rule is not.");
}
rules[ruleName] = normalize(ruleFn.call(ruleBuilder, ruleBuilder, baseGrammar.rules[ruleName]));
}
}
let extras = baseGrammar.extras.slice();
if (options.extras) {
if (typeof options.extras !== "function") {
throw new Error("Grammar's 'extras' property must be a function.");
}
extras = options.extras
.call(ruleBuilder, ruleBuilder, baseGrammar.extras)
.map(normalize);
}
let word = baseGrammar.word;
if (options.word) {
word = options.word.call(ruleBuilder, ruleBuilder).name;
if (typeof word != 'string') {
throw new Error("Grammar's 'word' property must be a named rule.");
}
}
let conflicts = baseGrammar.conflicts;
if (options.conflicts) {
if (typeof options.conflicts !== "function") {
throw new Error("Grammar's 'conflicts' property must be a function.");
}
const baseConflictRules = baseGrammar.conflicts.map(conflict => conflict.map(sym));
const conflictRules = options.conflicts.call(ruleBuilder, ruleBuilder, baseConflictRules);
if (!Array.isArray(conflictRules)) {
throw new Error("Grammar's conflicts must be an array of arrays of rules.");
}
conflicts = conflictRules.map(conflictSet => {
if (!Array.isArray(conflictSet)) {
throw new Error("Grammar's conflicts must be an array of arrays of rules.");
}
return conflictSet.map(symbol => symbol.name);
});
}
let inline = baseGrammar.inline;
if (options.inline) {
if (typeof options.inline !== "function") {
throw new Error("Grammar's 'inline' property must be a function.");
}
const baseInlineRules = baseGrammar.inline.map(sym);
const inlineRules = options.inline.call(ruleBuilder, ruleBuilder, baseInlineRules);
if (!Array.isArray(inlineRules)) {
throw new Error("Grammar's inline must be an array of rules.");
}
inline = inlineRules.map(symbol => symbol.name);
}
if (Object.keys(rules).length == 0) {
throw new Error("Grammar must have at least one rule.");
}
return {name, word, rules, extras, conflicts, externals, inline};
}
global.alias = alias;
global.blank = blank;
global.choice = choice;
global.optional = optional;
global.prec = prec;
global.repeat = repeat;
global.repeat1 = repeat1;
global.seq = seq;
global.sym = sym;
global.token = token;
global.grammar = grammar;

View file

@ -1,8 +1,15 @@
use clap::{App, Arg, SubCommand};
#[macro_use]
extern crate serde_derive;
#[macro_use]
extern crate serde_json;
#[macro_use]
extern crate lazy_static;
#[macro_use] extern crate serde_derive;
#[macro_use] extern crate serde_json;
#[macro_use] extern crate lazy_static;
use std::path::PathBuf;
use clap::{App, Arg, SubCommand};
use std::env;
use std::io::Write;
use std::process::{Command, Stdio};
mod build_tables;
mod error;
@ -20,25 +27,59 @@ fn main() -> error::Result<()> {
.version("0.1")
.author("Max Brunsfeld <maxbrunsfeld@gmail.com>")
.about("Generates and tests parsers")
.subcommand(SubCommand::with_name("generate").about("Generate a parser"))
.subcommand(
SubCommand::with_name("generate")
.about("Generate a parser")
).subcommand(
SubCommand::with_name("parse")
.about("Parse a file")
.arg(Arg::with_name("path").index(1))
).subcommand(
.arg(Arg::with_name("path").index(1)),
)
.subcommand(
SubCommand::with_name("test")
.about("Run a parser's tests")
.arg(Arg::with_name("path").index(1).required(true))
.arg(Arg::with_name("line").index(2).required(true))
.arg(Arg::with_name("column").index(3).required(true))
).get_matches();
.arg(Arg::with_name("column").index(3).required(true)),
)
.get_matches();
if let Some(matches) = matches.subcommand_matches("generate") {
let code = generate::generate_parser_for_grammar(String::new())?;
let mut grammar_path = env::current_dir().expect("Failed to read CWD");
grammar_path.push("grammar.js");
let grammar_json = load_js_grammar_file(grammar_path);
let code = generate::generate_parser_for_grammar(&grammar_json)?;
println!("{}", code);
}
Ok(())
}
fn load_js_grammar_file(grammar_path: PathBuf) -> String {
let mut node_process = Command::new("node")
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.spawn()
.expect("Failed to run `node`");
let js_prelude = include_str!("./js/dsl.js");
let mut node_stdin = node_process
.stdin
.take()
.expect("Failed to open stdin for node");
write!(
node_stdin,
"{}\nconsole.log(JSON.stringify(require(\"{}\"), null, 2));\n",
js_prelude,
grammar_path.to_str().unwrap()
).expect("Failed to write to node's stdin");
drop(node_stdin);
let output = node_process
.wait_with_output()
.expect("Failed to read output from node");
match output.status.code() {
None => panic!("Node process was killed"),
Some(0) => {}
Some(code) => panic!(format!("Node process exited with status {}", code)),
}
String::from_utf8(output.stdout).expect("Got invalid UTF8 from node")
}

View file

@ -23,6 +23,12 @@ pub struct Nfa {
pub states: Vec<NfaState>
}
impl Default for Nfa {
fn default() -> Self {
Self { states: Vec::new() }
}
}
#[derive(Debug)]
pub struct NfaCursor<'a> {
pub(crate) state_ids: Vec<u32>,

View file

@ -22,6 +22,7 @@ pub(super) fn extract_simple_aliases(
Symbol { kind: SymbolType::External, index} => &mut external_status_list[index],
Symbol { kind: SymbolType::NonTerminal, index} => &mut non_terminal_status_list[index],
Symbol { kind: SymbolType::Terminal, index} => &mut terminal_status_list[index],
Symbol { kind: SymbolType::End, .. } => panic!("Unexpected end token"),
};
if step.alias.is_none() {
@ -49,6 +50,7 @@ pub(super) fn extract_simple_aliases(
Symbol { kind: SymbolType::External, index} => &external_status_list[index],
Symbol { kind: SymbolType::NonTerminal, index} => &non_terminal_status_list[index],
Symbol { kind: SymbolType::Terminal, index} => &terminal_status_list[index],
Symbol { kind: SymbolType::End, .. } => panic!("Unexpected end token"),
};
if status.alias.is_some() {

View file

@ -67,10 +67,13 @@ pub(super) fn extract_tokens(
.expected_conflicts
.into_iter()
.map(|conflict| {
conflict
let mut result: Vec<_> = conflict
.iter()
.map(|symbol| symbol_replacer.replace_symbol(*symbol))
.collect()
.collect();
result.sort_unstable();
result.dedup();
result
})
.collect();

View file

@ -1,6 +1,188 @@
use crate::rules::{Symbol, AliasMap};
use crate::grammars::{SyntaxGrammar, LexicalGrammar};
use crate::tables::{ParseTable, LexTable};
use crate::grammars::{LexicalGrammar, SyntaxGrammar, VariableType};
use crate::rules::{Alias, AliasMap, Symbol, SymbolType};
use crate::tables::{LexTable, ParseTable, ParseTableEntry};
use std::collections::{HashMap, HashSet};
use std::fmt::Write;
macro_rules! add_line {
($this: tt, $($arg: tt)*) => {
for _ in 0..$this.indent_level {
write!(&mut $this.buffer, " ").unwrap();
}
$this.buffer.write_fmt(format_args!($($arg)*)).unwrap();
$this.buffer += "\n";
}
}
struct Generator {
buffer: String,
indent_level: usize,
language_name: String,
parse_table: ParseTable,
main_lex_table: LexTable,
keyword_lex_table: LexTable,
keyword_capture_token: Option<Symbol>,
syntax_grammar: SyntaxGrammar,
lexical_grammar: LexicalGrammar,
simple_aliases: AliasMap,
symbol_ids: HashMap<Symbol, String>,
parse_table_entries: Vec<(usize, ParseTableEntry)>,
next_parse_action_list_index: usize,
unique_aliases: HashSet<Alias>,
}
impl Generator {
fn generate(mut self) -> String {
self.add_includes();
self.add_pragmas();
self.add_stats();
self.add_symbol_enum();
self.add_symbol_names_list();
self.buffer
}
fn add_includes(&mut self) {
add_line!(self, "#include <tree_sitter/parser.h>");
add_line!(self, "");
}
fn add_pragmas(&mut self) {
add_line!(self, "#if defined(__GNUC__) || defined(__clang__)");
add_line!(self, "#pragma GCC diagnostic push");
add_line!(self, "#pragma GCC diagnostic ignored \"-Wmissing-field-initializers\"");
add_line!(self, "#endif");
add_line!(self, "");
// Compiling large lexer functions can be very slow, especially when
// using Visual Studio on Windows. Disabling optimizations is not
// ideal, but only a very small fraction of overall parse time is
// spent lexing, so the performance impact of this is pretty small.
if self.main_lex_table.states.len() > 500 {
add_line!(self, "#ifdef _MSC_VER");
add_line!(self, "#pragma optimize(\"\", off)");
add_line!(self, "#endif");
add_line!(self, "");
}
}
fn add_stats(&mut self) {
let mut token_count = 0;
for symbol in &self.parse_table.symbols {
if symbol.is_terminal() {
token_count += 1;
} else if symbol.is_external() {
let external_token = &self.syntax_grammar.external_tokens[symbol.index];
if external_token.corresponding_internal_token.is_none() {
token_count += 1;
}
}
}
for alias_sequence in &self.parse_table.alias_sequences {
for entry in alias_sequence {
if let Some(alias) = entry {
self.unique_aliases.insert(alias.clone());
}
}
}
let mut symbol_id_values = HashSet::new();
for i in 0..self.parse_table.symbols.len() {
self.assign_symbol_id(self.parse_table.symbols[i], &mut symbol_id_values);
}
add_line!(self, "#define LANGUAGE_VERSION {}", 6);
add_line!(self, "#define STATE_COUNT {}", self.parse_table.states.len());
add_line!(self, "#define SYMBOL_COUNT {}", self.parse_table.symbols.len());
add_line!(self, "#define ALIAS_COUNT {}", self.unique_aliases.len());
add_line!(self, "#define TOKEN_COUNT {}", token_count);
add_line!(self, "#define EXTERNAL_TOKEN_COUNT {}", self.syntax_grammar.external_tokens.len());
// add_line!(self, "#define MAX_ALIAS_SEQUENCE_LENGTH {}\n", self.parse_table.max_alias_sequence_length);
add_line!(self, "");
}
fn add_symbol_enum(&mut self) {
add_line!(self, "enum {{");
self.indent();
for i in 0..self.parse_table.symbols.len() {
let symbol = self.parse_table.symbols[i];
if symbol != Symbol::end() {
add_line!(self, "{} = {}", self.symbol_ids[&symbol], i);
}
}
self.dedent();
add_line!(self, "}};");
add_line!(self, "");
}
fn add_symbol_names_list(&mut self) {
add_line!(self, "static const char *ts_symbol_names[] = {{");
self.indent();
self.dedent();
add_line!(self, "}};");
add_line!(self, "");
}
fn assign_symbol_id(&mut self, symbol: Symbol, used_ids: &mut HashSet<String>) {
let mut id;
if symbol == Symbol::end() {
id = "ts_builtin_sym_end".to_string();
} else {
let (name, kind) = self.metadata_for_symbol(symbol);
id = match kind {
VariableType::Auxiliary => format!("aux_sym_{}", self.sanitize_name(name)),
VariableType::Anonymous => format!("anon_sym_{}", self.sanitize_name(name)),
VariableType::Hidden | VariableType::Named => {
format!("sym_{}", self.sanitize_name(name))
}
};
let mut suffix_number = 1;
let mut suffix = String::new();
while used_ids.contains(&id) {
id.drain(id.len() - suffix.len()..);
suffix_number += 1;
suffix = suffix_number.to_string();
id += &suffix;
}
}
used_ids.insert(id.clone());
self.symbol_ids.insert(symbol, id);
}
fn metadata_for_symbol(&self, symbol: Symbol) -> (&str, VariableType) {
match symbol.kind {
SymbolType::End => ("end", VariableType::Auxiliary),
SymbolType::NonTerminal => {
let variable = &self.syntax_grammar.variables[symbol.index];
(&variable.name, variable.kind)
}
SymbolType::Terminal => {
let variable = &self.lexical_grammar.variables[symbol.index];
(&variable.name, variable.kind)
}
SymbolType::External => {
let token = &self.syntax_grammar.external_tokens[symbol.index];
(&token.name, token.kind)
}
}
}
fn sanitize_name(&self, name: &str) -> String {
name.to_string()
}
fn indent(&mut self) {
self.indent_level += 1;
}
fn dedent(&mut self) {
self.indent_level -= 1;
}
}
pub(crate) fn render_c_code(
name: &str,
@ -12,5 +194,21 @@ pub(crate) fn render_c_code(
lexical_grammar: LexicalGrammar,
simple_aliases: AliasMap,
) -> String {
unimplemented!();
Generator {
buffer: String::new(),
indent_level: 0,
language_name: name.to_string(),
parse_table,
main_lex_table,
keyword_lex_table,
keyword_capture_token,
syntax_grammar,
lexical_grammar,
simple_aliases,
symbol_ids: HashMap::new(),
parse_table_entries: Vec::new(),
next_parse_action_list_index: 0,
unique_aliases: HashSet::new(),
}
.generate()
}

View file

@ -1,10 +1,11 @@
use std::collections::HashMap;
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub(crate) enum SymbolType {
External,
Terminal,
NonTerminal,
End,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
@ -33,7 +34,7 @@ pub(crate) struct MetadataParams {
pub alias: Option<Alias>,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub(crate) struct Symbol {
pub kind: SymbolType,
pub index: usize,
@ -56,6 +57,15 @@ pub(crate) enum Rule {
}
impl Rule {
pub fn alias(content: Rule, value: String, is_named: bool) -> Self {
add_metadata(content, move |params| {
params.alias = Some(Alias {
is_named,
value
});
})
}
pub fn token(content: Rule) -> Self {
add_metadata(content, |params| {
params.is_token = true;
@ -169,6 +179,13 @@ impl Symbol {
index,
}
}
pub fn end() -> Self {
Symbol {
kind: SymbolType::End,
index: 0,
}
}
}
impl From<Symbol> for Rule {
@ -177,7 +194,7 @@ impl From<Symbol> for Rule {
}
}
fn add_metadata<T: Fn(&mut MetadataParams)>(input: Rule, f: T) -> Rule {
fn add_metadata<T: FnOnce(&mut MetadataParams)>(input: Rule, f: T) -> Rule {
match input {
Rule::Metadata { rule, mut params } => {
f(&mut params);

View file

@ -6,20 +6,13 @@ pub(crate) type AliasSequenceId = usize;
pub(crate) type ParseStateId = usize;
pub(crate) type LexStateId = usize;
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub(crate) enum ParseActionType {
Error,
Shift,
Reduce,
Accept,
Recover,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub(crate) enum ParseAction {
Accept,
Error,
Shift(ParseStateId),
Shift {
state: ParseStateId,
is_repetition: bool,
},
ShiftExtra,
Recover,
Reduce {
@ -28,50 +21,69 @@ pub(crate) enum ParseAction {
precedence: i32,
dynamic_precedence: i32,
associativity: Option<Associativity>,
alias_sequence_id: Option<AliasSequenceId>,
is_repetition: bool,
alias_sequence_id: AliasSequenceId,
}
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct ParseTableEntry {
actions: Vec<ParseAction>,
reusable: bool,
pub actions: Vec<ParseAction>,
pub reusable: bool,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct ParseState {
terminal_entries: HashMap<Symbol, ParseTableEntry>,
nonterminal_entries: HashMap<Symbol, ParseStateId>
pub terminal_entries: HashMap<Symbol, ParseTableEntry>,
pub nonterminal_entries: HashMap<Symbol, ParseStateId>
}
#[derive(Debug, PartialEq, Eq)]
pub(crate) struct ParseTable {
states: Vec<ParseState>,
alias_sequences: Vec<Vec<Alias>>,
pub states: Vec<ParseState>,
pub symbols: Vec<Symbol>,
pub alias_sequences: Vec<Vec<Option<Alias>>>,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct AdvanceAction {
state: LexStateId,
precedence: Range<i32>,
in_main_token: bool,
pub state: LexStateId,
pub precedence: Range<i32>,
pub in_main_token: bool,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct AcceptTokenAction {
symbol: Symbol,
precedence: i32,
implicit_precedence: i32,
pub symbol: Symbol,
pub precedence: i32,
pub implicit_precedence: i32,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct LexState {
advance_actions: HashMap<Symbol, AdvanceAction>,
accept_action: Option<AcceptTokenAction>,
pub advance_actions: HashMap<Symbol, AdvanceAction>,
pub accept_action: Option<AcceptTokenAction>,
}
#[derive(Debug, PartialEq, Eq)]
pub(crate) struct LexTable {
states: Vec<LexState>,
pub states: Vec<LexState>,
}
impl ParseTableEntry {
pub fn new() -> Self {
Self {
reusable: true,
actions: Vec::new(),
}
}
}
impl ParseAction {
pub fn precedence(&self) -> i32 {
if let ParseAction::Reduce { precedence, .. } = self {
*precedence
} else {
0
}
}
}