From 889f232b4ca2cbdc932510bb75da6f686059eceb Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 18 Dec 2018 16:05:36 -0800 Subject: [PATCH] Implement variable inlining --- Cargo.lock | 15 +- Cargo.toml | 3 +- src/build_tables/inline_variables.rs | 318 +++++++++++++++++++++++++++ src/build_tables/item.rs | 213 ++++++++++++++++-- src/build_tables/mod.rs | 1 + src/grammars.rs | 12 + src/main.rs | 1 + src/parse_grammar.rs | 1 - src/rules.rs | 34 ++- 9 files changed, 567 insertions(+), 31 deletions(-) create mode 100644 src/build_tables/inline_variables.rs diff --git a/Cargo.lock b/Cargo.lock index d5109fb7..410580fa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -67,11 +67,6 @@ name = "bitflags" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -[[package]] -name = "bitvec" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "blake2-rfc" version = "0.2.18" @@ -461,16 +456,17 @@ dependencies = [ name = "rust-tree-sitter-cli" version = "0.1.0" dependencies = [ - "bitvec 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)", "clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)", "dirs 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", "ignore 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", "libloading 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", "regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", "rusqlite 0.14.0 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.33 (registry+https://github.com/rust-lang/crates.io-index)", + "smallbitvec 2.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "tree-sitter 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -548,6 +544,11 @@ dependencies = [ "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "smallbitvec" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "smallvec" version = "0.6.7" @@ -729,7 +730,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum backtrace 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "89a47830402e9981c5c41223151efcced65a0510c13097c769cede7efb34782a" "checksum backtrace-sys 0.1.24 (registry+https://github.com/rust-lang/crates.io-index)" = "c66d56ac8dabd07f6aacdaf633f4b8262f5b3601a810a0dcddffd5c22c69daa0" "checksum bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "228047a76f468627ca71776ecdebd732a3423081fcf5125585bcd7c49886ce12" -"checksum bitvec 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e37e2176261200377c7cde4c6de020394174df556c356f965e4bc239f5ce1c5a" "checksum blake2-rfc 0.2.18 (registry+https://github.com/rust-lang/crates.io-index)" = "5d6d530bdd2d52966a6d03b7a964add7ae1a288d25214066fd4b600f0f796400" "checksum cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)" = "f159dfd43363c4d08055a07703eb7a3406b0dac4d0584d96965a3262db3c9d16" "checksum cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "082bb9b28e00d3c9d39cc03e64ce4cea0f1bb9b3fde493f0cbc008472d22bdf4" @@ -787,6 +787,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)" = "15c141fc7027dd265a47c090bf864cf62b42c4d228bbcf4e51a0c9e2b0d3f7ef" "checksum serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)" = "225de307c6302bec3898c51ca302fc94a7a1697ef0845fcee6448f33c032249c" "checksum serde_json 1.0.33 (registry+https://github.com/rust-lang/crates.io-index)" = "c37ccd6be3ed1fdf419ee848f7c758eb31b054d7cd3ae3600e3bae0adf569811" +"checksum smallbitvec 2.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1764fe2b30ee783bfe3b9b37b2649d8d590b3148bb12e0079715d4d5c673562e" "checksum smallvec 0.6.7 (registry+https://github.com/rust-lang/crates.io-index)" = "b73ea3738b47563803ef814925e69be00799a8c07420be8b996f8e98fb2336db" "checksum stable_deref_trait 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "dba1a27d3efae4351c8051072d619e3ade2820635c3958d826bfea39d59b54c8" "checksum strsim 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bb4f380125926a99e52bc279241539c018323fab05ad6368b56f93d9369ff550" diff --git a/Cargo.toml b/Cargo.toml index 93a49d2c..f3880a1c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,7 +5,8 @@ authors = ["Max Brunsfeld "] edition = "2018" [dependencies] -bitvec = "0.8" +lazy_static = "1.2.0" +smallbitvec = "2.3.0" clap = "2.32" dirs = "1.0.2" ignore = "0.4.4" diff --git a/src/build_tables/inline_variables.rs b/src/build_tables/inline_variables.rs new file mode 100644 index 00000000..d201519f --- /dev/null +++ b/src/build_tables/inline_variables.rs @@ -0,0 +1,318 @@ +use super::item::ParseItem; +use crate::grammars::{Production, SyntaxGrammar}; +use std::collections::HashMap; + +pub(crate) struct InlinedProductionMap { + pub inlined_productions: Vec, + item_map: HashMap>, +} + +impl InlinedProductionMap { + pub fn new(grammar: &SyntaxGrammar) -> Self { + let mut result = Self { + inlined_productions: Vec::new(), + item_map: HashMap::new(), + }; + + let mut items_to_process = Vec::new(); + for (variable_index, variable) in grammar.variables.iter().enumerate() { + for production_index in 0..variable.productions.len() { + items_to_process.push(ParseItem::Normal { + variable_index: variable_index as u32, + production_index: production_index as u32, + step_index: 0, + }); + while !items_to_process.is_empty() { + let mut i = 0; + while i < items_to_process.len() { + let item = &items_to_process[i]; + if let Some(step) = item.step(grammar, &result) { + if grammar.variables_to_inline.contains(&step.symbol) { + let inlined_items = result + .inline(*item, grammar) + .into_iter() + .map(|production_index| ParseItem::Inlined { + variable_index: item.variable_index(), + production_index: *production_index, + step_index: item.step_index() as u32, + }) + .collect::>(); + items_to_process.splice(i..i + 1, inlined_items); + } else { + items_to_process[i] = item.successor(); + i += 1; + } + } else { + items_to_process.remove(i); + } + } + } + } + } + + result + } + + pub fn inlined_items<'a>( + &'a self, + item: ParseItem, + ) -> Option + 'a> { + self.item_map.get(&item).map(|production_indices| { + production_indices + .iter() + .cloned() + .map(move |production_index| ParseItem::Inlined { + variable_index: item.variable_index(), + production_index, + step_index: item.step_index() as u32, + }) + }) + } + + fn inline(&mut self, item: ParseItem, grammar: &SyntaxGrammar) -> &Vec { + let step_index = item.step_index(); + let mut productions_to_add = grammar.variables + [item.step(grammar, self).unwrap().symbol.index] + .productions + .clone(); + + let mut i = 0; + while i < productions_to_add.len() { + if let Some(first_symbol) = productions_to_add[i].first_symbol() { + if grammar.variables_to_inline.contains(&first_symbol) { + // Remove the production from the vector, replacing it with a placeholder. + let production = productions_to_add + .splice(i..i + 1, [Production::default()].iter().cloned()) + .next() + .unwrap(); + + // Replace the placeholder with the inlined productions. + productions_to_add.splice( + i..i + 1, + grammar.variables[first_symbol.index] + .productions + .iter() + .map(|p| { + let mut p = p.clone(); + p.steps.extend(production.steps[1..].iter().cloned()); + p + }), + ); + continue; + } + } + i += 1; + } + + let result = productions_to_add + .into_iter() + .map(|production_to_add| { + let mut inlined_production = item.production(grammar, &self).clone(); + inlined_production.steps.splice( + step_index..step_index + 1, + production_to_add.steps.iter().cloned(), + ); + self.inlined_productions + .iter() + .position(|p| *p == inlined_production) + .unwrap_or({ + self.inlined_productions.push(inlined_production); + self.inlined_productions.len() - 1 + }) as u32 + }) + .collect(); + + self.item_map.entry(item).or_insert(result) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::grammars::{ProductionStep, SyntaxVariable, VariableType}; + use crate::rules::Symbol; + + #[test] + fn test_basic_inlining() { + let grammar = SyntaxGrammar { + expected_conflicts: Vec::new(), + extra_tokens: Vec::new(), + external_tokens: Vec::new(), + word_token: None, + variables_to_inline: vec![Symbol::non_terminal(1)], + variables: vec![ + SyntaxVariable { + name: "var0".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::non_terminal(1)), // inlined + ProductionStep::new(Symbol::terminal(11)), + ], + }], + }, + SyntaxVariable { + name: "var1".to_string(), + kind: VariableType::Named, + productions: vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(12)), + ProductionStep::new(Symbol::terminal(13)), + ], + }, + Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::terminal(14))], + }, + ], + }, + ], + }; + + let inline_map = InlinedProductionMap::new(&grammar); + + // Nothing to inline at step 0. + assert_eq!( + display_items( + inline_map.inlined_items(ParseItem::Normal { + variable_index: 0, + production_index: 0, + step_index: 0 + }), + &grammar, + &inline_map + ), + None + ); + + // Inlining variable 1 yields two productions. + assert_eq!( + display_items( + inline_map.inlined_items(ParseItem::Normal { + variable_index: 0, + production_index: 0, + step_index: 1 + }), + &grammar, + &inline_map + ), + Some(vec![ + "terminal-10 • terminal-12 terminal-13 terminal-11".to_string(), + "terminal-10 • terminal-14 terminal-11".to_string(), + ]) + ); + } + + #[test] + fn test_nested_inlining() { + let grammar = SyntaxGrammar { + variables: vec![ + SyntaxVariable { + name: "var0".to_string(), + kind: VariableType::Named, + productions: vec![ + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(10)), + ProductionStep::new(Symbol::non_terminal(1)), // inlined + ProductionStep::new(Symbol::terminal(11)), + ProductionStep::new(Symbol::non_terminal(2)), // inlined + ProductionStep::new(Symbol::terminal(12)), + ], + }, + ], + }, + SyntaxVariable { + name: "var1".to_string(), + kind: VariableType::Named, + productions: vec![ + Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::terminal(13))], + }, + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::non_terminal(3)), // inlined + ProductionStep::new(Symbol::terminal(14)), + ], + }, + ], + }, + SyntaxVariable { + name: "var2".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::terminal(15))], + }], + }, + SyntaxVariable { + name: "var3".to_string(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::terminal(16))], + }], + }, + ], + variables_to_inline: vec![ + Symbol::non_terminal(1), + Symbol::non_terminal(2), + Symbol::non_terminal(3), + ], + expected_conflicts: Vec::new(), + extra_tokens: Vec::new(), + external_tokens: Vec::new(), + word_token: None, + }; + + let inline_map = InlinedProductionMap::new(&grammar); + + let items = inline_map.inlined_items(ParseItem::Normal { + variable_index: 0, + production_index: 0, + step_index: 1 + }).unwrap().collect::>(); + + assert_eq!( + display_items(Some(items.iter().cloned()), &grammar, &inline_map), + Some(vec![ + "terminal-10 • terminal-13 terminal-11 non-terminal-2 terminal-12".to_string(), + "terminal-10 • terminal-16 terminal-14 terminal-11 non-terminal-2 terminal-12".to_string() + ]) + ); + + let item = items[0].successor().successor(); + assert_eq!( + display_items(Some([item].iter().cloned()), &grammar, &inline_map), + Some(vec![ + "terminal-10 terminal-13 terminal-11 • non-terminal-2 terminal-12".to_string(), + ]) + ); + + assert_eq!( + display_items(inline_map.inlined_items(item), &grammar, &inline_map), + Some(vec![ + "terminal-10 terminal-13 terminal-11 • terminal-15 terminal-12".to_string(), + ]) + ); + } + + fn display_items( + items: Option>, + grammar: &SyntaxGrammar, + inline_map: &InlinedProductionMap, + ) -> Option> { + items.map(|items| { + items + .map(|item| format!("{}", item.with(grammar, inline_map))) + .collect() + }) + } +} diff --git a/src/build_tables/item.rs b/src/build_tables/item.rs index c8d30997..537b0928 100644 --- a/src/build_tables/item.rs +++ b/src/build_tables/item.rs @@ -1,22 +1,209 @@ -use crate::grammars::Production; +use super::inline_variables::InlinedProductionMap; +use crate::grammars::{Production, ProductionStep, SyntaxGrammar}; +use crate::rules::{Symbol, SymbolType}; +use smallbitvec::SmallBitVec; use std::collections::HashMap; -use bitvec::BitVec; +use std::hash::{Hash, Hasher}; +use std::fmt; -#[derive(Debug, PartialEq, Eq)] -pub(super) struct LookaheadSet { - terminal_bits: BitVec, - external_bits: BitVec, +lazy_static! { + static ref START_PRODUCTION: Production = Production { + dynamic_precedence: 0, + steps: vec![ProductionStep { + symbol: Symbol { + index: 0, + kind: SymbolType::NonTerminal, + }, + precedence: 0, + associativity: None, + alias: None, + }], + }; +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct LookaheadSet { + terminal_bits: SmallBitVec, + external_bits: SmallBitVec, eof: bool, } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] -pub(super) struct ParseItem { - variable_index: u32, - production_index: u32, - step_index: u32, +pub(crate) enum ParseItem { + Start { + step_index: u32, + }, + Normal { + variable_index: u32, + production_index: u32, + step_index: u32, + }, + Inlined { + variable_index: u32, + production_index: u32, + step_index: u32, + }, } -#[derive(Debug, PartialEq, Eq)] -pub(super) struct ParseItemSet { - entries: HashMap +#[derive(Clone, Debug, PartialEq, Eq)] +pub(crate) struct ParseItemSet { + pub entries: HashMap, +} + +impl LookaheadSet { + pub fn new() -> Self { + Self { + terminal_bits: SmallBitVec::new(), + external_bits: SmallBitVec::new(), + eof: false, + } + } + + pub fn insert(&mut self, other: Symbol) { + match other.kind { + SymbolType::NonTerminal => panic!("Cannot store non-terminals in a LookaheadSet"), + SymbolType::Terminal => self.terminal_bits.set(other.index, true), + SymbolType::External => self.external_bits.set(other.index, true), + } + } + + pub fn insert_all(&mut self, other: &LookaheadSet) -> bool { + let mut result = false; + if other.terminal_bits.len() > self.terminal_bits.len() { + self.terminal_bits.resize(other.terminal_bits.len(), false); + } + if other.external_bits.len() > self.external_bits.len() { + self.external_bits.resize(other.external_bits.len(), false); + } + for (i, element) in other.terminal_bits.iter().enumerate() { + if element { + result |= !self.terminal_bits[i]; + self.terminal_bits.set(i, element); + } + } + for (i, element) in other.external_bits.iter().enumerate() { + if element { + result |= !self.external_bits[i]; + self.external_bits.set(i, element); + } + } + if other.eof { + result |= !self.eof; + self.eof = true; + } + result + } +} + +impl ParseItem { + pub fn is_kernel(&self) -> bool { + match self { + ParseItem::Start { .. } => true, + ParseItem::Normal { step_index, .. } | ParseItem::Inlined { step_index, .. } => { + *step_index > 0 + } + } + } + + pub fn production<'a>( + &'a self, + grammar: &'a SyntaxGrammar, + inlined_productions: &'a InlinedProductionMap, + ) -> &'a Production { + match self { + ParseItem::Start { .. } => &START_PRODUCTION, + ParseItem::Normal { + variable_index, + production_index, + .. + } => { + &grammar.variables[*variable_index as usize].productions[*production_index as usize] + } + ParseItem::Inlined { + production_index, + .. + } => &inlined_productions.inlined_productions[*production_index as usize], + } + } + + pub fn step<'a>( + &'a self, + grammar: &'a SyntaxGrammar, + inlined_productions: &'a InlinedProductionMap, + ) -> Option<&'a ProductionStep> { + self.production(grammar, inlined_productions).steps.get(self.step_index()) + } + + pub fn variable_index(&self) -> u32 { + match self { + ParseItem::Start { .. } => panic!("Start item doesn't have a variable index"), + ParseItem::Normal { variable_index, .. } + | ParseItem::Inlined { variable_index, .. } => *variable_index, + } + } + + pub fn step_index(&self) -> usize { + match self { + ParseItem::Start { step_index } + | ParseItem::Normal { step_index, .. } + | ParseItem::Inlined { step_index, .. } => *step_index as usize, + } + } + + fn step_index_mut(&mut self) -> &mut u32 { + match self { + ParseItem::Start { step_index } + | ParseItem::Normal { step_index, .. } + | ParseItem::Inlined { step_index, .. } => step_index, + } + } + + pub fn with<'a>(&'a self, grammar: &'a SyntaxGrammar, inlines: &'a InlinedProductionMap) -> ParseItemDisplay<'a> { + ParseItemDisplay(self, grammar, inlines) + } + + pub fn successor(&self) -> ParseItem { + let mut result = self.clone(); + *result.step_index_mut() += 1; + result + } +} + +pub struct ParseItemDisplay<'a>(&'a ParseItem, &'a SyntaxGrammar, &'a InlinedProductionMap); + +impl<'a> fmt::Display for ParseItemDisplay<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + let step_index = self.0.step_index(); + let production = self.0.production(self.1, self.2); + for (i, step) in production.steps.iter().enumerate() { + if i > 0 { + write!(f, " ")?; + } + + if i == step_index { + write!(f, "• ")?; + } + + let name = if step.symbol.is_terminal() { + "terminal" + } else if step.symbol.is_external() { + "external" + } else { + "non-terminal" + }; + + write!(f, "{}-{}", name, step.symbol.index)?; + } + Ok(()) + } +} + +impl Hash for ParseItemSet { + fn hash(&self, hasher: &mut H) { + hasher.write_usize(self.entries.len()); + for (item, lookaheads) in self.entries.iter() { + item.hash(hasher); + lookaheads.hash(hasher); + } + } } diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index c3518428..f7bb1f9c 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -1,4 +1,5 @@ mod item; +mod inline_variables; use std::collections::{HashMap, VecDeque}; use crate::grammars::{SyntaxGrammar, LexicalGrammar}; diff --git a/src/grammars.rs b/src/grammars.rs index 74c213e1..8abdad24 100644 --- a/src/grammars.rs +++ b/src/grammars.rs @@ -108,6 +108,18 @@ impl ProductionStep { } } +impl Production { + pub fn first_symbol(&self) -> Option { + self.steps.first().map(|s| s.symbol.clone()) + } +} + +impl Default for Production { + fn default() -> Self { + Production { dynamic_precedence: 0, steps: Vec::new() } + } +} + impl Variable { pub fn named(name: &str, rule: Rule) -> Self { Self { name: name.to_string(), kind: VariableType::Named, rule } diff --git a/src/main.rs b/src/main.rs index b83764fc..9dc9efb2 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,6 +2,7 @@ use clap::{App, Arg, SubCommand}; #[macro_use] extern crate serde_derive; #[macro_use] extern crate serde_json; +#[macro_use] extern crate lazy_static; mod build_tables; mod error; diff --git a/src/parse_grammar.rs b/src/parse_grammar.rs index 0f1f5008..27dc8b05 100644 --- a/src/parse_grammar.rs +++ b/src/parse_grammar.rs @@ -2,7 +2,6 @@ use serde_json::{Map, Value}; use crate::error::Result; use crate::grammars::{InputGrammar, Variable, VariableType}; use crate::rules::Rule; -use std::collections::HashMap; #[derive(Deserialize)] #[serde(tag = "type")] diff --git a/src/rules.rs b/src/rules.rs index d7234f45..9374a283 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -10,7 +10,7 @@ pub(crate) enum SymbolType { #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] pub(crate) enum Associativity { Left, - Right + Right, } #[derive(Clone, Debug, PartialEq, Eq, Hash)] @@ -137,24 +137,37 @@ impl Rule { } impl Symbol { + pub fn is_terminal(&self) -> bool { + self.kind == SymbolType::Terminal + } + pub fn is_non_terminal(&self) -> bool { - return self.kind == SymbolType::NonTerminal + self.kind == SymbolType::NonTerminal } pub fn is_external(&self) -> bool { - return self.kind == SymbolType::External + self.kind == SymbolType::External } pub fn non_terminal(index: usize) -> Self { - Symbol { kind: SymbolType::NonTerminal, index } + Symbol { + kind: SymbolType::NonTerminal, + index, + } } pub fn terminal(index: usize) -> Self { - Symbol { kind: SymbolType::Terminal, index } + Symbol { + kind: SymbolType::Terminal, + index, + } } pub fn external(index: usize) -> Self { - Symbol { kind: SymbolType::External, index } + Symbol { + kind: SymbolType::External, + index, + } } } @@ -169,11 +182,14 @@ fn add_metadata(input: Rule, f: T) -> Rule { Rule::Metadata { rule, mut params } => { f(&mut params); Rule::Metadata { rule, params } - }, + } _ => { let mut params = MetadataParams::default(); f(&mut params); - Rule::Metadata { rule: Box::new(input), params } + Rule::Metadata { + rule: Box::new(input), + params, + } } } } @@ -184,7 +200,7 @@ fn choice_helper(result: &mut Vec, rule: Rule) { for element in elements { choice_helper(result, element); } - }, + } _ => { if !result.contains(&rule) { result.push(rule);