From 3fbaff5e69a1bfd200a7c9979e52412b55a26ba0 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 2 Jan 2019 16:48:44 -0800 Subject: [PATCH] Fix various logic errors in parse table construction --- Cargo.lock | 18 ++++ Cargo.toml | 5 ++ src/build_tables/build_lex_table.rs | 116 +++++++++++++++++++++---- src/build_tables/build_parse_table.rs | 59 +++++++------ src/build_tables/coincident_tokens.rs | 38 ++++---- src/build_tables/item.rs | 4 +- src/build_tables/item_set_builder.rs | 2 +- src/build_tables/mod.rs | 44 +++++----- src/build_tables/shrink_parse_table.rs | 6 +- src/build_tables/token_conflicts.rs | 2 +- src/grammars.rs | 2 +- src/logger.rs | 29 +++++++ src/main.rs | 28 ++++-- src/nfa.rs | 26 ++++-- src/parse_grammar.rs | 4 +- src/prepare_grammar/expand_repeats.rs | 2 +- src/prepare_grammar/extract_tokens.rs | 2 +- src/prepare_grammar/process_inlines.rs | 2 +- src/render/mod.rs | 19 ++-- src/rules.rs | 2 +- src/tables.rs | 2 +- 21 files changed, 297 insertions(+), 115 deletions(-) create mode 100644 src/logger.rs diff --git a/Cargo.lock b/Cargo.lock index 538517f1..2312d362 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -76,6 +76,11 @@ dependencies = [ "constant_time_eq 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "byteorder" +version = "1.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "cc" version = "1.0.25" @@ -212,6 +217,15 @@ dependencies = [ "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "hashbrown" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "byteorder 1.2.7 (registry+https://github.com/rust-lang/crates.io-index)", + "scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "ignore" version = "0.4.4" @@ -463,9 +477,11 @@ version = "0.1.0" dependencies = [ "clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)", "dirs 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", + "hashbrown 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)", "ignore 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)", "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", "libloading 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", "regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", "rusqlite 0.14.0 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", @@ -737,6 +753,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum backtrace-sys 0.1.24 (registry+https://github.com/rust-lang/crates.io-index)" = "c66d56ac8dabd07f6aacdaf633f4b8262f5b3601a810a0dcddffd5c22c69daa0" "checksum bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "228047a76f468627ca71776ecdebd732a3423081fcf5125585bcd7c49886ce12" "checksum blake2-rfc 0.2.18 (registry+https://github.com/rust-lang/crates.io-index)" = "5d6d530bdd2d52966a6d03b7a964add7ae1a288d25214066fd4b600f0f796400" +"checksum byteorder 1.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "94f88df23a25417badc922ab0f5716cc1330e87f71ddd9203b3a3ccd9cedf75d" "checksum cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)" = "f159dfd43363c4d08055a07703eb7a3406b0dac4d0584d96965a3262db3c9d16" "checksum cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "082bb9b28e00d3c9d39cc03e64ce4cea0f1bb9b3fde493f0cbc008472d22bdf4" "checksum clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b957d88f4b6a63b9d70d5f454ac8011819c6efa7727858f458ab71c756ce2d3e" @@ -753,6 +770,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82" "checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" "checksum globset 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "4743617a7464bbda3c8aec8558ff2f9429047e025771037df561d383337ff865" +"checksum hashbrown 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "64b7d419d0622ae02fe5da6b9a5e1964b610a65bb37923b976aeebb6dbb8f86e" "checksum ignore 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "36ecfc5ad80f0b1226df948c562e2cddd446096be3f644c95106400eae8a5e01" "checksum indexmap 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7e81a7c05f79578dbc15793d8b619db9ba32b4577003ef3af1a91c416798c58d" "checksum itoa 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "1306f3464951f30e30d12373d31c79fbd52d236e5e896fd92f96ec7babbbe60b" diff --git a/Cargo.toml b/Cargo.toml index b29bc85e..29b10e17 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,6 +9,7 @@ lazy_static = "1.2.0" smallbitvec = "2.3.0" clap = "2.32" dirs = "1.0.2" +hashbrown = "0.1" ignore = "0.4.4" libloading = "0.5" rusqlite = "0.14.0" @@ -20,3 +21,7 @@ regex-syntax = "0.6.4" [dependencies.serde_json] version = "1.0" features = ["preserve_order"] + +[dependencies.log] +version = "0.4.6" +features = ["std"] diff --git a/src/build_tables/build_lex_table.rs b/src/build_tables/build_lex_table.rs index aa929d97..c002f427 100644 --- a/src/build_tables/build_lex_table.rs +++ b/src/build_tables/build_lex_table.rs @@ -2,10 +2,9 @@ use super::item::LookaheadSet; use super::token_conflicts::TokenConflictMap; use crate::grammars::{LexicalGrammar, SyntaxGrammar}; use crate::nfa::NfaCursor; -use crate::rules::Symbol; use crate::tables::{AdvanceAction, LexState, LexTable, ParseTable}; use std::collections::hash_map::Entry; -use std::collections::{HashMap, VecDeque}; +use std::collections::{BTreeMap, HashMap, VecDeque}; pub(crate) fn build_lex_table( parse_table: &mut ParseTable, @@ -16,15 +15,16 @@ pub(crate) fn build_lex_table( let keyword_lex_table; if syntax_grammar.word_token.is_some() { let mut builder = LexTableBuilder::new(lexical_grammar); - builder.add_state_for_tokens(keywords.iter()); + builder.add_state_for_tokens(keywords); keyword_lex_table = builder.table; } else { keyword_lex_table = LexTable::default(); } let mut builder = LexTableBuilder::new(lexical_grammar); - for state in parse_table.states.iter_mut() { - let tokens = state.terminal_entries.keys().filter_map(|token| { + for (i, state) in parse_table.states.iter_mut().enumerate() { + info!("populate lex state for parse state {}", i); + let tokens = LookaheadSet::with(state.terminal_entries.keys().filter_map(|token| { if token.is_terminal() { if keywords.contains(&token) { syntax_grammar.word_token @@ -34,11 +34,14 @@ pub(crate) fn build_lex_table( } else { None } - }); - state.lex_state_id = builder.add_state_for_tokens(tokens); + })); + state.lex_state_id = builder.add_state_for_tokens(&tokens); } - (builder.table, keyword_lex_table) + let mut table = builder.table; + shrink_lex_table(&mut table, parse_table); + + (table, keyword_lex_table) } struct LexTableBuilder<'a> { @@ -60,32 +63,49 @@ impl<'a> LexTableBuilder<'a> { } } - fn add_state_for_tokens(&mut self, tokens: impl Iterator) -> usize { + fn add_state_for_tokens(&mut self, tokens: &LookaheadSet) -> usize { let nfa_states = tokens + .iter() .map(|token| self.lexical_grammar.variables[token.index].start_state) .collect(); - let result = self.add_state(nfa_states); - while let Some((state_id, nfa_states)) = self.state_queue.pop_front() { + let (state_id, is_new) = self.add_state(nfa_states); + + if is_new { + info!( + "entry point state: {}, tokens: {:?}", + state_id, + tokens + .iter() + .map(|t| &self.lexical_grammar.variables[t.index].name) + .collect::>() + ); + } + + while let Some((state_id, nfa_states)) = self.state_queue.pop_back() { self.populate_state(state_id, nfa_states); } - result + state_id } - fn add_state(&mut self, nfa_states: Vec) -> usize { - match self.state_ids_by_nfa_state_set.entry(nfa_states) { - Entry::Occupied(o) => *o.get(), + fn add_state(&mut self, nfa_states: Vec) -> (usize, bool) { + self.cursor.reset(nfa_states); + match self + .state_ids_by_nfa_state_set + .entry(self.cursor.state_ids.clone()) + { + Entry::Occupied(o) => (*o.get(), false), Entry::Vacant(v) => { let state_id = self.table.states.len(); self.table.states.push(LexState::default()); self.state_queue.push_back((state_id, v.key().clone())); v.insert(state_id); - state_id + (state_id, true) } } } fn populate_state(&mut self, state_id: usize, nfa_states: Vec) { - self.cursor.reset(nfa_states); + self.cursor.force_reset(nfa_states); let mut completion = None; for (id, prec) in self.cursor.completions() { @@ -102,12 +122,16 @@ impl<'a> LexTableBuilder<'a> { } for (chars, advance_precedence, next_states, is_sep) in self.cursor.grouped_successors() { + info!( + "populate state: {}, characters: {:?}, precedence: {:?}", + state_id, chars, advance_precedence + ); if let Some((_, completed_precedence)) = completion { if advance_precedence < completed_precedence { continue; } } - let next_state_id = self.add_state(next_states); + let (next_state_id, _) = self.add_state(next_states); self.table.states[state_id].advance_actions.push(( chars, AdvanceAction { @@ -122,3 +146,59 @@ impl<'a> LexTableBuilder<'a> { } } } + +fn shrink_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) { + let mut state_replacements = BTreeMap::new(); + let mut done = false; + while !done { + done = true; + for (i, state_i) in table.states.iter().enumerate() { + if state_replacements.contains_key(&i) { + continue; + } + for (j, state_j) in table.states.iter().enumerate() { + if state_replacements.contains_key(&j) { + continue; + } + if j == i { + break; + } + if state_i == state_j { + info!("replace state {} with state {}", i, j); + state_replacements.insert(i, j); + done = false; + } + } + } + for state in table.states.iter_mut() { + for advance_action in state.advance_actions.iter_mut() { + if let Some(new_state_id) = state_replacements.get(&advance_action.1.state) { + advance_action.1.state = *new_state_id; + } + } + } + } + + let final_state_replacements = (0..table.states.len()).into_iter().map(|state_id| { + let replacement = state_replacements.get(&state_id).cloned().unwrap_or(state_id); + let prior_removed = state_replacements.iter().take_while(|i| *i.0 < replacement).count(); + replacement - prior_removed + }).collect::>(); + + for state in parse_table.states.iter_mut() { + state.lex_state_id = final_state_replacements[state.lex_state_id]; + } + + for state in table.states.iter_mut() { + for advance_action in state.advance_actions.iter_mut() { + advance_action.1.state = final_state_replacements[advance_action.1.state]; + } + } + + let mut i = 0; + table.states.retain(|_| { + let result = !state_replacements.contains_key(&i); + i += 1; + result + }); +} diff --git a/src/build_tables/build_parse_table.rs b/src/build_tables/build_parse_table.rs index c17261dc..ada34dff 100644 --- a/src/build_tables/build_parse_table.rs +++ b/src/build_tables/build_parse_table.rs @@ -7,8 +7,11 @@ use crate::tables::{ AliasSequenceId, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry, }; use core::ops::Range; -use std::collections::hash_map::{DefaultHasher, Entry}; -use std::collections::{HashMap, HashSet, VecDeque}; +use hashbrown::hash_map::Entry; +use hashbrown::{HashMap, HashSet}; +use std::collections::hash_map::DefaultHasher; +use std::collections::VecDeque; + use std::fmt::Write; use std::hash::Hasher; @@ -43,9 +46,10 @@ impl<'a> ParseTableBuilder<'a> { // Ensure that the empty alias sequence has index 0. self.parse_table.alias_sequences.push(Vec::new()); - // Ensure that the error state has index 0. + // Add the error state at index 0. self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default()); + // Add the starting state at index 1. self.add_parse_state( &Vec::new(), &Vec::new(), @@ -61,6 +65,8 @@ impl<'a> ParseTableBuilder<'a> { self.process_part_state_queue()?; self.populate_used_symbols(); + self.remove_precedences(); + Ok((self.parse_table, self.following_tokens)) } @@ -112,28 +118,9 @@ impl<'a> ParseTableBuilder<'a> { fn process_part_state_queue(&mut self) -> Result<()> { while let Some(entry) = self.parse_state_queue.pop_front() { - let debug = false; - - if debug { - println!( - "ITEM SET {}:\n{}", - entry.state_id, - self.item_sets_by_state_id[entry.state_id] - .display_with(&self.syntax_grammar, &self.lexical_grammar,) - ); - } - let item_set = self .item_set_builder .transitive_closure(&self.item_sets_by_state_id[entry.state_id]); - - if debug { - println!( - "TRANSITIVE CLOSURE:\n{}", - item_set.display_with(&self.syntax_grammar, &self.lexical_grammar) - ); - } - self.add_actions( entry.preceding_symbols, entry.preceding_auxiliary_symbols, @@ -527,6 +514,7 @@ impl<'a> ParseTableBuilder<'a> { } fn populate_used_symbols(&mut self) { + self.parse_table.symbols.push(Symbol::end()); let mut terminal_usages = vec![false; self.lexical_grammar.variables.len()]; let mut non_terminal_usages = vec![false; self.syntax_grammar.variables.len()]; let mut external_usages = vec![false; self.syntax_grammar.external_tokens.len()]; @@ -542,20 +530,39 @@ impl<'a> ParseTableBuilder<'a> { non_terminal_usages[symbol.index] = true; } } - self.parse_table.symbols.push(Symbol::end()); for (i, value) in terminal_usages.into_iter().enumerate() { if value { self.parse_table.symbols.push(Symbol::terminal(i)); } } + for (i, value) in external_usages.into_iter().enumerate() { + if value { + self.parse_table.symbols.push(Symbol::external(i)); + } + } for (i, value) in non_terminal_usages.into_iter().enumerate() { if value { self.parse_table.symbols.push(Symbol::non_terminal(i)); } } - for (i, value) in external_usages.into_iter().enumerate() { - if value { - self.parse_table.symbols.push(Symbol::external(i)); + } + + fn remove_precedences(&mut self) { + for state in self.parse_table.states.iter_mut() { + for (_, entry) in state.terminal_entries.iter_mut() { + for action in entry.actions.iter_mut() { + match action { + ParseAction::Reduce { + precedence, + associativity, + .. + } => { + *precedence = 0; + *associativity = None; + } + _ => {} + } + } } } } diff --git a/src/build_tables/coincident_tokens.rs b/src/build_tables/coincident_tokens.rs index 10707489..5f2bb3ec 100644 --- a/src/build_tables/coincident_tokens.rs +++ b/src/build_tables/coincident_tokens.rs @@ -1,36 +1,44 @@ +use crate::grammars::LexicalGrammar; use crate::rules::Symbol; use crate::tables::{ParseStateId, ParseTable}; -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; pub(crate) struct CoincidentTokenIndex { - entries: HashMap<(Symbol, Symbol), HashSet>, - empty: HashSet, + entries: Vec>, + n: usize, } impl CoincidentTokenIndex { - pub fn new(table: &ParseTable) -> Self { - let mut entries = HashMap::new(); + pub fn new(table: &ParseTable, lexical_grammar: &LexicalGrammar) -> Self { + let n = lexical_grammar.variables.len(); + let mut result = Self { + n, + entries: vec![HashSet::new(); n * n], + }; for (i, state) in table.states.iter().enumerate() { for symbol in state.terminal_entries.keys() { for other_symbol in state.terminal_entries.keys() { - entries - .entry((*symbol, *other_symbol)) - .or_insert(HashSet::new()) - .insert(i); + let index = result.index(*symbol, *other_symbol); + result.entries[index].insert(i); } } } - Self { - entries, - empty: HashSet::new(), - } + result } pub fn states_with(&self, a: Symbol, b: Symbol) -> &HashSet { - self.entries.get(&(a, b)).unwrap_or(&self.empty) + &self.entries[self.index(a, b)] } pub fn contains(&self, a: Symbol, b: Symbol) -> bool { - self.entries.contains_key(&(a, b)) + !self.entries[self.index(a, b)].is_empty() + } + + fn index(&self, a: Symbol, b: Symbol) -> usize { + if a.index < b.index { + a.index * self.n + b.index + } else { + b.index * self.n + a.index + } } } diff --git a/src/build_tables/item.rs b/src/build_tables/item.rs index 4cd2f643..511d7bef 100644 --- a/src/build_tables/item.rs +++ b/src/build_tables/item.rs @@ -112,7 +112,9 @@ impl LookaheadSet { return; } }; - vec.resize(other.index + 1, false); + if other.index >= vec.len() { + vec.resize(other.index + 1, false); + } vec.set(other.index, true); } diff --git a/src/build_tables/item_set_builder.rs b/src/build_tables/item_set_builder.rs index 5e61bfcc..5714e7e2 100644 --- a/src/build_tables/item_set_builder.rs +++ b/src/build_tables/item_set_builder.rs @@ -1,7 +1,7 @@ use super::item::{LookaheadSet, ParseItem, ParseItemSet}; use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; use crate::rules::Symbol; -use std::collections::{HashMap, HashSet}; +use hashbrown::{HashMap, HashSet}; #[derive(Clone, Debug, PartialEq, Eq)] struct TransitiveClosureAddition<'a> { diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index 8b3a2db4..207431dd 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -27,22 +27,14 @@ pub(crate) fn build_tables( let (mut parse_table, following_tokens) = build_parse_table(syntax_grammar, lexical_grammar, inlines)?; let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens); - - eprintln!("{:?}", token_conflict_map); - - let coincident_token_index = CoincidentTokenIndex::new(&parse_table); - let keywords = if let Some(word_token) = syntax_grammar.word_token { - identify_keywords( - lexical_grammar, - &parse_table, - word_token, - &token_conflict_map, - &coincident_token_index, - ) - } else { - LookaheadSet::new() - }; - + let coincident_token_index = CoincidentTokenIndex::new(&parse_table, lexical_grammar); + let keywords = identify_keywords( + lexical_grammar, + &parse_table, + syntax_grammar.word_token, + &token_conflict_map, + &coincident_token_index, + ); populate_error_state( &mut parse_table, syntax_grammar, @@ -123,10 +115,15 @@ fn populate_error_state( fn identify_keywords( lexical_grammar: &LexicalGrammar, parse_table: &ParseTable, - word_token: Symbol, + word_token: Option, token_conflict_map: &TokenConflictMap, coincident_token_index: &CoincidentTokenIndex, ) -> LookaheadSet { + if word_token.is_none() { + return LookaheadSet::new(); + } + + let word_token = word_token.unwrap(); let mut cursor = NfaCursor::new(&lexical_grammar.nfa, Vec::new()); // First find all of the candidate keyword tokens: tokens that start with @@ -137,6 +134,7 @@ fn identify_keywords( if all_chars_are_alphabetical(&cursor) && token_conflict_map.does_match_same_string(i, word_token.index) { + info!("Keywords - add candidate {}", lexical_grammar.variables[i].name); Some(Symbol::terminal(i)) } else { None @@ -150,8 +148,8 @@ fn identify_keywords( if other_token != *token && token_conflict_map.does_match_same_string(token.index, other_token.index) { - eprintln!( - "Exclude {} from keywords because it matches the same string as {}", + info!( + "Keywords - exclude {} because it matches the same string as {}", lexical_grammar.variables[token.index].name, lexical_grammar.variables[other_token.index].name ); @@ -189,8 +187,8 @@ fn identify_keywords( word_token.index, other_index, ) { - eprintln!( - "Exclude {} from keywords because of conflict with {}", + info!( + "Keywords - exclude {} because of conflict with {}", lexical_grammar.variables[token.index].name, lexical_grammar.variables[other_index].name ); @@ -198,8 +196,8 @@ fn identify_keywords( } } - eprintln!( - "Include {} in keywords", + info!( + "Keywords - include {}", lexical_grammar.variables[token.index].name, ); true diff --git a/src/build_tables/shrink_parse_table.rs b/src/build_tables/shrink_parse_table.rs index b943158f..33b72c32 100644 --- a/src/build_tables/shrink_parse_table.rs +++ b/src/build_tables/shrink_parse_table.rs @@ -2,7 +2,7 @@ use super::token_conflicts::TokenConflictMap; use crate::grammars::{SyntaxGrammar, VariableType}; use crate::rules::{AliasMap, Symbol}; use crate::tables::{ParseAction, ParseState, ParseTable, ParseTableEntry}; -use std::collections::{HashMap, HashSet}; +use hashbrown::{HashMap, HashSet}; pub(crate) fn shrink_parse_table( parse_table: &mut ParseTable, @@ -240,6 +240,10 @@ fn can_add_entry_to_state( fn remove_unused_states(parse_table: &mut ParseTable) { let mut state_usage_map = vec![false; parse_table.states.len()]; + + state_usage_map[0] = true; + state_usage_map[1] = true; + for state in &parse_table.states { for referenced_state in state.referenced_states() { state_usage_map[referenced_state] = true; diff --git a/src/build_tables/token_conflicts.rs b/src/build_tables/token_conflicts.rs index 9f1c4426..18a80484 100644 --- a/src/build_tables/token_conflicts.rs +++ b/src/build_tables/token_conflicts.rs @@ -1,7 +1,7 @@ use crate::build_tables::item::LookaheadSet; use crate::grammars::LexicalGrammar; use crate::nfa::{CharacterSet, NfaCursor}; -use std::collections::HashSet; +use hashbrown::HashSet; use std::fmt; #[derive(Clone, Debug, Default, PartialEq, Eq)] diff --git a/src/grammars.rs b/src/grammars.rs index d23e8ca6..7f587a8c 100644 --- a/src/grammars.rs +++ b/src/grammars.rs @@ -1,6 +1,6 @@ use crate::nfa::Nfa; use crate::rules::{Alias, Associativity, Rule, Symbol}; -use std::collections::HashMap; +use hashbrown::HashMap; #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub(crate) enum VariableType { diff --git a/src/logger.rs b/src/logger.rs new file mode 100644 index 00000000..18df763d --- /dev/null +++ b/src/logger.rs @@ -0,0 +1,29 @@ +use log::{LevelFilter, Log, Metadata, Record}; + +struct Logger { + pub filter: Option, +} + +impl Log for Logger { + fn enabled(&self, _: &Metadata) -> bool { + true + } + + fn log(&self, record: &Record) { + eprintln!( + "[{}] {}", + record + .module_path() + .unwrap_or_default() + .trim_start_matches("rust_tree_sitter_cli::"), + record.args() + ); + } + + fn flush(&self) {} +} + +pub(crate) fn init() { + log::set_boxed_logger(Box::new(Logger { filter: None })).unwrap(); + log::set_max_level(LevelFilter::Info); +} diff --git a/src/main.rs b/src/main.rs index cd672186..a08922b7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,20 +1,23 @@ #[macro_use] -extern crate serde_derive; -#[macro_use] -extern crate serde_json; -#[macro_use] extern crate lazy_static; +#[macro_use] +extern crate log; +#[macro_use] +extern crate serde_derive; +extern crate hashbrown; +extern crate serde_json; -use std::path::PathBuf; use clap::{App, Arg, SubCommand}; use std::env; use std::io::Write; +use std::path::PathBuf; use std::process::{Command, Stdio}; mod build_tables; mod error; mod generate; mod grammars; +mod logger; mod nfa; mod parse_grammar; mod prepare_grammar; @@ -27,7 +30,11 @@ fn main() -> error::Result<()> { .version("0.1") .author("Max Brunsfeld ") .about("Generates and tests parsers") - .subcommand(SubCommand::with_name("generate").about("Generate a parser")) + .subcommand( + SubCommand::with_name("generate") + .about("Generate a parser") + .arg(Arg::with_name("log").long("log")), + ) .subcommand( SubCommand::with_name("parse") .about("Parse a file") @@ -42,7 +49,11 @@ fn main() -> error::Result<()> { ) .get_matches(); - if let Some(_) = matches.subcommand_matches("generate") { + if let Some(matches) = matches.subcommand_matches("generate") { + if matches.is_present("log") { + logger::init(); + } + let mut grammar_path = env::current_dir().expect("Failed to read CWD"); grammar_path.push("grammar.js"); let grammar_json = load_js_grammar_file(grammar_path); @@ -70,7 +81,8 @@ fn load_js_grammar_file(grammar_path: PathBuf) -> String { "{}\nconsole.log(JSON.stringify(require(\"{}\"), null, 2));\n", js_prelude, grammar_path.to_str().unwrap() - ).expect("Failed to write to node's stdin"); + ) + .expect("Failed to write to node's stdin"); drop(node_stdin); let output = node_process .wait_with_output() diff --git a/src/nfa.rs b/src/nfa.rs index e14dac44..1c7ff53b 100644 --- a/src/nfa.rs +++ b/src/nfa.rs @@ -320,6 +320,10 @@ impl<'a> NfaCursor<'a> { self.add_states(&mut states); } + pub fn force_reset(&mut self, states: Vec) { + self.state_ids = states + } + pub fn successors(&self) -> impl Iterator { self.state_ids.iter().filter_map(move |id| { if let NfaState::Advance { @@ -352,16 +356,26 @@ impl<'a> NfaCursor<'a> { result[i].1 = max(result[i].1, prec); result[i].2.push(state); result[i].3 |= is_sep; - } else { - let intersection = result[i].0.remove_intersection(&mut chars); - if !intersection.is_empty() { - let mut states = result[i].2.clone(); - states.push(state); + chars = CharacterSet::empty(); + break; + } + + let intersection = result[i].0.remove_intersection(&mut chars); + if !intersection.is_empty() { + let mut states = result[i].2.clone(); + let max_prec = max(result[i].1, prec); + states.push(state); + if result[i].0.is_empty() { + result[i].0 = intersection; + result[i].1 = max_prec; + result[i].2 = states; + result[i].3 |= is_sep; + } else { result.insert( i, ( intersection, - max(result[i].1, prec), + max_prec, states, result[i].3 || is_sep, ), diff --git a/src/parse_grammar.rs b/src/parse_grammar.rs index 07396329..6808f402 100644 --- a/src/parse_grammar.rs +++ b/src/parse_grammar.rs @@ -133,7 +133,7 @@ mod tests { #[test] fn test_parse_grammar() { - let grammar = parse_grammar(&json!({ + let grammar = parse_grammar(r#"{ "name": "my_lang", "rules": { "file": { @@ -148,7 +148,7 @@ mod tests { "value": "foo" } } - }).to_string()).unwrap(); + }"#).unwrap(); assert_eq!(grammar.name, "my_lang"); assert_eq!(grammar.variables, vec![ diff --git a/src/prepare_grammar/expand_repeats.rs b/src/prepare_grammar/expand_repeats.rs index f3811c5f..4589bd11 100644 --- a/src/prepare_grammar/expand_repeats.rs +++ b/src/prepare_grammar/expand_repeats.rs @@ -1,7 +1,7 @@ use super::ExtractedSyntaxGrammar; use crate::grammars::{Variable, VariableType}; use crate::rules::{Rule, Symbol}; -use std::collections::HashMap; +use hashbrown::HashMap; use std::mem; struct Expander { diff --git a/src/prepare_grammar/extract_tokens.rs b/src/prepare_grammar/extract_tokens.rs index 5f3f6e16..115933ee 100644 --- a/src/prepare_grammar/extract_tokens.rs +++ b/src/prepare_grammar/extract_tokens.rs @@ -2,7 +2,7 @@ use super::{ExtractedLexicalGrammar, ExtractedSyntaxGrammar, InternedGrammar}; use crate::error::{Error, Result}; use crate::grammars::{ExternalToken, Variable, VariableType}; use crate::rules::{MetadataParams, Rule, Symbol, SymbolType}; -use std::collections::HashMap; +use hashbrown::HashMap; use std::mem; pub(super) fn extract_tokens( diff --git a/src/prepare_grammar/process_inlines.rs b/src/prepare_grammar/process_inlines.rs index 0d7f6827..24bbc14d 100644 --- a/src/prepare_grammar/process_inlines.rs +++ b/src/prepare_grammar/process_inlines.rs @@ -1,5 +1,5 @@ use crate::grammars::{InlinedProductionMap, Production, ProductionStep, SyntaxGrammar}; -use std::collections::HashMap; +use hashbrown::HashMap; #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] struct ProductionStepId { diff --git a/src/render/mod.rs b/src/render/mod.rs index cbb8ba0d..250218c1 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -1,9 +1,9 @@ use crate::grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType}; use crate::nfa::CharacterSet; use crate::rules::{Alias, AliasMap, Symbol, SymbolType}; -use crate::tables::{LexState, LexTable, ParseAction, ParseTable, ParseTableEntry}; +use crate::tables::{AdvanceAction, LexState, LexTable, ParseAction, ParseTable, ParseTableEntry}; use core::ops::Range; -use std::collections::{HashMap, HashSet}; +use hashbrown::{HashMap, HashSet}; use std::fmt::Write; use std::mem::swap; @@ -372,17 +372,14 @@ impl Generator { if self.add_character_set_condition(&characters, &ruled_out_characters) { add!(self, ")\n"); indent!(self); - if action.in_main_token { - add_line!(self, "ADVANCE({});", action.state); - } else { - add_line!(self, "SKIP({});", action.state); - } + self.add_advance_action(&action); if let CharacterSet::Include(chars) = characters { ruled_out_characters.extend(chars.iter().map(|c| *c as u32)); } dedent!(self); } else { self.buffer.truncate(previous_length); + self.add_advance_action(&action); } } @@ -494,6 +491,14 @@ impl Generator { }) } + fn add_advance_action(&mut self, action: &AdvanceAction) { + if action.in_main_token { + add_line!(self, "ADVANCE({});", action.state); + } else { + add_line!(self, "SKIP({});", action.state); + } + } + fn add_lex_modes_list(&mut self) { self.get_external_scanner_state_id(HashSet::new()); diff --git a/src/rules.rs b/src/rules.rs index 77e50d3c..ad16c632 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use hashbrown::HashMap; #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] pub(crate) enum SymbolType { diff --git a/src/tables.rs b/src/tables.rs index 1c125621..21222135 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -1,6 +1,6 @@ use crate::nfa::CharacterSet; use crate::rules::{Alias, Associativity, Symbol}; -use std::collections::HashMap; +use hashbrown::HashMap; pub(crate) type AliasSequenceId = usize; pub(crate) type ParseStateId = usize;