From 9824ebbbc31f7cda43f8a5aa5b3847462ab4c6aa Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 2 Jan 2019 12:34:40 -0800 Subject: [PATCH] Implement lex table construction --- src/build_tables/build_lex_table.rs | 124 ++++++++++++++++ src/build_tables/build_parse_table.rs | 31 ++-- src/build_tables/item_set_builder.rs | 20 +-- src/build_tables/lex_table_builder.rs | 24 --- src/build_tables/mod.rs | 131 ++++++++++++++++- src/build_tables/shrink_parse_table.rs | 2 - src/build_tables/token_conflicts.rs | 80 +++++----- src/grammars.rs | 10 +- src/main.rs | 2 +- src/nfa.rs | 130 ++++++----------- src/prepare_grammar/expand_tokens.rs | 24 ++- src/prepare_grammar/extract_tokens.rs | 17 ++- src/render/mod.rs | 195 +++++++++++++++++++++++-- src/rules.rs | 3 + src/tables.rs | 15 +- 15 files changed, 581 insertions(+), 227 deletions(-) create mode 100644 src/build_tables/build_lex_table.rs delete mode 100644 src/build_tables/lex_table_builder.rs diff --git a/src/build_tables/build_lex_table.rs b/src/build_tables/build_lex_table.rs new file mode 100644 index 00000000..aa929d97 --- /dev/null +++ b/src/build_tables/build_lex_table.rs @@ -0,0 +1,124 @@ +use super::item::LookaheadSet; +use super::token_conflicts::TokenConflictMap; +use crate::grammars::{LexicalGrammar, SyntaxGrammar}; +use crate::nfa::NfaCursor; +use crate::rules::Symbol; +use crate::tables::{AdvanceAction, LexState, LexTable, ParseTable}; +use std::collections::hash_map::Entry; +use std::collections::{HashMap, VecDeque}; + +pub(crate) fn build_lex_table( + parse_table: &mut ParseTable, + syntax_grammar: &SyntaxGrammar, + lexical_grammar: &LexicalGrammar, + keywords: &LookaheadSet, +) -> (LexTable, LexTable) { + let keyword_lex_table; + if syntax_grammar.word_token.is_some() { + let mut builder = LexTableBuilder::new(lexical_grammar); + builder.add_state_for_tokens(keywords.iter()); + keyword_lex_table = builder.table; + } else { + keyword_lex_table = LexTable::default(); + } + + let mut builder = LexTableBuilder::new(lexical_grammar); + for state in parse_table.states.iter_mut() { + let tokens = state.terminal_entries.keys().filter_map(|token| { + if token.is_terminal() { + if keywords.contains(&token) { + syntax_grammar.word_token + } else { + Some(*token) + } + } else { + None + } + }); + state.lex_state_id = builder.add_state_for_tokens(tokens); + } + + (builder.table, keyword_lex_table) +} + +struct LexTableBuilder<'a> { + lexical_grammar: &'a LexicalGrammar, + cursor: NfaCursor<'a>, + table: LexTable, + state_queue: VecDeque<(usize, Vec)>, + state_ids_by_nfa_state_set: HashMap, usize>, +} + +impl<'a> LexTableBuilder<'a> { + fn new(lexical_grammar: &'a LexicalGrammar) -> Self { + Self { + lexical_grammar, + cursor: NfaCursor::new(&lexical_grammar.nfa, vec![]), + table: LexTable::default(), + state_queue: VecDeque::new(), + state_ids_by_nfa_state_set: HashMap::new(), + } + } + + fn add_state_for_tokens(&mut self, tokens: impl Iterator) -> usize { + let nfa_states = tokens + .map(|token| self.lexical_grammar.variables[token.index].start_state) + .collect(); + let result = self.add_state(nfa_states); + while let Some((state_id, nfa_states)) = self.state_queue.pop_front() { + self.populate_state(state_id, nfa_states); + } + result + } + + fn add_state(&mut self, nfa_states: Vec) -> usize { + match self.state_ids_by_nfa_state_set.entry(nfa_states) { + Entry::Occupied(o) => *o.get(), + Entry::Vacant(v) => { + let state_id = self.table.states.len(); + self.table.states.push(LexState::default()); + self.state_queue.push_back((state_id, v.key().clone())); + v.insert(state_id); + state_id + } + } + } + + fn populate_state(&mut self, state_id: usize, nfa_states: Vec) { + self.cursor.reset(nfa_states); + + let mut completion = None; + for (id, prec) in self.cursor.completions() { + if let Some((prev_id, prev_precedence)) = completion { + if TokenConflictMap::prefer_token( + self.lexical_grammar, + (prev_precedence, prev_id), + (prec, id), + ) { + continue; + } + } + completion = Some((id, prec)); + } + + for (chars, advance_precedence, next_states, is_sep) in self.cursor.grouped_successors() { + if let Some((_, completed_precedence)) = completion { + if advance_precedence < completed_precedence { + continue; + } + } + let next_state_id = self.add_state(next_states); + self.table.states[state_id].advance_actions.push(( + chars, + AdvanceAction { + state: next_state_id, + in_main_token: !is_sep, + }, + )); + } + + if let Some((completion_index, _)) = completion { + self.table.states[state_id].accept_action = Some(completion_index); + } + } +} diff --git a/src/build_tables/build_parse_table.rs b/src/build_tables/build_parse_table.rs index 2fe6fd8d..c17261dc 100644 --- a/src/build_tables/build_parse_table.rs +++ b/src/build_tables/build_parse_table.rs @@ -7,10 +7,10 @@ use crate::tables::{ AliasSequenceId, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry, }; use core::ops::Range; -use std::hash::Hasher; -use std::collections::hash_map::{Entry, DefaultHasher}; +use std::collections::hash_map::{DefaultHasher, Entry}; use std::collections::{HashMap, HashSet, VecDeque}; use std::fmt::Write; +use std::hash::Hasher; #[derive(Clone)] struct AuxiliarySymbolInfo { @@ -31,7 +31,6 @@ struct ParseTableBuilder<'a> { item_set_builder: ParseItemSetBuilder<'a>, syntax_grammar: &'a SyntaxGrammar, lexical_grammar: &'a LexicalGrammar, - inlines: &'a InlinedProductionMap, state_ids_by_item_set: HashMap, ParseStateId>, item_sets_by_state_id: Vec>, parse_state_queue: VecDeque, @@ -51,9 +50,12 @@ impl<'a> ParseTableBuilder<'a> { &Vec::new(), &Vec::new(), ParseItemSet::with( - [(ParseItem::start(), LookaheadSet::with([Symbol::end()].iter().cloned()))] - .iter() - .cloned(), + [( + ParseItem::start(), + LookaheadSet::with([Symbol::end()].iter().cloned()), + )] + .iter() + .cloned(), ), ); @@ -69,8 +71,12 @@ impl<'a> ParseTableBuilder<'a> { item_set: ParseItemSet<'a>, ) -> ParseStateId { if preceding_symbols.len() > 1 { - let left_tokens = self.item_set_builder.last_set(&preceding_symbols[preceding_symbols.len() - 2]); - let right_tokens = self.item_set_builder.first_set(&preceding_symbols[preceding_symbols.len() - 1]); + let left_tokens = self + .item_set_builder + .last_set(&preceding_symbols[preceding_symbols.len() - 2]); + let right_tokens = self + .item_set_builder + .first_set(&preceding_symbols[preceding_symbols.len() - 1]); for left_token in left_tokens.iter() { if left_token.is_terminal() { self.following_tokens[left_token.index].insert_all(right_tokens); @@ -117,11 +123,9 @@ impl<'a> ParseTableBuilder<'a> { ); } - let item_set = self.item_set_builder.transitive_closure( - &self.item_sets_by_state_id[entry.state_id], - self.syntax_grammar, - self.inlines, - ); + let item_set = self + .item_set_builder + .transitive_closure(&self.item_sets_by_state_id[entry.state_id]); if debug { println!( @@ -606,7 +610,6 @@ pub(crate) fn build_parse_table( ParseTableBuilder { syntax_grammar, lexical_grammar, - inlines, item_set_builder: ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines), state_ids_by_item_set: HashMap::new(), item_sets_by_state_id: Vec::new(), diff --git a/src/build_tables/item_set_builder.rs b/src/build_tables/item_set_builder.rs index 8649cb52..5e61bfcc 100644 --- a/src/build_tables/item_set_builder.rs +++ b/src/build_tables/item_set_builder.rs @@ -18,6 +18,7 @@ struct FollowSetInfo { pub(crate) struct ParseItemSetBuilder<'a> { first_sets: HashMap, last_sets: HashMap, + inlines: &'a InlinedProductionMap, transitive_closure_additions: Vec>>, } @@ -36,6 +37,7 @@ impl<'a> ParseItemSetBuilder<'a> { let mut result = Self { first_sets: HashMap::new(), last_sets: HashMap::new(), + inlines, transitive_closure_additions: vec![Vec::new(); syntax_grammar.variables.len()], }; @@ -237,15 +239,12 @@ impl<'a> ParseItemSetBuilder<'a> { result } - pub(crate) fn transitive_closure( - &mut self, - item_set: &ParseItemSet<'a>, - grammar: &'a SyntaxGrammar, - inlines: &'a InlinedProductionMap, - ) -> ParseItemSet<'a> { + pub(crate) fn transitive_closure(&mut self, item_set: &ParseItemSet<'a>) -> ParseItemSet<'a> { let mut result = ParseItemSet::default(); for (item, lookaheads) in &item_set.entries { - if let Some(productions) = inlines.inlined_productions(item.production, item.step_index) + if let Some(productions) = self + .inlines + .inlined_productions(item.production, item.step_index) { for production in productions { self.add_item( @@ -273,12 +272,7 @@ impl<'a> ParseItemSetBuilder<'a> { &self.first_sets[symbol] } - fn add_item( - &self, - set: &mut ParseItemSet<'a>, - item: ParseItem<'a>, - lookaheads: &LookaheadSet, - ) { + fn add_item(&self, set: &mut ParseItemSet<'a>, item: ParseItem<'a>, lookaheads: &LookaheadSet) { if let Some(step) = item.step() { if step.symbol.is_non_terminal() { let next_step = item.successor().step(); diff --git a/src/build_tables/lex_table_builder.rs b/src/build_tables/lex_table_builder.rs deleted file mode 100644 index 86d1578b..00000000 --- a/src/build_tables/lex_table_builder.rs +++ /dev/null @@ -1,24 +0,0 @@ -use crate::rules::Symbol; -use crate::tables::LexTable; -use crate::grammars::{SyntaxGrammar, LexicalGrammar}; - -pub(crate) struct LexTableBuilder<'a> { - syntax_grammar: &'a SyntaxGrammar, - lexical_grammar: &'a LexicalGrammar, - table: LexTable, -} - -impl<'a> LexTableBuilder<'a> { - pub fn new( - syntax_grammar: &'a SyntaxGrammar, - lexical_grammar: &'a LexicalGrammar, - ) -> Self { - Self { - syntax_grammar, lexical_grammar, table: LexTable::default() - } - } - - pub fn build(self) -> (LexTable, LexTable, Option) { - (LexTable::default(), LexTable::default(), None) - } -} diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index 665c56a0..8b3a2db4 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -1,11 +1,12 @@ +mod build_lex_table; mod build_parse_table; mod coincident_tokens; mod item; mod item_set_builder; -mod lex_table_builder; mod shrink_parse_table; mod token_conflicts; +use self::build_lex_table::build_lex_table; use self::build_parse_table::build_parse_table; use self::coincident_tokens::CoincidentTokenIndex; use self::item::LookaheadSet; @@ -13,6 +14,7 @@ use self::shrink_parse_table::shrink_parse_table; use self::token_conflicts::TokenConflictMap; use crate::error::Result; use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar}; +use crate::nfa::{CharacterSet, NfaCursor}; use crate::rules::{AliasMap, Symbol}; use crate::tables::{LexTable, ParseAction, ParseTable, ParseTableEntry}; @@ -25,7 +27,22 @@ pub(crate) fn build_tables( let (mut parse_table, following_tokens) = build_parse_table(syntax_grammar, lexical_grammar, inlines)?; let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens); + + eprintln!("{:?}", token_conflict_map); + let coincident_token_index = CoincidentTokenIndex::new(&parse_table); + let keywords = if let Some(word_token) = syntax_grammar.word_token { + identify_keywords( + lexical_grammar, + &parse_table, + word_token, + &token_conflict_map, + &coincident_token_index, + ) + } else { + LookaheadSet::new() + }; + populate_error_state( &mut parse_table, syntax_grammar, @@ -39,7 +56,14 @@ pub(crate) fn build_tables( simple_aliases, &token_conflict_map, ); - Ok((parse_table, LexTable::default(), LexTable::default(), None)) + let (main_lex_table, keyword_lex_table) = + build_lex_table(&mut parse_table, syntax_grammar, lexical_grammar, &keywords); + Ok(( + parse_table, + main_lex_table, + keyword_lex_table, + syntax_grammar.word_token, + )) } fn populate_error_state( @@ -77,13 +101,10 @@ fn populate_error_state( || !token_conflict_map.does_conflict(i, t.index) }); if can_be_used_for_recovery { - eprintln!("include {}", &lexical_grammar.variables[symbol.index].name); state .terminal_entries .entry(symbol) .or_insert_with(|| recover_entry.clone()); - } else { - eprintln!("exclude {}", &lexical_grammar.variables[symbol.index].name); } } @@ -98,3 +119,103 @@ fn populate_error_state( state.terminal_entries.insert(Symbol::end(), recover_entry); } + +fn identify_keywords( + lexical_grammar: &LexicalGrammar, + parse_table: &ParseTable, + word_token: Symbol, + token_conflict_map: &TokenConflictMap, + coincident_token_index: &CoincidentTokenIndex, +) -> LookaheadSet { + let mut cursor = NfaCursor::new(&lexical_grammar.nfa, Vec::new()); + + // First find all of the candidate keyword tokens: tokens that start with + // letters or underscore and can match the same string as a word token. + let keywords = LookaheadSet::with(lexical_grammar.variables.iter().enumerate().filter_map( + |(i, variable)| { + cursor.reset(vec![variable.start_state]); + if all_chars_are_alphabetical(&cursor) + && token_conflict_map.does_match_same_string(i, word_token.index) + { + Some(Symbol::terminal(i)) + } else { + None + } + }, + )); + + // Exclude keyword candidates that shadow another keyword candidate. + let keywords = LookaheadSet::with(keywords.iter().filter(|token| { + for other_token in keywords.iter() { + if other_token != *token + && token_conflict_map.does_match_same_string(token.index, other_token.index) + { + eprintln!( + "Exclude {} from keywords because it matches the same string as {}", + lexical_grammar.variables[token.index].name, + lexical_grammar.variables[other_token.index].name + ); + return false; + } + } + true + })); + + // Exclude keyword candidates for which substituting the keyword capture + // token would introduce new lexical conflicts with other tokens. + let keywords = LookaheadSet::with(keywords.iter().filter(|token| { + for other_index in 0..lexical_grammar.variables.len() { + if keywords.contains(&Symbol::terminal(other_index)) { + continue; + } + + // If the word token was already valid in every state containing + // this keyword candidate, then substituting the word token won't + // introduce any new lexical conflicts. + if coincident_token_index + .states_with(*token, Symbol::terminal(other_index)) + .iter() + .all(|state_id| { + parse_table.states[*state_id] + .terminal_entries + .contains_key(&word_token) + }) + { + continue; + } + + if !token_conflict_map.has_same_conflict_status( + token.index, + word_token.index, + other_index, + ) { + eprintln!( + "Exclude {} from keywords because of conflict with {}", + lexical_grammar.variables[token.index].name, + lexical_grammar.variables[other_index].name + ); + return false; + } + } + + eprintln!( + "Include {} in keywords", + lexical_grammar.variables[token.index].name, + ); + true + })); + + keywords +} + +fn all_chars_are_alphabetical(cursor: &NfaCursor) -> bool { + cursor.successors().all(|(chars, _, _, is_sep)| { + if is_sep { + true + } else if let CharacterSet::Include(chars) = chars { + chars.iter().all(|c| c.is_alphabetic() || *c == '_') + } else { + false + } + }) +} diff --git a/src/build_tables/shrink_parse_table.rs b/src/build_tables/shrink_parse_table.rs index 026c3058..b943158f 100644 --- a/src/build_tables/shrink_parse_table.rs +++ b/src/build_tables/shrink_parse_table.rs @@ -166,8 +166,6 @@ fn merge_parse_state( } } - eprintln!("maybe merge {} {}", left, right); - let mut symbols_to_add = Vec::new(); for (symbol, right_entry) in &right_state.terminal_entries { if !left_state.terminal_entries.contains_key(&symbol) { diff --git a/src/build_tables/token_conflicts.rs b/src/build_tables/token_conflicts.rs index 09d5e97c..9f1c4426 100644 --- a/src/build_tables/token_conflicts.rs +++ b/src/build_tables/token_conflicts.rs @@ -4,7 +4,7 @@ use crate::nfa::{CharacterSet, NfaCursor}; use std::collections::HashSet; use std::fmt; -#[derive(Clone, Debug, Default)] +#[derive(Clone, Debug, Default, PartialEq, Eq)] struct TokenConflictStatus { does_overlap: bool, does_match_valid_continuation: bool, @@ -12,15 +12,16 @@ struct TokenConflictStatus { matches_same_string: bool, } -pub(crate) struct TokenConflictMap { +pub(crate) struct TokenConflictMap<'a> { n: usize, status_matrix: Vec, starting_chars_by_index: Vec, following_chars_by_index: Vec, + grammar: &'a LexicalGrammar, } -impl TokenConflictMap { - pub fn new(grammar: &LexicalGrammar, following_tokens: Vec) -> Self { +impl<'a> TokenConflictMap<'a> { + pub fn new(grammar: &'a LexicalGrammar, following_tokens: Vec) -> Self { let mut cursor = NfaCursor::new(&grammar.nfa, Vec::new()); let starting_chars = get_starting_chars(&mut cursor, grammar); let following_chars = get_following_chars(&starting_chars, following_tokens); @@ -40,9 +41,16 @@ impl TokenConflictMap { status_matrix, starting_chars_by_index: starting_chars, following_chars_by_index: following_chars, + grammar, } } + pub fn has_same_conflict_status(&self, a: usize, b: usize, other: usize) -> bool { + let left = &self.status_matrix[matrix_index(self.n, a, other)]; + let right = &self.status_matrix[matrix_index(self.n, b, other)]; + left == right + } + pub fn does_match_same_string(&self, i: usize, j: usize) -> bool { self.status_matrix[matrix_index(self.n, i, j)].matches_same_string } @@ -55,9 +63,28 @@ impl TokenConflictMap { pub fn does_overlap(&self, i: usize, j: usize) -> bool { self.status_matrix[matrix_index(self.n, i, j)].does_overlap } + + pub fn prefer_token(grammar: &LexicalGrammar, left: (i32, usize), right: (i32, usize)) -> bool { + if left.0 > right.0 { + return true; + } else if left.0 < right.0 { + return false; + } + + match ( + grammar.variables[left.1].is_string, + grammar.variables[right.1].is_string, + ) { + (true, false) => return true, + (false, true) => return false, + _ => {} + } + + left.0 < right.0 + } } -impl fmt::Debug for TokenConflictMap { +impl<'a> fmt::Debug for TokenConflictMap<'a> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "TokenConflictMap {{\n")?; @@ -69,18 +96,22 @@ impl fmt::Debug for TokenConflictMap { write!(f, " following_characters: {{\n")?; for i in 0..self.n { - write!(f, " {}: {:?},\n", i, self.following_chars_by_index[i])?; + write!( + f, + " {}: {:?},\n", + self.grammar.variables[i].name, self.following_chars_by_index[i] + )?; } write!(f, " }},\n")?; write!(f, " status_matrix: {{\n")?; for i in 0..self.n { - write!(f, " {}: {{\n", i)?; + write!(f, " {}: {{\n", self.grammar.variables[i].name)?; for j in 0..self.n { write!( f, " {}: {:?},\n", - j, + self.grammar.variables[j].name, self.status_matrix[matrix_index(self.n, i, j)] )?; } @@ -101,7 +132,7 @@ fn get_starting_chars(cursor: &mut NfaCursor, grammar: &LexicalGrammar) -> Vec bool { - if left.0 > right.0 { - return true; - } else if left.0 < right.0 { - return false; - } - - match ( - grammar.variables[left.1].is_string, - grammar.variables[right.1].is_string, - ) { - (true, false) => return true, - (false, true) => return false, - _ => {} - } - - left.0 < right.0 -} - fn variable_ids_for_states<'a>( state_ids: &'a Vec, grammar: &'a LexicalGrammar, diff --git a/src/grammars.rs b/src/grammars.rs index 18da86d8..d23e8ca6 100644 --- a/src/grammars.rs +++ b/src/grammars.rs @@ -91,6 +91,7 @@ pub(crate) struct SyntaxGrammar { pub word_token: Option, } +#[cfg(test)] impl ProductionStep { pub(crate) fn new(symbol: Symbol) -> Self { Self { @@ -127,14 +128,6 @@ impl Production { pub fn first_symbol(&self) -> Option { self.steps.first().map(|s| s.symbol.clone()) } - - pub fn last_precedence(&self) -> i32 { - self.steps.last().map(|s| s.precedence).unwrap_or(0) - } - - pub fn last_associativity(&self) -> Option { - self.steps.last().map(|s| s.associativity).unwrap_or(None) - } } impl Default for Production { @@ -146,6 +139,7 @@ impl Default for Production { } } +#[cfg(test)] impl Variable { pub fn named(name: &str, rule: Rule) -> Self { Self { diff --git a/src/main.rs b/src/main.rs index c7ca2ca5..cd672186 100644 --- a/src/main.rs +++ b/src/main.rs @@ -42,7 +42,7 @@ fn main() -> error::Result<()> { ) .get_matches(); - if let Some(matches) = matches.subcommand_matches("generate") { + if let Some(_) = matches.subcommand_matches("generate") { let mut grammar_path = env::current_dir().expect("Failed to read CWD"); grammar_path.push("grammar.js"); let grammar_json = load_js_grammar_file(grammar_path); diff --git a/src/nfa.rs b/src/nfa.rs index ee39d178..e14dac44 100644 --- a/src/nfa.rs +++ b/src/nfa.rs @@ -40,7 +40,6 @@ impl Default for Nfa { pub struct NfaCursor<'a> { pub(crate) state_ids: Vec, nfa: &'a Nfa, - in_sep: bool, } impl CharacterSet { @@ -111,7 +110,7 @@ impl CharacterSet { CharacterSet::Exclude(other_chars) => { chars.retain(|c| other_chars.contains(&c)); CharacterSet::Exclude(chars) - }, + } }, } } @@ -311,7 +310,6 @@ impl<'a> NfaCursor<'a> { let mut result = Self { nfa, state_ids: Vec::new(), - in_sep: true, }; result.add_states(&mut states); result @@ -322,81 +320,59 @@ impl<'a> NfaCursor<'a> { self.add_states(&mut states); } - pub fn advance(&mut self, c: char) -> bool { - let mut result = false; - let mut new_state_ids = Vec::new(); - let mut any_sep_transitions = false; - for current_state_id in &self.state_ids { - if let NfaState::Advance { - chars, - state_id, - is_sep, - .. - } = &self.nfa.states[*current_state_id as usize] - { - if chars.contains(c) { - if *is_sep { - any_sep_transitions = true; - } - new_state_ids.push(*state_id); - result = true; - } - } - } - if !any_sep_transitions { - self.in_sep = false; - } - self.state_ids.clear(); - self.add_states(&mut new_state_ids); - result - } - - pub fn successors(&self) -> impl Iterator { + pub fn successors(&self) -> impl Iterator { self.state_ids.iter().filter_map(move |id| { if let NfaState::Advance { chars, state_id, precedence, - .. + is_sep, } = &self.nfa.states[*id as usize] { - Some((chars, *precedence, *state_id)) + Some((chars, *precedence, *state_id, *is_sep)) } else { None } }) } - pub fn grouped_successors(&self) -> Vec<(CharacterSet, i32, Vec)> { + pub fn grouped_successors(&self) -> Vec<(CharacterSet, i32, Vec, bool)> { Self::group_successors(self.successors()) } fn group_successors<'b>( - iter: impl Iterator, - ) -> Vec<(CharacterSet, i32, Vec)> { - let mut result: Vec<(CharacterSet, i32, Vec)> = Vec::new(); - for (chars, prec, state) in iter { + iter: impl Iterator, + ) -> Vec<(CharacterSet, i32, Vec, bool)> { + let mut result: Vec<(CharacterSet, i32, Vec, bool)> = Vec::new(); + for (chars, prec, state, is_sep) in iter { let mut chars = chars.clone(); let mut i = 0; while i < result.len() { - let intersection = result[i].0.remove_intersection(&mut chars); - if !intersection.is_empty() { - if result[i].0.is_empty() { - result[i].0 = intersection; - result[i].1 = max(result[i].1, prec); - result[i].2.push(state); - } else { + if result[i].0 == chars { + result[i].1 = max(result[i].1, prec); + result[i].2.push(state); + result[i].3 |= is_sep; + } else { + let intersection = result[i].0.remove_intersection(&mut chars); + if !intersection.is_empty() { let mut states = result[i].2.clone(); - let mut precedence = result[i].1; states.push(state); - result.insert(i, (intersection, max(precedence, prec), states)); + result.insert( + i, + ( + intersection, + max(result[i].1, prec), + states, + result[i].3 || is_sep, + ), + ); i += 1; } } i += 1; } if !chars.is_empty() { - result.push((chars, prec, vec![state])); + result.push((chars, prec, vec![state], is_sep)); } } result.sort_unstable_by(|a, b| a.0.cmp(&b.0)); @@ -417,10 +393,6 @@ impl<'a> NfaCursor<'a> { }) } - pub fn in_separator(&self) -> bool { - self.in_sep - } - pub fn add_states(&mut self, new_state_ids: &mut Vec) { let mut i = 0; while i < new_state_ids.len() { @@ -460,26 +432,31 @@ mod tests { let table = [ ( vec![ - (CharacterSet::empty().add_range('a', 'f'), 0, 1), - (CharacterSet::empty().add_range('d', 'i'), 1, 2), + (CharacterSet::empty().add_range('a', 'f'), 0, 1, false), + (CharacterSet::empty().add_range('d', 'i'), 1, 2, false), ], vec![ - (CharacterSet::empty().add_range('a', 'c'), 0, vec![1]), - (CharacterSet::empty().add_range('d', 'f'), 1, vec![1, 2]), - (CharacterSet::empty().add_range('g', 'i'), 1, vec![2]), + (CharacterSet::empty().add_range('a', 'c'), 0, vec![1], false), + ( + CharacterSet::empty().add_range('d', 'f'), + 1, + vec![1, 2], + false, + ), + (CharacterSet::empty().add_range('g', 'i'), 1, vec![2], false), ], ), ( vec![ - (CharacterSet::empty().add_range('a', 'z'), 0, 1), - (CharacterSet::empty().add_char('d'), 0, 2), - (CharacterSet::empty().add_char('i'), 0, 3), - (CharacterSet::empty().add_char('f'), 0, 4), + (CharacterSet::empty().add_range('a', 'z'), 0, 1, false), + (CharacterSet::empty().add_char('d'), 0, 2, false), + (CharacterSet::empty().add_char('i'), 0, 3, false), + (CharacterSet::empty().add_char('f'), 0, 4, false), ], vec![ - (CharacterSet::empty().add_char('d'), 0, vec![1, 2]), - (CharacterSet::empty().add_char('f'), 0, vec![1, 4]), - (CharacterSet::empty().add_char('i'), 0, vec![1, 3]), + (CharacterSet::empty().add_char('d'), 0, vec![1, 2], false), + (CharacterSet::empty().add_char('f'), 0, vec![1, 4], false), + (CharacterSet::empty().add_char('i'), 0, vec![1, 3], false), ( CharacterSet::empty() .add_range('a', 'c') @@ -488,6 +465,7 @@ mod tests { .add_range('j', 'z'), 0, vec![1], + false, ), ], ), @@ -495,28 +473,10 @@ mod tests { for row in table.iter() { assert_eq!( - NfaCursor::group_successors(row.0.iter().map(|(c, p, s)| (c, *p, *s))), + NfaCursor::group_successors(row.0.iter().map(|(c, p, s, sep)| (c, *p, *s, *sep))), row.1 ); } - - // let successors = NfaCursor::group_successors( - // [ - // (&CharacterSet::empty().add_range('a', 'f'), 1), - // (&CharacterSet::empty().add_range('d', 'i'), 2), - // ] - // .iter() - // .cloned(), - // ); - // - // assert_eq!( - // successors, - // vec![ - // (CharacterSet::empty().add_range('a', 'c'), vec![1],), - // (CharacterSet::empty().add_range('d', 'f'), vec![1, 2],), - // (CharacterSet::empty().add_range('g', 'i'), vec![2],), - // ] - // ); } #[test] diff --git a/src/prepare_grammar/expand_tokens.rs b/src/prepare_grammar/expand_tokens.rs index 4ef17b27..fdf085f6 100644 --- a/src/prepare_grammar/expand_tokens.rs +++ b/src/prepare_grammar/expand_tokens.rs @@ -6,6 +6,7 @@ use crate::rules::Rule; use regex_syntax::ast::{ parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind, RepetitionRange, }; +use std::i32; struct NfaBuilder { nfa: Nfa, @@ -17,7 +18,7 @@ fn is_string(rule: &Rule) -> bool { match rule { Rule::String(_) => true, Rule::Metadata { rule, .. } => is_string(rule), - _ => false + _ => false, } } @@ -346,7 +347,9 @@ impl NfaBuilder { fn push_split(&mut self, state_id: u32) { let last_state_id = self.nfa.last_state_id(); - self.nfa.states.push(NfaState::Split(state_id, last_state_id)); + self.nfa + .states + .push(NfaState::Split(state_id, last_state_id)); } fn add_precedence(&mut self, prec: i32, mut state_ids: Vec) { @@ -354,12 +357,12 @@ impl NfaBuilder { while i < state_ids.len() { let state_id = state_ids[i]; let (left, right) = match &mut self.nfa.states[state_id as usize] { - NfaState::Accept {precedence, ..} => { + NfaState::Accept { precedence, .. } => { *precedence = prec; return; - }, + } NfaState::Split(left, right) => (*left, *right), - _ => return + _ => return, }; if !state_ids.contains(&left) { state_ids.push(left); @@ -383,7 +386,7 @@ mod tests { let mut cursor = NfaCursor::new(&grammar.nfa, start_states); let mut result = None; - let mut result_precedence = 0; + let mut result_precedence = i32::MIN; let mut start_char = 0; let mut end_char = 0; for c in s.chars() { @@ -393,9 +396,14 @@ mod tests { result_precedence = precedence; } } - if cursor.advance(c) { + if let Some((_, _, next_states, in_sep)) = cursor + .grouped_successors() + .into_iter() + .find(|(chars, prec, _, _)| chars.contains(c) && *prec >= result_precedence) + { + cursor.reset(next_states); end_char += 1; - if cursor.in_separator() { + if in_sep { start_char = end_char; } } else { diff --git a/src/prepare_grammar/extract_tokens.rs b/src/prepare_grammar/extract_tokens.rs index eaeede90..5f3f6e16 100644 --- a/src/prepare_grammar/extract_tokens.rs +++ b/src/prepare_grammar/extract_tokens.rs @@ -1,6 +1,6 @@ use super::{ExtractedLexicalGrammar, ExtractedSyntaxGrammar, InternedGrammar}; use crate::error::{Error, Result}; -use crate::grammars::{ExternalToken, Variable}; +use crate::grammars::{ExternalToken, Variable, VariableType}; use crate::rules::{MetadataParams, Rule, Symbol, SymbolType}; use std::collections::HashMap; use std::mem; @@ -240,16 +240,21 @@ impl TokenExtractor { let index = self.extracted_variables.len(); let variable = if let Some(string_value) = string_value { - Variable::anonymous(string_value, rule.clone()) + Variable { + name: string_value.clone(), + kind: VariableType::Anonymous, + rule: rule.clone() + } } else { self.current_variable_token_count += 1; - Variable::auxiliary( - &format!( + Variable { + name: format!( "{}_token{}", &self.current_variable_name, self.current_variable_token_count ), - rule.clone(), - ) + kind: VariableType::Auxiliary, + rule: rule.clone(), + } }; self.extracted_variables.push(variable); diff --git a/src/render/mod.rs b/src/render/mod.rs index fc4cdafb..cbb8ba0d 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -2,6 +2,7 @@ use crate::grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType use crate::nfa::CharacterSet; use crate::rules::{Alias, AliasMap, Symbol, SymbolType}; use crate::tables::{LexState, LexTable, ParseAction, ParseTable, ParseTableEntry}; +use core::ops::Range; use std::collections::{HashMap, HashSet}; use std::fmt::Write; use std::mem::swap; @@ -12,11 +13,17 @@ macro_rules! add { }} } -macro_rules! add_line { - ($this: tt, $($arg: tt)*) => { +macro_rules! add_whitespace { + ($this: tt) => {{ for _ in 0..$this.indent_level { write!(&mut $this.buffer, " ").unwrap(); } + }}; +} + +macro_rules! add_line { + ($this: tt, $($arg: tt)*) => { + add_whitespace!($this); $this.buffer.write_fmt(format_args!($($arg)*)).unwrap(); $this.buffer += "\n"; } @@ -162,7 +169,7 @@ impl Generator { } } - add_line!(self, "#define LANGUAGE_VERSION {}", 6); + add_line!(self, "#define LANGUAGE_VERSION {}", 9); add_line!( self, "#define STATE_COUNT {}", @@ -352,7 +359,7 @@ impl Generator { add_line!( self, "ACCEPT_TOKEN({})", - self.symbol_ids[&accept_action.symbol] + self.symbol_ids[&Symbol::terminal(accept_action)] ); } @@ -360,9 +367,10 @@ impl Generator { for (characters, action) in state.advance_actions { let previous_length = self.buffer.len(); + add_whitespace!(self); add!(self, "if ("); if self.add_character_set_condition(&characters, &ruled_out_characters) { - add!(self, ")"); + add!(self, ")\n"); indent!(self); if action.in_main_token { add_line!(self, "ADVANCE({});", action.state); @@ -370,7 +378,7 @@ impl Generator { add_line!(self, "SKIP({});", action.state); } if let CharacterSet::Include(chars) = characters { - ruled_out_characters.extend(chars.iter()); + ruled_out_characters.extend(chars.iter().map(|c| *c as u32)); } dedent!(self); } else { @@ -384,9 +392,106 @@ impl Generator { fn add_character_set_condition( &mut self, characters: &CharacterSet, - ruled_out_characters: &HashSet, + ruled_out_characters: &HashSet, ) -> bool { - true + match characters { + CharacterSet::Include(chars) => { + let ranges = Self::get_ranges(chars, ruled_out_characters); + self.add_character_range_conditions(ranges, false) + } + CharacterSet::Exclude(chars) => { + let ranges = Self::get_ranges(chars, ruled_out_characters); + self.add_character_range_conditions(ranges, true) + } + } + } + + fn add_character_range_conditions( + &mut self, + ranges: impl Iterator>, + is_negated: bool, + ) -> bool { + let line_break = "\n "; + let mut did_add = false; + for range in ranges { + if is_negated { + if did_add { + add!(self, " &&{}", line_break); + } + if range.end == range.start { + add!(self, "lookahead != "); + self.add_character(range.start); + } else if range.end as u32 == range.start as u32 + 1 { + add!(self, "lookahead != "); + self.add_character(range.start); + add!(self, " &&{}lookahead != ", line_break); + self.add_character(range.end); + } else { + add!(self, "(lookahead < "); + self.add_character(range.start); + add!(self, " || "); + self.add_character(range.end); + add!(self, " < lookahead)"); + } + } else { + if did_add { + add!(self, " ||{}", line_break); + } + if range.end == range.start { + add!(self, "lookahead == "); + self.add_character(range.start); + } else if range.end as u32 == range.start as u32 + 1 { + add!(self, "lookahead == "); + self.add_character(range.start); + add!(self, " ||{}lookahead == ", line_break); + self.add_character(range.end); + } else { + add!(self, "("); + self.add_character(range.start); + add!(self, " <= lookahead && lookahead <= "); + self.add_character(range.end); + add!(self, ")"); + } + } + did_add = true; + } + did_add + } + + fn get_ranges<'a>( + chars: &'a Vec, + ruled_out_characters: &'a HashSet, + ) -> impl Iterator> + 'a { + let mut prev_range: Option> = None; + chars + .iter() + .cloned() + .chain(Some('\0')) + .filter_map(move |c| { + if ruled_out_characters.contains(&(c as u32)) { + return None; + } + if let Some(range) = prev_range.clone() { + if c == '\0' { + prev_range = Some(c..c); + return Some(range); + } + + let mut prev_range_successor = range.end as u32 + 1; + while prev_range_successor < c as u32 { + if !ruled_out_characters.contains(&prev_range_successor) { + prev_range = Some(c..c); + return Some(range); + } + prev_range_successor += 1; + } + prev_range = Some(range.start..c); + None + } else { + prev_range = Some(c..c); + None + } + }) } fn add_lex_modes_list(&mut self) { @@ -577,13 +682,6 @@ impl Generator { alias_sequence_id, .. } => { - if !self.symbol_ids.contains_key(&symbol) { - eprintln!( - "SYMBOL: {:?} {:?}", - symbol, - self.metadata_for_symbol(symbol) - ); - } add!(self, "REDUCE({}, {}", self.symbol_ids[&symbol], child_count); if dynamic_precedence != 0 { add!(self, ", .dynamic_precedence = {}", dynamic_precedence); @@ -785,7 +883,7 @@ impl Generator { { result.push(c); } else { - result += match c { + let replacement = match c { '~' => "TILDE", '`' => "BQUOTE", '!' => "BANG", @@ -821,7 +919,11 @@ impl Generator { '\r' => "CR", '\t' => "TAB", _ => continue, + }; + if !result.is_empty() && !result.ends_with("_") { + result.push('_'); } + result += replacement; } } result @@ -837,6 +939,21 @@ impl Generator { } result } + + fn add_character(&mut self, c: char) { + if c.is_ascii() { + match c { + '\'' => add!(self, "'\\''"), + '\\' => add!(self, "'\\\\'"), + '\t' => add!(self, "'\\t'"), + '\n' => add!(self, "'\\n'"), + '\r' => add!(self, "'\\r'"), + _ => add!(self, "'{}'", c), + } + } else { + add!(self, "{}", c as u32) + } + } } pub(crate) fn render_c_code( @@ -867,3 +984,49 @@ pub(crate) fn render_c_code( } .generate() } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_get_char_ranges() { + struct Row { + chars: Vec, + ruled_out_chars: Vec, + expected_ranges: Vec>, + } + + let table = [ + Row { + chars: vec!['a'], + ruled_out_chars: vec![], + expected_ranges: vec!['a'..'a'], + }, + Row { + chars: vec!['a', 'b', 'c', 'e', 'z'], + ruled_out_chars: vec![], + expected_ranges: vec!['a'..'c', 'e'..'e', 'z'..'z'], + }, + Row { + chars: vec!['a', 'b', 'c', 'e', 'h', 'z'], + ruled_out_chars: vec!['d', 'f', 'g'], + expected_ranges: vec!['a'..'h', 'z'..'z'], + }, + ]; + + for Row { + chars, + ruled_out_chars, + expected_ranges, + } in table.iter() + { + let ruled_out_chars = ruled_out_chars + .into_iter() + .map(|c: &char| *c as u32) + .collect(); + let ranges = Generator::get_ranges(chars, &ruled_out_chars).collect::>(); + assert_eq!(ranges, *expected_ranges); + } + } +} diff --git a/src/rules.rs b/src/rules.rs index 3bfd5181..77e50d3c 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -120,7 +120,10 @@ impl Rule { pub fn seq(rules: Vec) -> Self { Rule::Seq(rules) } +} +#[cfg(test)] +impl Rule { pub fn terminal(index: usize) -> Self { Rule::Symbol(Symbol::terminal(index)) } diff --git a/src/tables.rs b/src/tables.rs index 344c4816..1c125621 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -1,7 +1,6 @@ use crate::nfa::CharacterSet; use crate::rules::{Alias, Associativity, Symbol}; use std::collections::HashMap; -use std::ops::Range; pub(crate) type AliasSequenceId = usize; pub(crate) type ParseStateId = usize; @@ -50,21 +49,13 @@ pub(crate) struct ParseTable { #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct AdvanceAction { pub state: LexStateId, - pub precedence: Range, pub in_main_token: bool, } -#[derive(Clone, Debug, PartialEq, Eq)] -pub(crate) struct AcceptTokenAction { - pub symbol: Symbol, - pub precedence: i32, - pub implicit_precedence: i32, -} - -#[derive(Clone, Debug, PartialEq, Eq)] +#[derive(Clone, Debug, Default, PartialEq, Eq)] pub(crate) struct LexState { - pub advance_actions: HashMap, - pub accept_action: Option, + pub advance_actions: Vec<(CharacterSet, AdvanceAction)>, + pub accept_action: Option, } #[derive(Debug, PartialEq, Eq)]