Implement lex table construction

This commit is contained in:
Max Brunsfeld 2019-01-02 12:34:40 -08:00
parent a46b8fcb46
commit 9824ebbbc3
15 changed files with 581 additions and 227 deletions

View file

@ -0,0 +1,124 @@
use super::item::LookaheadSet;
use super::token_conflicts::TokenConflictMap;
use crate::grammars::{LexicalGrammar, SyntaxGrammar};
use crate::nfa::NfaCursor;
use crate::rules::Symbol;
use crate::tables::{AdvanceAction, LexState, LexTable, ParseTable};
use std::collections::hash_map::Entry;
use std::collections::{HashMap, VecDeque};
pub(crate) fn build_lex_table(
parse_table: &mut ParseTable,
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
keywords: &LookaheadSet,
) -> (LexTable, LexTable) {
let keyword_lex_table;
if syntax_grammar.word_token.is_some() {
let mut builder = LexTableBuilder::new(lexical_grammar);
builder.add_state_for_tokens(keywords.iter());
keyword_lex_table = builder.table;
} else {
keyword_lex_table = LexTable::default();
}
let mut builder = LexTableBuilder::new(lexical_grammar);
for state in parse_table.states.iter_mut() {
let tokens = state.terminal_entries.keys().filter_map(|token| {
if token.is_terminal() {
if keywords.contains(&token) {
syntax_grammar.word_token
} else {
Some(*token)
}
} else {
None
}
});
state.lex_state_id = builder.add_state_for_tokens(tokens);
}
(builder.table, keyword_lex_table)
}
struct LexTableBuilder<'a> {
lexical_grammar: &'a LexicalGrammar,
cursor: NfaCursor<'a>,
table: LexTable,
state_queue: VecDeque<(usize, Vec<u32>)>,
state_ids_by_nfa_state_set: HashMap<Vec<u32>, usize>,
}
impl<'a> LexTableBuilder<'a> {
fn new(lexical_grammar: &'a LexicalGrammar) -> Self {
Self {
lexical_grammar,
cursor: NfaCursor::new(&lexical_grammar.nfa, vec![]),
table: LexTable::default(),
state_queue: VecDeque::new(),
state_ids_by_nfa_state_set: HashMap::new(),
}
}
fn add_state_for_tokens(&mut self, tokens: impl Iterator<Item = Symbol>) -> usize {
let nfa_states = tokens
.map(|token| self.lexical_grammar.variables[token.index].start_state)
.collect();
let result = self.add_state(nfa_states);
while let Some((state_id, nfa_states)) = self.state_queue.pop_front() {
self.populate_state(state_id, nfa_states);
}
result
}
fn add_state(&mut self, nfa_states: Vec<u32>) -> usize {
match self.state_ids_by_nfa_state_set.entry(nfa_states) {
Entry::Occupied(o) => *o.get(),
Entry::Vacant(v) => {
let state_id = self.table.states.len();
self.table.states.push(LexState::default());
self.state_queue.push_back((state_id, v.key().clone()));
v.insert(state_id);
state_id
}
}
}
fn populate_state(&mut self, state_id: usize, nfa_states: Vec<u32>) {
self.cursor.reset(nfa_states);
let mut completion = None;
for (id, prec) in self.cursor.completions() {
if let Some((prev_id, prev_precedence)) = completion {
if TokenConflictMap::prefer_token(
self.lexical_grammar,
(prev_precedence, prev_id),
(prec, id),
) {
continue;
}
}
completion = Some((id, prec));
}
for (chars, advance_precedence, next_states, is_sep) in self.cursor.grouped_successors() {
if let Some((_, completed_precedence)) = completion {
if advance_precedence < completed_precedence {
continue;
}
}
let next_state_id = self.add_state(next_states);
self.table.states[state_id].advance_actions.push((
chars,
AdvanceAction {
state: next_state_id,
in_main_token: !is_sep,
},
));
}
if let Some((completion_index, _)) = completion {
self.table.states[state_id].accept_action = Some(completion_index);
}
}
}

View file

@ -7,10 +7,10 @@ use crate::tables::{
AliasSequenceId, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry,
};
use core::ops::Range;
use std::hash::Hasher;
use std::collections::hash_map::{Entry, DefaultHasher};
use std::collections::hash_map::{DefaultHasher, Entry};
use std::collections::{HashMap, HashSet, VecDeque};
use std::fmt::Write;
use std::hash::Hasher;
#[derive(Clone)]
struct AuxiliarySymbolInfo {
@ -31,7 +31,6 @@ struct ParseTableBuilder<'a> {
item_set_builder: ParseItemSetBuilder<'a>,
syntax_grammar: &'a SyntaxGrammar,
lexical_grammar: &'a LexicalGrammar,
inlines: &'a InlinedProductionMap,
state_ids_by_item_set: HashMap<ParseItemSet<'a>, ParseStateId>,
item_sets_by_state_id: Vec<ParseItemSet<'a>>,
parse_state_queue: VecDeque<ParseStateQueueEntry>,
@ -51,9 +50,12 @@ impl<'a> ParseTableBuilder<'a> {
&Vec::new(),
&Vec::new(),
ParseItemSet::with(
[(ParseItem::start(), LookaheadSet::with([Symbol::end()].iter().cloned()))]
.iter()
.cloned(),
[(
ParseItem::start(),
LookaheadSet::with([Symbol::end()].iter().cloned()),
)]
.iter()
.cloned(),
),
);
@ -69,8 +71,12 @@ impl<'a> ParseTableBuilder<'a> {
item_set: ParseItemSet<'a>,
) -> ParseStateId {
if preceding_symbols.len() > 1 {
let left_tokens = self.item_set_builder.last_set(&preceding_symbols[preceding_symbols.len() - 2]);
let right_tokens = self.item_set_builder.first_set(&preceding_symbols[preceding_symbols.len() - 1]);
let left_tokens = self
.item_set_builder
.last_set(&preceding_symbols[preceding_symbols.len() - 2]);
let right_tokens = self
.item_set_builder
.first_set(&preceding_symbols[preceding_symbols.len() - 1]);
for left_token in left_tokens.iter() {
if left_token.is_terminal() {
self.following_tokens[left_token.index].insert_all(right_tokens);
@ -117,11 +123,9 @@ impl<'a> ParseTableBuilder<'a> {
);
}
let item_set = self.item_set_builder.transitive_closure(
&self.item_sets_by_state_id[entry.state_id],
self.syntax_grammar,
self.inlines,
);
let item_set = self
.item_set_builder
.transitive_closure(&self.item_sets_by_state_id[entry.state_id]);
if debug {
println!(
@ -606,7 +610,6 @@ pub(crate) fn build_parse_table(
ParseTableBuilder {
syntax_grammar,
lexical_grammar,
inlines,
item_set_builder: ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines),
state_ids_by_item_set: HashMap::new(),
item_sets_by_state_id: Vec::new(),

View file

@ -18,6 +18,7 @@ struct FollowSetInfo {
pub(crate) struct ParseItemSetBuilder<'a> {
first_sets: HashMap<Symbol, LookaheadSet>,
last_sets: HashMap<Symbol, LookaheadSet>,
inlines: &'a InlinedProductionMap,
transitive_closure_additions: Vec<Vec<TransitiveClosureAddition<'a>>>,
}
@ -36,6 +37,7 @@ impl<'a> ParseItemSetBuilder<'a> {
let mut result = Self {
first_sets: HashMap::new(),
last_sets: HashMap::new(),
inlines,
transitive_closure_additions: vec![Vec::new(); syntax_grammar.variables.len()],
};
@ -237,15 +239,12 @@ impl<'a> ParseItemSetBuilder<'a> {
result
}
pub(crate) fn transitive_closure(
&mut self,
item_set: &ParseItemSet<'a>,
grammar: &'a SyntaxGrammar,
inlines: &'a InlinedProductionMap,
) -> ParseItemSet<'a> {
pub(crate) fn transitive_closure(&mut self, item_set: &ParseItemSet<'a>) -> ParseItemSet<'a> {
let mut result = ParseItemSet::default();
for (item, lookaheads) in &item_set.entries {
if let Some(productions) = inlines.inlined_productions(item.production, item.step_index)
if let Some(productions) = self
.inlines
.inlined_productions(item.production, item.step_index)
{
for production in productions {
self.add_item(
@ -273,12 +272,7 @@ impl<'a> ParseItemSetBuilder<'a> {
&self.first_sets[symbol]
}
fn add_item(
&self,
set: &mut ParseItemSet<'a>,
item: ParseItem<'a>,
lookaheads: &LookaheadSet,
) {
fn add_item(&self, set: &mut ParseItemSet<'a>, item: ParseItem<'a>, lookaheads: &LookaheadSet) {
if let Some(step) = item.step() {
if step.symbol.is_non_terminal() {
let next_step = item.successor().step();

View file

@ -1,24 +0,0 @@
use crate::rules::Symbol;
use crate::tables::LexTable;
use crate::grammars::{SyntaxGrammar, LexicalGrammar};
pub(crate) struct LexTableBuilder<'a> {
syntax_grammar: &'a SyntaxGrammar,
lexical_grammar: &'a LexicalGrammar,
table: LexTable,
}
impl<'a> LexTableBuilder<'a> {
pub fn new(
syntax_grammar: &'a SyntaxGrammar,
lexical_grammar: &'a LexicalGrammar,
) -> Self {
Self {
syntax_grammar, lexical_grammar, table: LexTable::default()
}
}
pub fn build(self) -> (LexTable, LexTable, Option<Symbol>) {
(LexTable::default(), LexTable::default(), None)
}
}

View file

@ -1,11 +1,12 @@
mod build_lex_table;
mod build_parse_table;
mod coincident_tokens;
mod item;
mod item_set_builder;
mod lex_table_builder;
mod shrink_parse_table;
mod token_conflicts;
use self::build_lex_table::build_lex_table;
use self::build_parse_table::build_parse_table;
use self::coincident_tokens::CoincidentTokenIndex;
use self::item::LookaheadSet;
@ -13,6 +14,7 @@ use self::shrink_parse_table::shrink_parse_table;
use self::token_conflicts::TokenConflictMap;
use crate::error::Result;
use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar};
use crate::nfa::{CharacterSet, NfaCursor};
use crate::rules::{AliasMap, Symbol};
use crate::tables::{LexTable, ParseAction, ParseTable, ParseTableEntry};
@ -25,7 +27,22 @@ pub(crate) fn build_tables(
let (mut parse_table, following_tokens) =
build_parse_table(syntax_grammar, lexical_grammar, inlines)?;
let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens);
eprintln!("{:?}", token_conflict_map);
let coincident_token_index = CoincidentTokenIndex::new(&parse_table);
let keywords = if let Some(word_token) = syntax_grammar.word_token {
identify_keywords(
lexical_grammar,
&parse_table,
word_token,
&token_conflict_map,
&coincident_token_index,
)
} else {
LookaheadSet::new()
};
populate_error_state(
&mut parse_table,
syntax_grammar,
@ -39,7 +56,14 @@ pub(crate) fn build_tables(
simple_aliases,
&token_conflict_map,
);
Ok((parse_table, LexTable::default(), LexTable::default(), None))
let (main_lex_table, keyword_lex_table) =
build_lex_table(&mut parse_table, syntax_grammar, lexical_grammar, &keywords);
Ok((
parse_table,
main_lex_table,
keyword_lex_table,
syntax_grammar.word_token,
))
}
fn populate_error_state(
@ -77,13 +101,10 @@ fn populate_error_state(
|| !token_conflict_map.does_conflict(i, t.index)
});
if can_be_used_for_recovery {
eprintln!("include {}", &lexical_grammar.variables[symbol.index].name);
state
.terminal_entries
.entry(symbol)
.or_insert_with(|| recover_entry.clone());
} else {
eprintln!("exclude {}", &lexical_grammar.variables[symbol.index].name);
}
}
@ -98,3 +119,103 @@ fn populate_error_state(
state.terminal_entries.insert(Symbol::end(), recover_entry);
}
fn identify_keywords(
lexical_grammar: &LexicalGrammar,
parse_table: &ParseTable,
word_token: Symbol,
token_conflict_map: &TokenConflictMap,
coincident_token_index: &CoincidentTokenIndex,
) -> LookaheadSet {
let mut cursor = NfaCursor::new(&lexical_grammar.nfa, Vec::new());
// First find all of the candidate keyword tokens: tokens that start with
// letters or underscore and can match the same string as a word token.
let keywords = LookaheadSet::with(lexical_grammar.variables.iter().enumerate().filter_map(
|(i, variable)| {
cursor.reset(vec![variable.start_state]);
if all_chars_are_alphabetical(&cursor)
&& token_conflict_map.does_match_same_string(i, word_token.index)
{
Some(Symbol::terminal(i))
} else {
None
}
},
));
// Exclude keyword candidates that shadow another keyword candidate.
let keywords = LookaheadSet::with(keywords.iter().filter(|token| {
for other_token in keywords.iter() {
if other_token != *token
&& token_conflict_map.does_match_same_string(token.index, other_token.index)
{
eprintln!(
"Exclude {} from keywords because it matches the same string as {}",
lexical_grammar.variables[token.index].name,
lexical_grammar.variables[other_token.index].name
);
return false;
}
}
true
}));
// Exclude keyword candidates for which substituting the keyword capture
// token would introduce new lexical conflicts with other tokens.
let keywords = LookaheadSet::with(keywords.iter().filter(|token| {
for other_index in 0..lexical_grammar.variables.len() {
if keywords.contains(&Symbol::terminal(other_index)) {
continue;
}
// If the word token was already valid in every state containing
// this keyword candidate, then substituting the word token won't
// introduce any new lexical conflicts.
if coincident_token_index
.states_with(*token, Symbol::terminal(other_index))
.iter()
.all(|state_id| {
parse_table.states[*state_id]
.terminal_entries
.contains_key(&word_token)
})
{
continue;
}
if !token_conflict_map.has_same_conflict_status(
token.index,
word_token.index,
other_index,
) {
eprintln!(
"Exclude {} from keywords because of conflict with {}",
lexical_grammar.variables[token.index].name,
lexical_grammar.variables[other_index].name
);
return false;
}
}
eprintln!(
"Include {} in keywords",
lexical_grammar.variables[token.index].name,
);
true
}));
keywords
}
fn all_chars_are_alphabetical(cursor: &NfaCursor) -> bool {
cursor.successors().all(|(chars, _, _, is_sep)| {
if is_sep {
true
} else if let CharacterSet::Include(chars) = chars {
chars.iter().all(|c| c.is_alphabetic() || *c == '_')
} else {
false
}
})
}

View file

@ -166,8 +166,6 @@ fn merge_parse_state(
}
}
eprintln!("maybe merge {} {}", left, right);
let mut symbols_to_add = Vec::new();
for (symbol, right_entry) in &right_state.terminal_entries {
if !left_state.terminal_entries.contains_key(&symbol) {

View file

@ -4,7 +4,7 @@ use crate::nfa::{CharacterSet, NfaCursor};
use std::collections::HashSet;
use std::fmt;
#[derive(Clone, Debug, Default)]
#[derive(Clone, Debug, Default, PartialEq, Eq)]
struct TokenConflictStatus {
does_overlap: bool,
does_match_valid_continuation: bool,
@ -12,15 +12,16 @@ struct TokenConflictStatus {
matches_same_string: bool,
}
pub(crate) struct TokenConflictMap {
pub(crate) struct TokenConflictMap<'a> {
n: usize,
status_matrix: Vec<TokenConflictStatus>,
starting_chars_by_index: Vec<CharacterSet>,
following_chars_by_index: Vec<CharacterSet>,
grammar: &'a LexicalGrammar,
}
impl TokenConflictMap {
pub fn new(grammar: &LexicalGrammar, following_tokens: Vec<LookaheadSet>) -> Self {
impl<'a> TokenConflictMap<'a> {
pub fn new(grammar: &'a LexicalGrammar, following_tokens: Vec<LookaheadSet>) -> Self {
let mut cursor = NfaCursor::new(&grammar.nfa, Vec::new());
let starting_chars = get_starting_chars(&mut cursor, grammar);
let following_chars = get_following_chars(&starting_chars, following_tokens);
@ -40,9 +41,16 @@ impl TokenConflictMap {
status_matrix,
starting_chars_by_index: starting_chars,
following_chars_by_index: following_chars,
grammar,
}
}
pub fn has_same_conflict_status(&self, a: usize, b: usize, other: usize) -> bool {
let left = &self.status_matrix[matrix_index(self.n, a, other)];
let right = &self.status_matrix[matrix_index(self.n, b, other)];
left == right
}
pub fn does_match_same_string(&self, i: usize, j: usize) -> bool {
self.status_matrix[matrix_index(self.n, i, j)].matches_same_string
}
@ -55,9 +63,28 @@ impl TokenConflictMap {
pub fn does_overlap(&self, i: usize, j: usize) -> bool {
self.status_matrix[matrix_index(self.n, i, j)].does_overlap
}
pub fn prefer_token(grammar: &LexicalGrammar, left: (i32, usize), right: (i32, usize)) -> bool {
if left.0 > right.0 {
return true;
} else if left.0 < right.0 {
return false;
}
match (
grammar.variables[left.1].is_string,
grammar.variables[right.1].is_string,
) {
(true, false) => return true,
(false, true) => return false,
_ => {}
}
left.0 < right.0
}
}
impl fmt::Debug for TokenConflictMap {
impl<'a> fmt::Debug for TokenConflictMap<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "TokenConflictMap {{\n")?;
@ -69,18 +96,22 @@ impl fmt::Debug for TokenConflictMap {
write!(f, " following_characters: {{\n")?;
for i in 0..self.n {
write!(f, " {}: {:?},\n", i, self.following_chars_by_index[i])?;
write!(
f,
" {}: {:?},\n",
self.grammar.variables[i].name, self.following_chars_by_index[i]
)?;
}
write!(f, " }},\n")?;
write!(f, " status_matrix: {{\n")?;
for i in 0..self.n {
write!(f, " {}: {{\n", i)?;
write!(f, " {}: {{\n", self.grammar.variables[i].name)?;
for j in 0..self.n {
write!(
f,
" {}: {:?},\n",
j,
self.grammar.variables[j].name,
self.status_matrix[matrix_index(self.n, i, j)]
)?;
}
@ -101,7 +132,7 @@ fn get_starting_chars(cursor: &mut NfaCursor, grammar: &LexicalGrammar) -> Vec<C
for variable in &grammar.variables {
cursor.reset(vec![variable.start_state]);
let mut all_chars = CharacterSet::empty();
for (chars, _, _) in cursor.successors() {
for (chars, _, _, _) in cursor.successors() {
all_chars = all_chars.add(chars);
}
result.push(all_chars);
@ -162,7 +193,11 @@ fn compute_conflict_status(
// Prefer tokens with higher precedence. For tokens with equal precedence,
// prefer those listed earlier in the grammar.
let winning_id;
if prefer_token(grammar, (prev_precedence, prev_id), (precedence, id)) {
if TokenConflictMap::prefer_token(
grammar,
(prev_precedence, prev_id),
(precedence, id),
) {
winning_id = prev_id;
} else {
winning_id = id;
@ -181,7 +216,7 @@ fn compute_conflict_status(
}
}
for (chars, advance_precedence, next_states) in cursor.grouped_successors() {
for (chars, advance_precedence, next_states, in_sep) in cursor.grouped_successors() {
let mut can_advance = true;
if let Some((completed_id, completed_precedence)) = completion {
let mut other_id = None;
@ -209,15 +244,13 @@ fn compute_conflict_status(
if chars.does_intersect(&following_chars[j]) {
result.0.does_match_valid_continuation = true;
}
if cursor.in_separator() {
if in_sep {
result.0.does_match_separators = true;
}
} else {
result.1.does_overlap = true;
if chars.does_intersect(&following_chars[i]) {
result.1.does_match_valid_continuation = true;
} else {
result.1.does_match_separators = true;
}
}
}
@ -231,25 +264,6 @@ fn compute_conflict_status(
result
}
fn prefer_token(grammar: &LexicalGrammar, left: (i32, usize), right: (i32, usize)) -> bool {
if left.0 > right.0 {
return true;
} else if left.0 < right.0 {
return false;
}
match (
grammar.variables[left.1].is_string,
grammar.variables[right.1].is_string,
) {
(true, false) => return true,
(false, true) => return false,
_ => {}
}
left.0 < right.0
}
fn variable_ids_for_states<'a>(
state_ids: &'a Vec<u32>,
grammar: &'a LexicalGrammar,

View file

@ -91,6 +91,7 @@ pub(crate) struct SyntaxGrammar {
pub word_token: Option<Symbol>,
}
#[cfg(test)]
impl ProductionStep {
pub(crate) fn new(symbol: Symbol) -> Self {
Self {
@ -127,14 +128,6 @@ impl Production {
pub fn first_symbol(&self) -> Option<Symbol> {
self.steps.first().map(|s| s.symbol.clone())
}
pub fn last_precedence(&self) -> i32 {
self.steps.last().map(|s| s.precedence).unwrap_or(0)
}
pub fn last_associativity(&self) -> Option<Associativity> {
self.steps.last().map(|s| s.associativity).unwrap_or(None)
}
}
impl Default for Production {
@ -146,6 +139,7 @@ impl Default for Production {
}
}
#[cfg(test)]
impl Variable {
pub fn named(name: &str, rule: Rule) -> Self {
Self {

View file

@ -42,7 +42,7 @@ fn main() -> error::Result<()> {
)
.get_matches();
if let Some(matches) = matches.subcommand_matches("generate") {
if let Some(_) = matches.subcommand_matches("generate") {
let mut grammar_path = env::current_dir().expect("Failed to read CWD");
grammar_path.push("grammar.js");
let grammar_json = load_js_grammar_file(grammar_path);

View file

@ -40,7 +40,6 @@ impl Default for Nfa {
pub struct NfaCursor<'a> {
pub(crate) state_ids: Vec<u32>,
nfa: &'a Nfa,
in_sep: bool,
}
impl CharacterSet {
@ -111,7 +110,7 @@ impl CharacterSet {
CharacterSet::Exclude(other_chars) => {
chars.retain(|c| other_chars.contains(&c));
CharacterSet::Exclude(chars)
},
}
},
}
}
@ -311,7 +310,6 @@ impl<'a> NfaCursor<'a> {
let mut result = Self {
nfa,
state_ids: Vec::new(),
in_sep: true,
};
result.add_states(&mut states);
result
@ -322,81 +320,59 @@ impl<'a> NfaCursor<'a> {
self.add_states(&mut states);
}
pub fn advance(&mut self, c: char) -> bool {
let mut result = false;
let mut new_state_ids = Vec::new();
let mut any_sep_transitions = false;
for current_state_id in &self.state_ids {
if let NfaState::Advance {
chars,
state_id,
is_sep,
..
} = &self.nfa.states[*current_state_id as usize]
{
if chars.contains(c) {
if *is_sep {
any_sep_transitions = true;
}
new_state_ids.push(*state_id);
result = true;
}
}
}
if !any_sep_transitions {
self.in_sep = false;
}
self.state_ids.clear();
self.add_states(&mut new_state_ids);
result
}
pub fn successors(&self) -> impl Iterator<Item = (&CharacterSet, i32, u32)> {
pub fn successors(&self) -> impl Iterator<Item = (&CharacterSet, i32, u32, bool)> {
self.state_ids.iter().filter_map(move |id| {
if let NfaState::Advance {
chars,
state_id,
precedence,
..
is_sep,
} = &self.nfa.states[*id as usize]
{
Some((chars, *precedence, *state_id))
Some((chars, *precedence, *state_id, *is_sep))
} else {
None
}
})
}
pub fn grouped_successors(&self) -> Vec<(CharacterSet, i32, Vec<u32>)> {
pub fn grouped_successors(&self) -> Vec<(CharacterSet, i32, Vec<u32>, bool)> {
Self::group_successors(self.successors())
}
fn group_successors<'b>(
iter: impl Iterator<Item = (&'b CharacterSet, i32, u32)>,
) -> Vec<(CharacterSet, i32, Vec<u32>)> {
let mut result: Vec<(CharacterSet, i32, Vec<u32>)> = Vec::new();
for (chars, prec, state) in iter {
iter: impl Iterator<Item = (&'b CharacterSet, i32, u32, bool)>,
) -> Vec<(CharacterSet, i32, Vec<u32>, bool)> {
let mut result: Vec<(CharacterSet, i32, Vec<u32>, bool)> = Vec::new();
for (chars, prec, state, is_sep) in iter {
let mut chars = chars.clone();
let mut i = 0;
while i < result.len() {
let intersection = result[i].0.remove_intersection(&mut chars);
if !intersection.is_empty() {
if result[i].0.is_empty() {
result[i].0 = intersection;
result[i].1 = max(result[i].1, prec);
result[i].2.push(state);
} else {
if result[i].0 == chars {
result[i].1 = max(result[i].1, prec);
result[i].2.push(state);
result[i].3 |= is_sep;
} else {
let intersection = result[i].0.remove_intersection(&mut chars);
if !intersection.is_empty() {
let mut states = result[i].2.clone();
let mut precedence = result[i].1;
states.push(state);
result.insert(i, (intersection, max(precedence, prec), states));
result.insert(
i,
(
intersection,
max(result[i].1, prec),
states,
result[i].3 || is_sep,
),
);
i += 1;
}
}
i += 1;
}
if !chars.is_empty() {
result.push((chars, prec, vec![state]));
result.push((chars, prec, vec![state], is_sep));
}
}
result.sort_unstable_by(|a, b| a.0.cmp(&b.0));
@ -417,10 +393,6 @@ impl<'a> NfaCursor<'a> {
})
}
pub fn in_separator(&self) -> bool {
self.in_sep
}
pub fn add_states(&mut self, new_state_ids: &mut Vec<u32>) {
let mut i = 0;
while i < new_state_ids.len() {
@ -460,26 +432,31 @@ mod tests {
let table = [
(
vec![
(CharacterSet::empty().add_range('a', 'f'), 0, 1),
(CharacterSet::empty().add_range('d', 'i'), 1, 2),
(CharacterSet::empty().add_range('a', 'f'), 0, 1, false),
(CharacterSet::empty().add_range('d', 'i'), 1, 2, false),
],
vec![
(CharacterSet::empty().add_range('a', 'c'), 0, vec![1]),
(CharacterSet::empty().add_range('d', 'f'), 1, vec![1, 2]),
(CharacterSet::empty().add_range('g', 'i'), 1, vec![2]),
(CharacterSet::empty().add_range('a', 'c'), 0, vec![1], false),
(
CharacterSet::empty().add_range('d', 'f'),
1,
vec![1, 2],
false,
),
(CharacterSet::empty().add_range('g', 'i'), 1, vec![2], false),
],
),
(
vec![
(CharacterSet::empty().add_range('a', 'z'), 0, 1),
(CharacterSet::empty().add_char('d'), 0, 2),
(CharacterSet::empty().add_char('i'), 0, 3),
(CharacterSet::empty().add_char('f'), 0, 4),
(CharacterSet::empty().add_range('a', 'z'), 0, 1, false),
(CharacterSet::empty().add_char('d'), 0, 2, false),
(CharacterSet::empty().add_char('i'), 0, 3, false),
(CharacterSet::empty().add_char('f'), 0, 4, false),
],
vec![
(CharacterSet::empty().add_char('d'), 0, vec![1, 2]),
(CharacterSet::empty().add_char('f'), 0, vec![1, 4]),
(CharacterSet::empty().add_char('i'), 0, vec![1, 3]),
(CharacterSet::empty().add_char('d'), 0, vec![1, 2], false),
(CharacterSet::empty().add_char('f'), 0, vec![1, 4], false),
(CharacterSet::empty().add_char('i'), 0, vec![1, 3], false),
(
CharacterSet::empty()
.add_range('a', 'c')
@ -488,6 +465,7 @@ mod tests {
.add_range('j', 'z'),
0,
vec![1],
false,
),
],
),
@ -495,28 +473,10 @@ mod tests {
for row in table.iter() {
assert_eq!(
NfaCursor::group_successors(row.0.iter().map(|(c, p, s)| (c, *p, *s))),
NfaCursor::group_successors(row.0.iter().map(|(c, p, s, sep)| (c, *p, *s, *sep))),
row.1
);
}
// let successors = NfaCursor::group_successors(
// [
// (&CharacterSet::empty().add_range('a', 'f'), 1),
// (&CharacterSet::empty().add_range('d', 'i'), 2),
// ]
// .iter()
// .cloned(),
// );
//
// assert_eq!(
// successors,
// vec![
// (CharacterSet::empty().add_range('a', 'c'), vec![1],),
// (CharacterSet::empty().add_range('d', 'f'), vec![1, 2],),
// (CharacterSet::empty().add_range('g', 'i'), vec![2],),
// ]
// );
}
#[test]

View file

@ -6,6 +6,7 @@ use crate::rules::Rule;
use regex_syntax::ast::{
parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind, RepetitionRange,
};
use std::i32;
struct NfaBuilder {
nfa: Nfa,
@ -17,7 +18,7 @@ fn is_string(rule: &Rule) -> bool {
match rule {
Rule::String(_) => true,
Rule::Metadata { rule, .. } => is_string(rule),
_ => false
_ => false,
}
}
@ -346,7 +347,9 @@ impl NfaBuilder {
fn push_split(&mut self, state_id: u32) {
let last_state_id = self.nfa.last_state_id();
self.nfa.states.push(NfaState::Split(state_id, last_state_id));
self.nfa
.states
.push(NfaState::Split(state_id, last_state_id));
}
fn add_precedence(&mut self, prec: i32, mut state_ids: Vec<u32>) {
@ -354,12 +357,12 @@ impl NfaBuilder {
while i < state_ids.len() {
let state_id = state_ids[i];
let (left, right) = match &mut self.nfa.states[state_id as usize] {
NfaState::Accept {precedence, ..} => {
NfaState::Accept { precedence, .. } => {
*precedence = prec;
return;
},
}
NfaState::Split(left, right) => (*left, *right),
_ => return
_ => return,
};
if !state_ids.contains(&left) {
state_ids.push(left);
@ -383,7 +386,7 @@ mod tests {
let mut cursor = NfaCursor::new(&grammar.nfa, start_states);
let mut result = None;
let mut result_precedence = 0;
let mut result_precedence = i32::MIN;
let mut start_char = 0;
let mut end_char = 0;
for c in s.chars() {
@ -393,9 +396,14 @@ mod tests {
result_precedence = precedence;
}
}
if cursor.advance(c) {
if let Some((_, _, next_states, in_sep)) = cursor
.grouped_successors()
.into_iter()
.find(|(chars, prec, _, _)| chars.contains(c) && *prec >= result_precedence)
{
cursor.reset(next_states);
end_char += 1;
if cursor.in_separator() {
if in_sep {
start_char = end_char;
}
} else {

View file

@ -1,6 +1,6 @@
use super::{ExtractedLexicalGrammar, ExtractedSyntaxGrammar, InternedGrammar};
use crate::error::{Error, Result};
use crate::grammars::{ExternalToken, Variable};
use crate::grammars::{ExternalToken, Variable, VariableType};
use crate::rules::{MetadataParams, Rule, Symbol, SymbolType};
use std::collections::HashMap;
use std::mem;
@ -240,16 +240,21 @@ impl TokenExtractor {
let index = self.extracted_variables.len();
let variable = if let Some(string_value) = string_value {
Variable::anonymous(string_value, rule.clone())
Variable {
name: string_value.clone(),
kind: VariableType::Anonymous,
rule: rule.clone()
}
} else {
self.current_variable_token_count += 1;
Variable::auxiliary(
&format!(
Variable {
name: format!(
"{}_token{}",
&self.current_variable_name, self.current_variable_token_count
),
rule.clone(),
)
kind: VariableType::Auxiliary,
rule: rule.clone(),
}
};
self.extracted_variables.push(variable);

View file

@ -2,6 +2,7 @@ use crate::grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType
use crate::nfa::CharacterSet;
use crate::rules::{Alias, AliasMap, Symbol, SymbolType};
use crate::tables::{LexState, LexTable, ParseAction, ParseTable, ParseTableEntry};
use core::ops::Range;
use std::collections::{HashMap, HashSet};
use std::fmt::Write;
use std::mem::swap;
@ -12,11 +13,17 @@ macro_rules! add {
}}
}
macro_rules! add_line {
($this: tt, $($arg: tt)*) => {
macro_rules! add_whitespace {
($this: tt) => {{
for _ in 0..$this.indent_level {
write!(&mut $this.buffer, " ").unwrap();
}
}};
}
macro_rules! add_line {
($this: tt, $($arg: tt)*) => {
add_whitespace!($this);
$this.buffer.write_fmt(format_args!($($arg)*)).unwrap();
$this.buffer += "\n";
}
@ -162,7 +169,7 @@ impl Generator {
}
}
add_line!(self, "#define LANGUAGE_VERSION {}", 6);
add_line!(self, "#define LANGUAGE_VERSION {}", 9);
add_line!(
self,
"#define STATE_COUNT {}",
@ -352,7 +359,7 @@ impl Generator {
add_line!(
self,
"ACCEPT_TOKEN({})",
self.symbol_ids[&accept_action.symbol]
self.symbol_ids[&Symbol::terminal(accept_action)]
);
}
@ -360,9 +367,10 @@ impl Generator {
for (characters, action) in state.advance_actions {
let previous_length = self.buffer.len();
add_whitespace!(self);
add!(self, "if (");
if self.add_character_set_condition(&characters, &ruled_out_characters) {
add!(self, ")");
add!(self, ")\n");
indent!(self);
if action.in_main_token {
add_line!(self, "ADVANCE({});", action.state);
@ -370,7 +378,7 @@ impl Generator {
add_line!(self, "SKIP({});", action.state);
}
if let CharacterSet::Include(chars) = characters {
ruled_out_characters.extend(chars.iter());
ruled_out_characters.extend(chars.iter().map(|c| *c as u32));
}
dedent!(self);
} else {
@ -384,9 +392,106 @@ impl Generator {
fn add_character_set_condition(
&mut self,
characters: &CharacterSet,
ruled_out_characters: &HashSet<char>,
ruled_out_characters: &HashSet<u32>,
) -> bool {
true
match characters {
CharacterSet::Include(chars) => {
let ranges = Self::get_ranges(chars, ruled_out_characters);
self.add_character_range_conditions(ranges, false)
}
CharacterSet::Exclude(chars) => {
let ranges = Self::get_ranges(chars, ruled_out_characters);
self.add_character_range_conditions(ranges, true)
}
}
}
fn add_character_range_conditions(
&mut self,
ranges: impl Iterator<Item = Range<char>>,
is_negated: bool,
) -> bool {
let line_break = "\n ";
let mut did_add = false;
for range in ranges {
if is_negated {
if did_add {
add!(self, " &&{}", line_break);
}
if range.end == range.start {
add!(self, "lookahead != ");
self.add_character(range.start);
} else if range.end as u32 == range.start as u32 + 1 {
add!(self, "lookahead != ");
self.add_character(range.start);
add!(self, " &&{}lookahead != ", line_break);
self.add_character(range.end);
} else {
add!(self, "(lookahead < ");
self.add_character(range.start);
add!(self, " || ");
self.add_character(range.end);
add!(self, " < lookahead)");
}
} else {
if did_add {
add!(self, " ||{}", line_break);
}
if range.end == range.start {
add!(self, "lookahead == ");
self.add_character(range.start);
} else if range.end as u32 == range.start as u32 + 1 {
add!(self, "lookahead == ");
self.add_character(range.start);
add!(self, " ||{}lookahead == ", line_break);
self.add_character(range.end);
} else {
add!(self, "(");
self.add_character(range.start);
add!(self, " <= lookahead && lookahead <= ");
self.add_character(range.end);
add!(self, ")");
}
}
did_add = true;
}
did_add
}
fn get_ranges<'a>(
chars: &'a Vec<char>,
ruled_out_characters: &'a HashSet<u32>,
) -> impl Iterator<Item = Range<char>> + 'a {
let mut prev_range: Option<Range<char>> = None;
chars
.iter()
.cloned()
.chain(Some('\0'))
.filter_map(move |c| {
if ruled_out_characters.contains(&(c as u32)) {
return None;
}
if let Some(range) = prev_range.clone() {
if c == '\0' {
prev_range = Some(c..c);
return Some(range);
}
let mut prev_range_successor = range.end as u32 + 1;
while prev_range_successor < c as u32 {
if !ruled_out_characters.contains(&prev_range_successor) {
prev_range = Some(c..c);
return Some(range);
}
prev_range_successor += 1;
}
prev_range = Some(range.start..c);
None
} else {
prev_range = Some(c..c);
None
}
})
}
fn add_lex_modes_list(&mut self) {
@ -577,13 +682,6 @@ impl Generator {
alias_sequence_id,
..
} => {
if !self.symbol_ids.contains_key(&symbol) {
eprintln!(
"SYMBOL: {:?} {:?}",
symbol,
self.metadata_for_symbol(symbol)
);
}
add!(self, "REDUCE({}, {}", self.symbol_ids[&symbol], child_count);
if dynamic_precedence != 0 {
add!(self, ", .dynamic_precedence = {}", dynamic_precedence);
@ -785,7 +883,7 @@ impl Generator {
{
result.push(c);
} else {
result += match c {
let replacement = match c {
'~' => "TILDE",
'`' => "BQUOTE",
'!' => "BANG",
@ -821,7 +919,11 @@ impl Generator {
'\r' => "CR",
'\t' => "TAB",
_ => continue,
};
if !result.is_empty() && !result.ends_with("_") {
result.push('_');
}
result += replacement;
}
}
result
@ -837,6 +939,21 @@ impl Generator {
}
result
}
fn add_character(&mut self, c: char) {
if c.is_ascii() {
match c {
'\'' => add!(self, "'\\''"),
'\\' => add!(self, "'\\\\'"),
'\t' => add!(self, "'\\t'"),
'\n' => add!(self, "'\\n'"),
'\r' => add!(self, "'\\r'"),
_ => add!(self, "'{}'", c),
}
} else {
add!(self, "{}", c as u32)
}
}
}
pub(crate) fn render_c_code(
@ -867,3 +984,49 @@ pub(crate) fn render_c_code(
}
.generate()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_get_char_ranges() {
struct Row {
chars: Vec<char>,
ruled_out_chars: Vec<char>,
expected_ranges: Vec<Range<char>>,
}
let table = [
Row {
chars: vec!['a'],
ruled_out_chars: vec![],
expected_ranges: vec!['a'..'a'],
},
Row {
chars: vec!['a', 'b', 'c', 'e', 'z'],
ruled_out_chars: vec![],
expected_ranges: vec!['a'..'c', 'e'..'e', 'z'..'z'],
},
Row {
chars: vec!['a', 'b', 'c', 'e', 'h', 'z'],
ruled_out_chars: vec!['d', 'f', 'g'],
expected_ranges: vec!['a'..'h', 'z'..'z'],
},
];
for Row {
chars,
ruled_out_chars,
expected_ranges,
} in table.iter()
{
let ruled_out_chars = ruled_out_chars
.into_iter()
.map(|c: &char| *c as u32)
.collect();
let ranges = Generator::get_ranges(chars, &ruled_out_chars).collect::<Vec<_>>();
assert_eq!(ranges, *expected_ranges);
}
}
}

View file

@ -120,7 +120,10 @@ impl Rule {
pub fn seq(rules: Vec<Rule>) -> Self {
Rule::Seq(rules)
}
}
#[cfg(test)]
impl Rule {
pub fn terminal(index: usize) -> Self {
Rule::Symbol(Symbol::terminal(index))
}

View file

@ -1,7 +1,6 @@
use crate::nfa::CharacterSet;
use crate::rules::{Alias, Associativity, Symbol};
use std::collections::HashMap;
use std::ops::Range;
pub(crate) type AliasSequenceId = usize;
pub(crate) type ParseStateId = usize;
@ -50,21 +49,13 @@ pub(crate) struct ParseTable {
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct AdvanceAction {
pub state: LexStateId,
pub precedence: Range<i32>,
pub in_main_token: bool,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct AcceptTokenAction {
pub symbol: Symbol,
pub precedence: i32,
pub implicit_precedence: i32,
}
#[derive(Clone, Debug, PartialEq, Eq)]
#[derive(Clone, Debug, Default, PartialEq, Eq)]
pub(crate) struct LexState {
pub advance_actions: HashMap<CharacterSet, AdvanceAction>,
pub accept_action: Option<AcceptTokenAction>,
pub advance_actions: Vec<(CharacterSet, AdvanceAction)>,
pub accept_action: Option<usize>,
}
#[derive(Debug, PartialEq, Eq)]