Implement lex table construction
This commit is contained in:
parent
a46b8fcb46
commit
9824ebbbc3
15 changed files with 581 additions and 227 deletions
124
src/build_tables/build_lex_table.rs
Normal file
124
src/build_tables/build_lex_table.rs
Normal file
|
|
@ -0,0 +1,124 @@
|
|||
use super::item::LookaheadSet;
|
||||
use super::token_conflicts::TokenConflictMap;
|
||||
use crate::grammars::{LexicalGrammar, SyntaxGrammar};
|
||||
use crate::nfa::NfaCursor;
|
||||
use crate::rules::Symbol;
|
||||
use crate::tables::{AdvanceAction, LexState, LexTable, ParseTable};
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
|
||||
pub(crate) fn build_lex_table(
|
||||
parse_table: &mut ParseTable,
|
||||
syntax_grammar: &SyntaxGrammar,
|
||||
lexical_grammar: &LexicalGrammar,
|
||||
keywords: &LookaheadSet,
|
||||
) -> (LexTable, LexTable) {
|
||||
let keyword_lex_table;
|
||||
if syntax_grammar.word_token.is_some() {
|
||||
let mut builder = LexTableBuilder::new(lexical_grammar);
|
||||
builder.add_state_for_tokens(keywords.iter());
|
||||
keyword_lex_table = builder.table;
|
||||
} else {
|
||||
keyword_lex_table = LexTable::default();
|
||||
}
|
||||
|
||||
let mut builder = LexTableBuilder::new(lexical_grammar);
|
||||
for state in parse_table.states.iter_mut() {
|
||||
let tokens = state.terminal_entries.keys().filter_map(|token| {
|
||||
if token.is_terminal() {
|
||||
if keywords.contains(&token) {
|
||||
syntax_grammar.word_token
|
||||
} else {
|
||||
Some(*token)
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
});
|
||||
state.lex_state_id = builder.add_state_for_tokens(tokens);
|
||||
}
|
||||
|
||||
(builder.table, keyword_lex_table)
|
||||
}
|
||||
|
||||
struct LexTableBuilder<'a> {
|
||||
lexical_grammar: &'a LexicalGrammar,
|
||||
cursor: NfaCursor<'a>,
|
||||
table: LexTable,
|
||||
state_queue: VecDeque<(usize, Vec<u32>)>,
|
||||
state_ids_by_nfa_state_set: HashMap<Vec<u32>, usize>,
|
||||
}
|
||||
|
||||
impl<'a> LexTableBuilder<'a> {
|
||||
fn new(lexical_grammar: &'a LexicalGrammar) -> Self {
|
||||
Self {
|
||||
lexical_grammar,
|
||||
cursor: NfaCursor::new(&lexical_grammar.nfa, vec![]),
|
||||
table: LexTable::default(),
|
||||
state_queue: VecDeque::new(),
|
||||
state_ids_by_nfa_state_set: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn add_state_for_tokens(&mut self, tokens: impl Iterator<Item = Symbol>) -> usize {
|
||||
let nfa_states = tokens
|
||||
.map(|token| self.lexical_grammar.variables[token.index].start_state)
|
||||
.collect();
|
||||
let result = self.add_state(nfa_states);
|
||||
while let Some((state_id, nfa_states)) = self.state_queue.pop_front() {
|
||||
self.populate_state(state_id, nfa_states);
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
fn add_state(&mut self, nfa_states: Vec<u32>) -> usize {
|
||||
match self.state_ids_by_nfa_state_set.entry(nfa_states) {
|
||||
Entry::Occupied(o) => *o.get(),
|
||||
Entry::Vacant(v) => {
|
||||
let state_id = self.table.states.len();
|
||||
self.table.states.push(LexState::default());
|
||||
self.state_queue.push_back((state_id, v.key().clone()));
|
||||
v.insert(state_id);
|
||||
state_id
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn populate_state(&mut self, state_id: usize, nfa_states: Vec<u32>) {
|
||||
self.cursor.reset(nfa_states);
|
||||
|
||||
let mut completion = None;
|
||||
for (id, prec) in self.cursor.completions() {
|
||||
if let Some((prev_id, prev_precedence)) = completion {
|
||||
if TokenConflictMap::prefer_token(
|
||||
self.lexical_grammar,
|
||||
(prev_precedence, prev_id),
|
||||
(prec, id),
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
completion = Some((id, prec));
|
||||
}
|
||||
|
||||
for (chars, advance_precedence, next_states, is_sep) in self.cursor.grouped_successors() {
|
||||
if let Some((_, completed_precedence)) = completion {
|
||||
if advance_precedence < completed_precedence {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
let next_state_id = self.add_state(next_states);
|
||||
self.table.states[state_id].advance_actions.push((
|
||||
chars,
|
||||
AdvanceAction {
|
||||
state: next_state_id,
|
||||
in_main_token: !is_sep,
|
||||
},
|
||||
));
|
||||
}
|
||||
|
||||
if let Some((completion_index, _)) = completion {
|
||||
self.table.states[state_id].accept_action = Some(completion_index);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -7,10 +7,10 @@ use crate::tables::{
|
|||
AliasSequenceId, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry,
|
||||
};
|
||||
use core::ops::Range;
|
||||
use std::hash::Hasher;
|
||||
use std::collections::hash_map::{Entry, DefaultHasher};
|
||||
use std::collections::hash_map::{DefaultHasher, Entry};
|
||||
use std::collections::{HashMap, HashSet, VecDeque};
|
||||
use std::fmt::Write;
|
||||
use std::hash::Hasher;
|
||||
|
||||
#[derive(Clone)]
|
||||
struct AuxiliarySymbolInfo {
|
||||
|
|
@ -31,7 +31,6 @@ struct ParseTableBuilder<'a> {
|
|||
item_set_builder: ParseItemSetBuilder<'a>,
|
||||
syntax_grammar: &'a SyntaxGrammar,
|
||||
lexical_grammar: &'a LexicalGrammar,
|
||||
inlines: &'a InlinedProductionMap,
|
||||
state_ids_by_item_set: HashMap<ParseItemSet<'a>, ParseStateId>,
|
||||
item_sets_by_state_id: Vec<ParseItemSet<'a>>,
|
||||
parse_state_queue: VecDeque<ParseStateQueueEntry>,
|
||||
|
|
@ -51,9 +50,12 @@ impl<'a> ParseTableBuilder<'a> {
|
|||
&Vec::new(),
|
||||
&Vec::new(),
|
||||
ParseItemSet::with(
|
||||
[(ParseItem::start(), LookaheadSet::with([Symbol::end()].iter().cloned()))]
|
||||
.iter()
|
||||
.cloned(),
|
||||
[(
|
||||
ParseItem::start(),
|
||||
LookaheadSet::with([Symbol::end()].iter().cloned()),
|
||||
)]
|
||||
.iter()
|
||||
.cloned(),
|
||||
),
|
||||
);
|
||||
|
||||
|
|
@ -69,8 +71,12 @@ impl<'a> ParseTableBuilder<'a> {
|
|||
item_set: ParseItemSet<'a>,
|
||||
) -> ParseStateId {
|
||||
if preceding_symbols.len() > 1 {
|
||||
let left_tokens = self.item_set_builder.last_set(&preceding_symbols[preceding_symbols.len() - 2]);
|
||||
let right_tokens = self.item_set_builder.first_set(&preceding_symbols[preceding_symbols.len() - 1]);
|
||||
let left_tokens = self
|
||||
.item_set_builder
|
||||
.last_set(&preceding_symbols[preceding_symbols.len() - 2]);
|
||||
let right_tokens = self
|
||||
.item_set_builder
|
||||
.first_set(&preceding_symbols[preceding_symbols.len() - 1]);
|
||||
for left_token in left_tokens.iter() {
|
||||
if left_token.is_terminal() {
|
||||
self.following_tokens[left_token.index].insert_all(right_tokens);
|
||||
|
|
@ -117,11 +123,9 @@ impl<'a> ParseTableBuilder<'a> {
|
|||
);
|
||||
}
|
||||
|
||||
let item_set = self.item_set_builder.transitive_closure(
|
||||
&self.item_sets_by_state_id[entry.state_id],
|
||||
self.syntax_grammar,
|
||||
self.inlines,
|
||||
);
|
||||
let item_set = self
|
||||
.item_set_builder
|
||||
.transitive_closure(&self.item_sets_by_state_id[entry.state_id]);
|
||||
|
||||
if debug {
|
||||
println!(
|
||||
|
|
@ -606,7 +610,6 @@ pub(crate) fn build_parse_table(
|
|||
ParseTableBuilder {
|
||||
syntax_grammar,
|
||||
lexical_grammar,
|
||||
inlines,
|
||||
item_set_builder: ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines),
|
||||
state_ids_by_item_set: HashMap::new(),
|
||||
item_sets_by_state_id: Vec::new(),
|
||||
|
|
|
|||
|
|
@ -18,6 +18,7 @@ struct FollowSetInfo {
|
|||
pub(crate) struct ParseItemSetBuilder<'a> {
|
||||
first_sets: HashMap<Symbol, LookaheadSet>,
|
||||
last_sets: HashMap<Symbol, LookaheadSet>,
|
||||
inlines: &'a InlinedProductionMap,
|
||||
transitive_closure_additions: Vec<Vec<TransitiveClosureAddition<'a>>>,
|
||||
}
|
||||
|
||||
|
|
@ -36,6 +37,7 @@ impl<'a> ParseItemSetBuilder<'a> {
|
|||
let mut result = Self {
|
||||
first_sets: HashMap::new(),
|
||||
last_sets: HashMap::new(),
|
||||
inlines,
|
||||
transitive_closure_additions: vec![Vec::new(); syntax_grammar.variables.len()],
|
||||
};
|
||||
|
||||
|
|
@ -237,15 +239,12 @@ impl<'a> ParseItemSetBuilder<'a> {
|
|||
result
|
||||
}
|
||||
|
||||
pub(crate) fn transitive_closure(
|
||||
&mut self,
|
||||
item_set: &ParseItemSet<'a>,
|
||||
grammar: &'a SyntaxGrammar,
|
||||
inlines: &'a InlinedProductionMap,
|
||||
) -> ParseItemSet<'a> {
|
||||
pub(crate) fn transitive_closure(&mut self, item_set: &ParseItemSet<'a>) -> ParseItemSet<'a> {
|
||||
let mut result = ParseItemSet::default();
|
||||
for (item, lookaheads) in &item_set.entries {
|
||||
if let Some(productions) = inlines.inlined_productions(item.production, item.step_index)
|
||||
if let Some(productions) = self
|
||||
.inlines
|
||||
.inlined_productions(item.production, item.step_index)
|
||||
{
|
||||
for production in productions {
|
||||
self.add_item(
|
||||
|
|
@ -273,12 +272,7 @@ impl<'a> ParseItemSetBuilder<'a> {
|
|||
&self.first_sets[symbol]
|
||||
}
|
||||
|
||||
fn add_item(
|
||||
&self,
|
||||
set: &mut ParseItemSet<'a>,
|
||||
item: ParseItem<'a>,
|
||||
lookaheads: &LookaheadSet,
|
||||
) {
|
||||
fn add_item(&self, set: &mut ParseItemSet<'a>, item: ParseItem<'a>, lookaheads: &LookaheadSet) {
|
||||
if let Some(step) = item.step() {
|
||||
if step.symbol.is_non_terminal() {
|
||||
let next_step = item.successor().step();
|
||||
|
|
|
|||
|
|
@ -1,24 +0,0 @@
|
|||
use crate::rules::Symbol;
|
||||
use crate::tables::LexTable;
|
||||
use crate::grammars::{SyntaxGrammar, LexicalGrammar};
|
||||
|
||||
pub(crate) struct LexTableBuilder<'a> {
|
||||
syntax_grammar: &'a SyntaxGrammar,
|
||||
lexical_grammar: &'a LexicalGrammar,
|
||||
table: LexTable,
|
||||
}
|
||||
|
||||
impl<'a> LexTableBuilder<'a> {
|
||||
pub fn new(
|
||||
syntax_grammar: &'a SyntaxGrammar,
|
||||
lexical_grammar: &'a LexicalGrammar,
|
||||
) -> Self {
|
||||
Self {
|
||||
syntax_grammar, lexical_grammar, table: LexTable::default()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn build(self) -> (LexTable, LexTable, Option<Symbol>) {
|
||||
(LexTable::default(), LexTable::default(), None)
|
||||
}
|
||||
}
|
||||
|
|
@ -1,11 +1,12 @@
|
|||
mod build_lex_table;
|
||||
mod build_parse_table;
|
||||
mod coincident_tokens;
|
||||
mod item;
|
||||
mod item_set_builder;
|
||||
mod lex_table_builder;
|
||||
mod shrink_parse_table;
|
||||
mod token_conflicts;
|
||||
|
||||
use self::build_lex_table::build_lex_table;
|
||||
use self::build_parse_table::build_parse_table;
|
||||
use self::coincident_tokens::CoincidentTokenIndex;
|
||||
use self::item::LookaheadSet;
|
||||
|
|
@ -13,6 +14,7 @@ use self::shrink_parse_table::shrink_parse_table;
|
|||
use self::token_conflicts::TokenConflictMap;
|
||||
use crate::error::Result;
|
||||
use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar};
|
||||
use crate::nfa::{CharacterSet, NfaCursor};
|
||||
use crate::rules::{AliasMap, Symbol};
|
||||
use crate::tables::{LexTable, ParseAction, ParseTable, ParseTableEntry};
|
||||
|
||||
|
|
@ -25,7 +27,22 @@ pub(crate) fn build_tables(
|
|||
let (mut parse_table, following_tokens) =
|
||||
build_parse_table(syntax_grammar, lexical_grammar, inlines)?;
|
||||
let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens);
|
||||
|
||||
eprintln!("{:?}", token_conflict_map);
|
||||
|
||||
let coincident_token_index = CoincidentTokenIndex::new(&parse_table);
|
||||
let keywords = if let Some(word_token) = syntax_grammar.word_token {
|
||||
identify_keywords(
|
||||
lexical_grammar,
|
||||
&parse_table,
|
||||
word_token,
|
||||
&token_conflict_map,
|
||||
&coincident_token_index,
|
||||
)
|
||||
} else {
|
||||
LookaheadSet::new()
|
||||
};
|
||||
|
||||
populate_error_state(
|
||||
&mut parse_table,
|
||||
syntax_grammar,
|
||||
|
|
@ -39,7 +56,14 @@ pub(crate) fn build_tables(
|
|||
simple_aliases,
|
||||
&token_conflict_map,
|
||||
);
|
||||
Ok((parse_table, LexTable::default(), LexTable::default(), None))
|
||||
let (main_lex_table, keyword_lex_table) =
|
||||
build_lex_table(&mut parse_table, syntax_grammar, lexical_grammar, &keywords);
|
||||
Ok((
|
||||
parse_table,
|
||||
main_lex_table,
|
||||
keyword_lex_table,
|
||||
syntax_grammar.word_token,
|
||||
))
|
||||
}
|
||||
|
||||
fn populate_error_state(
|
||||
|
|
@ -77,13 +101,10 @@ fn populate_error_state(
|
|||
|| !token_conflict_map.does_conflict(i, t.index)
|
||||
});
|
||||
if can_be_used_for_recovery {
|
||||
eprintln!("include {}", &lexical_grammar.variables[symbol.index].name);
|
||||
state
|
||||
.terminal_entries
|
||||
.entry(symbol)
|
||||
.or_insert_with(|| recover_entry.clone());
|
||||
} else {
|
||||
eprintln!("exclude {}", &lexical_grammar.variables[symbol.index].name);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -98,3 +119,103 @@ fn populate_error_state(
|
|||
|
||||
state.terminal_entries.insert(Symbol::end(), recover_entry);
|
||||
}
|
||||
|
||||
fn identify_keywords(
|
||||
lexical_grammar: &LexicalGrammar,
|
||||
parse_table: &ParseTable,
|
||||
word_token: Symbol,
|
||||
token_conflict_map: &TokenConflictMap,
|
||||
coincident_token_index: &CoincidentTokenIndex,
|
||||
) -> LookaheadSet {
|
||||
let mut cursor = NfaCursor::new(&lexical_grammar.nfa, Vec::new());
|
||||
|
||||
// First find all of the candidate keyword tokens: tokens that start with
|
||||
// letters or underscore and can match the same string as a word token.
|
||||
let keywords = LookaheadSet::with(lexical_grammar.variables.iter().enumerate().filter_map(
|
||||
|(i, variable)| {
|
||||
cursor.reset(vec![variable.start_state]);
|
||||
if all_chars_are_alphabetical(&cursor)
|
||||
&& token_conflict_map.does_match_same_string(i, word_token.index)
|
||||
{
|
||||
Some(Symbol::terminal(i))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
},
|
||||
));
|
||||
|
||||
// Exclude keyword candidates that shadow another keyword candidate.
|
||||
let keywords = LookaheadSet::with(keywords.iter().filter(|token| {
|
||||
for other_token in keywords.iter() {
|
||||
if other_token != *token
|
||||
&& token_conflict_map.does_match_same_string(token.index, other_token.index)
|
||||
{
|
||||
eprintln!(
|
||||
"Exclude {} from keywords because it matches the same string as {}",
|
||||
lexical_grammar.variables[token.index].name,
|
||||
lexical_grammar.variables[other_token.index].name
|
||||
);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
}));
|
||||
|
||||
// Exclude keyword candidates for which substituting the keyword capture
|
||||
// token would introduce new lexical conflicts with other tokens.
|
||||
let keywords = LookaheadSet::with(keywords.iter().filter(|token| {
|
||||
for other_index in 0..lexical_grammar.variables.len() {
|
||||
if keywords.contains(&Symbol::terminal(other_index)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// If the word token was already valid in every state containing
|
||||
// this keyword candidate, then substituting the word token won't
|
||||
// introduce any new lexical conflicts.
|
||||
if coincident_token_index
|
||||
.states_with(*token, Symbol::terminal(other_index))
|
||||
.iter()
|
||||
.all(|state_id| {
|
||||
parse_table.states[*state_id]
|
||||
.terminal_entries
|
||||
.contains_key(&word_token)
|
||||
})
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if !token_conflict_map.has_same_conflict_status(
|
||||
token.index,
|
||||
word_token.index,
|
||||
other_index,
|
||||
) {
|
||||
eprintln!(
|
||||
"Exclude {} from keywords because of conflict with {}",
|
||||
lexical_grammar.variables[token.index].name,
|
||||
lexical_grammar.variables[other_index].name
|
||||
);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
eprintln!(
|
||||
"Include {} in keywords",
|
||||
lexical_grammar.variables[token.index].name,
|
||||
);
|
||||
true
|
||||
}));
|
||||
|
||||
keywords
|
||||
}
|
||||
|
||||
fn all_chars_are_alphabetical(cursor: &NfaCursor) -> bool {
|
||||
cursor.successors().all(|(chars, _, _, is_sep)| {
|
||||
if is_sep {
|
||||
true
|
||||
} else if let CharacterSet::Include(chars) = chars {
|
||||
chars.iter().all(|c| c.is_alphabetic() || *c == '_')
|
||||
} else {
|
||||
false
|
||||
}
|
||||
})
|
||||
}
|
||||
|
|
|
|||
|
|
@ -166,8 +166,6 @@ fn merge_parse_state(
|
|||
}
|
||||
}
|
||||
|
||||
eprintln!("maybe merge {} {}", left, right);
|
||||
|
||||
let mut symbols_to_add = Vec::new();
|
||||
for (symbol, right_entry) in &right_state.terminal_entries {
|
||||
if !left_state.terminal_entries.contains_key(&symbol) {
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ use crate::nfa::{CharacterSet, NfaCursor};
|
|||
use std::collections::HashSet;
|
||||
use std::fmt;
|
||||
|
||||
#[derive(Clone, Debug, Default)]
|
||||
#[derive(Clone, Debug, Default, PartialEq, Eq)]
|
||||
struct TokenConflictStatus {
|
||||
does_overlap: bool,
|
||||
does_match_valid_continuation: bool,
|
||||
|
|
@ -12,15 +12,16 @@ struct TokenConflictStatus {
|
|||
matches_same_string: bool,
|
||||
}
|
||||
|
||||
pub(crate) struct TokenConflictMap {
|
||||
pub(crate) struct TokenConflictMap<'a> {
|
||||
n: usize,
|
||||
status_matrix: Vec<TokenConflictStatus>,
|
||||
starting_chars_by_index: Vec<CharacterSet>,
|
||||
following_chars_by_index: Vec<CharacterSet>,
|
||||
grammar: &'a LexicalGrammar,
|
||||
}
|
||||
|
||||
impl TokenConflictMap {
|
||||
pub fn new(grammar: &LexicalGrammar, following_tokens: Vec<LookaheadSet>) -> Self {
|
||||
impl<'a> TokenConflictMap<'a> {
|
||||
pub fn new(grammar: &'a LexicalGrammar, following_tokens: Vec<LookaheadSet>) -> Self {
|
||||
let mut cursor = NfaCursor::new(&grammar.nfa, Vec::new());
|
||||
let starting_chars = get_starting_chars(&mut cursor, grammar);
|
||||
let following_chars = get_following_chars(&starting_chars, following_tokens);
|
||||
|
|
@ -40,9 +41,16 @@ impl TokenConflictMap {
|
|||
status_matrix,
|
||||
starting_chars_by_index: starting_chars,
|
||||
following_chars_by_index: following_chars,
|
||||
grammar,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn has_same_conflict_status(&self, a: usize, b: usize, other: usize) -> bool {
|
||||
let left = &self.status_matrix[matrix_index(self.n, a, other)];
|
||||
let right = &self.status_matrix[matrix_index(self.n, b, other)];
|
||||
left == right
|
||||
}
|
||||
|
||||
pub fn does_match_same_string(&self, i: usize, j: usize) -> bool {
|
||||
self.status_matrix[matrix_index(self.n, i, j)].matches_same_string
|
||||
}
|
||||
|
|
@ -55,9 +63,28 @@ impl TokenConflictMap {
|
|||
pub fn does_overlap(&self, i: usize, j: usize) -> bool {
|
||||
self.status_matrix[matrix_index(self.n, i, j)].does_overlap
|
||||
}
|
||||
|
||||
pub fn prefer_token(grammar: &LexicalGrammar, left: (i32, usize), right: (i32, usize)) -> bool {
|
||||
if left.0 > right.0 {
|
||||
return true;
|
||||
} else if left.0 < right.0 {
|
||||
return false;
|
||||
}
|
||||
|
||||
match (
|
||||
grammar.variables[left.1].is_string,
|
||||
grammar.variables[right.1].is_string,
|
||||
) {
|
||||
(true, false) => return true,
|
||||
(false, true) => return false,
|
||||
_ => {}
|
||||
}
|
||||
|
||||
left.0 < right.0
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for TokenConflictMap {
|
||||
impl<'a> fmt::Debug for TokenConflictMap<'a> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "TokenConflictMap {{\n")?;
|
||||
|
||||
|
|
@ -69,18 +96,22 @@ impl fmt::Debug for TokenConflictMap {
|
|||
|
||||
write!(f, " following_characters: {{\n")?;
|
||||
for i in 0..self.n {
|
||||
write!(f, " {}: {:?},\n", i, self.following_chars_by_index[i])?;
|
||||
write!(
|
||||
f,
|
||||
" {}: {:?},\n",
|
||||
self.grammar.variables[i].name, self.following_chars_by_index[i]
|
||||
)?;
|
||||
}
|
||||
write!(f, " }},\n")?;
|
||||
|
||||
write!(f, " status_matrix: {{\n")?;
|
||||
for i in 0..self.n {
|
||||
write!(f, " {}: {{\n", i)?;
|
||||
write!(f, " {}: {{\n", self.grammar.variables[i].name)?;
|
||||
for j in 0..self.n {
|
||||
write!(
|
||||
f,
|
||||
" {}: {:?},\n",
|
||||
j,
|
||||
self.grammar.variables[j].name,
|
||||
self.status_matrix[matrix_index(self.n, i, j)]
|
||||
)?;
|
||||
}
|
||||
|
|
@ -101,7 +132,7 @@ fn get_starting_chars(cursor: &mut NfaCursor, grammar: &LexicalGrammar) -> Vec<C
|
|||
for variable in &grammar.variables {
|
||||
cursor.reset(vec![variable.start_state]);
|
||||
let mut all_chars = CharacterSet::empty();
|
||||
for (chars, _, _) in cursor.successors() {
|
||||
for (chars, _, _, _) in cursor.successors() {
|
||||
all_chars = all_chars.add(chars);
|
||||
}
|
||||
result.push(all_chars);
|
||||
|
|
@ -162,7 +193,11 @@ fn compute_conflict_status(
|
|||
// Prefer tokens with higher precedence. For tokens with equal precedence,
|
||||
// prefer those listed earlier in the grammar.
|
||||
let winning_id;
|
||||
if prefer_token(grammar, (prev_precedence, prev_id), (precedence, id)) {
|
||||
if TokenConflictMap::prefer_token(
|
||||
grammar,
|
||||
(prev_precedence, prev_id),
|
||||
(precedence, id),
|
||||
) {
|
||||
winning_id = prev_id;
|
||||
} else {
|
||||
winning_id = id;
|
||||
|
|
@ -181,7 +216,7 @@ fn compute_conflict_status(
|
|||
}
|
||||
}
|
||||
|
||||
for (chars, advance_precedence, next_states) in cursor.grouped_successors() {
|
||||
for (chars, advance_precedence, next_states, in_sep) in cursor.grouped_successors() {
|
||||
let mut can_advance = true;
|
||||
if let Some((completed_id, completed_precedence)) = completion {
|
||||
let mut other_id = None;
|
||||
|
|
@ -209,15 +244,13 @@ fn compute_conflict_status(
|
|||
if chars.does_intersect(&following_chars[j]) {
|
||||
result.0.does_match_valid_continuation = true;
|
||||
}
|
||||
if cursor.in_separator() {
|
||||
if in_sep {
|
||||
result.0.does_match_separators = true;
|
||||
}
|
||||
} else {
|
||||
result.1.does_overlap = true;
|
||||
if chars.does_intersect(&following_chars[i]) {
|
||||
result.1.does_match_valid_continuation = true;
|
||||
} else {
|
||||
result.1.does_match_separators = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -231,25 +264,6 @@ fn compute_conflict_status(
|
|||
result
|
||||
}
|
||||
|
||||
fn prefer_token(grammar: &LexicalGrammar, left: (i32, usize), right: (i32, usize)) -> bool {
|
||||
if left.0 > right.0 {
|
||||
return true;
|
||||
} else if left.0 < right.0 {
|
||||
return false;
|
||||
}
|
||||
|
||||
match (
|
||||
grammar.variables[left.1].is_string,
|
||||
grammar.variables[right.1].is_string,
|
||||
) {
|
||||
(true, false) => return true,
|
||||
(false, true) => return false,
|
||||
_ => {}
|
||||
}
|
||||
|
||||
left.0 < right.0
|
||||
}
|
||||
|
||||
fn variable_ids_for_states<'a>(
|
||||
state_ids: &'a Vec<u32>,
|
||||
grammar: &'a LexicalGrammar,
|
||||
|
|
|
|||
|
|
@ -91,6 +91,7 @@ pub(crate) struct SyntaxGrammar {
|
|||
pub word_token: Option<Symbol>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl ProductionStep {
|
||||
pub(crate) fn new(symbol: Symbol) -> Self {
|
||||
Self {
|
||||
|
|
@ -127,14 +128,6 @@ impl Production {
|
|||
pub fn first_symbol(&self) -> Option<Symbol> {
|
||||
self.steps.first().map(|s| s.symbol.clone())
|
||||
}
|
||||
|
||||
pub fn last_precedence(&self) -> i32 {
|
||||
self.steps.last().map(|s| s.precedence).unwrap_or(0)
|
||||
}
|
||||
|
||||
pub fn last_associativity(&self) -> Option<Associativity> {
|
||||
self.steps.last().map(|s| s.associativity).unwrap_or(None)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Production {
|
||||
|
|
@ -146,6 +139,7 @@ impl Default for Production {
|
|||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl Variable {
|
||||
pub fn named(name: &str, rule: Rule) -> Self {
|
||||
Self {
|
||||
|
|
|
|||
|
|
@ -42,7 +42,7 @@ fn main() -> error::Result<()> {
|
|||
)
|
||||
.get_matches();
|
||||
|
||||
if let Some(matches) = matches.subcommand_matches("generate") {
|
||||
if let Some(_) = matches.subcommand_matches("generate") {
|
||||
let mut grammar_path = env::current_dir().expect("Failed to read CWD");
|
||||
grammar_path.push("grammar.js");
|
||||
let grammar_json = load_js_grammar_file(grammar_path);
|
||||
|
|
|
|||
130
src/nfa.rs
130
src/nfa.rs
|
|
@ -40,7 +40,6 @@ impl Default for Nfa {
|
|||
pub struct NfaCursor<'a> {
|
||||
pub(crate) state_ids: Vec<u32>,
|
||||
nfa: &'a Nfa,
|
||||
in_sep: bool,
|
||||
}
|
||||
|
||||
impl CharacterSet {
|
||||
|
|
@ -111,7 +110,7 @@ impl CharacterSet {
|
|||
CharacterSet::Exclude(other_chars) => {
|
||||
chars.retain(|c| other_chars.contains(&c));
|
||||
CharacterSet::Exclude(chars)
|
||||
},
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
|
@ -311,7 +310,6 @@ impl<'a> NfaCursor<'a> {
|
|||
let mut result = Self {
|
||||
nfa,
|
||||
state_ids: Vec::new(),
|
||||
in_sep: true,
|
||||
};
|
||||
result.add_states(&mut states);
|
||||
result
|
||||
|
|
@ -322,81 +320,59 @@ impl<'a> NfaCursor<'a> {
|
|||
self.add_states(&mut states);
|
||||
}
|
||||
|
||||
pub fn advance(&mut self, c: char) -> bool {
|
||||
let mut result = false;
|
||||
let mut new_state_ids = Vec::new();
|
||||
let mut any_sep_transitions = false;
|
||||
for current_state_id in &self.state_ids {
|
||||
if let NfaState::Advance {
|
||||
chars,
|
||||
state_id,
|
||||
is_sep,
|
||||
..
|
||||
} = &self.nfa.states[*current_state_id as usize]
|
||||
{
|
||||
if chars.contains(c) {
|
||||
if *is_sep {
|
||||
any_sep_transitions = true;
|
||||
}
|
||||
new_state_ids.push(*state_id);
|
||||
result = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if !any_sep_transitions {
|
||||
self.in_sep = false;
|
||||
}
|
||||
self.state_ids.clear();
|
||||
self.add_states(&mut new_state_ids);
|
||||
result
|
||||
}
|
||||
|
||||
pub fn successors(&self) -> impl Iterator<Item = (&CharacterSet, i32, u32)> {
|
||||
pub fn successors(&self) -> impl Iterator<Item = (&CharacterSet, i32, u32, bool)> {
|
||||
self.state_ids.iter().filter_map(move |id| {
|
||||
if let NfaState::Advance {
|
||||
chars,
|
||||
state_id,
|
||||
precedence,
|
||||
..
|
||||
is_sep,
|
||||
} = &self.nfa.states[*id as usize]
|
||||
{
|
||||
Some((chars, *precedence, *state_id))
|
||||
Some((chars, *precedence, *state_id, *is_sep))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub fn grouped_successors(&self) -> Vec<(CharacterSet, i32, Vec<u32>)> {
|
||||
pub fn grouped_successors(&self) -> Vec<(CharacterSet, i32, Vec<u32>, bool)> {
|
||||
Self::group_successors(self.successors())
|
||||
}
|
||||
|
||||
fn group_successors<'b>(
|
||||
iter: impl Iterator<Item = (&'b CharacterSet, i32, u32)>,
|
||||
) -> Vec<(CharacterSet, i32, Vec<u32>)> {
|
||||
let mut result: Vec<(CharacterSet, i32, Vec<u32>)> = Vec::new();
|
||||
for (chars, prec, state) in iter {
|
||||
iter: impl Iterator<Item = (&'b CharacterSet, i32, u32, bool)>,
|
||||
) -> Vec<(CharacterSet, i32, Vec<u32>, bool)> {
|
||||
let mut result: Vec<(CharacterSet, i32, Vec<u32>, bool)> = Vec::new();
|
||||
for (chars, prec, state, is_sep) in iter {
|
||||
let mut chars = chars.clone();
|
||||
let mut i = 0;
|
||||
while i < result.len() {
|
||||
let intersection = result[i].0.remove_intersection(&mut chars);
|
||||
if !intersection.is_empty() {
|
||||
if result[i].0.is_empty() {
|
||||
result[i].0 = intersection;
|
||||
result[i].1 = max(result[i].1, prec);
|
||||
result[i].2.push(state);
|
||||
} else {
|
||||
if result[i].0 == chars {
|
||||
result[i].1 = max(result[i].1, prec);
|
||||
result[i].2.push(state);
|
||||
result[i].3 |= is_sep;
|
||||
} else {
|
||||
let intersection = result[i].0.remove_intersection(&mut chars);
|
||||
if !intersection.is_empty() {
|
||||
let mut states = result[i].2.clone();
|
||||
let mut precedence = result[i].1;
|
||||
states.push(state);
|
||||
result.insert(i, (intersection, max(precedence, prec), states));
|
||||
result.insert(
|
||||
i,
|
||||
(
|
||||
intersection,
|
||||
max(result[i].1, prec),
|
||||
states,
|
||||
result[i].3 || is_sep,
|
||||
),
|
||||
);
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
if !chars.is_empty() {
|
||||
result.push((chars, prec, vec![state]));
|
||||
result.push((chars, prec, vec![state], is_sep));
|
||||
}
|
||||
}
|
||||
result.sort_unstable_by(|a, b| a.0.cmp(&b.0));
|
||||
|
|
@ -417,10 +393,6 @@ impl<'a> NfaCursor<'a> {
|
|||
})
|
||||
}
|
||||
|
||||
pub fn in_separator(&self) -> bool {
|
||||
self.in_sep
|
||||
}
|
||||
|
||||
pub fn add_states(&mut self, new_state_ids: &mut Vec<u32>) {
|
||||
let mut i = 0;
|
||||
while i < new_state_ids.len() {
|
||||
|
|
@ -460,26 +432,31 @@ mod tests {
|
|||
let table = [
|
||||
(
|
||||
vec![
|
||||
(CharacterSet::empty().add_range('a', 'f'), 0, 1),
|
||||
(CharacterSet::empty().add_range('d', 'i'), 1, 2),
|
||||
(CharacterSet::empty().add_range('a', 'f'), 0, 1, false),
|
||||
(CharacterSet::empty().add_range('d', 'i'), 1, 2, false),
|
||||
],
|
||||
vec![
|
||||
(CharacterSet::empty().add_range('a', 'c'), 0, vec![1]),
|
||||
(CharacterSet::empty().add_range('d', 'f'), 1, vec![1, 2]),
|
||||
(CharacterSet::empty().add_range('g', 'i'), 1, vec![2]),
|
||||
(CharacterSet::empty().add_range('a', 'c'), 0, vec![1], false),
|
||||
(
|
||||
CharacterSet::empty().add_range('d', 'f'),
|
||||
1,
|
||||
vec![1, 2],
|
||||
false,
|
||||
),
|
||||
(CharacterSet::empty().add_range('g', 'i'), 1, vec![2], false),
|
||||
],
|
||||
),
|
||||
(
|
||||
vec![
|
||||
(CharacterSet::empty().add_range('a', 'z'), 0, 1),
|
||||
(CharacterSet::empty().add_char('d'), 0, 2),
|
||||
(CharacterSet::empty().add_char('i'), 0, 3),
|
||||
(CharacterSet::empty().add_char('f'), 0, 4),
|
||||
(CharacterSet::empty().add_range('a', 'z'), 0, 1, false),
|
||||
(CharacterSet::empty().add_char('d'), 0, 2, false),
|
||||
(CharacterSet::empty().add_char('i'), 0, 3, false),
|
||||
(CharacterSet::empty().add_char('f'), 0, 4, false),
|
||||
],
|
||||
vec![
|
||||
(CharacterSet::empty().add_char('d'), 0, vec![1, 2]),
|
||||
(CharacterSet::empty().add_char('f'), 0, vec![1, 4]),
|
||||
(CharacterSet::empty().add_char('i'), 0, vec![1, 3]),
|
||||
(CharacterSet::empty().add_char('d'), 0, vec![1, 2], false),
|
||||
(CharacterSet::empty().add_char('f'), 0, vec![1, 4], false),
|
||||
(CharacterSet::empty().add_char('i'), 0, vec![1, 3], false),
|
||||
(
|
||||
CharacterSet::empty()
|
||||
.add_range('a', 'c')
|
||||
|
|
@ -488,6 +465,7 @@ mod tests {
|
|||
.add_range('j', 'z'),
|
||||
0,
|
||||
vec![1],
|
||||
false,
|
||||
),
|
||||
],
|
||||
),
|
||||
|
|
@ -495,28 +473,10 @@ mod tests {
|
|||
|
||||
for row in table.iter() {
|
||||
assert_eq!(
|
||||
NfaCursor::group_successors(row.0.iter().map(|(c, p, s)| (c, *p, *s))),
|
||||
NfaCursor::group_successors(row.0.iter().map(|(c, p, s, sep)| (c, *p, *s, *sep))),
|
||||
row.1
|
||||
);
|
||||
}
|
||||
|
||||
// let successors = NfaCursor::group_successors(
|
||||
// [
|
||||
// (&CharacterSet::empty().add_range('a', 'f'), 1),
|
||||
// (&CharacterSet::empty().add_range('d', 'i'), 2),
|
||||
// ]
|
||||
// .iter()
|
||||
// .cloned(),
|
||||
// );
|
||||
//
|
||||
// assert_eq!(
|
||||
// successors,
|
||||
// vec![
|
||||
// (CharacterSet::empty().add_range('a', 'c'), vec![1],),
|
||||
// (CharacterSet::empty().add_range('d', 'f'), vec![1, 2],),
|
||||
// (CharacterSet::empty().add_range('g', 'i'), vec![2],),
|
||||
// ]
|
||||
// );
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ use crate::rules::Rule;
|
|||
use regex_syntax::ast::{
|
||||
parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind, RepetitionRange,
|
||||
};
|
||||
use std::i32;
|
||||
|
||||
struct NfaBuilder {
|
||||
nfa: Nfa,
|
||||
|
|
@ -17,7 +18,7 @@ fn is_string(rule: &Rule) -> bool {
|
|||
match rule {
|
||||
Rule::String(_) => true,
|
||||
Rule::Metadata { rule, .. } => is_string(rule),
|
||||
_ => false
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -346,7 +347,9 @@ impl NfaBuilder {
|
|||
|
||||
fn push_split(&mut self, state_id: u32) {
|
||||
let last_state_id = self.nfa.last_state_id();
|
||||
self.nfa.states.push(NfaState::Split(state_id, last_state_id));
|
||||
self.nfa
|
||||
.states
|
||||
.push(NfaState::Split(state_id, last_state_id));
|
||||
}
|
||||
|
||||
fn add_precedence(&mut self, prec: i32, mut state_ids: Vec<u32>) {
|
||||
|
|
@ -354,12 +357,12 @@ impl NfaBuilder {
|
|||
while i < state_ids.len() {
|
||||
let state_id = state_ids[i];
|
||||
let (left, right) = match &mut self.nfa.states[state_id as usize] {
|
||||
NfaState::Accept {precedence, ..} => {
|
||||
NfaState::Accept { precedence, .. } => {
|
||||
*precedence = prec;
|
||||
return;
|
||||
},
|
||||
}
|
||||
NfaState::Split(left, right) => (*left, *right),
|
||||
_ => return
|
||||
_ => return,
|
||||
};
|
||||
if !state_ids.contains(&left) {
|
||||
state_ids.push(left);
|
||||
|
|
@ -383,7 +386,7 @@ mod tests {
|
|||
let mut cursor = NfaCursor::new(&grammar.nfa, start_states);
|
||||
|
||||
let mut result = None;
|
||||
let mut result_precedence = 0;
|
||||
let mut result_precedence = i32::MIN;
|
||||
let mut start_char = 0;
|
||||
let mut end_char = 0;
|
||||
for c in s.chars() {
|
||||
|
|
@ -393,9 +396,14 @@ mod tests {
|
|||
result_precedence = precedence;
|
||||
}
|
||||
}
|
||||
if cursor.advance(c) {
|
||||
if let Some((_, _, next_states, in_sep)) = cursor
|
||||
.grouped_successors()
|
||||
.into_iter()
|
||||
.find(|(chars, prec, _, _)| chars.contains(c) && *prec >= result_precedence)
|
||||
{
|
||||
cursor.reset(next_states);
|
||||
end_char += 1;
|
||||
if cursor.in_separator() {
|
||||
if in_sep {
|
||||
start_char = end_char;
|
||||
}
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
use super::{ExtractedLexicalGrammar, ExtractedSyntaxGrammar, InternedGrammar};
|
||||
use crate::error::{Error, Result};
|
||||
use crate::grammars::{ExternalToken, Variable};
|
||||
use crate::grammars::{ExternalToken, Variable, VariableType};
|
||||
use crate::rules::{MetadataParams, Rule, Symbol, SymbolType};
|
||||
use std::collections::HashMap;
|
||||
use std::mem;
|
||||
|
|
@ -240,16 +240,21 @@ impl TokenExtractor {
|
|||
|
||||
let index = self.extracted_variables.len();
|
||||
let variable = if let Some(string_value) = string_value {
|
||||
Variable::anonymous(string_value, rule.clone())
|
||||
Variable {
|
||||
name: string_value.clone(),
|
||||
kind: VariableType::Anonymous,
|
||||
rule: rule.clone()
|
||||
}
|
||||
} else {
|
||||
self.current_variable_token_count += 1;
|
||||
Variable::auxiliary(
|
||||
&format!(
|
||||
Variable {
|
||||
name: format!(
|
||||
"{}_token{}",
|
||||
&self.current_variable_name, self.current_variable_token_count
|
||||
),
|
||||
rule.clone(),
|
||||
)
|
||||
kind: VariableType::Auxiliary,
|
||||
rule: rule.clone(),
|
||||
}
|
||||
};
|
||||
|
||||
self.extracted_variables.push(variable);
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ use crate::grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType
|
|||
use crate::nfa::CharacterSet;
|
||||
use crate::rules::{Alias, AliasMap, Symbol, SymbolType};
|
||||
use crate::tables::{LexState, LexTable, ParseAction, ParseTable, ParseTableEntry};
|
||||
use core::ops::Range;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::fmt::Write;
|
||||
use std::mem::swap;
|
||||
|
|
@ -12,11 +13,17 @@ macro_rules! add {
|
|||
}}
|
||||
}
|
||||
|
||||
macro_rules! add_line {
|
||||
($this: tt, $($arg: tt)*) => {
|
||||
macro_rules! add_whitespace {
|
||||
($this: tt) => {{
|
||||
for _ in 0..$this.indent_level {
|
||||
write!(&mut $this.buffer, " ").unwrap();
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
||||
macro_rules! add_line {
|
||||
($this: tt, $($arg: tt)*) => {
|
||||
add_whitespace!($this);
|
||||
$this.buffer.write_fmt(format_args!($($arg)*)).unwrap();
|
||||
$this.buffer += "\n";
|
||||
}
|
||||
|
|
@ -162,7 +169,7 @@ impl Generator {
|
|||
}
|
||||
}
|
||||
|
||||
add_line!(self, "#define LANGUAGE_VERSION {}", 6);
|
||||
add_line!(self, "#define LANGUAGE_VERSION {}", 9);
|
||||
add_line!(
|
||||
self,
|
||||
"#define STATE_COUNT {}",
|
||||
|
|
@ -352,7 +359,7 @@ impl Generator {
|
|||
add_line!(
|
||||
self,
|
||||
"ACCEPT_TOKEN({})",
|
||||
self.symbol_ids[&accept_action.symbol]
|
||||
self.symbol_ids[&Symbol::terminal(accept_action)]
|
||||
);
|
||||
}
|
||||
|
||||
|
|
@ -360,9 +367,10 @@ impl Generator {
|
|||
for (characters, action) in state.advance_actions {
|
||||
let previous_length = self.buffer.len();
|
||||
|
||||
add_whitespace!(self);
|
||||
add!(self, "if (");
|
||||
if self.add_character_set_condition(&characters, &ruled_out_characters) {
|
||||
add!(self, ")");
|
||||
add!(self, ")\n");
|
||||
indent!(self);
|
||||
if action.in_main_token {
|
||||
add_line!(self, "ADVANCE({});", action.state);
|
||||
|
|
@ -370,7 +378,7 @@ impl Generator {
|
|||
add_line!(self, "SKIP({});", action.state);
|
||||
}
|
||||
if let CharacterSet::Include(chars) = characters {
|
||||
ruled_out_characters.extend(chars.iter());
|
||||
ruled_out_characters.extend(chars.iter().map(|c| *c as u32));
|
||||
}
|
||||
dedent!(self);
|
||||
} else {
|
||||
|
|
@ -384,9 +392,106 @@ impl Generator {
|
|||
fn add_character_set_condition(
|
||||
&mut self,
|
||||
characters: &CharacterSet,
|
||||
ruled_out_characters: &HashSet<char>,
|
||||
ruled_out_characters: &HashSet<u32>,
|
||||
) -> bool {
|
||||
true
|
||||
match characters {
|
||||
CharacterSet::Include(chars) => {
|
||||
let ranges = Self::get_ranges(chars, ruled_out_characters);
|
||||
self.add_character_range_conditions(ranges, false)
|
||||
}
|
||||
CharacterSet::Exclude(chars) => {
|
||||
let ranges = Self::get_ranges(chars, ruled_out_characters);
|
||||
self.add_character_range_conditions(ranges, true)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn add_character_range_conditions(
|
||||
&mut self,
|
||||
ranges: impl Iterator<Item = Range<char>>,
|
||||
is_negated: bool,
|
||||
) -> bool {
|
||||
let line_break = "\n ";
|
||||
let mut did_add = false;
|
||||
for range in ranges {
|
||||
if is_negated {
|
||||
if did_add {
|
||||
add!(self, " &&{}", line_break);
|
||||
}
|
||||
if range.end == range.start {
|
||||
add!(self, "lookahead != ");
|
||||
self.add_character(range.start);
|
||||
} else if range.end as u32 == range.start as u32 + 1 {
|
||||
add!(self, "lookahead != ");
|
||||
self.add_character(range.start);
|
||||
add!(self, " &&{}lookahead != ", line_break);
|
||||
self.add_character(range.end);
|
||||
} else {
|
||||
add!(self, "(lookahead < ");
|
||||
self.add_character(range.start);
|
||||
add!(self, " || ");
|
||||
self.add_character(range.end);
|
||||
add!(self, " < lookahead)");
|
||||
}
|
||||
} else {
|
||||
if did_add {
|
||||
add!(self, " ||{}", line_break);
|
||||
}
|
||||
if range.end == range.start {
|
||||
add!(self, "lookahead == ");
|
||||
self.add_character(range.start);
|
||||
} else if range.end as u32 == range.start as u32 + 1 {
|
||||
add!(self, "lookahead == ");
|
||||
self.add_character(range.start);
|
||||
add!(self, " ||{}lookahead == ", line_break);
|
||||
self.add_character(range.end);
|
||||
} else {
|
||||
add!(self, "(");
|
||||
self.add_character(range.start);
|
||||
add!(self, " <= lookahead && lookahead <= ");
|
||||
self.add_character(range.end);
|
||||
add!(self, ")");
|
||||
}
|
||||
}
|
||||
did_add = true;
|
||||
}
|
||||
did_add
|
||||
}
|
||||
|
||||
fn get_ranges<'a>(
|
||||
chars: &'a Vec<char>,
|
||||
ruled_out_characters: &'a HashSet<u32>,
|
||||
) -> impl Iterator<Item = Range<char>> + 'a {
|
||||
let mut prev_range: Option<Range<char>> = None;
|
||||
chars
|
||||
.iter()
|
||||
.cloned()
|
||||
.chain(Some('\0'))
|
||||
.filter_map(move |c| {
|
||||
if ruled_out_characters.contains(&(c as u32)) {
|
||||
return None;
|
||||
}
|
||||
if let Some(range) = prev_range.clone() {
|
||||
if c == '\0' {
|
||||
prev_range = Some(c..c);
|
||||
return Some(range);
|
||||
}
|
||||
|
||||
let mut prev_range_successor = range.end as u32 + 1;
|
||||
while prev_range_successor < c as u32 {
|
||||
if !ruled_out_characters.contains(&prev_range_successor) {
|
||||
prev_range = Some(c..c);
|
||||
return Some(range);
|
||||
}
|
||||
prev_range_successor += 1;
|
||||
}
|
||||
prev_range = Some(range.start..c);
|
||||
None
|
||||
} else {
|
||||
prev_range = Some(c..c);
|
||||
None
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn add_lex_modes_list(&mut self) {
|
||||
|
|
@ -577,13 +682,6 @@ impl Generator {
|
|||
alias_sequence_id,
|
||||
..
|
||||
} => {
|
||||
if !self.symbol_ids.contains_key(&symbol) {
|
||||
eprintln!(
|
||||
"SYMBOL: {:?} {:?}",
|
||||
symbol,
|
||||
self.metadata_for_symbol(symbol)
|
||||
);
|
||||
}
|
||||
add!(self, "REDUCE({}, {}", self.symbol_ids[&symbol], child_count);
|
||||
if dynamic_precedence != 0 {
|
||||
add!(self, ", .dynamic_precedence = {}", dynamic_precedence);
|
||||
|
|
@ -785,7 +883,7 @@ impl Generator {
|
|||
{
|
||||
result.push(c);
|
||||
} else {
|
||||
result += match c {
|
||||
let replacement = match c {
|
||||
'~' => "TILDE",
|
||||
'`' => "BQUOTE",
|
||||
'!' => "BANG",
|
||||
|
|
@ -821,7 +919,11 @@ impl Generator {
|
|||
'\r' => "CR",
|
||||
'\t' => "TAB",
|
||||
_ => continue,
|
||||
};
|
||||
if !result.is_empty() && !result.ends_with("_") {
|
||||
result.push('_');
|
||||
}
|
||||
result += replacement;
|
||||
}
|
||||
}
|
||||
result
|
||||
|
|
@ -837,6 +939,21 @@ impl Generator {
|
|||
}
|
||||
result
|
||||
}
|
||||
|
||||
fn add_character(&mut self, c: char) {
|
||||
if c.is_ascii() {
|
||||
match c {
|
||||
'\'' => add!(self, "'\\''"),
|
||||
'\\' => add!(self, "'\\\\'"),
|
||||
'\t' => add!(self, "'\\t'"),
|
||||
'\n' => add!(self, "'\\n'"),
|
||||
'\r' => add!(self, "'\\r'"),
|
||||
_ => add!(self, "'{}'", c),
|
||||
}
|
||||
} else {
|
||||
add!(self, "{}", c as u32)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn render_c_code(
|
||||
|
|
@ -867,3 +984,49 @@ pub(crate) fn render_c_code(
|
|||
}
|
||||
.generate()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_get_char_ranges() {
|
||||
struct Row {
|
||||
chars: Vec<char>,
|
||||
ruled_out_chars: Vec<char>,
|
||||
expected_ranges: Vec<Range<char>>,
|
||||
}
|
||||
|
||||
let table = [
|
||||
Row {
|
||||
chars: vec!['a'],
|
||||
ruled_out_chars: vec![],
|
||||
expected_ranges: vec!['a'..'a'],
|
||||
},
|
||||
Row {
|
||||
chars: vec!['a', 'b', 'c', 'e', 'z'],
|
||||
ruled_out_chars: vec![],
|
||||
expected_ranges: vec!['a'..'c', 'e'..'e', 'z'..'z'],
|
||||
},
|
||||
Row {
|
||||
chars: vec!['a', 'b', 'c', 'e', 'h', 'z'],
|
||||
ruled_out_chars: vec!['d', 'f', 'g'],
|
||||
expected_ranges: vec!['a'..'h', 'z'..'z'],
|
||||
},
|
||||
];
|
||||
|
||||
for Row {
|
||||
chars,
|
||||
ruled_out_chars,
|
||||
expected_ranges,
|
||||
} in table.iter()
|
||||
{
|
||||
let ruled_out_chars = ruled_out_chars
|
||||
.into_iter()
|
||||
.map(|c: &char| *c as u32)
|
||||
.collect();
|
||||
let ranges = Generator::get_ranges(chars, &ruled_out_chars).collect::<Vec<_>>();
|
||||
assert_eq!(ranges, *expected_ranges);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -120,7 +120,10 @@ impl Rule {
|
|||
pub fn seq(rules: Vec<Rule>) -> Self {
|
||||
Rule::Seq(rules)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl Rule {
|
||||
pub fn terminal(index: usize) -> Self {
|
||||
Rule::Symbol(Symbol::terminal(index))
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
use crate::nfa::CharacterSet;
|
||||
use crate::rules::{Alias, Associativity, Symbol};
|
||||
use std::collections::HashMap;
|
||||
use std::ops::Range;
|
||||
|
||||
pub(crate) type AliasSequenceId = usize;
|
||||
pub(crate) type ParseStateId = usize;
|
||||
|
|
@ -50,21 +49,13 @@ pub(crate) struct ParseTable {
|
|||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
pub(crate) struct AdvanceAction {
|
||||
pub state: LexStateId,
|
||||
pub precedence: Range<i32>,
|
||||
pub in_main_token: bool,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
pub(crate) struct AcceptTokenAction {
|
||||
pub symbol: Symbol,
|
||||
pub precedence: i32,
|
||||
pub implicit_precedence: i32,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
#[derive(Clone, Debug, Default, PartialEq, Eq)]
|
||||
pub(crate) struct LexState {
|
||||
pub advance_actions: HashMap<CharacterSet, AdvanceAction>,
|
||||
pub accept_action: Option<AcceptTokenAction>,
|
||||
pub advance_actions: Vec<(CharacterSet, AdvanceAction)>,
|
||||
pub accept_action: Option<usize>,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue