Allow non-terminal extras

This commit is contained in:
Max Brunsfeld 2019-10-21 13:31:49 -07:00
parent 49c632ae90
commit fcaabea0cf
8 changed files with 274 additions and 83 deletions

View file

@ -7,7 +7,7 @@ use crate::generate::grammars::{
use crate::generate::node_types::VariableInfo;
use crate::generate::rules::{Associativity, Symbol, SymbolType, TokenSet};
use crate::generate::tables::{
FieldLocation, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry,
FieldLocation, GotoAction, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry,
ProductionInfo, ProductionInfoId,
};
use core::ops::Range;
@ -16,17 +16,19 @@ use std::collections::{BTreeMap, HashMap, HashSet, VecDeque};
use std::fmt::Write;
use std::u32;
// For conflict reporting, each parse state is associated with an example
// sequence of symbols that could lead to that parse state.
type SymbolSequence = Vec<Symbol>;
type AuxiliarySymbolSequence = Vec<AuxiliarySymbolInfo>;
pub(crate) type ParseStateInfo<'a> = (SymbolSequence, ParseItemSet<'a>);
#[derive(Clone)]
struct AuxiliarySymbolInfo {
auxiliary_symbol: Symbol,
parent_symbols: Vec<Symbol>,
}
type SymbolSequence = Vec<Symbol>;
type AuxiliarySymbolSequence = Vec<AuxiliarySymbolInfo>;
pub(crate) type ParseStateInfo<'a> = (SymbolSequence, ParseItemSet<'a>);
struct ParseStateQueueEntry {
state_id: ParseStateId,
preceding_auxiliary_symbols: AuxiliarySymbolSequence,
@ -41,6 +43,7 @@ struct ParseTableBuilder<'a> {
state_ids_by_item_set: HashMap<ParseItemSet<'a>, ParseStateId>,
parse_state_info_by_id: Vec<ParseStateInfo<'a>>,
parse_state_queue: VecDeque<ParseStateQueueEntry>,
non_terminal_extra_states: Vec<(Symbol, usize)>,
parse_table: ParseTable,
}
@ -52,7 +55,7 @@ impl<'a> ParseTableBuilder<'a> {
.push(ProductionInfo::default());
// Add the error state at index 0.
self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default());
self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default(), false);
// Add the starting state at index 1.
self.add_parse_state(
@ -66,8 +69,40 @@ impl<'a> ParseTableBuilder<'a> {
.iter()
.cloned(),
),
false,
);
// Compute the possible item sets for non-terminal extras.
let mut non_terminal_extra_item_sets_by_first_terminal = BTreeMap::new();
for extra_non_terminal in self
.syntax_grammar
.extra_tokens
.iter()
.filter(|s| s.is_non_terminal())
{
let variable = &self.syntax_grammar.variables[extra_non_terminal.index];
for production in &variable.productions {
non_terminal_extra_item_sets_by_first_terminal
.entry(production.first_symbol().unwrap())
.or_insert(ParseItemSet::default())
.insert(
ParseItem {
variable_index: extra_non_terminal.index as u32,
production,
step_index: 1,
},
&[Symbol::end()].iter().cloned().collect(),
);
}
}
// Add a state for each starting terminal of a non-terminal extra rule.
for (terminal, item_set) in non_terminal_extra_item_sets_by_first_terminal {
self.non_terminal_extra_states
.push((terminal, self.parse_table.states.len()));
self.add_parse_state(&Vec::new(), &Vec::new(), item_set, true);
}
while let Some(entry) = self.parse_state_queue.pop_front() {
let item_set = self
.item_set_builder
@ -91,9 +126,15 @@ impl<'a> ParseTableBuilder<'a> {
preceding_symbols: &SymbolSequence,
preceding_auxiliary_symbols: &AuxiliarySymbolSequence,
item_set: ParseItemSet<'a>,
is_non_terminal_extra: bool,
) -> ParseStateId {
match self.state_ids_by_item_set.entry(item_set) {
// If an equivalent item set has already been processed, then return
// the existing parse state index.
Entry::Occupied(o) => *o.get(),
// Otherwise, insert a new parse state and add it to the queue of
// parse states to populate.
Entry::Vacant(v) => {
let core = v.key().core();
let core_count = self.core_ids_by_core.len();
@ -116,6 +157,7 @@ impl<'a> ParseTableBuilder<'a> {
terminal_entries: HashMap::new(),
nonterminal_entries: HashMap::new(),
core_id,
is_non_terminal_extra,
});
self.parse_state_queue.push_back(ParseStateQueueEntry {
state_id,
@ -138,7 +180,12 @@ impl<'a> ParseTableBuilder<'a> {
let mut non_terminal_successors = BTreeMap::new();
let mut lookaheads_with_conflicts = TokenSet::new();
// Each item in the item set contributes to either or a Shift action or a Reduce
// action in this state.
for (item, lookaheads) in &item_set.entries {
// If the item is unfinished, then this state has a transition for the item's
// next symbol. Advance the item to its next step and insert the resulting
// item into the successor item set.
if let Some(next_symbol) = item.symbol() {
let successor = item.successor();
if next_symbol.is_non_terminal() {
@ -160,7 +207,10 @@ impl<'a> ParseTableBuilder<'a> {
.or_insert_with(|| ParseItemSet::default())
.insert(successor, lookaheads);
}
} else {
}
// If the item is finished, then add a Reduce action to this state based
// on this item.
else {
let action = if item.is_augmented() {
ParseAction::Accept
} else {
@ -179,6 +229,10 @@ impl<'a> ParseTableBuilder<'a> {
.terminal_entries
.entry(lookahead);
let entry = entry.or_insert_with(|| ParseTableEntry::new());
// While inserting Reduce actions, eagerly resolve conflicts related
// to precedence: avoid inserting lower-precedence reductions, and
// clear the action list when inserting higher-precedence reductions.
if entry.actions.is_empty() {
entry.actions.push(action);
} else if action.precedence() > entry.actions[0].precedence() {
@ -193,12 +247,16 @@ impl<'a> ParseTableBuilder<'a> {
}
}
// Having computed the the successor item sets for each symbol, add a new
// parse state for each of these item sets, and add a corresponding Shift
// action to this state.
for (symbol, next_item_set) in terminal_successors {
preceding_symbols.push(symbol);
let next_state_id = self.add_parse_state(
&preceding_symbols,
&preceding_auxiliary_symbols,
next_item_set,
self.parse_table.states[state_id].is_non_terminal_extra,
);
preceding_symbols.pop();
@ -226,13 +284,19 @@ impl<'a> ParseTableBuilder<'a> {
&preceding_symbols,
&preceding_auxiliary_symbols,
next_item_set,
self.parse_table.states[state_id].is_non_terminal_extra,
);
preceding_symbols.pop();
self.parse_table.states[state_id]
.nonterminal_entries
.insert(symbol, next_state_id);
.insert(symbol, GotoAction::Goto(next_state_id));
}
// For any symbol with multiple actions, perform conflict resolution.
// This will either
// * choose one action over the others using precedence or associativity
// * keep multiple actions if this conflict has been whitelisted in the grammar
// * fail, terminating the parser generation process
for symbol in lookaheads_with_conflicts.iter() {
self.handle_conflict(
&item_set,
@ -243,15 +307,50 @@ impl<'a> ParseTableBuilder<'a> {
)?;
}
// Finally, add actions for the grammar's `extra` symbols.
let state = &mut self.parse_table.states[state_id];
for extra_token in &self.syntax_grammar.extra_tokens {
state
.terminal_entries
.entry(*extra_token)
.or_insert(ParseTableEntry {
reusable: true,
actions: vec![ParseAction::ShiftExtra],
});
let is_non_terminal_extra = state.is_non_terminal_extra;
let is_end_of_non_terminal_extra =
is_non_terminal_extra && state.terminal_entries.len() == 1;
// Add actions for the start tokens of each non-terminal extra rule.
// These actions are added to every state except for the states that are
// alread within non-terminal extras. Non-terminal extras are not allowed
// to nest within each other.
if !is_non_terminal_extra {
for (terminal, state_id) in &self.non_terminal_extra_states {
state
.terminal_entries
.entry(*terminal)
.or_insert(ParseTableEntry {
reusable: true,
actions: vec![ParseAction::Shift {
state: *state_id,
is_repetition: false,
}],
});
}
}
// Add ShiftExtra actions for the terminal extra tokens. These actions
// are added to every state except for those at the ends of non-terminal
// extras.
if !is_end_of_non_terminal_extra {
for extra_token in &self.syntax_grammar.extra_tokens {
if extra_token.is_non_terminal() {
state
.nonterminal_entries
.insert(*extra_token, GotoAction::ShiftExtra);
} else {
state
.terminal_entries
.entry(*extra_token)
.or_insert(ParseTableEntry {
reusable: true,
actions: vec![ParseAction::ShiftExtra],
});
}
}
}
Ok(())
@ -362,8 +461,8 @@ impl<'a> ParseTableBuilder<'a> {
}
}
// If all reduce actions are left associative, remove the SHIFT action.
// If all reduce actions are right associative, remove the REDUCE actions.
// If all Reduce actions are left associative, remove the SHIFT action.
// If all Reduce actions are right associative, remove the REDUCE actions.
match (has_left, has_non, has_right) {
(true, false, false) => {
entry.actions.pop();
@ -774,6 +873,7 @@ pub(crate) fn build_parse_table<'a>(
lexical_grammar,
item_set_builder,
variable_info,
non_terminal_extra_states: Vec::new(),
state_ids_by_item_set: HashMap::new(),
core_ids_by_core: HashMap::new(),
parse_state_info_by_id: Vec::new(),

View file

@ -2,7 +2,9 @@ use super::token_conflicts::TokenConflictMap;
use crate::generate::dedup::split_state_id_groups;
use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar, VariableType};
use crate::generate::rules::{AliasMap, Symbol, TokenSet};
use crate::generate::tables::{ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry};
use crate::generate::tables::{
GotoAction, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry,
};
use log::info;
use std::collections::{HashMap, HashSet};
use std::mem;
@ -101,7 +103,10 @@ impl<'a> Minimizer<'a> {
state.update_referenced_states(|other_state_id, state| {
if let Some(symbol) = unit_reduction_symbols_by_state.get(&other_state_id) {
done = false;
state.nonterminal_entries[symbol]
match state.nonterminal_entries.get(symbol) {
Some(GotoAction::Goto(state_id)) => *state_id,
_ => other_state_id,
}
} else {
other_state_id
}
@ -262,18 +267,24 @@ impl<'a> Minimizer<'a> {
for (symbol, s1) in &state1.nonterminal_entries {
if let Some(s2) = state2.nonterminal_entries.get(symbol) {
let group1 = group_ids_by_state_id[*s1];
let group2 = group_ids_by_state_id[*s2];
if group1 != group2 {
info!(
"split states {} {} - successors for {} are split: {} {}",
state1.id,
state2.id,
self.symbol_name(symbol),
s1,
s2,
);
return true;
match (s1, s2) {
(GotoAction::ShiftExtra, GotoAction::ShiftExtra) => continue,
(GotoAction::Goto(s1), GotoAction::Goto(s2)) => {
let group1 = group_ids_by_state_id[*s1];
let group2 = group_ids_by_state_id[*s2];
if group1 != group2 {
info!(
"split states {} {} - successors for {} are split: {} {}",
state1.id,
state2.id,
self.symbol_name(symbol),
s1,
s2,
);
return true;
}
}
_ => return true,
}
}
}

View file

@ -93,15 +93,7 @@ pub(super) fn extract_tokens(
let mut extra_tokens = Vec::new();
for rule in grammar.extra_tokens {
if let Rule::Symbol(symbol) = rule {
let new_symbol = symbol_replacer.replace_symbol(symbol);
if new_symbol.is_non_terminal() {
return Error::err(format!(
"Non-token symbol '{}' cannot be used as an extra token",
&variables[new_symbol.index].name
));
} else {
extra_tokens.push(new_symbol);
}
extra_tokens.push(symbol_replacer.replace_symbol(symbol));
} else {
if let Some(index) = lexical_variables.iter().position(|v| v.rule == rule) {
extra_tokens.push(Symbol::terminal(index));
@ -472,28 +464,6 @@ mod test {
);
}
#[test]
fn test_error_on_non_terminal_symbol_extras() {
let mut grammar = build_grammar(vec![
Variable::named("rule_0", Rule::non_terminal(1)),
Variable::named("rule_1", Rule::non_terminal(2)),
Variable::named("rule_2", Rule::string("x")),
]);
grammar.extra_tokens = vec![Rule::non_terminal(1)];
match extract_tokens(grammar) {
Err(e) => {
assert_eq!(
e.message(),
"Non-token symbol 'rule_1' cannot be used as an extra token"
);
}
_ => {
panic!("Expected an error but got no error");
}
}
}
#[test]
fn test_error_on_external_with_same_name_as_non_terminal() {
let mut grammar = build_grammar(vec![

View file

@ -2,7 +2,8 @@ use super::grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType
use super::nfa::CharacterSet;
use super::rules::{Alias, AliasMap, Symbol, SymbolType};
use super::tables::{
AdvanceAction, FieldLocation, LexState, LexTable, ParseAction, ParseTable, ParseTableEntry,
AdvanceAction, FieldLocation, GotoAction, LexState, LexTable, ParseAction, ParseTable,
ParseTableEntry,
};
use core::ops::Range;
use std::cmp;
@ -678,7 +679,12 @@ impl Generator {
add_line!(self, "static TSLexMode ts_lex_modes[STATE_COUNT] = {{");
indent!(self);
for (i, state) in self.parse_table.states.iter().enumerate() {
if state.external_lex_state_id > 0 {
if state.is_non_terminal_extra
&& state.terminal_entries.len() == 1
&& *state.terminal_entries.iter().next().unwrap().0 == Symbol::end()
{
add_line!(self, "[{}] = {{-1}},", i,);
} else if state.external_lex_state_id > 0 {
add_line!(
self,
"[{}] = {{.lex_state = {}, .external_lex_state = {}}},",
@ -807,12 +813,15 @@ impl Generator {
terminal_entries.sort_unstable_by_key(|e| self.symbol_order.get(e.0));
nonterminal_entries.sort_unstable_by_key(|k| k.0);
for (symbol, state_id) in &nonterminal_entries {
for (symbol, action) in &nonterminal_entries {
add_line!(
self,
"[{}] = STATE({}),",
self.symbol_ids[symbol],
*state_id
match action {
GotoAction::Goto(state) => *state,
GotoAction::ShiftExtra => i,
}
);
}
@ -865,9 +874,15 @@ impl Generator {
.or_default()
.push(**symbol);
}
for (symbol, state_id) in &state.nonterminal_entries {
for (symbol, action) in &state.nonterminal_entries {
let state_id = match action {
GotoAction::Goto(i) => *i,
GotoAction::ShiftExtra => {
self.large_state_count + small_state_indices.len() - 1
}
};
symbols_by_value
.entry((*state_id, SymbolType::NonTerminal))
.entry((state_id, SymbolType::NonTerminal))
.or_default()
.push(*symbol);
}

View file

@ -24,6 +24,12 @@ pub(crate) enum ParseAction {
},
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub(crate) enum GotoAction {
Goto(ParseStateId),
ShiftExtra,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct ParseTableEntry {
pub actions: Vec<ParseAction>,
@ -34,10 +40,11 @@ pub(crate) struct ParseTableEntry {
pub(crate) struct ParseState {
pub id: ParseStateId,
pub terminal_entries: HashMap<Symbol, ParseTableEntry>,
pub nonterminal_entries: HashMap<Symbol, ParseStateId>,
pub nonterminal_entries: HashMap<Symbol, GotoAction>,
pub lex_state_id: usize,
pub external_lex_state_id: usize,
pub core_id: usize,
pub is_non_terminal_extra: bool,
}
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
@ -103,7 +110,13 @@ impl ParseState {
_ => None,
})
})
.chain(self.nonterminal_entries.iter().map(|(_, state)| *state))
.chain(self.nonterminal_entries.iter().filter_map(|(_, action)| {
if let GotoAction::Goto(state) = action {
Some(*state)
} else {
None
}
}))
}
pub fn update_referenced_states<F>(&mut self, mut f: F)
@ -121,15 +134,18 @@ impl ParseState {
}
}
}
for (symbol, other_state) in &self.nonterminal_entries {
let result = f(*other_state, self);
if result != *other_state {
updates.push((*symbol, 0, result));
for (symbol, action) in &self.nonterminal_entries {
if let GotoAction::Goto(other_state) = action {
let result = f(*other_state, self);
if result != *other_state {
updates.push((*symbol, 0, result));
}
}
}
for (symbol, action_index, new_state) in updates {
if symbol.is_non_terminal() {
self.nonterminal_entries.insert(symbol, new_state);
self.nonterminal_entries
.insert(symbol, GotoAction::Goto(new_state));
} else {
let entry = self.terminal_entries.get_mut(&symbol).unwrap();
if let ParseAction::Shift { is_repetition, .. } = entry.actions[action_index] {

View file

@ -351,6 +351,7 @@ static Subtree ts_parser__lex(
Length start_position = ts_stack_position(self->stack, version);
Subtree external_token = ts_stack_last_external_token(self->stack, version);
TSLexMode lex_mode = self->language->lex_modes[parse_state];
if (lex_mode.lex_state == (uint16_t)-1) return NULL_SUBTREE;
const bool *valid_external_tokens = ts_language_enabled_external_tokens(
self->language,
lex_mode.external_lex_state
@ -748,7 +749,8 @@ static StackVersion ts_parser__reduce(
uint32_t count,
int dynamic_precedence,
uint16_t production_id,
bool fragile
bool is_fragile,
bool is_extra
) {
uint32_t initial_version_count = ts_stack_version_count(self->stack);
uint32_t removed_version_count = 0;
@ -813,7 +815,8 @@ static StackVersion ts_parser__reduce(
TSStateId state = ts_stack_state(self->stack, slice_version);
TSStateId next_state = ts_language_next_state(self->language, state, symbol);
if (fragile || pop.size > 1 || initial_version_count > 1) {
if (is_extra) parent.ptr->extra = true;
if (is_fragile || pop.size > 1 || initial_version_count > 1) {
parent.ptr->fragile_left = true;
parent.ptr->fragile_right = true;
parent.ptr->parse_state = TS_TREE_STATE_NONE;
@ -962,7 +965,7 @@ static bool ts_parser__do_all_potential_reductions(
reduction_version = ts_parser__reduce(
self, version, action.symbol, action.count,
action.dynamic_precedence, action.production_id,
true
true, false
);
}
@ -1366,8 +1369,17 @@ static bool ts_parser__advance(
// Otherwise, re-run the lexer.
if (!lookahead.ptr) {
lookahead = ts_parser__lex(self, version, state);
ts_parser__set_cached_token(self, position, last_external_token, lookahead);
ts_language_table_entry(self->language, state, ts_subtree_symbol(lookahead), &table_entry);
if (lookahead.ptr) {
ts_parser__set_cached_token(self, position, last_external_token, lookahead);
ts_language_table_entry(self->language, state, ts_subtree_symbol(lookahead), &table_entry);
}
// When parsing a non-terminal extra, a null lookahead indicates the
// end of the rule. The reduction is stored in the EOF table entry.
// After the reduction, the lexer needs to be run again.
else {
ts_language_table_entry(self->language, state, ts_builtin_sym_end, &table_entry);
}
}
for (;;) {
@ -1422,11 +1434,12 @@ static bool ts_parser__advance(
case TSParseActionTypeReduce: {
bool is_fragile = table_entry.action_count > 1;
bool is_extra = lookahead.ptr == NULL;
LOG("reduce sym:%s, child_count:%u", SYM_NAME(action.params.symbol), action.params.child_count);
StackVersion reduction_version = ts_parser__reduce(
self, version, action.params.symbol, action.params.child_count,
action.params.dynamic_precedence, action.params.production_id,
is_fragile
is_fragile, is_extra
);
if (reduction_version != STACK_VERSION_NONE) {
last_reduction_version = reduction_version;
@ -1459,6 +1472,15 @@ static bool ts_parser__advance(
ts_stack_renumber_version(self->stack, last_reduction_version, version);
LOG_STACK();
state = ts_stack_state(self->stack, version);
// At the end of a non-terminal extra rule, the lexer will return a
// null subtree, because the parser needs to perform a fixed reduction
// regardless of the lookahead node. After performing that reduction,
// (and completing the non-terminal extra rule) run the lexer again based
// on the current parse state.
if (!lookahead.ptr) {
lookahead = ts_parser__lex(self, version, state);
}
ts_language_table_entry(
self->language,
state,

View file

@ -0,0 +1,22 @@
==============
No extras
==============
a b c d
---
(module)
==============
Extras
==============
a (one) b (two) (three) c d
---
(module
(comment)
(comment)
(comment))

View file

@ -0,0 +1,35 @@
{
"name": "extra_non_terminals",
"extras": [
{"type": "PATTERN", "value": "\\s"},
{"type": "SYMBOL", "name": "comment"}
],
"rules": {
"module": {
"type": "SEQ",
"members": [
{"type": "STRING", "value": "a"},
{"type": "STRING", "value": "b"},
{"type": "STRING", "value": "c"},
{"type": "STRING", "value": "d"}
]
},
"comment": {
"type": "SEQ",
"members": [
{"type": "STRING", "value": "("},
{
"type": "REPEAT",
"content": {
"type": "PATTERN",
"value": "[a-z]+"
}
},
{"type": "STRING", "value": ")"}
]
}
}
}