Merge pull request #469 from tree-sitter/non-terminal-extras
Allow non-terminal extras
This commit is contained in:
commit
6cd82574a3
17 changed files with 304 additions and 113 deletions
|
|
@ -7,7 +7,7 @@ use crate::generate::grammars::{
|
|||
use crate::generate::node_types::VariableInfo;
|
||||
use crate::generate::rules::{Associativity, Symbol, SymbolType, TokenSet};
|
||||
use crate::generate::tables::{
|
||||
FieldLocation, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry,
|
||||
FieldLocation, GotoAction, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry,
|
||||
ProductionInfo, ProductionInfoId,
|
||||
};
|
||||
use core::ops::Range;
|
||||
|
|
@ -16,17 +16,19 @@ use std::collections::{BTreeMap, HashMap, HashSet, VecDeque};
|
|||
use std::fmt::Write;
|
||||
use std::u32;
|
||||
|
||||
// For conflict reporting, each parse state is associated with an example
|
||||
// sequence of symbols that could lead to that parse state.
|
||||
type SymbolSequence = Vec<Symbol>;
|
||||
|
||||
type AuxiliarySymbolSequence = Vec<AuxiliarySymbolInfo>;
|
||||
pub(crate) type ParseStateInfo<'a> = (SymbolSequence, ParseItemSet<'a>);
|
||||
|
||||
#[derive(Clone)]
|
||||
struct AuxiliarySymbolInfo {
|
||||
auxiliary_symbol: Symbol,
|
||||
parent_symbols: Vec<Symbol>,
|
||||
}
|
||||
|
||||
type SymbolSequence = Vec<Symbol>;
|
||||
type AuxiliarySymbolSequence = Vec<AuxiliarySymbolInfo>;
|
||||
|
||||
pub(crate) type ParseStateInfo<'a> = (SymbolSequence, ParseItemSet<'a>);
|
||||
|
||||
struct ParseStateQueueEntry {
|
||||
state_id: ParseStateId,
|
||||
preceding_auxiliary_symbols: AuxiliarySymbolSequence,
|
||||
|
|
@ -41,6 +43,7 @@ struct ParseTableBuilder<'a> {
|
|||
state_ids_by_item_set: HashMap<ParseItemSet<'a>, ParseStateId>,
|
||||
parse_state_info_by_id: Vec<ParseStateInfo<'a>>,
|
||||
parse_state_queue: VecDeque<ParseStateQueueEntry>,
|
||||
non_terminal_extra_states: Vec<(Symbol, usize)>,
|
||||
parse_table: ParseTable,
|
||||
}
|
||||
|
||||
|
|
@ -52,7 +55,7 @@ impl<'a> ParseTableBuilder<'a> {
|
|||
.push(ProductionInfo::default());
|
||||
|
||||
// Add the error state at index 0.
|
||||
self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default());
|
||||
self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default(), false);
|
||||
|
||||
// Add the starting state at index 1.
|
||||
self.add_parse_state(
|
||||
|
|
@ -66,8 +69,40 @@ impl<'a> ParseTableBuilder<'a> {
|
|||
.iter()
|
||||
.cloned(),
|
||||
),
|
||||
false,
|
||||
);
|
||||
|
||||
// Compute the possible item sets for non-terminal extras.
|
||||
let mut non_terminal_extra_item_sets_by_first_terminal = BTreeMap::new();
|
||||
for extra_non_terminal in self
|
||||
.syntax_grammar
|
||||
.extra_symbols
|
||||
.iter()
|
||||
.filter(|s| s.is_non_terminal())
|
||||
{
|
||||
let variable = &self.syntax_grammar.variables[extra_non_terminal.index];
|
||||
for production in &variable.productions {
|
||||
non_terminal_extra_item_sets_by_first_terminal
|
||||
.entry(production.first_symbol().unwrap())
|
||||
.or_insert(ParseItemSet::default())
|
||||
.insert(
|
||||
ParseItem {
|
||||
variable_index: extra_non_terminal.index as u32,
|
||||
production,
|
||||
step_index: 1,
|
||||
},
|
||||
&[Symbol::end()].iter().cloned().collect(),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Add a state for each starting terminal of a non-terminal extra rule.
|
||||
for (terminal, item_set) in non_terminal_extra_item_sets_by_first_terminal {
|
||||
self.non_terminal_extra_states
|
||||
.push((terminal, self.parse_table.states.len()));
|
||||
self.add_parse_state(&Vec::new(), &Vec::new(), item_set, true);
|
||||
}
|
||||
|
||||
while let Some(entry) = self.parse_state_queue.pop_front() {
|
||||
let item_set = self
|
||||
.item_set_builder
|
||||
|
|
@ -91,9 +126,15 @@ impl<'a> ParseTableBuilder<'a> {
|
|||
preceding_symbols: &SymbolSequence,
|
||||
preceding_auxiliary_symbols: &AuxiliarySymbolSequence,
|
||||
item_set: ParseItemSet<'a>,
|
||||
is_non_terminal_extra: bool,
|
||||
) -> ParseStateId {
|
||||
match self.state_ids_by_item_set.entry(item_set) {
|
||||
// If an equivalent item set has already been processed, then return
|
||||
// the existing parse state index.
|
||||
Entry::Occupied(o) => *o.get(),
|
||||
|
||||
// Otherwise, insert a new parse state and add it to the queue of
|
||||
// parse states to populate.
|
||||
Entry::Vacant(v) => {
|
||||
let core = v.key().core();
|
||||
let core_count = self.core_ids_by_core.len();
|
||||
|
|
@ -116,6 +157,7 @@ impl<'a> ParseTableBuilder<'a> {
|
|||
terminal_entries: HashMap::new(),
|
||||
nonterminal_entries: HashMap::new(),
|
||||
core_id,
|
||||
is_non_terminal_extra,
|
||||
});
|
||||
self.parse_state_queue.push_back(ParseStateQueueEntry {
|
||||
state_id,
|
||||
|
|
@ -138,7 +180,12 @@ impl<'a> ParseTableBuilder<'a> {
|
|||
let mut non_terminal_successors = BTreeMap::new();
|
||||
let mut lookaheads_with_conflicts = TokenSet::new();
|
||||
|
||||
// Each item in the item set contributes to either or a Shift action or a Reduce
|
||||
// action in this state.
|
||||
for (item, lookaheads) in &item_set.entries {
|
||||
// If the item is unfinished, then this state has a transition for the item's
|
||||
// next symbol. Advance the item to its next step and insert the resulting
|
||||
// item into the successor item set.
|
||||
if let Some(next_symbol) = item.symbol() {
|
||||
let successor = item.successor();
|
||||
if next_symbol.is_non_terminal() {
|
||||
|
|
@ -160,7 +207,10 @@ impl<'a> ParseTableBuilder<'a> {
|
|||
.or_insert_with(|| ParseItemSet::default())
|
||||
.insert(successor, lookaheads);
|
||||
}
|
||||
} else {
|
||||
}
|
||||
// If the item is finished, then add a Reduce action to this state based
|
||||
// on this item.
|
||||
else {
|
||||
let action = if item.is_augmented() {
|
||||
ParseAction::Accept
|
||||
} else {
|
||||
|
|
@ -179,6 +229,10 @@ impl<'a> ParseTableBuilder<'a> {
|
|||
.terminal_entries
|
||||
.entry(lookahead);
|
||||
let entry = entry.or_insert_with(|| ParseTableEntry::new());
|
||||
|
||||
// While inserting Reduce actions, eagerly resolve conflicts related
|
||||
// to precedence: avoid inserting lower-precedence reductions, and
|
||||
// clear the action list when inserting higher-precedence reductions.
|
||||
if entry.actions.is_empty() {
|
||||
entry.actions.push(action);
|
||||
} else if action.precedence() > entry.actions[0].precedence() {
|
||||
|
|
@ -193,12 +247,16 @@ impl<'a> ParseTableBuilder<'a> {
|
|||
}
|
||||
}
|
||||
|
||||
// Having computed the the successor item sets for each symbol, add a new
|
||||
// parse state for each of these item sets, and add a corresponding Shift
|
||||
// action to this state.
|
||||
for (symbol, next_item_set) in terminal_successors {
|
||||
preceding_symbols.push(symbol);
|
||||
let next_state_id = self.add_parse_state(
|
||||
&preceding_symbols,
|
||||
&preceding_auxiliary_symbols,
|
||||
next_item_set,
|
||||
self.parse_table.states[state_id].is_non_terminal_extra,
|
||||
);
|
||||
preceding_symbols.pop();
|
||||
|
||||
|
|
@ -226,13 +284,19 @@ impl<'a> ParseTableBuilder<'a> {
|
|||
&preceding_symbols,
|
||||
&preceding_auxiliary_symbols,
|
||||
next_item_set,
|
||||
self.parse_table.states[state_id].is_non_terminal_extra,
|
||||
);
|
||||
preceding_symbols.pop();
|
||||
self.parse_table.states[state_id]
|
||||
.nonterminal_entries
|
||||
.insert(symbol, next_state_id);
|
||||
.insert(symbol, GotoAction::Goto(next_state_id));
|
||||
}
|
||||
|
||||
// For any symbol with multiple actions, perform conflict resolution.
|
||||
// This will either
|
||||
// * choose one action over the others using precedence or associativity
|
||||
// * keep multiple actions if this conflict has been whitelisted in the grammar
|
||||
// * fail, terminating the parser generation process
|
||||
for symbol in lookaheads_with_conflicts.iter() {
|
||||
self.handle_conflict(
|
||||
&item_set,
|
||||
|
|
@ -243,15 +307,50 @@ impl<'a> ParseTableBuilder<'a> {
|
|||
)?;
|
||||
}
|
||||
|
||||
// Finally, add actions for the grammar's `extra` symbols.
|
||||
let state = &mut self.parse_table.states[state_id];
|
||||
for extra_token in &self.syntax_grammar.extra_tokens {
|
||||
state
|
||||
.terminal_entries
|
||||
.entry(*extra_token)
|
||||
.or_insert(ParseTableEntry {
|
||||
reusable: true,
|
||||
actions: vec![ParseAction::ShiftExtra],
|
||||
});
|
||||
let is_non_terminal_extra = state.is_non_terminal_extra;
|
||||
let is_end_of_non_terminal_extra =
|
||||
is_non_terminal_extra && state.terminal_entries.len() == 1;
|
||||
|
||||
// Add actions for the start tokens of each non-terminal extra rule.
|
||||
// These actions are added to every state except for the states that are
|
||||
// alread within non-terminal extras. Non-terminal extras are not allowed
|
||||
// to nest within each other.
|
||||
if !is_non_terminal_extra {
|
||||
for (terminal, state_id) in &self.non_terminal_extra_states {
|
||||
state
|
||||
.terminal_entries
|
||||
.entry(*terminal)
|
||||
.or_insert(ParseTableEntry {
|
||||
reusable: true,
|
||||
actions: vec![ParseAction::Shift {
|
||||
state: *state_id,
|
||||
is_repetition: false,
|
||||
}],
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Add ShiftExtra actions for the terminal extra tokens. These actions
|
||||
// are added to every state except for those at the ends of non-terminal
|
||||
// extras.
|
||||
if !is_end_of_non_terminal_extra {
|
||||
for extra_token in &self.syntax_grammar.extra_symbols {
|
||||
if extra_token.is_non_terminal() {
|
||||
state
|
||||
.nonterminal_entries
|
||||
.insert(*extra_token, GotoAction::ShiftExtra);
|
||||
} else {
|
||||
state
|
||||
.terminal_entries
|
||||
.entry(*extra_token)
|
||||
.or_insert(ParseTableEntry {
|
||||
reusable: true,
|
||||
actions: vec![ParseAction::ShiftExtra],
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
|
@ -362,8 +461,8 @@ impl<'a> ParseTableBuilder<'a> {
|
|||
}
|
||||
}
|
||||
|
||||
// If all reduce actions are left associative, remove the SHIFT action.
|
||||
// If all reduce actions are right associative, remove the REDUCE actions.
|
||||
// If all Reduce actions are left associative, remove the SHIFT action.
|
||||
// If all Reduce actions are right associative, remove the REDUCE actions.
|
||||
match (has_left, has_non, has_right) {
|
||||
(true, false, false) => {
|
||||
entry.actions.pop();
|
||||
|
|
@ -744,7 +843,7 @@ fn populate_following_tokens(
|
|||
}
|
||||
}
|
||||
}
|
||||
for extra in &grammar.extra_tokens {
|
||||
for extra in &grammar.extra_symbols {
|
||||
if extra.is_terminal() {
|
||||
for entry in result.iter_mut() {
|
||||
entry.insert(*extra);
|
||||
|
|
@ -774,6 +873,7 @@ pub(crate) fn build_parse_table<'a>(
|
|||
lexical_grammar,
|
||||
item_set_builder,
|
||||
variable_info,
|
||||
non_terminal_extra_states: Vec::new(),
|
||||
state_ids_by_item_set: HashMap::new(),
|
||||
core_ids_by_core: HashMap::new(),
|
||||
parse_state_info_by_id: Vec::new(),
|
||||
|
|
|
|||
|
|
@ -2,7 +2,9 @@ use super::token_conflicts::TokenConflictMap;
|
|||
use crate::generate::dedup::split_state_id_groups;
|
||||
use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar, VariableType};
|
||||
use crate::generate::rules::{AliasMap, Symbol, TokenSet};
|
||||
use crate::generate::tables::{ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry};
|
||||
use crate::generate::tables::{
|
||||
GotoAction, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry,
|
||||
};
|
||||
use log::info;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::mem;
|
||||
|
|
@ -101,7 +103,10 @@ impl<'a> Minimizer<'a> {
|
|||
state.update_referenced_states(|other_state_id, state| {
|
||||
if let Some(symbol) = unit_reduction_symbols_by_state.get(&other_state_id) {
|
||||
done = false;
|
||||
state.nonterminal_entries[symbol]
|
||||
match state.nonterminal_entries.get(symbol) {
|
||||
Some(GotoAction::Goto(state_id)) => *state_id,
|
||||
_ => other_state_id,
|
||||
}
|
||||
} else {
|
||||
other_state_id
|
||||
}
|
||||
|
|
@ -262,18 +267,24 @@ impl<'a> Minimizer<'a> {
|
|||
|
||||
for (symbol, s1) in &state1.nonterminal_entries {
|
||||
if let Some(s2) = state2.nonterminal_entries.get(symbol) {
|
||||
let group1 = group_ids_by_state_id[*s1];
|
||||
let group2 = group_ids_by_state_id[*s2];
|
||||
if group1 != group2 {
|
||||
info!(
|
||||
"split states {} {} - successors for {} are split: {} {}",
|
||||
state1.id,
|
||||
state2.id,
|
||||
self.symbol_name(symbol),
|
||||
s1,
|
||||
s2,
|
||||
);
|
||||
return true;
|
||||
match (s1, s2) {
|
||||
(GotoAction::ShiftExtra, GotoAction::ShiftExtra) => continue,
|
||||
(GotoAction::Goto(s1), GotoAction::Goto(s2)) => {
|
||||
let group1 = group_ids_by_state_id[*s1];
|
||||
let group2 = group_ids_by_state_id[*s2];
|
||||
if group1 != group2 {
|
||||
info!(
|
||||
"split states {} {} - successors for {} are split: {} {}",
|
||||
state1.id,
|
||||
state2.id,
|
||||
self.symbol_name(symbol),
|
||||
s1,
|
||||
s2,
|
||||
);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
_ => return true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ pub(crate) struct Variable {
|
|||
pub(crate) struct InputGrammar {
|
||||
pub name: String,
|
||||
pub variables: Vec<Variable>,
|
||||
pub extra_tokens: Vec<Rule>,
|
||||
pub extra_symbols: Vec<Rule>,
|
||||
pub expected_conflicts: Vec<Vec<String>>,
|
||||
pub external_tokens: Vec<Rule>,
|
||||
pub variables_to_inline: Vec<String>,
|
||||
|
|
@ -87,7 +87,7 @@ pub(crate) struct ExternalToken {
|
|||
#[derive(Debug, Default)]
|
||||
pub(crate) struct SyntaxGrammar {
|
||||
pub variables: Vec<SyntaxVariable>,
|
||||
pub extra_tokens: Vec<Symbol>,
|
||||
pub extra_symbols: Vec<Symbol>,
|
||||
pub expected_conflicts: Vec<Vec<Symbol>>,
|
||||
pub external_tokens: Vec<ExternalToken>,
|
||||
pub supertype_symbols: Vec<Symbol>,
|
||||
|
|
|
|||
|
|
@ -689,7 +689,7 @@ mod tests {
|
|||
fn test_node_types_simple() {
|
||||
let node_types = get_node_types(InputGrammar {
|
||||
name: String::new(),
|
||||
extra_tokens: Vec::new(),
|
||||
extra_symbols: Vec::new(),
|
||||
external_tokens: Vec::new(),
|
||||
expected_conflicts: Vec::new(),
|
||||
variables_to_inline: Vec::new(),
|
||||
|
|
@ -775,7 +775,7 @@ mod tests {
|
|||
fn test_node_types_with_supertypes() {
|
||||
let node_types = get_node_types(InputGrammar {
|
||||
name: String::new(),
|
||||
extra_tokens: Vec::new(),
|
||||
extra_symbols: Vec::new(),
|
||||
external_tokens: Vec::new(),
|
||||
expected_conflicts: Vec::new(),
|
||||
variables_to_inline: Vec::new(),
|
||||
|
|
@ -862,7 +862,7 @@ mod tests {
|
|||
fn test_node_types_for_children_without_fields() {
|
||||
let node_types = get_node_types(InputGrammar {
|
||||
name: String::new(),
|
||||
extra_tokens: Vec::new(),
|
||||
extra_symbols: Vec::new(),
|
||||
external_tokens: Vec::new(),
|
||||
expected_conflicts: Vec::new(),
|
||||
variables_to_inline: Vec::new(),
|
||||
|
|
@ -960,7 +960,7 @@ mod tests {
|
|||
fn test_node_types_for_aliased_nodes() {
|
||||
let node_types = get_node_types(InputGrammar {
|
||||
name: String::new(),
|
||||
extra_tokens: Vec::new(),
|
||||
extra_symbols: Vec::new(),
|
||||
external_tokens: Vec::new(),
|
||||
expected_conflicts: Vec::new(),
|
||||
variables_to_inline: Vec::new(),
|
||||
|
|
@ -1036,7 +1036,7 @@ mod tests {
|
|||
fn test_node_types_with_multiple_valued_fields() {
|
||||
let node_types = get_node_types(InputGrammar {
|
||||
name: String::new(),
|
||||
extra_tokens: Vec::new(),
|
||||
extra_symbols: Vec::new(),
|
||||
external_tokens: Vec::new(),
|
||||
expected_conflicts: Vec::new(),
|
||||
variables_to_inline: Vec::new(),
|
||||
|
|
|
|||
|
|
@ -87,7 +87,7 @@ pub(crate) fn parse_grammar(input: &str) -> Result<InputGrammar> {
|
|||
})
|
||||
}
|
||||
|
||||
let extra_tokens = grammar_json
|
||||
let extra_symbols = grammar_json
|
||||
.extras
|
||||
.unwrap_or(Vec::new())
|
||||
.into_iter()
|
||||
|
|
@ -107,7 +107,7 @@ pub(crate) fn parse_grammar(input: &str) -> Result<InputGrammar> {
|
|||
name: grammar_json.name,
|
||||
word_token: grammar_json.word,
|
||||
variables,
|
||||
extra_tokens,
|
||||
extra_symbols,
|
||||
expected_conflicts,
|
||||
external_tokens,
|
||||
supertype_symbols,
|
||||
|
|
|
|||
|
|
@ -283,7 +283,7 @@ mod tests {
|
|||
fn build_grammar(variables: Vec<Variable>) -> ExtractedSyntaxGrammar {
|
||||
ExtractedSyntaxGrammar {
|
||||
variables,
|
||||
extra_tokens: Vec::new(),
|
||||
extra_symbols: Vec::new(),
|
||||
external_tokens: Vec::new(),
|
||||
expected_conflicts: Vec::new(),
|
||||
variables_to_inline: Vec::new(),
|
||||
|
|
|
|||
|
|
@ -146,7 +146,7 @@ mod tests {
|
|||
}],
|
||||
},
|
||||
],
|
||||
extra_tokens: Vec::new(),
|
||||
extra_symbols: Vec::new(),
|
||||
expected_conflicts: Vec::new(),
|
||||
variables_to_inline: Vec::new(),
|
||||
supertype_symbols: Vec::new(),
|
||||
|
|
|
|||
|
|
@ -90,21 +90,13 @@ pub(super) fn extract_tokens(
|
|||
.collect();
|
||||
|
||||
let mut separators = Vec::new();
|
||||
let mut extra_tokens = Vec::new();
|
||||
for rule in grammar.extra_tokens {
|
||||
let mut extra_symbols = Vec::new();
|
||||
for rule in grammar.extra_symbols {
|
||||
if let Rule::Symbol(symbol) = rule {
|
||||
let new_symbol = symbol_replacer.replace_symbol(symbol);
|
||||
if new_symbol.is_non_terminal() {
|
||||
return Error::err(format!(
|
||||
"Non-token symbol '{}' cannot be used as an extra token",
|
||||
&variables[new_symbol.index].name
|
||||
));
|
||||
} else {
|
||||
extra_tokens.push(new_symbol);
|
||||
}
|
||||
extra_symbols.push(symbol_replacer.replace_symbol(symbol));
|
||||
} else {
|
||||
if let Some(index) = lexical_variables.iter().position(|v| v.rule == rule) {
|
||||
extra_tokens.push(Symbol::terminal(index));
|
||||
extra_symbols.push(Symbol::terminal(index));
|
||||
} else {
|
||||
separators.push(rule);
|
||||
}
|
||||
|
|
@ -158,7 +150,7 @@ pub(super) fn extract_tokens(
|
|||
ExtractedSyntaxGrammar {
|
||||
variables,
|
||||
expected_conflicts,
|
||||
extra_tokens,
|
||||
extra_symbols,
|
||||
variables_to_inline,
|
||||
supertype_symbols,
|
||||
external_tokens,
|
||||
|
|
@ -415,15 +407,15 @@ mod test {
|
|||
}
|
||||
|
||||
#[test]
|
||||
fn test_extracting_extra_tokens() {
|
||||
fn test_extracting_extra_symbols() {
|
||||
let mut grammar = build_grammar(vec![
|
||||
Variable::named("rule_0", Rule::string("x")),
|
||||
Variable::named("comment", Rule::pattern("//.*")),
|
||||
]);
|
||||
grammar.extra_tokens = vec![Rule::string(" "), Rule::non_terminal(1)];
|
||||
grammar.extra_symbols = vec![Rule::string(" "), Rule::non_terminal(1)];
|
||||
|
||||
let (syntax_grammar, lexical_grammar) = extract_tokens(grammar).unwrap();
|
||||
assert_eq!(syntax_grammar.extra_tokens, vec![Symbol::terminal(1),]);
|
||||
assert_eq!(syntax_grammar.extra_symbols, vec![Symbol::terminal(1),]);
|
||||
assert_eq!(lexical_grammar.separators, vec![Rule::string(" "),]);
|
||||
}
|
||||
|
||||
|
|
@ -472,28 +464,6 @@ mod test {
|
|||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_error_on_non_terminal_symbol_extras() {
|
||||
let mut grammar = build_grammar(vec![
|
||||
Variable::named("rule_0", Rule::non_terminal(1)),
|
||||
Variable::named("rule_1", Rule::non_terminal(2)),
|
||||
Variable::named("rule_2", Rule::string("x")),
|
||||
]);
|
||||
grammar.extra_tokens = vec![Rule::non_terminal(1)];
|
||||
|
||||
match extract_tokens(grammar) {
|
||||
Err(e) => {
|
||||
assert_eq!(
|
||||
e.message(),
|
||||
"Non-token symbol 'rule_1' cannot be used as an extra token"
|
||||
);
|
||||
}
|
||||
_ => {
|
||||
panic!("Expected an error but got no error");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_error_on_external_with_same_name_as_non_terminal() {
|
||||
let mut grammar = build_grammar(vec![
|
||||
|
|
@ -522,7 +492,7 @@ mod test {
|
|||
fn build_grammar(variables: Vec<Variable>) -> InternedGrammar {
|
||||
InternedGrammar {
|
||||
variables,
|
||||
extra_tokens: Vec::new(),
|
||||
extra_symbols: Vec::new(),
|
||||
external_tokens: Vec::new(),
|
||||
expected_conflicts: Vec::new(),
|
||||
variables_to_inline: Vec::new(),
|
||||
|
|
|
|||
|
|
@ -199,7 +199,7 @@ unless they are used only as the grammar's start rule.
|
|||
}
|
||||
}
|
||||
Ok(SyntaxGrammar {
|
||||
extra_tokens: grammar.extra_tokens,
|
||||
extra_symbols: grammar.extra_symbols,
|
||||
expected_conflicts: grammar.expected_conflicts,
|
||||
variables_to_inline: grammar.variables_to_inline,
|
||||
external_tokens: grammar.external_tokens,
|
||||
|
|
|
|||
|
|
@ -30,9 +30,9 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result<InternedGrammar>
|
|||
external_tokens.push(Variable { name, kind, rule });
|
||||
}
|
||||
|
||||
let mut extra_tokens = Vec::with_capacity(grammar.extra_tokens.len());
|
||||
for extra_token in grammar.extra_tokens.iter() {
|
||||
extra_tokens.push(interner.intern_rule(extra_token)?);
|
||||
let mut extra_symbols = Vec::with_capacity(grammar.extra_symbols.len());
|
||||
for extra_token in grammar.extra_symbols.iter() {
|
||||
extra_symbols.push(interner.intern_rule(extra_token)?);
|
||||
}
|
||||
|
||||
let mut supertype_symbols = Vec::with_capacity(grammar.supertype_symbols.len());
|
||||
|
|
@ -76,7 +76,7 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result<InternedGrammar>
|
|||
Ok(InternedGrammar {
|
||||
variables,
|
||||
external_tokens,
|
||||
extra_tokens,
|
||||
extra_symbols,
|
||||
expected_conflicts,
|
||||
variables_to_inline,
|
||||
supertype_symbols,
|
||||
|
|
@ -236,7 +236,7 @@ mod tests {
|
|||
InputGrammar {
|
||||
variables,
|
||||
name: "the_language".to_string(),
|
||||
extra_tokens: Vec::new(),
|
||||
extra_symbols: Vec::new(),
|
||||
external_tokens: Vec::new(),
|
||||
expected_conflicts: Vec::new(),
|
||||
variables_to_inline: Vec::new(),
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ use crate::generate::rules::{AliasMap, Rule, Symbol};
|
|||
|
||||
pub(crate) struct IntermediateGrammar<T, U> {
|
||||
variables: Vec<Variable>,
|
||||
extra_tokens: Vec<T>,
|
||||
extra_symbols: Vec<T>,
|
||||
expected_conflicts: Vec<Vec<Symbol>>,
|
||||
external_tokens: Vec<U>,
|
||||
variables_to_inline: Vec<Symbol>,
|
||||
|
|
|
|||
|
|
@ -196,7 +196,7 @@ mod tests {
|
|||
fn test_basic_inlining() {
|
||||
let grammar = SyntaxGrammar {
|
||||
expected_conflicts: Vec::new(),
|
||||
extra_tokens: Vec::new(),
|
||||
extra_symbols: Vec::new(),
|
||||
external_tokens: Vec::new(),
|
||||
supertype_symbols: Vec::new(),
|
||||
word_token: None,
|
||||
|
|
@ -327,7 +327,7 @@ mod tests {
|
|||
Symbol::non_terminal(3),
|
||||
],
|
||||
expected_conflicts: Vec::new(),
|
||||
extra_tokens: Vec::new(),
|
||||
extra_symbols: Vec::new(),
|
||||
external_tokens: Vec::new(),
|
||||
supertype_symbols: Vec::new(),
|
||||
word_token: None,
|
||||
|
|
@ -429,7 +429,7 @@ mod tests {
|
|||
},
|
||||
],
|
||||
expected_conflicts: Vec::new(),
|
||||
extra_tokens: Vec::new(),
|
||||
extra_symbols: Vec::new(),
|
||||
external_tokens: Vec::new(),
|
||||
supertype_symbols: Vec::new(),
|
||||
word_token: None,
|
||||
|
|
|
|||
|
|
@ -2,7 +2,8 @@ use super::grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType
|
|||
use super::nfa::CharacterSet;
|
||||
use super::rules::{Alias, AliasMap, Symbol, SymbolType};
|
||||
use super::tables::{
|
||||
AdvanceAction, FieldLocation, LexState, LexTable, ParseAction, ParseTable, ParseTableEntry,
|
||||
AdvanceAction, FieldLocation, GotoAction, LexState, LexTable, ParseAction, ParseTable,
|
||||
ParseTableEntry,
|
||||
};
|
||||
use core::ops::Range;
|
||||
use std::cmp;
|
||||
|
|
@ -678,7 +679,12 @@ impl Generator {
|
|||
add_line!(self, "static TSLexMode ts_lex_modes[STATE_COUNT] = {{");
|
||||
indent!(self);
|
||||
for (i, state) in self.parse_table.states.iter().enumerate() {
|
||||
if state.external_lex_state_id > 0 {
|
||||
if state.is_non_terminal_extra
|
||||
&& state.terminal_entries.len() == 1
|
||||
&& *state.terminal_entries.iter().next().unwrap().0 == Symbol::end()
|
||||
{
|
||||
add_line!(self, "[{}] = {{-1}},", i,);
|
||||
} else if state.external_lex_state_id > 0 {
|
||||
add_line!(
|
||||
self,
|
||||
"[{}] = {{.lex_state = {}, .external_lex_state = {}}},",
|
||||
|
|
@ -807,12 +813,15 @@ impl Generator {
|
|||
terminal_entries.sort_unstable_by_key(|e| self.symbol_order.get(e.0));
|
||||
nonterminal_entries.sort_unstable_by_key(|k| k.0);
|
||||
|
||||
for (symbol, state_id) in &nonterminal_entries {
|
||||
for (symbol, action) in &nonterminal_entries {
|
||||
add_line!(
|
||||
self,
|
||||
"[{}] = STATE({}),",
|
||||
self.symbol_ids[symbol],
|
||||
*state_id
|
||||
match action {
|
||||
GotoAction::Goto(state) => *state,
|
||||
GotoAction::ShiftExtra => i,
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
|
|
@ -865,9 +874,15 @@ impl Generator {
|
|||
.or_default()
|
||||
.push(**symbol);
|
||||
}
|
||||
for (symbol, state_id) in &state.nonterminal_entries {
|
||||
for (symbol, action) in &state.nonterminal_entries {
|
||||
let state_id = match action {
|
||||
GotoAction::Goto(i) => *i,
|
||||
GotoAction::ShiftExtra => {
|
||||
self.large_state_count + small_state_indices.len() - 1
|
||||
}
|
||||
};
|
||||
symbols_by_value
|
||||
.entry((*state_id, SymbolType::NonTerminal))
|
||||
.entry((state_id, SymbolType::NonTerminal))
|
||||
.or_default()
|
||||
.push(*symbol);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -24,6 +24,12 @@ pub(crate) enum ParseAction {
|
|||
},
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||
pub(crate) enum GotoAction {
|
||||
Goto(ParseStateId),
|
||||
ShiftExtra,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
pub(crate) struct ParseTableEntry {
|
||||
pub actions: Vec<ParseAction>,
|
||||
|
|
@ -34,10 +40,11 @@ pub(crate) struct ParseTableEntry {
|
|||
pub(crate) struct ParseState {
|
||||
pub id: ParseStateId,
|
||||
pub terminal_entries: HashMap<Symbol, ParseTableEntry>,
|
||||
pub nonterminal_entries: HashMap<Symbol, ParseStateId>,
|
||||
pub nonterminal_entries: HashMap<Symbol, GotoAction>,
|
||||
pub lex_state_id: usize,
|
||||
pub external_lex_state_id: usize,
|
||||
pub core_id: usize,
|
||||
pub is_non_terminal_extra: bool,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
|
||||
|
|
@ -103,7 +110,13 @@ impl ParseState {
|
|||
_ => None,
|
||||
})
|
||||
})
|
||||
.chain(self.nonterminal_entries.iter().map(|(_, state)| *state))
|
||||
.chain(self.nonterminal_entries.iter().filter_map(|(_, action)| {
|
||||
if let GotoAction::Goto(state) = action {
|
||||
Some(*state)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn update_referenced_states<F>(&mut self, mut f: F)
|
||||
|
|
@ -121,15 +134,18 @@ impl ParseState {
|
|||
}
|
||||
}
|
||||
}
|
||||
for (symbol, other_state) in &self.nonterminal_entries {
|
||||
let result = f(*other_state, self);
|
||||
if result != *other_state {
|
||||
updates.push((*symbol, 0, result));
|
||||
for (symbol, action) in &self.nonterminal_entries {
|
||||
if let GotoAction::Goto(other_state) = action {
|
||||
let result = f(*other_state, self);
|
||||
if result != *other_state {
|
||||
updates.push((*symbol, 0, result));
|
||||
}
|
||||
}
|
||||
}
|
||||
for (symbol, action_index, new_state) in updates {
|
||||
if symbol.is_non_terminal() {
|
||||
self.nonterminal_entries.insert(symbol, new_state);
|
||||
self.nonterminal_entries
|
||||
.insert(symbol, GotoAction::Goto(new_state));
|
||||
} else {
|
||||
let entry = self.terminal_entries.get_mut(&symbol).unwrap();
|
||||
if let ParseAction::Shift { is_repetition, .. } = entry.actions[action_index] {
|
||||
|
|
|
|||
|
|
@ -351,6 +351,7 @@ static Subtree ts_parser__lex(
|
|||
Length start_position = ts_stack_position(self->stack, version);
|
||||
Subtree external_token = ts_stack_last_external_token(self->stack, version);
|
||||
TSLexMode lex_mode = self->language->lex_modes[parse_state];
|
||||
if (lex_mode.lex_state == (uint16_t)-1) return NULL_SUBTREE;
|
||||
const bool *valid_external_tokens = ts_language_enabled_external_tokens(
|
||||
self->language,
|
||||
lex_mode.external_lex_state
|
||||
|
|
@ -748,7 +749,8 @@ static StackVersion ts_parser__reduce(
|
|||
uint32_t count,
|
||||
int dynamic_precedence,
|
||||
uint16_t production_id,
|
||||
bool fragile
|
||||
bool is_fragile,
|
||||
bool is_extra
|
||||
) {
|
||||
uint32_t initial_version_count = ts_stack_version_count(self->stack);
|
||||
uint32_t removed_version_count = 0;
|
||||
|
|
@ -813,7 +815,8 @@ static StackVersion ts_parser__reduce(
|
|||
|
||||
TSStateId state = ts_stack_state(self->stack, slice_version);
|
||||
TSStateId next_state = ts_language_next_state(self->language, state, symbol);
|
||||
if (fragile || pop.size > 1 || initial_version_count > 1) {
|
||||
if (is_extra) parent.ptr->extra = true;
|
||||
if (is_fragile || pop.size > 1 || initial_version_count > 1) {
|
||||
parent.ptr->fragile_left = true;
|
||||
parent.ptr->fragile_right = true;
|
||||
parent.ptr->parse_state = TS_TREE_STATE_NONE;
|
||||
|
|
@ -962,7 +965,7 @@ static bool ts_parser__do_all_potential_reductions(
|
|||
reduction_version = ts_parser__reduce(
|
||||
self, version, action.symbol, action.count,
|
||||
action.dynamic_precedence, action.production_id,
|
||||
true
|
||||
true, false
|
||||
);
|
||||
}
|
||||
|
||||
|
|
@ -1366,8 +1369,17 @@ static bool ts_parser__advance(
|
|||
// Otherwise, re-run the lexer.
|
||||
if (!lookahead.ptr) {
|
||||
lookahead = ts_parser__lex(self, version, state);
|
||||
ts_parser__set_cached_token(self, position, last_external_token, lookahead);
|
||||
ts_language_table_entry(self->language, state, ts_subtree_symbol(lookahead), &table_entry);
|
||||
if (lookahead.ptr) {
|
||||
ts_parser__set_cached_token(self, position, last_external_token, lookahead);
|
||||
ts_language_table_entry(self->language, state, ts_subtree_symbol(lookahead), &table_entry);
|
||||
}
|
||||
|
||||
// When parsing a non-terminal extra, a null lookahead indicates the
|
||||
// end of the rule. The reduction is stored in the EOF table entry.
|
||||
// After the reduction, the lexer needs to be run again.
|
||||
else {
|
||||
ts_language_table_entry(self->language, state, ts_builtin_sym_end, &table_entry);
|
||||
}
|
||||
}
|
||||
|
||||
for (;;) {
|
||||
|
|
@ -1422,11 +1434,12 @@ static bool ts_parser__advance(
|
|||
|
||||
case TSParseActionTypeReduce: {
|
||||
bool is_fragile = table_entry.action_count > 1;
|
||||
bool is_extra = lookahead.ptr == NULL;
|
||||
LOG("reduce sym:%s, child_count:%u", SYM_NAME(action.params.symbol), action.params.child_count);
|
||||
StackVersion reduction_version = ts_parser__reduce(
|
||||
self, version, action.params.symbol, action.params.child_count,
|
||||
action.params.dynamic_precedence, action.params.production_id,
|
||||
is_fragile
|
||||
is_fragile, is_extra
|
||||
);
|
||||
if (reduction_version != STACK_VERSION_NONE) {
|
||||
last_reduction_version = reduction_version;
|
||||
|
|
@ -1459,6 +1472,15 @@ static bool ts_parser__advance(
|
|||
ts_stack_renumber_version(self->stack, last_reduction_version, version);
|
||||
LOG_STACK();
|
||||
state = ts_stack_state(self->stack, version);
|
||||
|
||||
// At the end of a non-terminal extra rule, the lexer will return a
|
||||
// null subtree, because the parser needs to perform a fixed reduction
|
||||
// regardless of the lookahead node. After performing that reduction,
|
||||
// (and completing the non-terminal extra rule) run the lexer again based
|
||||
// on the current parse state.
|
||||
if (!lookahead.ptr) {
|
||||
lookahead = ts_parser__lex(self, version, state);
|
||||
}
|
||||
ts_language_table_entry(
|
||||
self->language,
|
||||
state,
|
||||
|
|
|
|||
22
test/fixtures/test_grammars/extra_non_terminals/corpus.txt
vendored
Normal file
22
test/fixtures/test_grammars/extra_non_terminals/corpus.txt
vendored
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
==============
|
||||
No extras
|
||||
==============
|
||||
|
||||
a b c d
|
||||
|
||||
---
|
||||
|
||||
(module)
|
||||
|
||||
==============
|
||||
Extras
|
||||
==============
|
||||
|
||||
a (one) b (two) (three) c d
|
||||
|
||||
---
|
||||
|
||||
(module
|
||||
(comment)
|
||||
(comment)
|
||||
(comment))
|
||||
35
test/fixtures/test_grammars/extra_non_terminals/grammar.json
vendored
Normal file
35
test/fixtures/test_grammars/extra_non_terminals/grammar.json
vendored
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
{
|
||||
"name": "extra_non_terminals",
|
||||
|
||||
"extras": [
|
||||
{"type": "PATTERN", "value": "\\s"},
|
||||
{"type": "SYMBOL", "name": "comment"}
|
||||
],
|
||||
|
||||
"rules": {
|
||||
"module": {
|
||||
"type": "SEQ",
|
||||
"members": [
|
||||
{"type": "STRING", "value": "a"},
|
||||
{"type": "STRING", "value": "b"},
|
||||
{"type": "STRING", "value": "c"},
|
||||
{"type": "STRING", "value": "d"}
|
||||
]
|
||||
},
|
||||
|
||||
"comment": {
|
||||
"type": "SEQ",
|
||||
"members": [
|
||||
{"type": "STRING", "value": "("},
|
||||
{
|
||||
"type": "REPEAT",
|
||||
"content": {
|
||||
"type": "PATTERN",
|
||||
"value": "[a-z]+"
|
||||
}
|
||||
},
|
||||
{"type": "STRING", "value": ")"}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue