fix(generate): return error when single state transitions have

indirectly recursive cycles.

This can cause infinite loops in the parser near EOF.

Co-authored-by: Amaan Qureshi <amaanq12@gmail.com>
(cherry picked from commit 310c0b86a7)
This commit is contained in:
Will Lillis 2025-09-02 22:02:31 -04:00 committed by Amaan Qureshi
parent 8e2b5ad2a4
commit 14c4d2f8ca
3 changed files with 114 additions and 1 deletions

View file

@ -8,7 +8,7 @@ mod process_inlines;
use std::{
cmp::Ordering,
collections::{hash_map, HashMap, HashSet},
collections::{hash_map, BTreeSet, HashMap, HashSet},
mem,
};
@ -16,6 +16,7 @@ use anyhow::Result;
pub use expand_tokens::ExpandTokensError;
pub use extract_tokens::ExtractTokensError;
pub use flatten_grammar::FlattenGrammarError;
use indexmap::IndexMap;
pub use intern_symbols::InternSymbolsError;
pub use process_inlines::ProcessInlinesError;
use serde::Serialize;
@ -80,6 +81,7 @@ pub type PrepareGrammarResult<T> = Result<T, PrepareGrammarError>;
#[error(transparent)]
pub enum PrepareGrammarError {
ValidatePrecedences(#[from] ValidatePrecedenceError),
ValidateIndirectRecursion(#[from] IndirectRecursionError),
InternSymbols(#[from] InternSymbolsError),
ExtractTokens(#[from] ExtractTokensError),
FlattenGrammar(#[from] FlattenGrammarError),
@ -96,6 +98,22 @@ pub enum ValidatePrecedenceError {
Ordering(#[from] ConflictingPrecedenceOrderingError),
}
#[derive(Debug, Error, Serialize)]
pub struct IndirectRecursionError(pub Vec<String>);
impl std::fmt::Display for IndirectRecursionError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "Grammar contains an indirectly recursive rule: ")?;
for (i, symbol) in self.0.iter().enumerate() {
if i > 0 {
write!(f, " -> ")?;
}
write!(f, "{symbol}")?;
}
Ok(())
}
}
#[derive(Debug, Error, Serialize)]
pub struct UndeclaredPrecedenceError {
pub precedence: String,
@ -141,6 +159,7 @@ pub fn prepare_grammar(
AliasMap,
)> {
validate_precedences(input_grammar)?;
validate_indirect_recursion(input_grammar)?;
let interned_grammar = intern_symbols(input_grammar)?;
let (syntax_grammar, lexical_grammar) = extract_tokens(interned_grammar)?;
@ -152,6 +171,83 @@ pub fn prepare_grammar(
Ok((syntax_grammar, lexical_grammar, inlines, default_aliases))
}
/// Check for indirect recursion cycles in the grammar that can cause infinite loops while
/// parsing. An indirect recursion cycle occurs when a non-terminal can derive itself through
/// a chain of single-symbol productions (e.g., A -> B, B -> A).
fn validate_indirect_recursion(grammar: &InputGrammar) -> Result<(), IndirectRecursionError> {
let mut epsilon_transitions: IndexMap<&str, BTreeSet<String>> = IndexMap::new();
for variable in &grammar.variables {
let productions = get_single_symbol_productions(&variable.rule);
// Filter out rules that *directly* reference themselves, as this doesn't
// cause a parsing loop.
let filtered: BTreeSet<String> = productions
.into_iter()
.filter(|s| s != &variable.name)
.collect();
epsilon_transitions.insert(variable.name.as_str(), filtered);
}
for start_symbol in epsilon_transitions.keys() {
let mut visited = BTreeSet::new();
let mut path = Vec::new();
if let Some((start_idx, end_idx)) =
get_cycle(start_symbol, &epsilon_transitions, &mut visited, &mut path)
{
let cycle_symbols = path[start_idx..=end_idx]
.iter()
.map(|s| (*s).to_string())
.collect();
return Err(IndirectRecursionError(cycle_symbols));
}
}
Ok(())
}
fn get_single_symbol_productions(rule: &Rule) -> BTreeSet<String> {
match rule {
Rule::NamedSymbol(name) => BTreeSet::from([name.clone()]),
Rule::Choice(choices) => choices
.iter()
.flat_map(get_single_symbol_productions)
.collect(),
Rule::Metadata { rule, .. } => get_single_symbol_productions(rule),
_ => BTreeSet::new(),
}
}
/// Perform a depth-first search to detect cycles in single state transitions.
fn get_cycle<'a>(
current: &'a str,
transitions: &'a IndexMap<&'a str, BTreeSet<String>>,
visited: &mut BTreeSet<&'a str>,
path: &mut Vec<&'a str>,
) -> Option<(usize, usize)> {
if let Some(first_idx) = path.iter().position(|s| *s == current) {
path.push(current);
return Some((first_idx, path.len() - 1));
}
if visited.contains(current) {
return None;
}
path.push(current);
visited.insert(current);
if let Some(next_symbols) = transitions.get(current) {
for next in next_symbols {
if let Some(cycle) = get_cycle(next, transitions, visited, path) {
return Some(cycle);
}
}
}
path.pop();
None
}
/// Check that all of the named precedences used in the grammar are declared
/// within the `precedences` lists, and also that there are no conflicting
/// precedence orderings declared in those lists.

View file

@ -0,0 +1 @@
Grammar contains an indirectly recursive rule: type_expression -> _expression -> identifier_expression -> type_expression

View file

@ -0,0 +1,16 @@
module.exports = grammar({
name: 'indirect_recursive_in_single_symbol_transitions',
rules: {
source_file: $ => repeat($._statement),
_statement: $ => seq($.initialization_part, $.type_expression),
type_expression: $ => choice('int', $._expression),
initialization_part: $ => seq('=', $._expression),
_expression: $ => choice($.identifier_expression, $.type_expression),
identifier_expression: $ => choice(/[a-zA-Z_][a-zA-Z0-9_]*/, $.type_expression),
}
});