From 310c0b86a7149089fe6861a89fbf5cf65ea3ada3 Mon Sep 17 00:00:00 2001 From: Will Lillis Date: Tue, 2 Sep 2025 22:02:31 -0400 Subject: [PATCH] fix(generate): return error when single state transitions have indirectly recursive cycles. This can cause infinite loops in the parser near EOF. Co-authored-by: Amaan Qureshi --- crates/generate/src/prepare_grammar.rs | 98 ++++++++++++++++++- .../expected_error.txt | 1 + .../grammar.js | 16 +++ 3 files changed, 114 insertions(+), 1 deletion(-) create mode 100644 test/fixtures/test_grammars/indirect_recursion_in_transitions/expected_error.txt create mode 100644 test/fixtures/test_grammars/indirect_recursion_in_transitions/grammar.js diff --git a/crates/generate/src/prepare_grammar.rs b/crates/generate/src/prepare_grammar.rs index 2fe7f8a5..8c35c741 100644 --- a/crates/generate/src/prepare_grammar.rs +++ b/crates/generate/src/prepare_grammar.rs @@ -8,7 +8,7 @@ mod process_inlines; use std::{ cmp::Ordering, - collections::{hash_map, HashMap, HashSet}, + collections::{hash_map, BTreeSet, HashMap, HashSet}, mem, }; @@ -16,6 +16,7 @@ use anyhow::Result; pub use expand_tokens::ExpandTokensError; pub use extract_tokens::ExtractTokensError; pub use flatten_grammar::FlattenGrammarError; +use indexmap::IndexMap; pub use intern_symbols::InternSymbolsError; pub use process_inlines::ProcessInlinesError; use serde::Serialize; @@ -80,6 +81,7 @@ pub type PrepareGrammarResult = Result; #[error(transparent)] pub enum PrepareGrammarError { ValidatePrecedences(#[from] ValidatePrecedenceError), + ValidateIndirectRecursion(#[from] IndirectRecursionError), InternSymbols(#[from] InternSymbolsError), ExtractTokens(#[from] ExtractTokensError), FlattenGrammar(#[from] FlattenGrammarError), @@ -96,6 +98,22 @@ pub enum ValidatePrecedenceError { Ordering(#[from] ConflictingPrecedenceOrderingError), } +#[derive(Debug, Error, Serialize)] +pub struct IndirectRecursionError(pub Vec); + +impl std::fmt::Display for IndirectRecursionError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Grammar contains an indirectly recursive rule: ")?; + for (i, symbol) in self.0.iter().enumerate() { + if i > 0 { + write!(f, " -> ")?; + } + write!(f, "{symbol}")?; + } + Ok(()) + } +} + #[derive(Debug, Error, Serialize)] pub struct UndeclaredPrecedenceError { pub precedence: String, @@ -141,6 +159,7 @@ pub fn prepare_grammar( AliasMap, )> { validate_precedences(input_grammar)?; + validate_indirect_recursion(input_grammar)?; let interned_grammar = intern_symbols(input_grammar)?; let (syntax_grammar, lexical_grammar) = extract_tokens(interned_grammar)?; @@ -152,6 +171,83 @@ pub fn prepare_grammar( Ok((syntax_grammar, lexical_grammar, inlines, default_aliases)) } +/// Check for indirect recursion cycles in the grammar that can cause infinite loops while +/// parsing. An indirect recursion cycle occurs when a non-terminal can derive itself through +/// a chain of single-symbol productions (e.g., A -> B, B -> A). +fn validate_indirect_recursion(grammar: &InputGrammar) -> Result<(), IndirectRecursionError> { + let mut epsilon_transitions: IndexMap<&str, BTreeSet> = IndexMap::new(); + + for variable in &grammar.variables { + let productions = get_single_symbol_productions(&variable.rule); + // Filter out rules that *directly* reference themselves, as this doesn't + // cause a parsing loop. + let filtered: BTreeSet = productions + .into_iter() + .filter(|s| s != &variable.name) + .collect(); + epsilon_transitions.insert(variable.name.as_str(), filtered); + } + + for start_symbol in epsilon_transitions.keys() { + let mut visited = BTreeSet::new(); + let mut path = Vec::new(); + if let Some((start_idx, end_idx)) = + get_cycle(start_symbol, &epsilon_transitions, &mut visited, &mut path) + { + let cycle_symbols = path[start_idx..=end_idx] + .iter() + .map(|s| (*s).to_string()) + .collect(); + return Err(IndirectRecursionError(cycle_symbols)); + } + } + + Ok(()) +} + +fn get_single_symbol_productions(rule: &Rule) -> BTreeSet { + match rule { + Rule::NamedSymbol(name) => BTreeSet::from([name.clone()]), + Rule::Choice(choices) => choices + .iter() + .flat_map(get_single_symbol_productions) + .collect(), + Rule::Metadata { rule, .. } => get_single_symbol_productions(rule), + _ => BTreeSet::new(), + } +} + +/// Perform a depth-first search to detect cycles in single state transitions. +fn get_cycle<'a>( + current: &'a str, + transitions: &'a IndexMap<&'a str, BTreeSet>, + visited: &mut BTreeSet<&'a str>, + path: &mut Vec<&'a str>, +) -> Option<(usize, usize)> { + if let Some(first_idx) = path.iter().position(|s| *s == current) { + path.push(current); + return Some((first_idx, path.len() - 1)); + } + + if visited.contains(current) { + return None; + } + + path.push(current); + visited.insert(current); + + if let Some(next_symbols) = transitions.get(current) { + for next in next_symbols { + if let Some(cycle) = get_cycle(next, transitions, visited, path) { + return Some(cycle); + } + } + } + + path.pop(); + None +} + /// Check that all of the named precedences used in the grammar are declared /// within the `precedences` lists, and also that there are no conflicting /// precedence orderings declared in those lists. diff --git a/test/fixtures/test_grammars/indirect_recursion_in_transitions/expected_error.txt b/test/fixtures/test_grammars/indirect_recursion_in_transitions/expected_error.txt new file mode 100644 index 00000000..4f244a6c --- /dev/null +++ b/test/fixtures/test_grammars/indirect_recursion_in_transitions/expected_error.txt @@ -0,0 +1 @@ +Grammar contains an indirectly recursive rule: type_expression -> _expression -> identifier_expression -> type_expression \ No newline at end of file diff --git a/test/fixtures/test_grammars/indirect_recursion_in_transitions/grammar.js b/test/fixtures/test_grammars/indirect_recursion_in_transitions/grammar.js new file mode 100644 index 00000000..65ff7b45 --- /dev/null +++ b/test/fixtures/test_grammars/indirect_recursion_in_transitions/grammar.js @@ -0,0 +1,16 @@ +module.exports = grammar({ + name: 'indirect_recursive_in_single_symbol_transitions', + rules: { + source_file: $ => repeat($._statement), + + _statement: $ => seq($.initialization_part, $.type_expression), + + type_expression: $ => choice('int', $._expression), + + initialization_part: $ => seq('=', $._expression), + + _expression: $ => choice($.identifier_expression, $.type_expression), + + identifier_expression: $ => choice(/[a-zA-Z_][a-zA-Z0-9_]*/, $.type_expression), + } +});