tree-sitter/cli/generate/src/parse_grammar.rs
Will Lillis 253003ccf8 fix(generate): warn users when extra rule can lead to parser hang
When a *named* rule in the extras is able to match the empty string,
parsing can hang in certain situations (i.e. near EOF).

(cherry picked from commit ac171eb280)
2025-08-29 23:32:57 -04:00

455 lines
14 KiB
Rust

use std::collections::HashSet;
use anyhow::Result;
use regex::Regex;
use serde::{Deserialize, Serialize};
use serde_json::{Map, Value};
use thiserror::Error;
use super::{
grammars::{InputGrammar, PrecedenceEntry, Variable, VariableType},
rules::{Precedence, Rule},
};
use crate::grammars::ReservedWordContext;
#[derive(Deserialize)]
#[serde(tag = "type")]
#[allow(non_camel_case_types)]
#[allow(clippy::upper_case_acronyms)]
enum RuleJSON {
ALIAS {
content: Box<RuleJSON>,
named: bool,
value: String,
},
BLANK,
STRING {
value: String,
},
PATTERN {
value: String,
flags: Option<String>,
},
SYMBOL {
name: String,
},
CHOICE {
members: Vec<RuleJSON>,
},
FIELD {
name: String,
content: Box<RuleJSON>,
},
SEQ {
members: Vec<RuleJSON>,
},
REPEAT {
content: Box<RuleJSON>,
},
REPEAT1 {
content: Box<RuleJSON>,
},
PREC_DYNAMIC {
value: i32,
content: Box<RuleJSON>,
},
PREC_LEFT {
value: PrecedenceValueJSON,
content: Box<RuleJSON>,
},
PREC_RIGHT {
value: PrecedenceValueJSON,
content: Box<RuleJSON>,
},
PREC {
value: PrecedenceValueJSON,
content: Box<RuleJSON>,
},
TOKEN {
content: Box<RuleJSON>,
},
IMMEDIATE_TOKEN {
content: Box<RuleJSON>,
},
RESERVED {
context_name: String,
content: Box<RuleJSON>,
},
}
#[derive(Deserialize)]
#[serde(untagged)]
enum PrecedenceValueJSON {
Integer(i32),
Name(String),
}
#[derive(Deserialize)]
pub struct GrammarJSON {
pub name: String,
rules: Map<String, Value>,
#[serde(default)]
precedences: Vec<Vec<RuleJSON>>,
#[serde(default)]
conflicts: Vec<Vec<String>>,
#[serde(default)]
externals: Vec<RuleJSON>,
#[serde(default)]
extras: Vec<RuleJSON>,
#[serde(default)]
inline: Vec<String>,
#[serde(default)]
supertypes: Vec<String>,
#[serde(default)]
word: Option<String>,
#[serde(default)]
reserved: Map<String, Value>,
}
pub type ParseGrammarResult<T> = Result<T, ParseGrammarError>;
#[derive(Debug, Error, Serialize)]
pub enum ParseGrammarError {
#[error("{0}")]
Serialization(String),
#[error("Rules in the `extras` array must not contain empty strings")]
InvalidExtra,
#[error("Invalid rule in precedences array. Only strings and symbols are allowed")]
Unexpected,
#[error("Reserved word sets must be arrays")]
InvalidReservedWordSet,
#[error("Grammar Error: Unexpected rule `{0}` in `token()` call")]
UnexpectedRule(String),
}
impl From<serde_json::Error> for ParseGrammarError {
fn from(value: serde_json::Error) -> Self {
Self::Serialization(value.to_string())
}
}
/// Check if a rule is referenced by another rule.
///
/// This function is used to determine if a variable is used in a given rule,
/// and `is_other` indicates if the rule is an external, and if it is,
/// to not assume that a named symbol that is equal to itself means it's being referenced.
///
/// For example, if we have an external rule **and** a normal rule both called `foo`,
/// `foo` should not be thought of as directly used unless it's used within another rule.
fn rule_is_referenced(rule: &Rule, target: &str, is_external: bool) -> bool {
match rule {
Rule::NamedSymbol(name) => name == target && !is_external,
Rule::Choice(rules) | Rule::Seq(rules) => {
rules.iter().any(|r| rule_is_referenced(r, target, false))
}
Rule::Metadata { rule, .. } | Rule::Reserved { rule, .. } => {
rule_is_referenced(rule, target, is_external)
}
Rule::Repeat(inner) => rule_is_referenced(inner, target, false),
Rule::Blank | Rule::String(_) | Rule::Pattern(_, _) | Rule::Symbol(_) => false,
}
}
fn variable_is_used(
grammar_rules: &[(String, Rule)],
extras: &[Rule],
externals: &[Rule],
target_name: &str,
in_progress: &mut HashSet<String>,
) -> bool {
let root = &grammar_rules.first().unwrap().0;
if target_name == root {
return true;
}
if extras
.iter()
.any(|rule| rule_is_referenced(rule, target_name, false))
{
return true;
}
if externals
.iter()
.any(|rule| rule_is_referenced(rule, target_name, true))
{
return true;
}
in_progress.insert(target_name.to_string());
let result = grammar_rules
.iter()
.filter(|(key, _)| *key != target_name)
.any(|(name, rule)| {
if !rule_is_referenced(rule, target_name, false) || in_progress.contains(name) {
return false;
}
variable_is_used(grammar_rules, extras, externals, name, in_progress)
});
in_progress.remove(target_name);
result
}
pub(crate) fn parse_grammar(input: &str) -> ParseGrammarResult<InputGrammar> {
let mut grammar_json = serde_json::from_str::<GrammarJSON>(input)?;
let mut extra_symbols =
grammar_json
.extras
.into_iter()
.try_fold(Vec::<Rule>::new(), |mut acc, item| {
let rule = parse_rule(item, false)?;
if let Rule::String(ref value) = rule {
if value.is_empty() {
Err(ParseGrammarError::InvalidExtra)?;
}
}
acc.push(rule);
ParseGrammarResult::Ok(acc)
})?;
let mut external_tokens = grammar_json
.externals
.into_iter()
.map(|e| parse_rule(e, false))
.collect::<ParseGrammarResult<Vec<_>>>()?;
let mut precedence_orderings = Vec::with_capacity(grammar_json.precedences.len());
for list in grammar_json.precedences {
let mut ordering = Vec::with_capacity(list.len());
for entry in list {
ordering.push(match entry {
RuleJSON::STRING { value } => PrecedenceEntry::Name(value),
RuleJSON::SYMBOL { name } => PrecedenceEntry::Symbol(name),
_ => Err(ParseGrammarError::Unexpected)?,
});
}
precedence_orderings.push(ordering);
}
let mut variables = Vec::with_capacity(grammar_json.rules.len());
let rules = grammar_json
.rules
.into_iter()
.map(|(n, r)| Ok((n, parse_rule(serde_json::from_value(r)?, false)?)))
.collect::<ParseGrammarResult<Vec<_>>>()?;
let mut in_progress = HashSet::new();
for (name, rule) in &rules {
if grammar_json.word.as_ref().is_none_or(|w| w != name)
&& !variable_is_used(
&rules,
&extra_symbols,
&external_tokens,
name,
&mut in_progress,
)
{
grammar_json.conflicts.retain(|r| !r.contains(name));
grammar_json.supertypes.retain(|r| r != name);
grammar_json.inline.retain(|r| r != name);
extra_symbols.retain(|r| !rule_is_referenced(r, name, true));
external_tokens.retain(|r| !rule_is_referenced(r, name, true));
precedence_orderings.retain(|r| {
!r.iter().any(|e| {
let PrecedenceEntry::Symbol(s) = e else {
return false;
};
s == name
})
});
continue;
}
if extra_symbols
.iter()
.any(|r| rule_is_referenced(r, name, false))
{
let inner_rule = if let Rule::Metadata { rule, .. } = rule {
rule
} else {
rule
};
let matches_empty = match inner_rule {
Rule::String(rule_str) => rule_str.is_empty(),
Rule::Pattern(ref value, _) => Regex::new(value)
.map(|reg| reg.is_match(""))
.unwrap_or(false),
_ => false,
};
if matches_empty {
eprintln!("Warning: Named extra rule `{name}` matches the empty string. Inline this to avoid infinite loops while parsing.");
}
}
variables.push(Variable {
name: name.clone(),
kind: VariableType::Named,
rule: rule.clone(),
});
}
let reserved_words = grammar_json
.reserved
.into_iter()
.map(|(name, rule_values)| {
let Value::Array(rule_values) = rule_values else {
Err(ParseGrammarError::InvalidReservedWordSet)?
};
let mut reserved_words = Vec::with_capacity(rule_values.len());
for value in rule_values {
reserved_words.push(parse_rule(serde_json::from_value(value)?, false)?);
}
Ok(ReservedWordContext {
name,
reserved_words,
})
})
.collect::<ParseGrammarResult<Vec<_>>>()?;
Ok(InputGrammar {
name: grammar_json.name,
word_token: grammar_json.word,
expected_conflicts: grammar_json.conflicts,
supertype_symbols: grammar_json.supertypes,
variables_to_inline: grammar_json.inline,
precedence_orderings,
variables,
extra_symbols,
external_tokens,
reserved_words,
})
}
fn parse_rule(json: RuleJSON, is_token: bool) -> ParseGrammarResult<Rule> {
match json {
RuleJSON::ALIAS {
content,
value,
named,
} => parse_rule(*content, is_token).map(|r| Rule::alias(r, value, named)),
RuleJSON::BLANK => Ok(Rule::Blank),
RuleJSON::STRING { value } => Ok(Rule::String(value)),
RuleJSON::PATTERN { value, flags } => Ok(Rule::Pattern(
value,
flags.map_or(String::new(), |f| {
f.matches(|c| {
if c == 'i' {
true
} else {
// silently ignore unicode flags
if c != 'u' && c != 'v' {
eprintln!("Warning: unsupported flag {c}");
}
false
}
})
.collect()
}),
)),
RuleJSON::SYMBOL { name } => {
if is_token {
Err(ParseGrammarError::UnexpectedRule(name))?
} else {
Ok(Rule::NamedSymbol(name))
}
}
RuleJSON::CHOICE { members } => members
.into_iter()
.map(|m| parse_rule(m, is_token))
.collect::<ParseGrammarResult<Vec<_>>>()
.map(Rule::choice),
RuleJSON::FIELD { content, name } => {
parse_rule(*content, is_token).map(|r| Rule::field(name, r))
}
RuleJSON::SEQ { members } => members
.into_iter()
.map(|m| parse_rule(m, is_token))
.collect::<ParseGrammarResult<Vec<_>>>()
.map(Rule::seq),
RuleJSON::REPEAT1 { content } => parse_rule(*content, is_token).map(Rule::repeat),
RuleJSON::REPEAT { content } => {
parse_rule(*content, is_token).map(|m| Rule::choice(vec![Rule::repeat(m), Rule::Blank]))
}
RuleJSON::PREC { value, content } => {
parse_rule(*content, is_token).map(|r| Rule::prec(value.into(), r))
}
RuleJSON::PREC_LEFT { value, content } => {
parse_rule(*content, is_token).map(|r| Rule::prec_left(value.into(), r))
}
RuleJSON::PREC_RIGHT { value, content } => {
parse_rule(*content, is_token).map(|r| Rule::prec_right(value.into(), r))
}
RuleJSON::PREC_DYNAMIC { value, content } => {
parse_rule(*content, is_token).map(|r| Rule::prec_dynamic(value, r))
}
RuleJSON::RESERVED {
content,
context_name,
} => parse_rule(*content, is_token).map(|r| Rule::Reserved {
rule: Box::new(r),
context_name,
}),
RuleJSON::TOKEN { content } => parse_rule(*content, true).map(Rule::token),
RuleJSON::IMMEDIATE_TOKEN { content } => {
parse_rule(*content, is_token).map(Rule::immediate_token)
}
}
}
impl From<PrecedenceValueJSON> for Precedence {
fn from(val: PrecedenceValueJSON) -> Self {
match val {
PrecedenceValueJSON::Integer(i) => Self::Integer(i),
PrecedenceValueJSON::Name(i) => Self::Name(i),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_grammar() {
let grammar = parse_grammar(
r#"{
"name": "my_lang",
"rules": {
"file": {
"type": "REPEAT1",
"content": {
"type": "SYMBOL",
"name": "statement"
}
},
"statement": {
"type": "STRING",
"value": "foo"
}
}
}"#,
)
.unwrap();
assert_eq!(grammar.name, "my_lang");
assert_eq!(
grammar.variables,
vec![
Variable {
name: "file".to_string(),
kind: VariableType::Named,
rule: Rule::repeat(Rule::NamedSymbol("statement".to_string()))
},
Variable {
name: "statement".to_string(),
kind: VariableType::Named,
rule: Rule::String("foo".to_string())
},
]
);
}
}