use std::collections::HashSet; use anyhow::Result; use regex::Regex; use serde::{Deserialize, Serialize}; use serde_json::{Map, Value}; use thiserror::Error; use super::{ grammars::{InputGrammar, PrecedenceEntry, Variable, VariableType}, rules::{Precedence, Rule}, }; use crate::grammars::ReservedWordContext; #[derive(Deserialize)] #[serde(tag = "type")] #[allow(non_camel_case_types)] #[allow(clippy::upper_case_acronyms)] enum RuleJSON { ALIAS { content: Box, named: bool, value: String, }, BLANK, STRING { value: String, }, PATTERN { value: String, flags: Option, }, SYMBOL { name: String, }, CHOICE { members: Vec, }, FIELD { name: String, content: Box, }, SEQ { members: Vec, }, REPEAT { content: Box, }, REPEAT1 { content: Box, }, PREC_DYNAMIC { value: i32, content: Box, }, PREC_LEFT { value: PrecedenceValueJSON, content: Box, }, PREC_RIGHT { value: PrecedenceValueJSON, content: Box, }, PREC { value: PrecedenceValueJSON, content: Box, }, TOKEN { content: Box, }, IMMEDIATE_TOKEN { content: Box, }, RESERVED { context_name: String, content: Box, }, } #[derive(Deserialize)] #[serde(untagged)] enum PrecedenceValueJSON { Integer(i32), Name(String), } #[derive(Deserialize)] pub struct GrammarJSON { pub name: String, rules: Map, #[serde(default)] precedences: Vec>, #[serde(default)] conflicts: Vec>, #[serde(default)] externals: Vec, #[serde(default)] extras: Vec, #[serde(default)] inline: Vec, #[serde(default)] supertypes: Vec, #[serde(default)] word: Option, #[serde(default)] reserved: Map, } pub type ParseGrammarResult = Result; #[derive(Debug, Error, Serialize)] pub enum ParseGrammarError { #[error("{0}")] Serialization(String), #[error("Rules in the `extras` array must not contain empty strings")] InvalidExtra, #[error("Invalid rule in precedences array. Only strings and symbols are allowed")] Unexpected, #[error("Reserved word sets must be arrays")] InvalidReservedWordSet, #[error("Grammar Error: Unexpected rule `{0}` in `token()` call")] UnexpectedRule(String), } impl From for ParseGrammarError { fn from(value: serde_json::Error) -> Self { Self::Serialization(value.to_string()) } } /// Check if a rule is referenced by another rule. /// /// This function is used to determine if a variable is used in a given rule, /// and `is_other` indicates if the rule is an external, and if it is, /// to not assume that a named symbol that is equal to itself means it's being referenced. /// /// For example, if we have an external rule **and** a normal rule both called `foo`, /// `foo` should not be thought of as directly used unless it's used within another rule. fn rule_is_referenced(rule: &Rule, target: &str, is_external: bool) -> bool { match rule { Rule::NamedSymbol(name) => name == target && !is_external, Rule::Choice(rules) | Rule::Seq(rules) => { rules.iter().any(|r| rule_is_referenced(r, target, false)) } Rule::Metadata { rule, .. } | Rule::Reserved { rule, .. } => { rule_is_referenced(rule, target, is_external) } Rule::Repeat(inner) => rule_is_referenced(inner, target, false), Rule::Blank | Rule::String(_) | Rule::Pattern(_, _) | Rule::Symbol(_) => false, } } fn variable_is_used( grammar_rules: &[(String, Rule)], extras: &[Rule], externals: &[Rule], target_name: &str, in_progress: &mut HashSet, ) -> bool { let root = &grammar_rules.first().unwrap().0; if target_name == root { return true; } if extras .iter() .any(|rule| rule_is_referenced(rule, target_name, false)) { return true; } if externals .iter() .any(|rule| rule_is_referenced(rule, target_name, true)) { return true; } in_progress.insert(target_name.to_string()); let result = grammar_rules .iter() .filter(|(key, _)| *key != target_name) .any(|(name, rule)| { if !rule_is_referenced(rule, target_name, false) || in_progress.contains(name) { return false; } variable_is_used(grammar_rules, extras, externals, name, in_progress) }); in_progress.remove(target_name); result } pub(crate) fn parse_grammar(input: &str) -> ParseGrammarResult { let mut grammar_json = serde_json::from_str::(input)?; let mut extra_symbols = grammar_json .extras .into_iter() .try_fold(Vec::::new(), |mut acc, item| { let rule = parse_rule(item, false)?; if let Rule::String(ref value) = rule { if value.is_empty() { Err(ParseGrammarError::InvalidExtra)?; } } acc.push(rule); ParseGrammarResult::Ok(acc) })?; let mut external_tokens = grammar_json .externals .into_iter() .map(|e| parse_rule(e, false)) .collect::>>()?; let mut precedence_orderings = Vec::with_capacity(grammar_json.precedences.len()); for list in grammar_json.precedences { let mut ordering = Vec::with_capacity(list.len()); for entry in list { ordering.push(match entry { RuleJSON::STRING { value } => PrecedenceEntry::Name(value), RuleJSON::SYMBOL { name } => PrecedenceEntry::Symbol(name), _ => Err(ParseGrammarError::Unexpected)?, }); } precedence_orderings.push(ordering); } let mut variables = Vec::with_capacity(grammar_json.rules.len()); let rules = grammar_json .rules .into_iter() .map(|(n, r)| Ok((n, parse_rule(serde_json::from_value(r)?, false)?))) .collect::>>()?; let mut in_progress = HashSet::new(); for (name, rule) in &rules { if grammar_json.word.as_ref().is_none_or(|w| w != name) && !variable_is_used( &rules, &extra_symbols, &external_tokens, name, &mut in_progress, ) { grammar_json.conflicts.retain(|r| !r.contains(name)); grammar_json.supertypes.retain(|r| r != name); grammar_json.inline.retain(|r| r != name); extra_symbols.retain(|r| !rule_is_referenced(r, name, true)); external_tokens.retain(|r| !rule_is_referenced(r, name, true)); precedence_orderings.retain(|r| { !r.iter().any(|e| { let PrecedenceEntry::Symbol(s) = e else { return false; }; s == name }) }); continue; } if extra_symbols .iter() .any(|r| rule_is_referenced(r, name, false)) { let inner_rule = if let Rule::Metadata { rule, .. } = rule { rule } else { rule }; let matches_empty = match inner_rule { Rule::String(rule_str) => rule_str.is_empty(), Rule::Pattern(ref value, _) => Regex::new(value) .map(|reg| reg.is_match("")) .unwrap_or(false), _ => false, }; if matches_empty { eprintln!("Warning: Named extra rule `{name}` matches the empty string. Inline this to avoid infinite loops while parsing."); } } variables.push(Variable { name: name.clone(), kind: VariableType::Named, rule: rule.clone(), }); } let reserved_words = grammar_json .reserved .into_iter() .map(|(name, rule_values)| { let Value::Array(rule_values) = rule_values else { Err(ParseGrammarError::InvalidReservedWordSet)? }; let mut reserved_words = Vec::with_capacity(rule_values.len()); for value in rule_values { reserved_words.push(parse_rule(serde_json::from_value(value)?, false)?); } Ok(ReservedWordContext { name, reserved_words, }) }) .collect::>>()?; Ok(InputGrammar { name: grammar_json.name, word_token: grammar_json.word, expected_conflicts: grammar_json.conflicts, supertype_symbols: grammar_json.supertypes, variables_to_inline: grammar_json.inline, precedence_orderings, variables, extra_symbols, external_tokens, reserved_words, }) } fn parse_rule(json: RuleJSON, is_token: bool) -> ParseGrammarResult { match json { RuleJSON::ALIAS { content, value, named, } => parse_rule(*content, is_token).map(|r| Rule::alias(r, value, named)), RuleJSON::BLANK => Ok(Rule::Blank), RuleJSON::STRING { value } => Ok(Rule::String(value)), RuleJSON::PATTERN { value, flags } => Ok(Rule::Pattern( value, flags.map_or(String::new(), |f| { f.matches(|c| { if c == 'i' { true } else { // silently ignore unicode flags if c != 'u' && c != 'v' { eprintln!("Warning: unsupported flag {c}"); } false } }) .collect() }), )), RuleJSON::SYMBOL { name } => { if is_token { Err(ParseGrammarError::UnexpectedRule(name))? } else { Ok(Rule::NamedSymbol(name)) } } RuleJSON::CHOICE { members } => members .into_iter() .map(|m| parse_rule(m, is_token)) .collect::>>() .map(Rule::choice), RuleJSON::FIELD { content, name } => { parse_rule(*content, is_token).map(|r| Rule::field(name, r)) } RuleJSON::SEQ { members } => members .into_iter() .map(|m| parse_rule(m, is_token)) .collect::>>() .map(Rule::seq), RuleJSON::REPEAT1 { content } => parse_rule(*content, is_token).map(Rule::repeat), RuleJSON::REPEAT { content } => { parse_rule(*content, is_token).map(|m| Rule::choice(vec![Rule::repeat(m), Rule::Blank])) } RuleJSON::PREC { value, content } => { parse_rule(*content, is_token).map(|r| Rule::prec(value.into(), r)) } RuleJSON::PREC_LEFT { value, content } => { parse_rule(*content, is_token).map(|r| Rule::prec_left(value.into(), r)) } RuleJSON::PREC_RIGHT { value, content } => { parse_rule(*content, is_token).map(|r| Rule::prec_right(value.into(), r)) } RuleJSON::PREC_DYNAMIC { value, content } => { parse_rule(*content, is_token).map(|r| Rule::prec_dynamic(value, r)) } RuleJSON::RESERVED { content, context_name, } => parse_rule(*content, is_token).map(|r| Rule::Reserved { rule: Box::new(r), context_name, }), RuleJSON::TOKEN { content } => parse_rule(*content, true).map(Rule::token), RuleJSON::IMMEDIATE_TOKEN { content } => { parse_rule(*content, is_token).map(Rule::immediate_token) } } } impl From for Precedence { fn from(val: PrecedenceValueJSON) -> Self { match val { PrecedenceValueJSON::Integer(i) => Self::Integer(i), PrecedenceValueJSON::Name(i) => Self::Name(i), } } } #[cfg(test)] mod tests { use super::*; #[test] fn test_parse_grammar() { let grammar = parse_grammar( r#"{ "name": "my_lang", "rules": { "file": { "type": "REPEAT1", "content": { "type": "SYMBOL", "name": "statement" } }, "statement": { "type": "STRING", "value": "foo" } } }"#, ) .unwrap(); assert_eq!(grammar.name, "my_lang"); assert_eq!( grammar.variables, vec![ Variable { name: "file".to_string(), kind: VariableType::Named, rule: Rule::repeat(Rule::NamedSymbol("statement".to_string())) }, Variable { name: "statement".to_string(), kind: VariableType::Named, rule: Rule::String("foo".to_string()) }, ] ); } }