When a *named* rule in the extras is able to match the empty string,
parsing can hang in certain situations (i.e. near EOF).
(cherry picked from commit ac171eb280)
455 lines
14 KiB
Rust
455 lines
14 KiB
Rust
use std::collections::HashSet;
|
|
|
|
use anyhow::Result;
|
|
use regex::Regex;
|
|
use serde::{Deserialize, Serialize};
|
|
use serde_json::{Map, Value};
|
|
use thiserror::Error;
|
|
|
|
use super::{
|
|
grammars::{InputGrammar, PrecedenceEntry, Variable, VariableType},
|
|
rules::{Precedence, Rule},
|
|
};
|
|
use crate::grammars::ReservedWordContext;
|
|
|
|
#[derive(Deserialize)]
|
|
#[serde(tag = "type")]
|
|
#[allow(non_camel_case_types)]
|
|
#[allow(clippy::upper_case_acronyms)]
|
|
enum RuleJSON {
|
|
ALIAS {
|
|
content: Box<RuleJSON>,
|
|
named: bool,
|
|
value: String,
|
|
},
|
|
BLANK,
|
|
STRING {
|
|
value: String,
|
|
},
|
|
PATTERN {
|
|
value: String,
|
|
flags: Option<String>,
|
|
},
|
|
SYMBOL {
|
|
name: String,
|
|
},
|
|
CHOICE {
|
|
members: Vec<RuleJSON>,
|
|
},
|
|
FIELD {
|
|
name: String,
|
|
content: Box<RuleJSON>,
|
|
},
|
|
SEQ {
|
|
members: Vec<RuleJSON>,
|
|
},
|
|
REPEAT {
|
|
content: Box<RuleJSON>,
|
|
},
|
|
REPEAT1 {
|
|
content: Box<RuleJSON>,
|
|
},
|
|
PREC_DYNAMIC {
|
|
value: i32,
|
|
content: Box<RuleJSON>,
|
|
},
|
|
PREC_LEFT {
|
|
value: PrecedenceValueJSON,
|
|
content: Box<RuleJSON>,
|
|
},
|
|
PREC_RIGHT {
|
|
value: PrecedenceValueJSON,
|
|
content: Box<RuleJSON>,
|
|
},
|
|
PREC {
|
|
value: PrecedenceValueJSON,
|
|
content: Box<RuleJSON>,
|
|
},
|
|
TOKEN {
|
|
content: Box<RuleJSON>,
|
|
},
|
|
IMMEDIATE_TOKEN {
|
|
content: Box<RuleJSON>,
|
|
},
|
|
RESERVED {
|
|
context_name: String,
|
|
content: Box<RuleJSON>,
|
|
},
|
|
}
|
|
|
|
#[derive(Deserialize)]
|
|
#[serde(untagged)]
|
|
enum PrecedenceValueJSON {
|
|
Integer(i32),
|
|
Name(String),
|
|
}
|
|
|
|
#[derive(Deserialize)]
|
|
pub struct GrammarJSON {
|
|
pub name: String,
|
|
rules: Map<String, Value>,
|
|
#[serde(default)]
|
|
precedences: Vec<Vec<RuleJSON>>,
|
|
#[serde(default)]
|
|
conflicts: Vec<Vec<String>>,
|
|
#[serde(default)]
|
|
externals: Vec<RuleJSON>,
|
|
#[serde(default)]
|
|
extras: Vec<RuleJSON>,
|
|
#[serde(default)]
|
|
inline: Vec<String>,
|
|
#[serde(default)]
|
|
supertypes: Vec<String>,
|
|
#[serde(default)]
|
|
word: Option<String>,
|
|
#[serde(default)]
|
|
reserved: Map<String, Value>,
|
|
}
|
|
|
|
pub type ParseGrammarResult<T> = Result<T, ParseGrammarError>;
|
|
|
|
#[derive(Debug, Error, Serialize)]
|
|
pub enum ParseGrammarError {
|
|
#[error("{0}")]
|
|
Serialization(String),
|
|
#[error("Rules in the `extras` array must not contain empty strings")]
|
|
InvalidExtra,
|
|
#[error("Invalid rule in precedences array. Only strings and symbols are allowed")]
|
|
Unexpected,
|
|
#[error("Reserved word sets must be arrays")]
|
|
InvalidReservedWordSet,
|
|
#[error("Grammar Error: Unexpected rule `{0}` in `token()` call")]
|
|
UnexpectedRule(String),
|
|
}
|
|
|
|
impl From<serde_json::Error> for ParseGrammarError {
|
|
fn from(value: serde_json::Error) -> Self {
|
|
Self::Serialization(value.to_string())
|
|
}
|
|
}
|
|
|
|
/// Check if a rule is referenced by another rule.
|
|
///
|
|
/// This function is used to determine if a variable is used in a given rule,
|
|
/// and `is_other` indicates if the rule is an external, and if it is,
|
|
/// to not assume that a named symbol that is equal to itself means it's being referenced.
|
|
///
|
|
/// For example, if we have an external rule **and** a normal rule both called `foo`,
|
|
/// `foo` should not be thought of as directly used unless it's used within another rule.
|
|
fn rule_is_referenced(rule: &Rule, target: &str, is_external: bool) -> bool {
|
|
match rule {
|
|
Rule::NamedSymbol(name) => name == target && !is_external,
|
|
Rule::Choice(rules) | Rule::Seq(rules) => {
|
|
rules.iter().any(|r| rule_is_referenced(r, target, false))
|
|
}
|
|
Rule::Metadata { rule, .. } | Rule::Reserved { rule, .. } => {
|
|
rule_is_referenced(rule, target, is_external)
|
|
}
|
|
Rule::Repeat(inner) => rule_is_referenced(inner, target, false),
|
|
Rule::Blank | Rule::String(_) | Rule::Pattern(_, _) | Rule::Symbol(_) => false,
|
|
}
|
|
}
|
|
|
|
fn variable_is_used(
|
|
grammar_rules: &[(String, Rule)],
|
|
extras: &[Rule],
|
|
externals: &[Rule],
|
|
target_name: &str,
|
|
in_progress: &mut HashSet<String>,
|
|
) -> bool {
|
|
let root = &grammar_rules.first().unwrap().0;
|
|
if target_name == root {
|
|
return true;
|
|
}
|
|
|
|
if extras
|
|
.iter()
|
|
.any(|rule| rule_is_referenced(rule, target_name, false))
|
|
{
|
|
return true;
|
|
}
|
|
|
|
if externals
|
|
.iter()
|
|
.any(|rule| rule_is_referenced(rule, target_name, true))
|
|
{
|
|
return true;
|
|
}
|
|
|
|
in_progress.insert(target_name.to_string());
|
|
let result = grammar_rules
|
|
.iter()
|
|
.filter(|(key, _)| *key != target_name)
|
|
.any(|(name, rule)| {
|
|
if !rule_is_referenced(rule, target_name, false) || in_progress.contains(name) {
|
|
return false;
|
|
}
|
|
variable_is_used(grammar_rules, extras, externals, name, in_progress)
|
|
});
|
|
in_progress.remove(target_name);
|
|
|
|
result
|
|
}
|
|
|
|
pub(crate) fn parse_grammar(input: &str) -> ParseGrammarResult<InputGrammar> {
|
|
let mut grammar_json = serde_json::from_str::<GrammarJSON>(input)?;
|
|
|
|
let mut extra_symbols =
|
|
grammar_json
|
|
.extras
|
|
.into_iter()
|
|
.try_fold(Vec::<Rule>::new(), |mut acc, item| {
|
|
let rule = parse_rule(item, false)?;
|
|
if let Rule::String(ref value) = rule {
|
|
if value.is_empty() {
|
|
Err(ParseGrammarError::InvalidExtra)?;
|
|
}
|
|
}
|
|
acc.push(rule);
|
|
ParseGrammarResult::Ok(acc)
|
|
})?;
|
|
|
|
let mut external_tokens = grammar_json
|
|
.externals
|
|
.into_iter()
|
|
.map(|e| parse_rule(e, false))
|
|
.collect::<ParseGrammarResult<Vec<_>>>()?;
|
|
|
|
let mut precedence_orderings = Vec::with_capacity(grammar_json.precedences.len());
|
|
for list in grammar_json.precedences {
|
|
let mut ordering = Vec::with_capacity(list.len());
|
|
for entry in list {
|
|
ordering.push(match entry {
|
|
RuleJSON::STRING { value } => PrecedenceEntry::Name(value),
|
|
RuleJSON::SYMBOL { name } => PrecedenceEntry::Symbol(name),
|
|
_ => Err(ParseGrammarError::Unexpected)?,
|
|
});
|
|
}
|
|
precedence_orderings.push(ordering);
|
|
}
|
|
|
|
let mut variables = Vec::with_capacity(grammar_json.rules.len());
|
|
|
|
let rules = grammar_json
|
|
.rules
|
|
.into_iter()
|
|
.map(|(n, r)| Ok((n, parse_rule(serde_json::from_value(r)?, false)?)))
|
|
.collect::<ParseGrammarResult<Vec<_>>>()?;
|
|
|
|
let mut in_progress = HashSet::new();
|
|
|
|
for (name, rule) in &rules {
|
|
if grammar_json.word.as_ref().is_none_or(|w| w != name)
|
|
&& !variable_is_used(
|
|
&rules,
|
|
&extra_symbols,
|
|
&external_tokens,
|
|
name,
|
|
&mut in_progress,
|
|
)
|
|
{
|
|
grammar_json.conflicts.retain(|r| !r.contains(name));
|
|
grammar_json.supertypes.retain(|r| r != name);
|
|
grammar_json.inline.retain(|r| r != name);
|
|
extra_symbols.retain(|r| !rule_is_referenced(r, name, true));
|
|
external_tokens.retain(|r| !rule_is_referenced(r, name, true));
|
|
precedence_orderings.retain(|r| {
|
|
!r.iter().any(|e| {
|
|
let PrecedenceEntry::Symbol(s) = e else {
|
|
return false;
|
|
};
|
|
s == name
|
|
})
|
|
});
|
|
continue;
|
|
}
|
|
|
|
if extra_symbols
|
|
.iter()
|
|
.any(|r| rule_is_referenced(r, name, false))
|
|
{
|
|
let inner_rule = if let Rule::Metadata { rule, .. } = rule {
|
|
rule
|
|
} else {
|
|
rule
|
|
};
|
|
let matches_empty = match inner_rule {
|
|
Rule::String(rule_str) => rule_str.is_empty(),
|
|
Rule::Pattern(ref value, _) => Regex::new(value)
|
|
.map(|reg| reg.is_match(""))
|
|
.unwrap_or(false),
|
|
_ => false,
|
|
};
|
|
if matches_empty {
|
|
eprintln!("Warning: Named extra rule `{name}` matches the empty string. Inline this to avoid infinite loops while parsing.");
|
|
}
|
|
}
|
|
variables.push(Variable {
|
|
name: name.clone(),
|
|
kind: VariableType::Named,
|
|
rule: rule.clone(),
|
|
});
|
|
}
|
|
|
|
let reserved_words = grammar_json
|
|
.reserved
|
|
.into_iter()
|
|
.map(|(name, rule_values)| {
|
|
let Value::Array(rule_values) = rule_values else {
|
|
Err(ParseGrammarError::InvalidReservedWordSet)?
|
|
};
|
|
|
|
let mut reserved_words = Vec::with_capacity(rule_values.len());
|
|
for value in rule_values {
|
|
reserved_words.push(parse_rule(serde_json::from_value(value)?, false)?);
|
|
}
|
|
Ok(ReservedWordContext {
|
|
name,
|
|
reserved_words,
|
|
})
|
|
})
|
|
.collect::<ParseGrammarResult<Vec<_>>>()?;
|
|
|
|
Ok(InputGrammar {
|
|
name: grammar_json.name,
|
|
word_token: grammar_json.word,
|
|
expected_conflicts: grammar_json.conflicts,
|
|
supertype_symbols: grammar_json.supertypes,
|
|
variables_to_inline: grammar_json.inline,
|
|
precedence_orderings,
|
|
variables,
|
|
extra_symbols,
|
|
external_tokens,
|
|
reserved_words,
|
|
})
|
|
}
|
|
|
|
fn parse_rule(json: RuleJSON, is_token: bool) -> ParseGrammarResult<Rule> {
|
|
match json {
|
|
RuleJSON::ALIAS {
|
|
content,
|
|
value,
|
|
named,
|
|
} => parse_rule(*content, is_token).map(|r| Rule::alias(r, value, named)),
|
|
RuleJSON::BLANK => Ok(Rule::Blank),
|
|
RuleJSON::STRING { value } => Ok(Rule::String(value)),
|
|
RuleJSON::PATTERN { value, flags } => Ok(Rule::Pattern(
|
|
value,
|
|
flags.map_or(String::new(), |f| {
|
|
f.matches(|c| {
|
|
if c == 'i' {
|
|
true
|
|
} else {
|
|
// silently ignore unicode flags
|
|
if c != 'u' && c != 'v' {
|
|
eprintln!("Warning: unsupported flag {c}");
|
|
}
|
|
false
|
|
}
|
|
})
|
|
.collect()
|
|
}),
|
|
)),
|
|
RuleJSON::SYMBOL { name } => {
|
|
if is_token {
|
|
Err(ParseGrammarError::UnexpectedRule(name))?
|
|
} else {
|
|
Ok(Rule::NamedSymbol(name))
|
|
}
|
|
}
|
|
RuleJSON::CHOICE { members } => members
|
|
.into_iter()
|
|
.map(|m| parse_rule(m, is_token))
|
|
.collect::<ParseGrammarResult<Vec<_>>>()
|
|
.map(Rule::choice),
|
|
RuleJSON::FIELD { content, name } => {
|
|
parse_rule(*content, is_token).map(|r| Rule::field(name, r))
|
|
}
|
|
RuleJSON::SEQ { members } => members
|
|
.into_iter()
|
|
.map(|m| parse_rule(m, is_token))
|
|
.collect::<ParseGrammarResult<Vec<_>>>()
|
|
.map(Rule::seq),
|
|
RuleJSON::REPEAT1 { content } => parse_rule(*content, is_token).map(Rule::repeat),
|
|
RuleJSON::REPEAT { content } => {
|
|
parse_rule(*content, is_token).map(|m| Rule::choice(vec![Rule::repeat(m), Rule::Blank]))
|
|
}
|
|
RuleJSON::PREC { value, content } => {
|
|
parse_rule(*content, is_token).map(|r| Rule::prec(value.into(), r))
|
|
}
|
|
RuleJSON::PREC_LEFT { value, content } => {
|
|
parse_rule(*content, is_token).map(|r| Rule::prec_left(value.into(), r))
|
|
}
|
|
RuleJSON::PREC_RIGHT { value, content } => {
|
|
parse_rule(*content, is_token).map(|r| Rule::prec_right(value.into(), r))
|
|
}
|
|
RuleJSON::PREC_DYNAMIC { value, content } => {
|
|
parse_rule(*content, is_token).map(|r| Rule::prec_dynamic(value, r))
|
|
}
|
|
RuleJSON::RESERVED {
|
|
content,
|
|
context_name,
|
|
} => parse_rule(*content, is_token).map(|r| Rule::Reserved {
|
|
rule: Box::new(r),
|
|
context_name,
|
|
}),
|
|
RuleJSON::TOKEN { content } => parse_rule(*content, true).map(Rule::token),
|
|
RuleJSON::IMMEDIATE_TOKEN { content } => {
|
|
parse_rule(*content, is_token).map(Rule::immediate_token)
|
|
}
|
|
}
|
|
}
|
|
|
|
impl From<PrecedenceValueJSON> for Precedence {
|
|
fn from(val: PrecedenceValueJSON) -> Self {
|
|
match val {
|
|
PrecedenceValueJSON::Integer(i) => Self::Integer(i),
|
|
PrecedenceValueJSON::Name(i) => Self::Name(i),
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_parse_grammar() {
|
|
let grammar = parse_grammar(
|
|
r#"{
|
|
"name": "my_lang",
|
|
"rules": {
|
|
"file": {
|
|
"type": "REPEAT1",
|
|
"content": {
|
|
"type": "SYMBOL",
|
|
"name": "statement"
|
|
}
|
|
},
|
|
"statement": {
|
|
"type": "STRING",
|
|
"value": "foo"
|
|
}
|
|
}
|
|
}"#,
|
|
)
|
|
.unwrap();
|
|
|
|
assert_eq!(grammar.name, "my_lang");
|
|
assert_eq!(
|
|
grammar.variables,
|
|
vec![
|
|
Variable {
|
|
name: "file".to_string(),
|
|
kind: VariableType::Named,
|
|
rule: Rule::repeat(Rule::NamedSymbol("statement".to_string()))
|
|
},
|
|
Variable {
|
|
name: "statement".to_string(),
|
|
kind: VariableType::Named,
|
|
rule: Rule::String("foo".to_string())
|
|
},
|
|
]
|
|
);
|
|
}
|
|
}
|