From a2d760e42694b9077e61bc0d5f48dfd5a4325baf Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 27 Oct 2020 15:46:09 -0700 Subject: [PATCH] Ensure nodes are aliased consistently within syntax error nodes Co-Authored-By: Rick Winfrey --- .../extract_default_aliases.rs | 293 ++++++++++++++++++ .../prepare_grammar/extract_simple_aliases.rs | 223 ------------- cli/src/generate/prepare_grammar/mod.rs | 8 +- cli/src/generate/render.rs | 18 +- cli/src/tests/query_test.rs | 24 ++ 5 files changed, 330 insertions(+), 236 deletions(-) create mode 100644 cli/src/generate/prepare_grammar/extract_default_aliases.rs delete mode 100644 cli/src/generate/prepare_grammar/extract_simple_aliases.rs diff --git a/cli/src/generate/prepare_grammar/extract_default_aliases.rs b/cli/src/generate/prepare_grammar/extract_default_aliases.rs new file mode 100644 index 00000000..3e08e3ad --- /dev/null +++ b/cli/src/generate/prepare_grammar/extract_default_aliases.rs @@ -0,0 +1,293 @@ +use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar}; +use crate::generate::rules::{Alias, AliasMap, Symbol, SymbolType}; + +#[derive(Clone, Default)] +struct SymbolStatus { + aliases: Vec<(Alias, usize)>, + appears_unaliased: bool, +} + +// Update the grammar by finding symbols that always are aliased, and for each such symbol, +// promoting one of its aliases to a "default alias", which is applied globally instead +// of in a context-specific way. +// +// This has two benefits: +// * It reduces the overhead of storing production-specific alias info in the parse table. +// * Within an `ERROR` node, no context-specific aliases will be applied. This transformation +// ensures that the children of an `ERROR` node have symbols that are consistent with the +// way that they would appear in a valid syntax tree. +pub(super) fn extract_default_aliases( + syntax_grammar: &mut SyntaxGrammar, + lexical_grammar: &LexicalGrammar, +) -> AliasMap { + let mut terminal_status_list = vec![SymbolStatus::default(); lexical_grammar.variables.len()]; + let mut non_terminal_status_list = + vec![SymbolStatus::default(); syntax_grammar.variables.len()]; + let mut external_status_list = + vec![SymbolStatus::default(); syntax_grammar.external_tokens.len()]; + + // For each grammar symbol, find all of the aliases under which the symbol appears, + // and determine whether or not the symbol ever appears *unaliased*. + for variable in syntax_grammar.variables.iter() { + for production in variable.productions.iter() { + for step in production.steps.iter() { + let mut status = match step.symbol.kind { + SymbolType::External => &mut external_status_list[step.symbol.index], + SymbolType::NonTerminal => &mut non_terminal_status_list[step.symbol.index], + SymbolType::Terminal => &mut terminal_status_list[step.symbol.index], + SymbolType::End => panic!("Unexpected end token"), + }; + + // Default aliases don't work for inlined variables. + if syntax_grammar.variables_to_inline.contains(&step.symbol) { + continue; + } + + if let Some(alias) = &step.alias { + if let Some(count_for_alias) = status + .aliases + .iter_mut() + .find_map(|(a, count)| if a == alias { Some(count) } else { None }) + { + *count_for_alias += 1; + } else { + status.aliases.push((alias.clone(), 1)); + } + } else { + status.appears_unaliased = true; + } + } + } + } + + let symbols_with_statuses = (terminal_status_list + .iter_mut() + .enumerate() + .map(|(i, status)| (Symbol::terminal(i), status))) + .chain( + non_terminal_status_list + .iter_mut() + .enumerate() + .map(|(i, status)| (Symbol::non_terminal(i), status)), + ) + .chain( + external_status_list + .iter_mut() + .enumerate() + .map(|(i, status)| (Symbol::external(i), status)), + ); + + // For each symbol that always appears aliased, find the alias the occurs most often, + // and designate that alias as the symbol's "default alias". Store all of these + // default aliases in a map that will be returned. + let mut result = AliasMap::new(); + for (symbol, status) in symbols_with_statuses { + if status.appears_unaliased { + status.aliases.clear(); + } else { + if let Some(default_entry) = status + .aliases + .iter() + .enumerate() + .max_by_key(|(i, (_, count))| (count, -(*i as i64))) + .map(|(_, entry)| entry.clone()) + { + status.aliases.clear(); + status.aliases.push(default_entry.clone()); + result.insert(symbol, default_entry.0); + } + } + } + + // Wherever a symbol is aliased as its default alias, remove the usage of the alias, + // because it will now be redundant. + let mut alias_positions_to_clear = Vec::new(); + for variable in syntax_grammar.variables.iter_mut() { + alias_positions_to_clear.clear(); + + for (i, production) in variable.productions.iter().enumerate() { + for (j, step) in production.steps.iter().enumerate() { + let status = match step.symbol.kind { + SymbolType::External => &mut external_status_list[step.symbol.index], + SymbolType::NonTerminal => &mut non_terminal_status_list[step.symbol.index], + SymbolType::Terminal => &mut terminal_status_list[step.symbol.index], + SymbolType::End => panic!("Unexpected end token"), + }; + + // If this step is aliased as the symbol's default alias, then remove that alias. + if step.alias.is_some() + && step.alias.as_ref() == status.aliases.get(0).map(|t| &t.0) + { + let mut other_productions_must_use_this_alias_at_this_index = false; + for (other_i, other_production) in variable.productions.iter().enumerate() { + if other_i != i + && other_production.steps.len() > j + && other_production.steps[j].alias == step.alias + && result.get(&other_production.steps[j].symbol) != step.alias.as_ref() + { + other_productions_must_use_this_alias_at_this_index = true; + break; + } + } + + if !other_productions_must_use_this_alias_at_this_index { + alias_positions_to_clear.push((i, j)); + } + } + } + } + + for (production_index, step_index) in &alias_positions_to_clear { + variable.productions[*production_index].steps[*step_index].alias = None; + } + } + + result +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::generate::grammars::{ + LexicalVariable, Production, ProductionStep, SyntaxVariable, VariableType, + }; + use crate::generate::nfa::Nfa; + + #[test] + fn test_extract_simple_aliases() { + let mut syntax_grammar = SyntaxGrammar { + variables: vec![ + SyntaxVariable { + name: "v1".to_owned(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(0)).with_alias("a1", true), + ProductionStep::new(Symbol::terminal(1)).with_alias("a2", true), + ProductionStep::new(Symbol::terminal(2)).with_alias("a3", true), + ProductionStep::new(Symbol::terminal(3)).with_alias("a4", true), + ], + }], + }, + SyntaxVariable { + name: "v2".to_owned(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ + // Token 0 is always aliased as "a1". + ProductionStep::new(Symbol::terminal(0)).with_alias("a1", true), + // Token 1 is aliased within rule `v1` above, but not here. + ProductionStep::new(Symbol::terminal(1)), + // Token 2 is aliased differently here than in `v1`. The alias from + // `v1` should be promoted to the default alias, because `v1` appears + // first in the grammar. + ProductionStep::new(Symbol::terminal(2)).with_alias("a5", true), + // Token 3 is also aliased differently here than in `v1`. In this case, + // this alias should be promoted to the default alias, because it is + // used a greater number of times (twice). + ProductionStep::new(Symbol::terminal(3)).with_alias("a6", true), + ProductionStep::new(Symbol::terminal(3)).with_alias("a6", true), + ], + }], + }, + ], + extra_symbols: Vec::new(), + expected_conflicts: Vec::new(), + variables_to_inline: Vec::new(), + supertype_symbols: Vec::new(), + external_tokens: Vec::new(), + word_token: None, + }; + + let lexical_grammar = LexicalGrammar { + nfa: Nfa::new(), + variables: vec![ + LexicalVariable { + name: "t0".to_string(), + kind: VariableType::Anonymous, + implicit_precedence: 0, + start_state: 0, + }, + LexicalVariable { + name: "t1".to_string(), + kind: VariableType::Anonymous, + implicit_precedence: 0, + start_state: 0, + }, + LexicalVariable { + name: "t2".to_string(), + kind: VariableType::Anonymous, + implicit_precedence: 0, + start_state: 0, + }, + LexicalVariable { + name: "t3".to_string(), + kind: VariableType::Anonymous, + implicit_precedence: 0, + start_state: 0, + }, + ], + }; + + let default_aliases = extract_default_aliases(&mut syntax_grammar, &lexical_grammar); + assert_eq!(default_aliases.len(), 3); + + assert_eq!( + default_aliases.get(&Symbol::terminal(0)), + Some(&Alias { + value: "a1".to_string(), + is_named: true, + }) + ); + assert_eq!( + default_aliases.get(&Symbol::terminal(2)), + Some(&Alias { + value: "a3".to_string(), + is_named: true, + }) + ); + assert_eq!( + default_aliases.get(&Symbol::terminal(3)), + Some(&Alias { + value: "a6".to_string(), + is_named: true, + }) + ); + assert_eq!(default_aliases.get(&Symbol::terminal(1)), None); + + assert_eq!( + syntax_grammar.variables, + vec![ + SyntaxVariable { + name: "v1".to_owned(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(0)), + ProductionStep::new(Symbol::terminal(1)).with_alias("a2", true), + ProductionStep::new(Symbol::terminal(2)), + ProductionStep::new(Symbol::terminal(3)).with_alias("a4", true), + ], + },], + }, + SyntaxVariable { + name: "v2".to_owned(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(0)), + ProductionStep::new(Symbol::terminal(1)), + ProductionStep::new(Symbol::terminal(2)).with_alias("a5", true), + ProductionStep::new(Symbol::terminal(3)), + ProductionStep::new(Symbol::terminal(3)), + ], + },], + }, + ] + ); + } +} diff --git a/cli/src/generate/prepare_grammar/extract_simple_aliases.rs b/cli/src/generate/prepare_grammar/extract_simple_aliases.rs deleted file mode 100644 index 6da009d5..00000000 --- a/cli/src/generate/prepare_grammar/extract_simple_aliases.rs +++ /dev/null @@ -1,223 +0,0 @@ -use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar}; -use crate::generate::rules::{Alias, AliasMap, Symbol, SymbolType}; - -#[derive(Clone, Default)] -struct SymbolStatus { - alias: Option, - conflicting: bool, -} - -pub(super) fn extract_simple_aliases( - syntax_grammar: &mut SyntaxGrammar, - lexical_grammar: &LexicalGrammar, -) -> AliasMap { - // Determine which symbols in the grammars are *always* aliased to a single name. - let mut terminal_status_list = vec![SymbolStatus::default(); lexical_grammar.variables.len()]; - let mut non_terminal_status_list = - vec![SymbolStatus::default(); syntax_grammar.variables.len()]; - let mut external_status_list = - vec![SymbolStatus::default(); syntax_grammar.external_tokens.len()]; - for variable in syntax_grammar.variables.iter() { - for production in variable.productions.iter() { - for step in production.steps.iter() { - let mut status = match step.symbol { - Symbol { - kind: SymbolType::External, - index, - } => &mut external_status_list[index], - Symbol { - kind: SymbolType::NonTerminal, - index, - } => &mut non_terminal_status_list[index], - Symbol { - kind: SymbolType::Terminal, - index, - } => &mut terminal_status_list[index], - Symbol { - kind: SymbolType::End, - .. - } => panic!("Unexpected end token"), - }; - - if step.alias.is_none() { - status.alias = None; - status.conflicting = true; - } - - if !status.conflicting { - if status.alias.is_none() { - status.alias = step.alias.clone(); - } else if status.alias != step.alias { - status.alias = None; - status.conflicting = true; - } - } - } - } - } - - // Remove the aliases for those symbols. - for variable in syntax_grammar.variables.iter_mut() { - for production in variable.productions.iter_mut() { - for step in production.steps.iter_mut() { - let status = match step.symbol { - Symbol { - kind: SymbolType::External, - index, - } => &external_status_list[index], - Symbol { - kind: SymbolType::NonTerminal, - index, - } => &non_terminal_status_list[index], - Symbol { - kind: SymbolType::Terminal, - index, - } => &terminal_status_list[index], - Symbol { - kind: SymbolType::End, - .. - } => panic!("Unexpected end token"), - }; - - if status.alias.is_some() { - step.alias = None; - } - } - } - } - - // Populate a map of the symbols to their aliases. - let mut result = AliasMap::new(); - for (i, status) in terminal_status_list.into_iter().enumerate() { - if let Some(alias) = status.alias { - result.insert(Symbol::terminal(i), alias); - } - } - for (i, status) in non_terminal_status_list.into_iter().enumerate() { - if let Some(alias) = status.alias { - result.insert(Symbol::non_terminal(i), alias); - } - } - for (i, status) in external_status_list.into_iter().enumerate() { - if let Some(alias) = status.alias { - result.insert(Symbol::external(i), alias); - } - } - result -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::generate::grammars::{ - LexicalVariable, Production, ProductionStep, SyntaxVariable, VariableType, - }; - use crate::generate::nfa::Nfa; - - #[test] - fn test_extract_simple_aliases() { - let mut syntax_grammar = SyntaxGrammar { - variables: vec![ - SyntaxVariable { - name: "v1".to_owned(), - kind: VariableType::Named, - productions: vec![Production { - dynamic_precedence: 0, - steps: vec![ - ProductionStep::new(Symbol::terminal(0)).with_alias("a1", true), - ProductionStep::new(Symbol::terminal(1)).with_alias("a2", true), - ProductionStep::new(Symbol::terminal(2)).with_alias("a3", true), - ], - }], - }, - SyntaxVariable { - name: "v2".to_owned(), - kind: VariableType::Named, - productions: vec![Production { - dynamic_precedence: 0, - steps: vec![ - // Token 0 is always aliased as "a1". - ProductionStep::new(Symbol::terminal(0)).with_alias("a1", true), - // Token 1 is aliased above, but not here. - ProductionStep::new(Symbol::terminal(1)), - // Token 2 is aliased differently than above. - ProductionStep::new(Symbol::terminal(2)).with_alias("a4", true), - ], - }], - }, - ], - extra_symbols: Vec::new(), - expected_conflicts: Vec::new(), - variables_to_inline: Vec::new(), - supertype_symbols: Vec::new(), - external_tokens: Vec::new(), - word_token: None, - }; - - let lexical_grammar = LexicalGrammar { - nfa: Nfa::new(), - variables: vec![ - LexicalVariable { - name: "t1".to_string(), - kind: VariableType::Anonymous, - implicit_precedence: 0, - start_state: 0, - }, - LexicalVariable { - name: "t2".to_string(), - kind: VariableType::Anonymous, - implicit_precedence: 0, - start_state: 0, - }, - LexicalVariable { - name: "t3".to_string(), - kind: VariableType::Anonymous, - implicit_precedence: 0, - start_state: 0, - }, - ], - }; - - let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &lexical_grammar); - assert_eq!(simple_aliases.len(), 1); - assert_eq!( - simple_aliases[&Symbol::terminal(0)], - Alias { - value: "a1".to_string(), - is_named: true, - } - ); - - assert_eq!( - syntax_grammar.variables, - vec![ - SyntaxVariable { - name: "v1".to_owned(), - kind: VariableType::Named, - productions: vec![Production { - dynamic_precedence: 0, - steps: vec![ - // 'Simple' alias removed - ProductionStep::new(Symbol::terminal(0)), - // Other aliases unchanged - ProductionStep::new(Symbol::terminal(1)).with_alias("a2", true), - ProductionStep::new(Symbol::terminal(2)).with_alias("a3", true), - ], - },], - }, - SyntaxVariable { - name: "v2".to_owned(), - kind: VariableType::Named, - productions: vec![Production { - dynamic_precedence: 0, - steps: vec![ - ProductionStep::new(Symbol::terminal(0)), - ProductionStep::new(Symbol::terminal(1)), - ProductionStep::new(Symbol::terminal(2)).with_alias("a4", true), - ], - },], - }, - ] - ); - } -} diff --git a/cli/src/generate/prepare_grammar/mod.rs b/cli/src/generate/prepare_grammar/mod.rs index 029483d3..8b094c56 100644 --- a/cli/src/generate/prepare_grammar/mod.rs +++ b/cli/src/generate/prepare_grammar/mod.rs @@ -1,6 +1,6 @@ mod expand_repeats; mod expand_tokens; -mod extract_simple_aliases; +mod extract_default_aliases; mod extract_tokens; mod flatten_grammar; mod intern_symbols; @@ -8,7 +8,7 @@ mod process_inlines; use self::expand_repeats::expand_repeats; pub(crate) use self::expand_tokens::expand_tokens; -use self::extract_simple_aliases::extract_simple_aliases; +use self::extract_default_aliases::extract_default_aliases; use self::extract_tokens::extract_tokens; use self::flatten_grammar::flatten_grammar; use self::intern_symbols::intern_symbols; @@ -52,7 +52,7 @@ pub(crate) fn prepare_grammar( let syntax_grammar = expand_repeats(syntax_grammar); let mut syntax_grammar = flatten_grammar(syntax_grammar)?; let lexical_grammar = expand_tokens(lexical_grammar)?; - let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &lexical_grammar); + let default_aliases = extract_default_aliases(&mut syntax_grammar, &lexical_grammar); let inlines = process_inlines(&syntax_grammar); - Ok((syntax_grammar, lexical_grammar, inlines, simple_aliases)) + Ok((syntax_grammar, lexical_grammar, inlines, default_aliases)) } diff --git a/cli/src/generate/render.rs b/cli/src/generate/render.rs index f7f788d0..e1e75ee1 100644 --- a/cli/src/generate/render.rs +++ b/cli/src/generate/render.rs @@ -65,7 +65,7 @@ struct Generator { keyword_capture_token: Option, syntax_grammar: SyntaxGrammar, lexical_grammar: LexicalGrammar, - simple_aliases: AliasMap, + default_aliases: AliasMap, symbol_order: HashMap, symbol_ids: HashMap, alias_ids: HashMap, @@ -198,10 +198,10 @@ impl Generator { // public-facing symbol. If one of the symbols is not aliased, choose that one // to be the public-facing symbol. Otherwise, pick the symbol with the lowest // numeric value. - if let Some(alias) = self.simple_aliases.get(symbol) { + if let Some(alias) = self.default_aliases.get(symbol) { let kind = alias.kind(); for other_symbol in &self.parse_table.symbols { - if let Some(other_alias) = self.simple_aliases.get(other_symbol) { + if let Some(other_alias) = self.default_aliases.get(other_symbol) { if other_symbol < mapping && other_alias == alias { mapping = other_symbol; } @@ -361,7 +361,7 @@ impl Generator { indent!(self); for symbol in self.parse_table.symbols.iter() { let name = self.sanitize_string( - self.simple_aliases + self.default_aliases .get(symbol) .map(|alias| alias.value.as_str()) .unwrap_or(self.metadata_for_symbol(*symbol).0), @@ -444,7 +444,7 @@ impl Generator { for symbol in &self.parse_table.symbols { add_line!(self, "[{}] = {{", self.symbol_ids[&symbol]); indent!(self); - if let Some(Alias { is_named, .. }) = self.simple_aliases.get(symbol) { + if let Some(Alias { is_named, .. }) = self.default_aliases.get(symbol) { add_line!(self, ".visible = true,"); add_line!(self, ".named = {},", is_named); } else { @@ -525,7 +525,7 @@ impl Generator { for step in &production.steps { if let Some(alias) = &step.alias { if step.symbol.is_non_terminal() - && !self.simple_aliases.contains_key(&step.symbol) + && Some(alias) != self.default_aliases.get(&step.symbol) { if self.symbol_ids.contains_key(&step.symbol) { let alias_ids = @@ -1545,7 +1545,7 @@ impl Generator { /// for keyword capture, if any. /// * `syntax_grammar` - The syntax grammar extracted from the language's grammar /// * `lexical_grammar` - The lexical grammar extracted from the language's grammar -/// * `simple_aliases` - A map describing the global rename rules that should apply. +/// * `default_aliases` - A map describing the global rename rules that should apply. /// the keys are symbols that are *always* aliased in the same way, and the values /// are the aliases that are applied to those symbols. /// * `next_abi` - A boolean indicating whether to opt into the new, unstable parse @@ -1558,7 +1558,7 @@ pub(crate) fn render_c_code( keyword_capture_token: Option, syntax_grammar: SyntaxGrammar, lexical_grammar: LexicalGrammar, - simple_aliases: AliasMap, + default_aliases: AliasMap, next_abi: bool, ) -> String { Generator { @@ -1572,7 +1572,7 @@ pub(crate) fn render_c_code( keyword_capture_token, syntax_grammar, lexical_grammar, - simple_aliases, + default_aliases, symbol_ids: HashMap::new(), symbol_order: HashMap::new(), alias_ids: HashMap::new(), diff --git a/cli/src/tests/query_test.rs b/cli/src/tests/query_test.rs index 1f7ddaff..067bb6f9 100644 --- a/cli/src/tests/query_test.rs +++ b/cli/src/tests/query_test.rs @@ -367,6 +367,30 @@ fn test_query_errors_on_impossible_patterns() { }); } +#[test] +fn test_query_verifies_possible_patterns_with_aliased_parent_nodes() { + allocations::record(|| { + let ruby = get_language("ruby"); + + Query::new(ruby, "(destructured_parameter (identifier))").unwrap(); + + assert_eq!( + Query::new(ruby, "(destructured_parameter (string))",), + Err(QueryError { + kind: QueryErrorKind::Structure, + row: 0, + offset: 24, + column: 24, + message: [ + "(destructured_parameter (string))", // + " ^", + ] + .join("\n") + }) + ); + }); +} + #[test] fn test_query_matches_with_simple_pattern() { allocations::record(|| {