From 0688a5edd387e01ca7c83f9bbf2fb732852d2f5d Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 6 Dec 2018 22:11:52 -0800 Subject: [PATCH] Implement extract_tokens --- src/build_tables/mod.rs | 2 +- src/grammars.rs | 83 +++-- src/parse_grammar.rs | 12 +- src/prepare_grammar/expand_repeats.rs | 36 +- src/prepare_grammar/extract_tokens.rs | 492 +++++++++++++++++++++++++- src/prepare_grammar/intern_symbols.rs | 38 +- src/prepare_grammar/mod.rs | 8 +- src/render/mod.rs | 2 +- src/rules.rs | 52 +-- src/tables.rs | 24 +- 10 files changed, 621 insertions(+), 128 deletions(-) diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index c5dd5b54..c3518428 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -25,7 +25,7 @@ struct ParseTableBuilder<'a> { parse_table: ParseTable, } -pub fn build_tables( +pub(crate) fn build_tables( syntax_grammar: &SyntaxGrammar, lexical_grammar: &LexicalGrammar, simple_aliases: &AliasMap diff --git a/src/grammars.rs b/src/grammars.rs index 6f5b772e..62910637 100644 --- a/src/grammars.rs +++ b/src/grammars.rs @@ -1,7 +1,7 @@ use crate::rules::{Associativity, Alias, Rule, Symbol}; #[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub enum VariableType { +pub(crate) enum VariableType { Hidden, Auxiliary, Anonymous, @@ -11,16 +11,16 @@ pub enum VariableType { // Input grammar #[derive(Clone, Debug, PartialEq, Eq)] -pub struct InputVariable { +pub(crate) struct Variable { pub name: String, pub kind: VariableType, pub rule: Rule, } -#[derive(PartialEq, Eq)] -pub struct InputGrammar { +#[derive(Debug, PartialEq, Eq)] +pub(crate) struct InputGrammar { pub name: String, - pub variables: Vec, + pub variables: Vec, pub extra_tokens: Vec, pub expected_conflicts: Vec>, pub external_tokens: Vec, @@ -30,60 +30,53 @@ pub struct InputGrammar { // Extracted lexical grammar -#[derive(PartialEq, Eq)] -pub struct LexicalVariable { - name: String, - kind: VariableType, - rule: Rule, - is_string: bool, -} - -pub struct LexicalGrammar { - variables: Vec, - separators: Vec, +#[derive(Debug, PartialEq, Eq)] +pub(crate) struct LexicalGrammar { + pub variables: Vec, + pub separators: Vec, } // Extracted syntax grammar #[derive(Clone, Debug, PartialEq, Eq)] -pub struct ProductionStep { - symbol: Symbol, - precedence: i32, - associativity: Option, - alias: Option, - is_excluded: bool, +pub(crate) struct ProductionStep { + pub symbol: Symbol, + pub precedence: i32, + pub associativity: Option, + pub alias: Option, + pub is_excluded: bool, } #[derive(Clone, Debug, PartialEq, Eq)] -pub struct Production { - steps: Vec, - dynamic_precedence: i32, +pub(crate) struct Production { + pub steps: Vec, + pub dynamic_precedence: i32, } -#[derive(Clone, PartialEq, Eq)] -pub struct SyntaxVariable { - name: String, - kind: VariableType, +#[derive(Clone, Debug, PartialEq, Eq)] +pub(crate) struct SyntaxVariable { + pub name: String, + pub kind: VariableType, } -#[derive(Clone, PartialEq, Eq)] -pub struct ExternalToken { - name: String, - kind: VariableType, - corresponding_internal_token: Symbol, +#[derive(Clone, Debug, PartialEq, Eq)] +pub(crate) struct ExternalToken { + pub name: String, + pub kind: VariableType, + pub corresponding_internal_token: Option, } -pub struct SyntaxGrammar { - variables: Vec, - extra_tokens: Vec, - expected_conflicts: Vec>, - external_tokens: Vec, - variables_to_inline: Vec, - word_token: Symbol, +#[derive(Debug)] +pub(crate) struct SyntaxGrammar { + pub variables: Vec, + pub extra_tokens: Vec, + pub expected_conflicts: Vec>, + pub external_tokens: Vec, + pub variables_to_inline: Vec, + pub word_token: Symbol, } -#[cfg(test)] -impl InputVariable { +impl Variable { pub fn named(name: &str, rule: Rule) -> Self { Self { name: name.to_string(), kind: VariableType::Named, rule } } @@ -95,4 +88,8 @@ impl InputVariable { pub fn hidden(name: &str, rule: Rule) -> Self { Self { name: name.to_string(), kind: VariableType::Hidden, rule } } + + pub fn anonymous(name: &str, rule: Rule) -> Self { + Self { name: name.to_string(), kind: VariableType::Anonymous, rule } + } } diff --git a/src/parse_grammar.rs b/src/parse_grammar.rs index 4c21e5ba..0f1f5008 100644 --- a/src/parse_grammar.rs +++ b/src/parse_grammar.rs @@ -1,13 +1,13 @@ use serde_json::{Map, Value}; use crate::error::Result; -use crate::grammars::{InputGrammar, InputVariable, VariableType}; +use crate::grammars::{InputGrammar, Variable, VariableType}; use crate::rules::Rule; use std::collections::HashMap; #[derive(Deserialize)] #[serde(tag = "type")] #[allow(non_camel_case_types)] -pub enum RuleJSON { +enum RuleJSON { BLANK, STRING { value: String, @@ -58,12 +58,12 @@ struct GrammarJSON { word: Option, } -pub fn parse_grammar(input: &str) -> Result { +pub(crate) fn parse_grammar(input: &str) -> Result { let grammar_json: GrammarJSON = serde_json::from_str(&input)?; let mut variables = Vec::with_capacity(grammar_json.rules.len()); for (name, value) in grammar_json.rules { - variables.push(InputVariable { + variables.push(Variable { name: name.to_owned(), kind: VariableType::Named, rule: parse_rule(serde_json::from_value(value)?), @@ -138,12 +138,12 @@ mod tests { assert_eq!(grammar.name, "my_lang"); assert_eq!(grammar.variables, vec![ - InputVariable { + Variable { name: "file".to_string(), kind: VariableType::Named, rule: Rule::repeat(Rule::NamedSymbol("statement".to_string())) }, - InputVariable { + Variable { name: "statement".to_string(), kind: VariableType::Named, rule: Rule::String("foo".to_string()) diff --git a/src/prepare_grammar/expand_repeats.rs b/src/prepare_grammar/expand_repeats.rs index 69db150c..dcb8f916 100644 --- a/src/prepare_grammar/expand_repeats.rs +++ b/src/prepare_grammar/expand_repeats.rs @@ -1,5 +1,5 @@ use crate::rules::{Rule, Symbol}; -use crate::grammars::{InputVariable, VariableType}; +use crate::grammars::{Variable, VariableType}; use std::collections::HashMap; use std::mem; use std::rc::Rc; @@ -9,12 +9,12 @@ struct Expander { variable_name: String, repeat_count_in_variable: usize, preceding_symbol_count: usize, - auxiliary_variables: Vec, + auxiliary_variables: Vec, existing_repeats: HashMap } impl Expander { - fn expand_variable(&mut self, variable: &mut InputVariable) { + fn expand_variable(&mut self, variable: &mut Variable) { self.variable_name.clear(); self.variable_name.push_str(&variable.name); self.repeat_count_in_variable = 0; @@ -48,7 +48,7 @@ impl Expander { let repeat_symbol = Symbol::non_terminal(self.preceding_symbol_count + self.auxiliary_variables.len()); let rc_symbol = Rc::new(Rule::Symbol(repeat_symbol)); self.existing_repeats.insert(inner_rule.clone(), repeat_symbol); - self.auxiliary_variables.push(InputVariable { + self.auxiliary_variables.push(Variable { name: rule_name, kind: VariableType::Auxiliary, rule: Rule::Choice { @@ -100,7 +100,7 @@ mod tests { fn test_basic_repeat_expansion() { // Repeats nested inside of sequences and choices are expanded. let grammar = expand_repeats(build_grammar(vec![ - InputVariable::named("rule0", Rule::seq(vec![ + Variable::named("rule0", Rule::seq(vec![ Rule::terminal(10), Rule::choice(vec![ Rule::repeat(Rule::terminal(11)), @@ -111,7 +111,7 @@ mod tests { ])); assert_eq!(grammar.variables, vec![ - InputVariable::named("rule0", Rule::seq(vec![ + Variable::named("rule0", Rule::seq(vec![ Rule::terminal(10), Rule::choice(vec![ Rule::non_terminal(1), @@ -119,14 +119,14 @@ mod tests { ]), Rule::terminal(13), ])), - InputVariable::auxiliary("rule0_repeat1", Rule::choice(vec![ + Variable::auxiliary("rule0_repeat1", Rule::choice(vec![ Rule::seq(vec![ Rule::non_terminal(1), Rule::non_terminal(1), ]), Rule::terminal(11), ])), - InputVariable::auxiliary("rule0_repeat2", Rule::choice(vec![ + Variable::auxiliary("rule0_repeat2", Rule::choice(vec![ Rule::seq(vec![ Rule::non_terminal(2), Rule::non_terminal(2), @@ -140,11 +140,11 @@ mod tests { fn test_repeat_deduplication() { // Terminal 4 appears inside of a repeat in three different places. let grammar = expand_repeats(build_grammar(vec![ - InputVariable::named("rule0", Rule::choice(vec![ + Variable::named("rule0", Rule::choice(vec![ Rule::seq(vec![ Rule::terminal(1), Rule::repeat(Rule::terminal(4)) ]), Rule::seq(vec![ Rule::terminal(2), Rule::repeat(Rule::terminal(4)) ]), ])), - InputVariable::named("rule1", Rule::seq(vec![ + Variable::named("rule1", Rule::seq(vec![ Rule::terminal(3), Rule::repeat(Rule::terminal(4)), ])), @@ -152,15 +152,15 @@ mod tests { // Only one auxiliary rule is created for repeating terminal 4. assert_eq!(grammar.variables, vec![ - InputVariable::named("rule0", Rule::choice(vec![ + Variable::named("rule0", Rule::choice(vec![ Rule::seq(vec![ Rule::terminal(1), Rule::non_terminal(2) ]), Rule::seq(vec![ Rule::terminal(2), Rule::non_terminal(2) ]), ])), - InputVariable::named("rule1", Rule::seq(vec![ + Variable::named("rule1", Rule::seq(vec![ Rule::terminal(3), Rule::non_terminal(2), ])), - InputVariable::auxiliary("rule0_repeat1", Rule::choice(vec![ + Variable::auxiliary("rule0_repeat1", Rule::choice(vec![ Rule::seq(vec![ Rule::non_terminal(2), Rule::non_terminal(2), @@ -173,7 +173,7 @@ mod tests { #[test] fn test_expansion_of_nested_repeats() { let grammar = expand_repeats(build_grammar(vec![ - InputVariable::named("rule0", Rule::seq(vec![ + Variable::named("rule0", Rule::seq(vec![ Rule::terminal(10), Rule::repeat(Rule::seq(vec![ Rule::terminal(11), @@ -183,18 +183,18 @@ mod tests { ])); assert_eq!(grammar.variables, vec![ - InputVariable::named("rule0", Rule::seq(vec![ + Variable::named("rule0", Rule::seq(vec![ Rule::terminal(10), Rule::non_terminal(2), ])), - InputVariable::auxiliary("rule0_repeat1", Rule::choice(vec![ + Variable::auxiliary("rule0_repeat1", Rule::choice(vec![ Rule::seq(vec![ Rule::non_terminal(1), Rule::non_terminal(1), ]), Rule::terminal(12), ])), - InputVariable::auxiliary("rule0_repeat2", Rule::choice(vec![ + Variable::auxiliary("rule0_repeat2", Rule::choice(vec![ Rule::seq(vec![ Rule::non_terminal(2), Rule::non_terminal(2), @@ -207,7 +207,7 @@ mod tests { ]); } - fn build_grammar(variables: Vec) -> ExtractedGrammar { + fn build_grammar(variables: Vec) -> ExtractedGrammar { ExtractedGrammar { variables, extra_tokens: Vec::new(), diff --git a/src/prepare_grammar/extract_tokens.rs b/src/prepare_grammar/extract_tokens.rs index 660d3819..ee90b3c8 100644 --- a/src/prepare_grammar/extract_tokens.rs +++ b/src/prepare_grammar/extract_tokens.rs @@ -1,7 +1,491 @@ -use crate::error::Result; -use crate::grammars::LexicalGrammar; +use std::collections::HashMap; +use std::rc::Rc; +use std::mem; +use crate::error::{Error, Result}; +use crate::rules::{Rule, MetadataParams, Symbol, SymbolType}; +use crate::grammars::{Variable, VariableType, LexicalGrammar, ExternalToken}; use super::{InternedGrammar, ExtractedGrammar}; -pub(super) fn extract_tokens(grammar: InternedGrammar) -> Result<(ExtractedGrammar, LexicalGrammar)> { - unimplemented!(); +pub(super) fn extract_tokens( + mut grammar: InternedGrammar +) -> Result<(ExtractedGrammar, LexicalGrammar)> { + let mut extractor = TokenExtractor { + current_variable_name: String::new(), + current_variable_token_count: 0, + extracted_variables: Vec::new(), + extracted_usage_counts: Vec::new(), + }; + + for mut variable in grammar.variables.iter_mut() { + extractor.extract_tokens_in_variable(&mut variable); + } + + for mut variable in grammar.external_tokens.iter_mut() { + extractor.extract_tokens_in_variable(&mut variable); + } + + let mut lexical_variables = Vec::with_capacity(extractor.extracted_variables.len()); + for variable in extractor.extracted_variables { + lexical_variables.push(Variable { + name: variable.name, + kind: variable.kind, + rule: variable.rule, + }); + } + + // If a variable's entire rule was extracted as a token and that token didn't + // appear within any other rule, then remove that variable from the syntax + // grammar, giving its name to the token in the lexical grammar. Any symbols + // that pointed to that variable will need to be updated to point to the + // variable in the lexical grammar. Symbols that pointed to later variables + // will need to have their indices decremented. + let mut variables = Vec::new(); + let mut symbol_replacer = SymbolReplacer { replacements: HashMap::new() }; + for (i, variable) in grammar.variables.into_iter().enumerate() { + if let Rule::Symbol(Symbol { kind: SymbolType::Terminal, index }) = variable.rule { + if i > 0 && extractor.extracted_usage_counts[index] == 1 { + let mut lexical_variable = &mut lexical_variables[index]; + lexical_variable.kind = variable.kind; + lexical_variable.name = variable.name; + symbol_replacer.replacements.insert(i, index); + continue; + } + } + variables.push(variable); + } + + for variable in variables.iter_mut() { + variable.rule = symbol_replacer.replace_symbols_in_rule(&variable.rule); + } + + let expected_conflicts = grammar.expected_conflicts + .into_iter() + .map(|conflict| + conflict + .iter() + .map(|symbol| symbol_replacer.replace_symbol(*symbol)) + .collect() + ).collect(); + + let variables_to_inline = grammar.variables_to_inline + .into_iter() + .map(|symbol| symbol_replacer.replace_symbol(symbol)) + .collect(); + + let mut separators = Vec::new(); + let mut extra_tokens = Vec::new(); + for rule in grammar.extra_tokens { + if let Rule::Symbol(symbol) = rule { + let new_symbol = symbol_replacer.replace_symbol(symbol); + if new_symbol.is_non_terminal() { + return Err(Error::GrammarError(format!( + "Non-token symbol '{}' cannot be used as an extra token", + &variables[new_symbol.index].name + ))); + } else { + extra_tokens.push(new_symbol); + } + } else { + if let Some(index) = lexical_variables.iter().position(|v| v.rule == rule) { + extra_tokens.push(Symbol::terminal(index)); + } else { + separators.push(rule); + } + } + } + + let mut external_tokens = Vec::new(); + for external_token in grammar.external_tokens { + let rule = symbol_replacer.replace_symbols_in_rule(&external_token.rule); + if let Rule::Symbol(symbol) = rule { + if symbol.is_non_terminal() { + return Err(Error::GrammarError(format!( + "Rule '{}' cannot be used as both an external token and a non-terminal rule", + &variables[symbol.index].name, + ))); + } + + if symbol.is_external() { + external_tokens.push(ExternalToken { + name: external_token.name, + kind: external_token.kind, + corresponding_internal_token: None, + }) + } else { + external_tokens.push(ExternalToken { + name: lexical_variables[symbol.index].name.clone(), + kind: external_token.kind, + corresponding_internal_token: Some(symbol), + }) + } + } else { + return Err(Error::GrammarError(format!( + "Non-symbol rules cannot be used as external tokens" + ))); + } + } + + let mut word_token = None; + if let Some(token) = grammar.word_token { + let token = symbol_replacer.replace_symbol(token); + if token.is_non_terminal() { + return Err(Error::GrammarError(format!( + "Non-terminal symbol '{}' cannot be used as the word token", + &variables[token.index].name + ))); + } + word_token = Some(token); + } + + Ok(( + ExtractedGrammar { + variables, + expected_conflicts, + extra_tokens, + variables_to_inline, + external_tokens, + word_token, + }, + LexicalGrammar { + variables: lexical_variables, + separators, + } + )) +} + +struct TokenExtractor { + current_variable_name: String, + current_variable_token_count: usize, + extracted_variables: Vec, + extracted_usage_counts: Vec, +} + +struct SymbolReplacer { + replacements: HashMap +} + +impl TokenExtractor { + fn extract_tokens_in_variable(&mut self, variable: &mut Variable) { + self.current_variable_name.clear(); + self.current_variable_name.push_str(&variable.name); + self.current_variable_token_count = 0; + let mut rule = Rule::Blank; + mem::swap(&mut rule, &mut variable.rule); + variable.rule = self.extract_tokens_in_rule(&rule); + } + + fn extract_tokens_in_rule(&mut self, input: &Rule) -> Rule { + match input { + Rule::String(name) => self.extract_token(input, Some(name)).into(), + Rule::Pattern(..) => self.extract_token(input, None).into(), + Rule::Metadata { params, rule } => { + if params.is_token { + let mut params = params.clone(); + params.is_token = false; + + let mut string_value = None; + if let Rule::String(value) = rule.as_ref() { + string_value = Some(value); + } + + let rule_to_extract = if params == MetadataParams::default() { + rule.as_ref() + } else { + input + }; + + self.extract_token(rule_to_extract, string_value).into() + } else { + Rule::Metadata { + params: params.clone(), + rule: Rc::new(self.extract_tokens_in_rule((&rule).clone())) + } + } + }, + Rule::Repeat(content) => Rule::Repeat( + Rc::new(self.extract_tokens_in_rule(content)) + ), + Rule::Seq { left, right } => Rule::Seq { + left: Rc::new(self.extract_tokens_in_rule(left)), + right: Rc::new(self.extract_tokens_in_rule(right)), + }, + Rule::Choice { elements } => Rule::Choice { + elements: elements.iter().map(|e| self.extract_tokens_in_rule(e)).collect() + }, + _ => input.clone() + } + } + + fn extract_token(&mut self, rule: &Rule, string_value: Option<&String>) -> Symbol { + for (i, variable) in self.extracted_variables.iter_mut().enumerate() { + if variable.rule == *rule { + self.extracted_usage_counts[i] += 1; + return Symbol::terminal(i) + } + } + + let index = self.extracted_variables.len(); + let variable = if let Some(string_value) = string_value { + Variable::anonymous(string_value, rule.clone()) + } else { + self.current_variable_token_count += 1; + Variable::auxiliary( + &format!( + "{}_token{}", + &self.current_variable_name, + self.current_variable_token_count + ), + rule.clone() + ) + }; + + self.extracted_variables.push(variable); + self.extracted_usage_counts.push(1); + Symbol::terminal(index) + } +} + +impl SymbolReplacer { + fn replace_symbols_in_rule(&mut self, rule: &Rule) -> Rule { + match rule { + Rule::Symbol(symbol) => self.replace_symbol(*symbol).into(), + Rule::Choice { elements } => Rule::Choice { + elements: elements.iter().map(|e| self.replace_symbols_in_rule(e)).collect() + }, + Rule::Seq { left, right } => Rule::Seq { + left: Rc::new(self.replace_symbols_in_rule(left)), + right: Rc::new(self.replace_symbols_in_rule(right)), + }, + Rule::Repeat(content) => Rule::Repeat( + Rc::new(self.replace_symbols_in_rule(content)) + ), + Rule::Metadata { rule, params } => Rule::Metadata { + params: params.clone(), + rule: Rc::new(self.replace_symbols_in_rule(rule)), + }, + _ => rule.clone() + } + } + + fn replace_symbol(&self, symbol: Symbol) -> Symbol { + if !symbol.is_non_terminal() { + return symbol + } + + if let Some(replacement) = self.replacements.get(&symbol.index) { + return Symbol::terminal(*replacement); + } + + let mut adjusted_index = symbol.index; + for (replaced_index, _) in self.replacements.iter() { + if *replaced_index < symbol.index { + adjusted_index -= 1; + } + } + + return Symbol::non_terminal(adjusted_index); + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_extraction() { + let (syntax_grammar, lexical_grammar) = extract_tokens(build_grammar(vec![ + Variable::named("rule_0", Rule::repeat(Rule::seq(vec![ + Rule::string("a"), + Rule::pattern("b"), + Rule::choice(vec![ + Rule::non_terminal(1), + Rule::non_terminal(2), + Rule::token(Rule::repeat(Rule::choice(vec![ + Rule::string("c"), + Rule::string("d"), + ]))) + ]) + ]))), + Variable::named("rule_1", Rule::pattern("e")), + Variable::named("rule_2", Rule::pattern("b")), + Variable::named("rule_3", Rule::seq(vec![ + Rule::non_terminal(2), + Rule::Blank, + ])), + ])).unwrap(); + + assert_eq!(syntax_grammar.variables, vec![ + Variable::named("rule_0", Rule::repeat(Rule::seq(vec![ + // The string "a" was replaced by a symbol referencing the lexical grammar + Rule::terminal(0), + + // The pattern "b" was replaced by a symbol referencing the lexical grammar + Rule::terminal(1), + Rule::choice(vec![ + // The symbol referencing `rule_1` was replaced by a symbol referencing + // the lexical grammar. + Rule::terminal(3), + + // The symbol referencing `rule_2` had its index decremented because + // `rule_1` was moved to the lexical grammar. + Rule::non_terminal(1), + + // The rule wrapped in `token` was replaced by a symbol referencing + // the lexical grammar. + Rule::terminal(2), + ]) + ]))), + + // The pattern "e" was only used in once place: as the definition of `rule_1`, + // so that rule was moved to the lexical grammar. The pattern "b" appeared in + // two places, so it was not moved into the lexical grammar. + Variable::named("rule_2", Rule::terminal(1)), + Variable::named("rule_3", Rule::seq(vec![ + Rule::non_terminal(1), + Rule::Blank, + ])), + ]); + + assert_eq!(lexical_grammar.variables, vec![ + Variable::anonymous("a", Rule::string("a")), + Variable::auxiliary("rule_0_token1", Rule::pattern("b")), + Variable::auxiliary("rule_0_token2", Rule::repeat(Rule::choice(vec![ + Rule::string("c"), + Rule::string("d"), + ]))), + Variable::named("rule_1", Rule::pattern("e")), + ]); + } + + #[test] + fn test_start_rule_is_token() { + let (syntax_grammar, lexical_grammar) = extract_tokens(build_grammar(vec![ + Variable::named("rule_0", Rule::string("hello")), + ])).unwrap(); + + assert_eq!(syntax_grammar.variables, vec![ + Variable::named("rule_0", Rule::terminal(0)), + ]); + assert_eq!(lexical_grammar.variables, vec![ + Variable::anonymous("hello", Rule::string("hello")), + ]) + } + + #[test] + fn test_extracting_extra_tokens() { + let mut grammar = build_grammar(vec![ + Variable::named("rule_0", Rule::string("x")), + Variable::named("comment", Rule::pattern("//.*")), + ]); + grammar.extra_tokens = vec![ + Rule::string(" "), + Rule::non_terminal(1), + ]; + + let (syntax_grammar, lexical_grammar) = extract_tokens(grammar).unwrap(); + assert_eq!(syntax_grammar.extra_tokens, vec![ + Symbol::terminal(1), + ]); + assert_eq!(lexical_grammar.separators, vec![ + Rule::string(" "), + ]); + } + + #[test] + fn test_extract_externals() { + let mut grammar = build_grammar(vec![ + Variable::named("rule_0", Rule::seq(vec![ + Rule::external(0), + Rule::string("a"), + Rule::non_terminal(1), + Rule::non_terminal(2), + ])), + Variable::named("rule_1", Rule::string("b")), + Variable::named("rule_2", Rule::string("c")), + ]); + grammar.external_tokens = vec![ + Variable::named("external_0", Rule::external(0)), + Variable::anonymous("a", Rule::string("a")), + Variable::named("rule_2", Rule::non_terminal(2)), + ]; + + let (syntax_grammar, _) = extract_tokens(grammar).unwrap(); + + assert_eq!(syntax_grammar.external_tokens, vec![ + ExternalToken { + name: "external_0".to_string(), + kind: VariableType::Named, + corresponding_internal_token: None, + }, + ExternalToken { + name: "a".to_string(), + kind: VariableType::Anonymous, + corresponding_internal_token: Some(Symbol::terminal(0)), + }, + ExternalToken { + name: "rule_2".to_string(), + kind: VariableType::Named, + corresponding_internal_token: Some(Symbol::terminal(2)), + }, + ]); + } + + #[test] + fn test_error_on_non_terminal_symbol_extras() { + let mut grammar = build_grammar(vec![ + Variable::named("rule_0", Rule::non_terminal(1)), + Variable::named("rule_1", Rule::non_terminal(2)), + Variable::named("rule_2", Rule::string("x")), + ]); + grammar.extra_tokens = vec![ + Rule::non_terminal(1), + ]; + + match extract_tokens(grammar) { + Err(Error::GrammarError(s)) => { + assert_eq!(s, "Non-token symbol 'rule_1' cannot be used as an extra token"); + }, + _ => { + panic!("Expected an error but got no error"); + } + } + } + + #[test] + fn test_error_on_external_with_same_name_as_non_terminal() { + let mut grammar = build_grammar(vec![ + Variable::named("rule_0", Rule::seq(vec![ + Rule::non_terminal(1), + Rule::non_terminal(2), + ])), + Variable::named("rule_1", Rule::seq(vec![ + Rule::non_terminal(2), + Rule::non_terminal(2), + ])), + Variable::named("rule_2", Rule::string("a")), + ]); + grammar.external_tokens = vec![ + Variable::named("rule_1", Rule::non_terminal(1)), + ]; + + match extract_tokens(grammar) { + Err(Error::GrammarError(s)) => { + assert_eq!(s, "Rule 'rule_1' cannot be used as both an external token and a non-terminal rule"); + }, + _ => { + panic!("Expected an error but got no error"); + } + } + } + + fn build_grammar(variables: Vec) -> InternedGrammar { + InternedGrammar { + variables, + extra_tokens: Vec::new(), + external_tokens: Vec::new(), + expected_conflicts: Vec::new(), + variables_to_inline: Vec::new(), + word_token: None, + } + } } diff --git a/src/prepare_grammar/intern_symbols.rs b/src/prepare_grammar/intern_symbols.rs index 00a5c330..e4cf7ff1 100644 --- a/src/prepare_grammar/intern_symbols.rs +++ b/src/prepare_grammar/intern_symbols.rs @@ -1,6 +1,6 @@ use crate::error::{Error, Result}; use crate::rules::{Rule, Symbol}; -use crate::grammars::{InputGrammar, InputVariable, VariableType}; +use crate::grammars::{InputGrammar, Variable, VariableType}; use std::rc::Rc; use super::InternedGrammar; @@ -13,7 +13,7 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result let mut variables = Vec::with_capacity(grammar.variables.len()); for variable in grammar.variables.iter() { - variables.push(InputVariable { + variables.push(Variable { name: variable.name.clone(), kind: variable_type_for_name(&variable.name), rule: interner.intern_rule(&variable.rule)?, @@ -28,7 +28,7 @@ pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result } else { (String::new(), VariableType::Anonymous) }; - external_tokens.push(InputVariable { name, kind, rule }); + external_tokens.push(Variable { name, kind, rule }); } let mut extra_tokens = Vec::with_capacity(grammar.extra_tokens.len()); @@ -154,21 +154,21 @@ mod tests { #[test] fn test_basic_repeat_expansion() { let grammar = intern_symbols(&build_grammar(vec![ - InputVariable::named("x", Rule::choice(vec![ + Variable::named("x", Rule::choice(vec![ Rule::named("y"), Rule::named("_z"), ])), - InputVariable::named("y", Rule::named("_z")), - InputVariable::named("_z", Rule::string("a")), + Variable::named("y", Rule::named("_z")), + Variable::named("_z", Rule::string("a")), ])).unwrap(); assert_eq!(grammar.variables, vec![ - InputVariable::named("x", Rule::choice(vec![ + Variable::named("x", Rule::choice(vec![ Rule::non_terminal(1), Rule::non_terminal(2), ])), - InputVariable::named("y", Rule::non_terminal(2)), - InputVariable::hidden("_z", Rule::string("a")), + Variable::named("y", Rule::non_terminal(2)), + Variable::hidden("_z", Rule::string("a")), ]); } @@ -177,13 +177,13 @@ mod tests { // Variable `y` is both an internal and an external token. // Variable `z` is just an external token. let mut input_grammar = build_grammar(vec![ - InputVariable::named("w", Rule::choice(vec![ + Variable::named("w", Rule::choice(vec![ Rule::named("x"), Rule::named("y"), Rule::named("z"), ])), - InputVariable::named("x", Rule::string("a")), - InputVariable::named("y", Rule::string("b")), + Variable::named("x", Rule::string("a")), + Variable::named("y", Rule::string("b")), ]); input_grammar.external_tokens.extend(vec![ Rule::named("y"), @@ -195,26 +195,26 @@ mod tests { // Variable `y` is referred to by its internal index. // Variable `z` is referred to by its external index. assert_eq!(grammar.variables, vec![ - InputVariable::named("w", Rule::choice(vec![ + Variable::named("w", Rule::choice(vec![ Rule::non_terminal(1), Rule::non_terminal(2), Rule::external(1), ])), - InputVariable::named("x", Rule::string("a")), - InputVariable::named("y", Rule::string("b")), + Variable::named("x", Rule::string("a")), + Variable::named("y", Rule::string("b")), ]); // The external token for `y` refers back to its internal index. assert_eq!(grammar.external_tokens, vec![ - InputVariable::named("y", Rule::non_terminal(2)), - InputVariable::named("z", Rule::external(1)), + Variable::named("y", Rule::non_terminal(2)), + Variable::named("z", Rule::external(1)), ]); } #[test] fn test_grammar_with_undefined_symbols() { let result = intern_symbols(&build_grammar(vec![ - InputVariable::named("x", Rule::named("y")), + Variable::named("x", Rule::named("y")), ])); match result { @@ -223,7 +223,7 @@ mod tests { } } - fn build_grammar(variables: Vec) -> InputGrammar { + fn build_grammar(variables: Vec) -> InputGrammar { InputGrammar { variables, name: "the_language".to_string(), diff --git a/src/prepare_grammar/mod.rs b/src/prepare_grammar/mod.rs index 0788edca..b860807a 100644 --- a/src/prepare_grammar/mod.rs +++ b/src/prepare_grammar/mod.rs @@ -6,7 +6,7 @@ mod normalize_rules; mod extract_simple_aliases; use crate::rules::{AliasMap, Rule, Symbol}; -use crate::grammars::{InputGrammar, SyntaxGrammar, LexicalGrammar, InputVariable, ExternalToken}; +use crate::grammars::{InputGrammar, SyntaxGrammar, LexicalGrammar, Variable, ExternalToken}; use crate::error::Result; use self::intern_symbols::intern_symbols; use self::extract_tokens::extract_tokens; @@ -16,7 +16,7 @@ use self::normalize_rules::normalize_rules; use self::extract_simple_aliases::extract_simple_aliases; pub(self) struct IntermediateGrammar { - variables: Vec, + variables: Vec, extra_tokens: Vec, expected_conflicts: Vec>, external_tokens: Vec, @@ -24,10 +24,10 @@ pub(self) struct IntermediateGrammar { word_token: Option, } -pub(self) type InternedGrammar = IntermediateGrammar; +pub(self) type InternedGrammar = IntermediateGrammar; pub(self) type ExtractedGrammar = IntermediateGrammar; -pub fn prepare_grammar( +pub(crate) fn prepare_grammar( input_grammar: &InputGrammar ) -> Result<(SyntaxGrammar, LexicalGrammar, AliasMap)> { let interned_grammar = intern_symbols(input_grammar)?; diff --git a/src/render/mod.rs b/src/render/mod.rs index 85ce1f32..5bd11a34 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -2,7 +2,7 @@ use crate::rules::{Symbol, AliasMap}; use crate::grammars::{SyntaxGrammar, LexicalGrammar}; use crate::tables::{ParseTable, LexTable}; -pub fn render_c_code( +pub(crate) fn render_c_code( name: &str, parse_table: ParseTable, main_lex_table: LexTable, diff --git a/src/rules.rs b/src/rules.rs index 3cccca0d..5c3b65fd 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -2,47 +2,47 @@ use std::rc::Rc; use std::collections::HashMap; #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] -pub enum SymbolType { +pub(crate) enum SymbolType { External, Terminal, NonTerminal, } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] -pub enum Associativity { +pub(crate) enum Associativity { Left, Right } #[derive(Clone, Debug, PartialEq, Eq, Hash)] -pub struct Alias { - value: String, - is_named: bool, +pub(crate) struct Alias { + pub value: String, + pub is_named: bool, } -pub type AliasMap = HashMap; +pub(crate) type AliasMap = HashMap; #[derive(Clone, Debug, Default, PartialEq, Eq, Hash)] -pub struct MetadataParams { - precedence: Option, - dynamic_precedence: i32, - associativity: Option, - is_token: bool, - is_string: bool, - is_active: bool, - is_main_token: bool, - is_excluded: bool, - alias: Option, +pub(crate) struct MetadataParams { + pub precedence: Option, + pub dynamic_precedence: i32, + pub associativity: Option, + pub is_token: bool, + pub is_string: bool, + pub is_active: bool, + pub is_main_token: bool, + pub is_excluded: bool, + pub alias: Option, } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] -pub struct Symbol { - kind: SymbolType, - index: usize, +pub(crate) struct Symbol { + pub kind: SymbolType, + pub index: usize, } #[derive(Clone, Debug, PartialEq, Eq, Hash)] -pub enum Rule { +pub(crate) enum Rule { Blank, CharacterSet(Vec), String(String), @@ -153,9 +153,21 @@ impl Rule { pub fn string(value: &'static str) -> Self { Rule::String(value.to_string()) } + + pub fn pattern(value: &'static str) -> Self { + Rule::Pattern(value.to_string()) + } } impl Symbol { + pub fn is_non_terminal(&self) -> bool { + return self.kind == SymbolType::NonTerminal + } + + pub fn is_external(&self) -> bool { + return self.kind == SymbolType::External + } + pub fn non_terminal(index: usize) -> Self { Symbol { kind: SymbolType::NonTerminal, index } } diff --git a/src/tables.rs b/src/tables.rs index 10b1e41d..de66253c 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -2,12 +2,12 @@ use std::collections::HashMap; use std::ops::Range; use crate::rules::{Associativity, Symbol, Alias}; -pub type AliasSequenceId = usize; -pub type ParseStateId = usize; -pub type LexStateId = usize; +pub(crate) type AliasSequenceId = usize; +pub(crate) type ParseStateId = usize; +pub(crate) type LexStateId = usize; #[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub enum ParseActionType { +pub(crate) enum ParseActionType { Error, Shift, Reduce, @@ -16,7 +16,7 @@ pub enum ParseActionType { } #[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub enum ParseAction { +pub(crate) enum ParseAction { Accept, Error, Shift(ParseStateId), @@ -34,44 +34,44 @@ pub enum ParseAction { } #[derive(Clone, Debug, PartialEq, Eq)] -pub struct ParseTableEntry { +pub(crate) struct ParseTableEntry { actions: Vec, reusable: bool, } #[derive(Clone, Debug, PartialEq, Eq)] -pub struct ParseState { +pub(crate) struct ParseState { terminal_entries: HashMap, nonterminal_entries: HashMap } #[derive(Debug, PartialEq, Eq)] -pub struct ParseTable { +pub(crate) struct ParseTable { states: Vec, alias_sequences: Vec>, } #[derive(Clone, Debug, PartialEq, Eq)] -pub struct AdvanceAction { +pub(crate) struct AdvanceAction { state: LexStateId, precedence: Range, in_main_token: bool, } #[derive(Clone, Debug, PartialEq, Eq)] -pub struct AcceptTokenAction { +pub(crate) struct AcceptTokenAction { symbol: Symbol, precedence: i32, implicit_precedence: i32, } #[derive(Clone, Debug, PartialEq, Eq)] -pub struct LexState { +pub(crate) struct LexState { advance_actions: HashMap, accept_action: Option, } #[derive(Debug, PartialEq, Eq)] -pub struct LexTable { +pub(crate) struct LexTable { states: Vec, }