From 5258ee2e6ad3f202e43f98a093c82da1143a27fa Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sun, 23 Dec 2018 10:16:03 -0800 Subject: [PATCH] Implement more C code generation --- src/build_tables/item.rs | 60 +- src/build_tables/item_set_builder.rs | 27 +- src/build_tables/lex_table_builder.rs | 24 + src/build_tables/mod.rs | 61 ++- src/render/mod.rs | 761 ++++++++++++++++++++++++-- src/tables.rs | 12 +- 6 files changed, 840 insertions(+), 105 deletions(-) create mode 100644 src/build_tables/lex_table_builder.rs diff --git a/src/build_tables/item.rs b/src/build_tables/item.rs index 49ab4f27..28723d24 100644 --- a/src/build_tables/item.rs +++ b/src/build_tables/item.rs @@ -2,7 +2,7 @@ use crate::grammars::{LexicalGrammar, Production, ProductionStep, SyntaxGrammar} use crate::rules::Associativity; use crate::rules::{Symbol, SymbolType}; use smallbitvec::SmallBitVec; -use std::collections::{HashMap, BTreeMap}; +use std::collections::BTreeMap; use std::fmt; use std::hash::{Hash, Hasher}; use std::u32; @@ -178,7 +178,11 @@ impl<'a> ParseItem<'a> { } pub fn prev_step(&self) -> Option<&'a ProductionStep> { - self.production.steps.get(self.step_index as usize - 1) + if self.step_index > 0 { + Some(&self.production.steps[self.step_index as usize - 1]) + } else { + None + } } pub fn is_done(&self) -> bool { @@ -355,43 +359,49 @@ impl<'a> PartialEq for ParseItem<'a> { } } -impl<'a> PartialOrd for ParseItem<'a> { - fn partial_cmp(&self, other: &Self) -> Option { - if let Some(o) = self.variable_index.partial_cmp(&other.variable_index) { - return Some(o); +impl<'a> Ord for ParseItem<'a> { + fn cmp(&self, other: &Self) -> Ordering { + let o = self.variable_index.cmp(&other.variable_index); + if o != Ordering::Equal { + return o; } - if let Some(o) = self.step_index.partial_cmp(&other.step_index) { - return Some(o); + let o = self.step_index.cmp(&other.step_index); + if o != Ordering::Equal { + return o; } - if let Some(o) = self.production.dynamic_precedence.partial_cmp(&other.production.dynamic_precedence) { - return Some(o); + let o = self.production.dynamic_precedence.cmp(&other.production.dynamic_precedence); + if o != Ordering::Equal { + return o; } - if let Some(o) = self.production.steps.len().partial_cmp(&other.production.steps.len()) { - return Some(o); + let o = self.production.steps.len().cmp(&other.production.steps.len()); + if o != Ordering::Equal { + return o; } - if let Some(o) = self.precedence().partial_cmp(&other.precedence()) { - return Some(o); + let o = self.precedence().cmp(&other.precedence()); + if o != Ordering::Equal { + return o; } - if let Some(o) = self.associativity().partial_cmp(&other.associativity()) { - return Some(o); + let o = self.associativity().cmp(&other.associativity()); + if o != Ordering::Equal { + return o; } for (i, step) in self.production.steps.iter().enumerate() { - let cmp = if i < self.step_index as usize { - step.alias.partial_cmp(&other.production.steps[i].alias) + let o = if i < self.step_index as usize { + step.alias.cmp(&other.production.steps[i].alias) } else { - step.partial_cmp(&other.production.steps[i]) + step.cmp(&other.production.steps[i]) }; - if let Some(o) = cmp { - return Some(o); + if o != Ordering::Equal { + return o; } } - return None; + return Ordering::Equal; } } -impl<'a> Ord for ParseItem<'a> { - fn cmp(&self, other: &Self) -> Ordering { - self.partial_cmp(other).unwrap_or(Ordering::Equal) +impl<'a> PartialOrd for ParseItem<'a> { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) } } diff --git a/src/build_tables/item_set_builder.rs b/src/build_tables/item_set_builder.rs index 52ee0a45..d7883988 100644 --- a/src/build_tables/item_set_builder.rs +++ b/src/build_tables/item_set_builder.rs @@ -198,15 +198,16 @@ impl<'a> ParseItemSetBuilder<'a> { if syntax_grammar.variables_to_inline.contains(&non_terminal) { continue; } - for (production_index, production) in variable.productions.iter().enumerate() { + for production in &variable.productions { let item = ParseItem { variable_index, production, step_index: 0, }; - // let step_id = item.as_step_id(syntax_grammar, inlines); - if let Some(inlined_productions) = inlines.inlined_productions(item.production, item.step_index) { + if let Some(inlined_productions) = + inlines.inlined_productions(item.production, item.step_index) + { for production in inlined_productions { find_or_push( additions_for_non_terminal, @@ -244,16 +245,21 @@ impl<'a> ParseItemSetBuilder<'a> { ) -> ParseItemSet<'a> { let mut result = ParseItemSet::default(); for (item, lookaheads) in &item_set.entries { - if let Some(productions) = inlines.inlined_productions(item.production, item.step_index) { + if let Some(productions) = inlines.inlined_productions(item.production, item.step_index) + { for production in productions { - self.add_item(&mut result, ParseItem { - variable_index: item.variable_index, - production, - step_index: item.step_index, - }, lookaheads, grammar); + self.add_item( + &mut result, + ParseItem { + variable_index: item.variable_index, + production, + step_index: item.step_index, + }, + lookaheads, + ); } } else { - self.add_item(&mut result, *item, lookaheads, grammar); + self.add_item(&mut result, *item, lookaheads); } } result @@ -268,7 +274,6 @@ impl<'a> ParseItemSetBuilder<'a> { set: &mut ParseItemSet<'a>, item: ParseItem<'a>, lookaheads: &LookaheadSet, - grammar: &SyntaxGrammar, ) { if let Some(step) = item.step() { if step.symbol.is_non_terminal() { diff --git a/src/build_tables/lex_table_builder.rs b/src/build_tables/lex_table_builder.rs new file mode 100644 index 00000000..86d1578b --- /dev/null +++ b/src/build_tables/lex_table_builder.rs @@ -0,0 +1,24 @@ +use crate::rules::Symbol; +use crate::tables::LexTable; +use crate::grammars::{SyntaxGrammar, LexicalGrammar}; + +pub(crate) struct LexTableBuilder<'a> { + syntax_grammar: &'a SyntaxGrammar, + lexical_grammar: &'a LexicalGrammar, + table: LexTable, +} + +impl<'a> LexTableBuilder<'a> { + pub fn new( + syntax_grammar: &'a SyntaxGrammar, + lexical_grammar: &'a LexicalGrammar, + ) -> Self { + Self { + syntax_grammar, lexical_grammar, table: LexTable::default() + } + } + + pub fn build(self) -> (LexTable, LexTable, Option) { + (LexTable::default(), LexTable::default(), None) + } +} diff --git a/src/build_tables/mod.rs b/src/build_tables/mod.rs index 27951453..fc17ce7f 100644 --- a/src/build_tables/mod.rs +++ b/src/build_tables/mod.rs @@ -1,10 +1,13 @@ mod item; mod item_set_builder; +mod lex_table_builder; use self::item::{LookaheadSet, ParseItem, ParseItemSet}; use self::item_set_builder::ParseItemSetBuilder; +use self::lex_table_builder::LexTableBuilder; use crate::error::{Error, Result}; use crate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar, VariableType}; +use crate::rules::Alias; use crate::rules::{AliasMap, Associativity, Symbol, SymbolType}; use crate::tables::{ AliasSequenceId, LexTable, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry, @@ -43,7 +46,7 @@ struct ParseTableBuilder<'a> { impl<'a> ParseTableBuilder<'a> { fn build(mut self) -> Result<(ParseTable, LexTable, LexTable, Option)> { - // Ensure that the empty rename sequence has index 0. + // Ensure that the empty alias sequence has index 0. self.parse_table.alias_sequences.push(Vec::new()); // Ensure that the error state has index 0. @@ -61,9 +64,18 @@ impl<'a> ParseTableBuilder<'a> { ); self.process_part_state_queue()?; + + let lex_table_builder = LexTableBuilder::new(self.syntax_grammar, self.lexical_grammar); + self.populate_used_symbols(); - Err(Error::grammar("oh no")) + let (main_lex_table, keyword_lex_table, keyword_capture_token) = lex_table_builder.build(); + Ok(( + self.parse_table, + main_lex_table, + keyword_lex_table, + keyword_capture_token, + )) } fn add_parse_state( @@ -82,6 +94,7 @@ impl<'a> ParseTableBuilder<'a> { let state_id = self.parse_table.states.len(); self.item_sets_by_state_id.push(v.key().clone()); self.parse_table.states.push(ParseState { + lex_state_id: 0, terminal_entries: HashMap::new(), nonterminal_entries: HashMap::new(), }); @@ -98,12 +111,16 @@ impl<'a> ParseTableBuilder<'a> { fn process_part_state_queue(&mut self) -> Result<()> { while let Some(entry) = self.parse_state_queue.pop_front() { - println!( - "ITEM SET {}:\n{}", - entry.state_id, - self.item_sets_by_state_id[entry.state_id] - .display_with(&self.syntax_grammar, &self.lexical_grammar,) - ); + let debug = false; + + if debug { + println!( + "ITEM SET {}:\n{}", + entry.state_id, + self.item_sets_by_state_id[entry.state_id] + .display_with(&self.syntax_grammar, &self.lexical_grammar,) + ); + } let item_set = self.item_set_builder.transitive_closure( &self.item_sets_by_state_id[entry.state_id], @@ -111,11 +128,12 @@ impl<'a> ParseTableBuilder<'a> { self.inlines, ); - // println!("TRANSITIVE CLOSURE:"); - // for item in item_set.entries.keys() { - // println!("{}", item.display_with(&self.syntax_grammar, &self.lexical_grammar, &self.item_set_builder.inlines)); - // } - // println!(""); + if debug { + println!( + "TRANSITIVE CLOSURE:\n{}", + item_set.display_with(&self.syntax_grammar, &self.lexical_grammar) + ); + } self.add_actions( entry.preceding_symbols, @@ -249,6 +267,17 @@ impl<'a> ParseTableBuilder<'a> { )?; } + let state = &mut self.parse_table.states[state_id]; + for extra_token in &self.syntax_grammar.extra_tokens { + state + .terminal_entries + .entry(*extra_token) + .or_insert(ParseTableEntry { + reusable: true, + actions: vec![ParseAction::ShiftExtra], + }); + } + Ok(()) } @@ -514,6 +543,7 @@ impl<'a> ParseTableBuilder<'a> { non_terminal_usages[symbol.index] = true; } } + self.parse_table.symbols.push(Symbol::end()); for (i, value) in terminal_usages.into_iter().enumerate() { if value { self.parse_table.symbols.push(Symbol::terminal(i)); @@ -532,12 +562,15 @@ impl<'a> ParseTableBuilder<'a> { } fn get_alias_sequence_id(&mut self, item: &ParseItem) -> AliasSequenceId { - let alias_sequence = item + let mut alias_sequence: Vec> = item .production .steps .iter() .map(|s| s.alias.clone()) .collect(); + while alias_sequence.last() == Some(&None) { + alias_sequence.pop(); + } if let Some(index) = self .parse_table .alias_sequences diff --git a/src/render/mod.rs b/src/render/mod.rs index 2ca610a6..fc4cdafb 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -1,8 +1,16 @@ -use crate::grammars::{LexicalGrammar, SyntaxGrammar, VariableType}; +use crate::grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType}; +use crate::nfa::CharacterSet; use crate::rules::{Alias, AliasMap, Symbol, SymbolType}; -use crate::tables::{LexTable, ParseTable, ParseTableEntry}; +use crate::tables::{LexState, LexTable, ParseAction, ParseTable, ParseTableEntry}; use std::collections::{HashMap, HashSet}; use std::fmt::Write; +use std::mem::swap; + +macro_rules! add { + ($this: tt, $($arg: tt)*) => {{ + $this.buffer.write_fmt(format_args!($($arg)*)).unwrap(); + }} +} macro_rules! add_line { ($this: tt, $($arg: tt)*) => { @@ -14,10 +22,21 @@ macro_rules! add_line { } } +macro_rules! indent { + ($this: tt) => { + $this.indent_level += 1; + }; +} + +macro_rules! dedent { + ($this: tt) => { + $this.indent_level -= 1; + }; +} + struct Generator { buffer: String, indent_level: usize, - language_name: String, parse_table: ParseTable, main_lex_table: LexTable, @@ -27,9 +46,9 @@ struct Generator { lexical_grammar: LexicalGrammar, simple_aliases: AliasMap, symbol_ids: HashMap, - parse_table_entries: Vec<(usize, ParseTableEntry)>, - next_parse_action_list_index: usize, - unique_aliases: HashSet, + alias_ids: HashMap, + external_scanner_states: Vec>, + alias_map: HashMap>, } impl Generator { @@ -39,6 +58,30 @@ impl Generator { self.add_stats(); self.add_symbol_enum(); self.add_symbol_names_list(); + self.add_symbol_metadata_list(); + self.add_alias_sequences(); + + let mut main_lex_table = LexTable::default(); + swap(&mut main_lex_table, &mut self.main_lex_table); + self.add_lex_function("ts_lex", main_lex_table); + + if self.keyword_capture_token.is_some() { + let mut keyword_lex_table = LexTable::default(); + swap(&mut keyword_lex_table, &mut self.keyword_lex_table); + self.add_lex_function("ts_lex_keywords", keyword_lex_table); + } + + self.add_lex_modes_list(); + + if !self.syntax_grammar.external_tokens.is_empty() { + self.add_external_token_enum(); + self.add_external_scanner_symbol_map(); + self.add_external_scanner_states_list(); + } + + self.add_parse_table(); + self.add_parser_export(); + self.buffer } @@ -50,7 +93,10 @@ impl Generator { fn add_pragmas(&mut self) { add_line!(self, "#if defined(__GNUC__) || defined(__clang__)"); add_line!(self, "#pragma GCC diagnostic push"); - add_line!(self, "#pragma GCC diagnostic ignored \"-Wmissing-field-initializers\""); + add_line!( + self, + "#pragma GCC diagnostic ignored \"-Wmissing-field-initializers\"" + ); add_line!(self, "#endif"); add_line!(self, ""); @@ -67,81 +113,639 @@ impl Generator { } fn add_stats(&mut self) { - let mut token_count = 0; - - for symbol in &self.parse_table.symbols { - if symbol.is_terminal() { - token_count += 1; - } else if symbol.is_external() { - let external_token = &self.syntax_grammar.external_tokens[symbol.index]; - if external_token.corresponding_internal_token.is_none() { - token_count += 1; + let token_count = self + .parse_table + .symbols + .iter() + .filter(|symbol| { + if symbol.is_terminal() { + true + } else if symbol.is_external() { + self.syntax_grammar.external_tokens[symbol.index] + .corresponding_internal_token + .is_none() + } else { + false } - } + }) + .count(); + + let mut symbol_identifiers = HashSet::new(); + for i in 0..self.parse_table.symbols.len() { + self.assign_symbol_id(self.parse_table.symbols[i], &mut symbol_identifiers); } for alias_sequence in &self.parse_table.alias_sequences { for entry in alias_sequence { if let Some(alias) = entry { - self.unique_aliases.insert(alias.clone()); + let alias_kind = if alias.is_named { + VariableType::Named + } else { + VariableType::Anonymous + }; + let matching_symbol = self.parse_table.symbols.iter().cloned().find(|symbol| { + let (name, kind) = self.metadata_for_symbol(*symbol); + name == alias.value && kind == alias_kind + }); + let alias_id = if let Some(symbol) = matching_symbol { + self.symbol_ids[&symbol].clone() + } else if alias.is_named { + format!("alias_sym_{}", self.sanitize_identifier(&alias.value)) + } else { + format!("anon_alias_sym_{}", self.sanitize_identifier(&alias.value)) + }; + self.alias_ids.entry(alias.clone()).or_insert(alias_id); + self.alias_map + .entry(alias.clone()) + .or_insert(matching_symbol); } } } - let mut symbol_id_values = HashSet::new(); - for i in 0..self.parse_table.symbols.len() { - self.assign_symbol_id(self.parse_table.symbols[i], &mut symbol_id_values); - } - add_line!(self, "#define LANGUAGE_VERSION {}", 6); - add_line!(self, "#define STATE_COUNT {}", self.parse_table.states.len()); - add_line!(self, "#define SYMBOL_COUNT {}", self.parse_table.symbols.len()); - add_line!(self, "#define ALIAS_COUNT {}", self.unique_aliases.len()); + add_line!( + self, + "#define STATE_COUNT {}", + self.parse_table.states.len() + ); + add_line!( + self, + "#define SYMBOL_COUNT {}", + self.parse_table.symbols.len() + ); + add_line!( + self, + "#define ALIAS_COUNT {}", + self.alias_map.iter().filter(|e| e.1.is_none()).count() + ); add_line!(self, "#define TOKEN_COUNT {}", token_count); - add_line!(self, "#define EXTERNAL_TOKEN_COUNT {}", self.syntax_grammar.external_tokens.len()); - // add_line!(self, "#define MAX_ALIAS_SEQUENCE_LENGTH {}\n", self.parse_table.max_alias_sequence_length); + add_line!( + self, + "#define EXTERNAL_TOKEN_COUNT {}", + self.syntax_grammar.external_tokens.len() + ); + if let Some(max_alias_sequence_length) = self + .parse_table + .alias_sequences + .iter() + .map(|seq| seq.len()) + .max() + { + add_line!( + self, + "#define MAX_ALIAS_SEQUENCE_LENGTH {}", + max_alias_sequence_length + ); + } add_line!(self, ""); } fn add_symbol_enum(&mut self) { add_line!(self, "enum {{"); - self.indent(); - for i in 0..self.parse_table.symbols.len() { - let symbol = self.parse_table.symbols[i]; - if symbol != Symbol::end() { - add_line!(self, "{} = {}", self.symbol_ids[&symbol], i); + indent!(self); + let mut i = 1; + for symbol in self.parse_table.symbols.iter() { + if *symbol != Symbol::end() { + add_line!(self, "{} = {},", self.symbol_ids[&symbol], i); + i += 1; } } - self.dedent(); + for (alias, symbol) in &self.alias_map { + if symbol.is_none() { + add_line!(self, "{} = {},", self.alias_ids[&alias], i); + } + i += 1; + } + dedent!(self); add_line!(self, "}};"); add_line!(self, ""); } fn add_symbol_names_list(&mut self) { add_line!(self, "static const char *ts_symbol_names[] = {{"); - self.indent(); - self.dedent(); + indent!(self); + for symbol in self.parse_table.symbols.iter() { + if *symbol != Symbol::end() { + add_line!( + self, + "[{}] = \"{}\",", + self.symbol_ids[&symbol], + self.sanitize_string(self.metadata_for_symbol(*symbol).0) + ); + } + } + for (alias, symbol) in &self.alias_map { + if symbol.is_none() { + add_line!( + self, + "[{}] = \"{}\",", + self.alias_ids[&alias], + self.sanitize_string(&alias.value) + ); + } + } + dedent!(self); add_line!(self, "}};"); add_line!(self, ""); } - fn assign_symbol_id(&mut self, symbol: Symbol, used_ids: &mut HashSet) { + fn add_symbol_metadata_list(&mut self) { + add_line!( + self, + "static const TSSymbolMetadata ts_symbol_metadata[] = {{" + ); + indent!(self); + for symbol in &self.parse_table.symbols { + add_line!(self, "[{}] = {{", self.symbol_ids[&symbol]); + indent!(self); + match self.metadata_for_symbol(*symbol).1 { + VariableType::Named => { + add_line!(self, ".visible = true,"); + add_line!(self, ".named = true,"); + } + VariableType::Anonymous => { + add_line!(self, ".visible = true,"); + add_line!(self, ".named = false,"); + } + VariableType::Hidden => { + add_line!(self, ".visible = false,"); + add_line!(self, ".named = true,"); + } + VariableType::Auxiliary => { + add_line!(self, ".visible = false,"); + add_line!(self, ".named = false,"); + } + } + dedent!(self); + add_line!(self, "}},"); + } + for (alias, matching_symbol) in &self.alias_map { + if matching_symbol.is_none() { + add_line!(self, "[{}] = {{", self.alias_ids[&alias]); + indent!(self); + add_line!(self, ".visible = true,"); + add_line!(self, ".named = {},", alias.is_named); + dedent!(self); + add_line!(self, "}},"); + } + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_alias_sequences(&mut self) { + add_line!( + self, + "static TSSymbol ts_alias_sequences[{}][MAX_ALIAS_SEQUENCE_LENGTH] = {{", + self.parse_table.alias_sequences.len() + ); + indent!(self); + for (i, sequence) in self.parse_table.alias_sequences.iter().enumerate().skip(1) { + add_line!(self, "[{}] = {{", i); + indent!(self); + for (j, alias) in sequence.iter().enumerate() { + if let Some(alias) = alias { + add_line!(self, "[{}] = {},", j, self.alias_ids[&alias]); + } + } + dedent!(self); + add_line!(self, "}},"); + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_lex_function(&mut self, name: &str, lex_table: LexTable) { + add_line!( + self, + "static bool {}(TSLexer *lexer, TSStateId state) {{", + name + ); + indent!(self); + add_line!(self, "START_LEXER();"); + add_line!(self, "switch (state) {{"); + indent!(self); + + for (i, state) in lex_table.states.into_iter().enumerate() { + add_line!(self, "case {}:", i); + indent!(self); + self.add_lex_state(state); + dedent!(self); + } + + add_line!(self, "default:"); + indent!(self); + add_line!(self, "return false;"); + dedent!(self); + + dedent!(self); + add_line!(self, "}}"); + dedent!(self); + add_line!(self, "}}"); + add_line!(self, ""); + } + + fn add_lex_state(&mut self, state: LexState) { + if let Some(accept_action) = state.accept_action { + add_line!( + self, + "ACCEPT_TOKEN({})", + self.symbol_ids[&accept_action.symbol] + ); + } + + let mut ruled_out_characters = HashSet::new(); + for (characters, action) in state.advance_actions { + let previous_length = self.buffer.len(); + + add!(self, "if ("); + if self.add_character_set_condition(&characters, &ruled_out_characters) { + add!(self, ")"); + indent!(self); + if action.in_main_token { + add_line!(self, "ADVANCE({});", action.state); + } else { + add_line!(self, "SKIP({});", action.state); + } + if let CharacterSet::Include(chars) = characters { + ruled_out_characters.extend(chars.iter()); + } + dedent!(self); + } else { + self.buffer.truncate(previous_length); + } + } + + add_line!(self, "END_STATE();"); + } + + fn add_character_set_condition( + &mut self, + characters: &CharacterSet, + ruled_out_characters: &HashSet, + ) -> bool { + true + } + + fn add_lex_modes_list(&mut self) { + self.get_external_scanner_state_id(HashSet::new()); + + let mut external_tokens_by_corresponding_internal_token = HashMap::new(); + for (i, external_token) in self.syntax_grammar.external_tokens.iter().enumerate() { + if let Some(symbol) = external_token.corresponding_internal_token { + external_tokens_by_corresponding_internal_token.insert(symbol.index, i); + } + } + + add_line!(self, "static TSLexMode ts_lex_modes[STATE_COUNT] = {{"); + indent!(self); + for i in 0..self.parse_table.states.len() { + let mut external_tokens = HashSet::new(); + for token in self.parse_table.states[i].terminal_entries.keys() { + if token.is_external() { + external_tokens.insert(token.index); + } else if token.is_terminal() { + if let Some(external_index) = + external_tokens_by_corresponding_internal_token.get(&token.index) + { + external_tokens.insert(*external_index); + } + } + } + + let external_state_id = self.get_external_scanner_state_id(external_tokens); + let state = &self.parse_table.states[i]; + if external_state_id > 0 { + add_line!( + self, + "[{}] = {{.lex_state = {}, .external_lex_state = {}}},", + i, + state.lex_state_id, + external_state_id + ); + } else { + add_line!(self, "[{}] = {{.lex_state = {}}},", i, state.lex_state_id); + } + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_external_token_enum(&mut self) { + add_line!(self, "enum {{"); + indent!(self); + for i in 0..self.syntax_grammar.external_tokens.len() { + add_line!( + self, + "{} = {},", + self.external_token_id(&self.syntax_grammar.external_tokens[i]), + i + ); + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_external_scanner_symbol_map(&mut self) { + add_line!( + self, + "static TSSymbol ts_external_scanner_symbol_map[EXTERNAL_TOKEN_COUNT] = {{" + ); + indent!(self); + for i in 0..self.syntax_grammar.external_tokens.len() { + add_line!( + self, + "[{}] = {},", + self.external_token_id(&self.syntax_grammar.external_tokens[i]), + self.symbol_ids[&Symbol::external(i)], + ); + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_external_scanner_states_list(&mut self) { + add_line!( + self, + "static bool ts_external_scanner_states[{}][EXTERNAL_TOKEN_COUNT] = {{", + self.external_scanner_states.len(), + ); + indent!(self); + for i in 0..self.external_scanner_states.len() { + if !self.external_scanner_states[i].is_empty() { + add_line!(self, "[{}] = {{", i); + indent!(self); + for token_index in &self.external_scanner_states[i] { + add_line!( + self, + "[{}] = true,", + self.external_token_id(&self.syntax_grammar.external_tokens[*token_index]) + ); + } + dedent!(self); + add_line!(self, "}},"); + } + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_parse_table(&mut self) { + let mut parse_table_entries = Vec::new(); + let mut next_parse_action_list_index = 0; + + self.get_parse_action_list_id( + &ParseTableEntry { + actions: Vec::new(), + reusable: false, + }, + &mut parse_table_entries, + &mut next_parse_action_list_index, + ); + + add_line!( + self, + "static uint16_t ts_parse_table[STATE_COUNT][SYMBOL_COUNT] = {{" + ); + indent!(self); + for (i, state) in self.parse_table.states.iter().enumerate() { + add_line!(self, "[{}] = {{", i); + indent!(self); + for (symbol, state_id) in &state.nonterminal_entries { + add_line!(self, "[{}] = STATE({}),", self.symbol_ids[symbol], state_id); + } + for (symbol, entry) in &state.terminal_entries { + let entry_id = self.get_parse_action_list_id( + entry, + &mut parse_table_entries, + &mut next_parse_action_list_index, + ); + add_line!( + self, + "[{}] = ACTIONS({}),", + self.symbol_ids[symbol], + entry_id + ); + } + dedent!(self); + add_line!(self, "}},"); + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + + self.add_parse_action_list(parse_table_entries); + } + + fn add_parse_action_list(&mut self, parse_table_entries: Vec<(usize, ParseTableEntry)>) { + add_line!(self, "static TSParseActionEntry ts_parse_actions[] = {{"); + indent!(self); + for (i, entry) in parse_table_entries { + add!( + self, + " [{}] = {{.count = {}, .reusable = {}}},", + i, + entry.actions.len(), + entry.reusable + ); + for action in entry.actions { + add!(self, " "); + match action { + ParseAction::Accept => add!(self, " ACCEPT_INPUT()"), + ParseAction::Recover => add!(self, "RECOVER()"), + ParseAction::ShiftExtra => add!(self, "SHIFT_EXTRA()"), + ParseAction::Shift { + state, + is_repetition, + } => { + if is_repetition { + add!(self, "SHIFT_REPEAT({})", state); + } else { + add!(self, "SHIFT({})", state); + } + } + ParseAction::Reduce { + symbol, + child_count, + dynamic_precedence, + alias_sequence_id, + .. + } => { + if !self.symbol_ids.contains_key(&symbol) { + eprintln!( + "SYMBOL: {:?} {:?}", + symbol, + self.metadata_for_symbol(symbol) + ); + } + add!(self, "REDUCE({}, {}", self.symbol_ids[&symbol], child_count); + if dynamic_precedence != 0 { + add!(self, ", .dynamic_precedence = {}", dynamic_precedence); + } + if alias_sequence_id != 0 { + add!(self, ", .alias_sequence_id = {}", alias_sequence_id); + } + add!(self, ")"); + } + } + add!(self, ",") + } + add!(self, "\n"); + } + dedent!(self); + add_line!(self, "}};"); + add_line!(self, ""); + } + + fn add_parser_export(&mut self) { + let language_function_name = format!("tree_sitter_{}", self.language_name); + let external_scanner_name = format!("{}_external_scanner", language_function_name); + + if !self.syntax_grammar.external_tokens.is_empty() { + add_line!(self, "void *{}_create();", external_scanner_name); + add_line!(self, "void {}_destroy(void *);", external_scanner_name); + add_line!( + self, + "bool {}_scan(void *, TSLexer *, const bool *);", + external_scanner_name + ); + add_line!( + self, + "unsigned {}_serialize(void *, char *);", + external_scanner_name + ); + add_line!( + self, + "void {}_deserialize(void *, const char *, unsigned);", + external_scanner_name + ); + add_line!(self, ""); + } + + add_line!(self, "#ifdef _WIN32"); + add_line!(self, "#define extern __declspec(dllexport)"); + add_line!(self, "#endif"); + add_line!(self, ""); + + add_line!( + self, + "extern const TSLanguage *{}() {{", + language_function_name + ); + indent!(self); + add_line!(self, "static TSLanguage language = {{"); + indent!(self); + add_line!(self, ".version = LANGUAGE_VERSION,"); + add_line!(self, ".symbol_count = SYMBOL_COUNT,"); + add_line!(self, ".alias_count = ALIAS_COUNT,"); + add_line!(self, ".token_count = TOKEN_COUNT,"); + add_line!(self, ".symbol_metadata = ts_symbol_metadata,"); + add_line!( + self, + ".parse_table = (const unsigned short *)ts_parse_table," + ); + add_line!(self, ".parse_actions = ts_parse_actions,"); + add_line!(self, ".lex_modes = ts_lex_modes,"); + add_line!(self, ".symbol_names = ts_symbol_names,"); + add_line!( + self, + ".alias_sequences = (const TSSymbol *)ts_alias_sequences," + ); + + add_line!( + self, + ".max_alias_sequence_length = MAX_ALIAS_SEQUENCE_LENGTH," + ); + add_line!(self, ".lex_fn = ts_lex,"); + + if let Some(keyword_capture_token) = self.keyword_capture_token { + add_line!(self, ".keyword_lex_fn = ts_lex_keywords,"); + add_line!( + self, + ".keyword_capture_token = {},", + self.symbol_ids[&keyword_capture_token] + ); + } + + add_line!(self, ".external_token_count = EXTERNAL_TOKEN_COUNT,"); + + if !self.syntax_grammar.external_tokens.is_empty() { + add_line!(self, ".external_scanner = {{"); + indent!(self); + add_line!(self, "(const bool *)ts_external_scanner_states,"); + add_line!(self, "ts_external_scanner_symbol_map,"); + add_line!(self, "{}_create,", external_scanner_name); + add_line!(self, "{}_destroy,", external_scanner_name); + add_line!(self, "{}_scan,", external_scanner_name); + add_line!(self, "{}_serialize,", external_scanner_name); + add_line!(self, "{}_deserialize,", external_scanner_name); + dedent!(self); + add_line!(self, "}},"); + } + dedent!(self); + + add_line!(self, "}};"); + add_line!(self, "return &language;"); + dedent!(self); + add_line!(self, "}}"); + } + + fn get_parse_action_list_id( + &self, + entry: &ParseTableEntry, + parse_table_entries: &mut Vec<(usize, ParseTableEntry)>, + next_parse_action_list_index: &mut usize, + ) -> usize { + if let Some((index, _)) = parse_table_entries.iter().find(|(_, e)| *e == *entry) { + return *index; + } + + let result = *next_parse_action_list_index; + parse_table_entries.push((result, entry.clone())); + *next_parse_action_list_index += 1 + entry.actions.len(); + result + } + + fn get_external_scanner_state_id(&mut self, external_tokens: HashSet) -> usize { + self.external_scanner_states + .iter() + .position(|tokens| *tokens == external_tokens) + .unwrap_or_else(|| { + self.external_scanner_states.push(external_tokens); + self.external_scanner_states.len() - 1 + }) + } + + fn external_token_id(&self, token: &ExternalToken) -> String { + format!( + "ts_external_token_{}", + self.sanitize_identifier(&token.name) + ) + } + + fn assign_symbol_id(&mut self, symbol: Symbol, used_identifiers: &mut HashSet) { let mut id; if symbol == Symbol::end() { id = "ts_builtin_sym_end".to_string(); } else { let (name, kind) = self.metadata_for_symbol(symbol); id = match kind { - VariableType::Auxiliary => format!("aux_sym_{}", self.sanitize_name(name)), - VariableType::Anonymous => format!("anon_sym_{}", self.sanitize_name(name)), + VariableType::Auxiliary => format!("aux_sym_{}", self.sanitize_identifier(name)), + VariableType::Anonymous => format!("anon_sym_{}", self.sanitize_identifier(name)), VariableType::Hidden | VariableType::Named => { - format!("sym_{}", self.sanitize_name(name)) + format!("sym_{}", self.sanitize_identifier(name)) } }; let mut suffix_number = 1; let mut suffix = String::new(); - while used_ids.contains(&id) { + while used_identifiers.contains(&id) { id.drain(id.len() - suffix.len()..); suffix_number += 1; suffix = suffix_number.to_string(); @@ -149,7 +753,7 @@ impl Generator { } } - used_ids.insert(id.clone()); + used_identifiers.insert(id.clone()); self.symbol_ids.insert(symbol, id); } @@ -171,16 +775,67 @@ impl Generator { } } - fn sanitize_name(&self, name: &str) -> String { - name.to_string() + fn sanitize_identifier(&self, name: &str) -> String { + let mut result = String::with_capacity(name.len()); + for c in name.chars() { + if ('a' <= c && c <= 'z') + || ('A' <= c && c <= 'Z') + || ('0' <= c && c <= '9') + || c == '_' + { + result.push(c); + } else { + result += match c { + '~' => "TILDE", + '`' => "BQUOTE", + '!' => "BANG", + '@' => "AT", + '#' => "POUND", + '$' => "DOLLAR", + '%' => "PERCENT", + '^' => "CARET", + '&' => "AMP", + '*' => "STAR", + '(' => "LPAREN", + ')' => "RPAREN", + '-' => "DASH", + '+' => "PLUS", + '=' => "EQ", + '{' => "LBRACE", + '}' => "RBRACE", + '[' => "LBRACK", + ']' => "RBRACK", + '\\' => "BSLASH", + '|' => "PIPE", + ':' => "COLON", + ';' => "SEMI", + '"' => "DQUOTE", + '\'' => "SQUOTE", + '<' => "LT", + '>' => "GT", + ',' => "COMMA", + '.' => "DOT", + '?' => "QMARK", + '/' => "SLASH", + '\n' => "LF", + '\r' => "CR", + '\t' => "TAB", + _ => continue, + } + } + } + result } - fn indent(&mut self) { - self.indent_level += 1; - } - - fn dedent(&mut self) { - self.indent_level -= 1; + fn sanitize_string(&self, name: &str) -> String { + let mut result = String::with_capacity(name.len()); + for c in name.chars() { + if ['\\', '\n', '\r', '\"'].contains(&c) { + result.push('\\'); + } + result.push(c); + } + result } } @@ -206,9 +861,9 @@ pub(crate) fn render_c_code( lexical_grammar, simple_aliases, symbol_ids: HashMap::new(), - parse_table_entries: Vec::new(), - next_parse_action_list_index: 0, - unique_aliases: HashSet::new(), + alias_ids: HashMap::new(), + external_scanner_states: Vec::new(), + alias_map: HashMap::new(), } .generate() } diff --git a/src/tables.rs b/src/tables.rs index 9100b81e..01cecb49 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -1,6 +1,7 @@ use std::collections::HashMap; use std::ops::Range; use crate::rules::{Associativity, Symbol, Alias}; +use crate::nfa::CharacterSet; pub(crate) type AliasSequenceId = usize; pub(crate) type ParseStateId = usize; @@ -34,7 +35,8 @@ pub(crate) struct ParseTableEntry { #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct ParseState { pub terminal_entries: HashMap, - pub nonterminal_entries: HashMap + pub nonterminal_entries: HashMap, + pub lex_state_id: usize, } #[derive(Debug, PartialEq, Eq)] @@ -60,7 +62,7 @@ pub(crate) struct AcceptTokenAction { #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct LexState { - pub advance_actions: HashMap, + pub advance_actions: HashMap, pub accept_action: Option, } @@ -78,6 +80,12 @@ impl ParseTableEntry { } } +impl Default for LexTable { + fn default() -> Self { + LexTable { states: Vec::new() } + } +} + impl ParseAction { pub fn precedence(&self) -> i32 { if let ParseAction::Reduce { precedence, .. } = self {