diff --git a/Cargo.lock b/Cargo.lock index ea918eb6..cd411095 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -824,7 +824,7 @@ dependencies = [ [[package]] name = "tree-sitter" -version = "0.17.0" +version = "0.17.1" dependencies = [ "cc", "regex", @@ -832,7 +832,7 @@ dependencies = [ [[package]] name = "tree-sitter-cli" -version = "0.17.1" +version = "0.17.3" dependencies = [ "ansi_term", "atty", diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 21a8fa0f..48dbbff7 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "tree-sitter-cli" description = "CLI tool for developing, testing, and using Tree-sitter parsers" -version = "0.17.1" +version = "0.17.3" authors = ["Max Brunsfeld "] edition = "2018" license = "MIT" diff --git a/cli/npm/package.json b/cli/npm/package.json index 42f75c98..4c6dfe90 100644 --- a/cli/npm/package.json +++ b/cli/npm/package.json @@ -1,6 +1,6 @@ { "name": "tree-sitter-cli", - "version": "0.17.1", + "version": "0.17.3", "author": "Max Brunsfeld", "license": "MIT", "repository": { diff --git a/cli/src/generate/node_types.rs b/cli/src/generate/node_types.rs index 7962c7f3..bc5a836f 100644 --- a/cli/src/generate/node_types.rs +++ b/cli/src/generate/node_types.rs @@ -146,7 +146,7 @@ impl ChildQuantity { pub(crate) fn get_variable_info( syntax_grammar: &SyntaxGrammar, lexical_grammar: &LexicalGrammar, - simple_aliases: &AliasMap, + default_aliases: &AliasMap, ) -> Result> { let child_type_is_visible = |t: &ChildType| { variable_type_for_child_type(t, syntax_grammar, lexical_grammar) >= VariableType::Anonymous @@ -185,7 +185,7 @@ pub(crate) fn get_variable_info( let child_symbol = step.symbol; let child_type = if let Some(alias) = &step.alias { ChildType::Aliased(alias.clone()) - } else if let Some(alias) = simple_aliases.get(&step.symbol) { + } else if let Some(alias) = default_aliases.get(&step.symbol) { ChildType::Aliased(alias.clone()) } else { ChildType::Normal(child_symbol) @@ -358,7 +358,7 @@ pub(crate) fn get_variable_info( pub(crate) fn generate_node_types_json( syntax_grammar: &SyntaxGrammar, lexical_grammar: &LexicalGrammar, - simple_aliases: &AliasMap, + default_aliases: &AliasMap, variable_info: &Vec, ) -> Vec { let mut node_types_json = BTreeMap::new(); @@ -369,7 +369,7 @@ pub(crate) fn generate_node_types_json( named: alias.is_named, }, ChildType::Normal(symbol) => { - if let Some(alias) = simple_aliases.get(&symbol) { + if let Some(alias) = default_aliases.get(&symbol) { NodeTypeJSON { kind: alias.value.clone(), named: alias.is_named, @@ -417,22 +417,33 @@ pub(crate) fn generate_node_types_json( }; let mut aliases_by_symbol = HashMap::new(); - for (symbol, alias) in simple_aliases { + for (symbol, alias) in default_aliases { aliases_by_symbol.insert(*symbol, { let mut aliases = HashSet::new(); aliases.insert(Some(alias.clone())); aliases }); } + for extra_symbol in &syntax_grammar.extra_symbols { + if !default_aliases.contains_key(extra_symbol) { + aliases_by_symbol + .entry(*extra_symbol) + .or_insert(HashSet::new()) + .insert(None); + } + } for variable in &syntax_grammar.variables { for production in &variable.productions { for step in &production.steps { - if !simple_aliases.contains_key(&step.symbol) { - aliases_by_symbol - .entry(step.symbol) - .or_insert(HashSet::new()) - .insert(step.alias.clone()); - } + aliases_by_symbol + .entry(step.symbol) + .or_insert(HashSet::new()) + .insert( + step.alias + .as_ref() + .or_else(|| default_aliases.get(&step.symbol)) + .cloned(), + ); } } } @@ -722,9 +733,18 @@ mod tests { kind: VariableType::Named, rule: Rule::string("x"), }, + // This rule is not reachable from the start symbol + // so it won't be present in the node_types + Variable { + name: "v3".to_string(), + kind: VariableType::Named, + rule: Rule::string("y"), + }, ], }); + assert_eq!(node_types.len(), 3); + assert_eq!( node_types[0], NodeInfoJSON { @@ -784,6 +804,112 @@ mod tests { ); } + #[test] + fn test_node_types_simple_extras() { + let node_types = get_node_types(InputGrammar { + name: String::new(), + extra_symbols: vec![Rule::named("v3")], + external_tokens: Vec::new(), + expected_conflicts: Vec::new(), + variables_to_inline: Vec::new(), + word_token: None, + supertype_symbols: vec![], + variables: vec![ + Variable { + name: "v1".to_string(), + kind: VariableType::Named, + rule: Rule::seq(vec![ + Rule::field("f1".to_string(), Rule::named("v2")), + Rule::field("f2".to_string(), Rule::string(";")), + ]), + }, + Variable { + name: "v2".to_string(), + kind: VariableType::Named, + rule: Rule::string("x"), + }, + // This rule is not reachable from the start symbol, but + // it is reachable from the 'extra_symbols' so it + // should be present in the node_types + Variable { + name: "v3".to_string(), + kind: VariableType::Named, + rule: Rule::string("y"), + }, + ], + }); + + assert_eq!(node_types.len(), 4); + + assert_eq!( + node_types[0], + NodeInfoJSON { + kind: "v1".to_string(), + named: true, + subtypes: None, + children: None, + fields: Some( + vec![ + ( + "f1".to_string(), + FieldInfoJSON { + multiple: false, + required: true, + types: vec![NodeTypeJSON { + kind: "v2".to_string(), + named: true, + }] + } + ), + ( + "f2".to_string(), + FieldInfoJSON { + multiple: false, + required: true, + types: vec![NodeTypeJSON { + kind: ";".to_string(), + named: false, + }] + } + ), + ] + .into_iter() + .collect() + ) + } + ); + assert_eq!( + node_types[1], + NodeInfoJSON { + kind: ";".to_string(), + named: false, + subtypes: None, + children: None, + fields: None + } + ); + assert_eq!( + node_types[2], + NodeInfoJSON { + kind: "v2".to_string(), + named: true, + subtypes: None, + children: None, + fields: None + } + ); + assert_eq!( + node_types[3], + NodeInfoJSON { + kind: "v3".to_string(), + named: true, + subtypes: None, + children: None, + fields: None + } + ); + } + #[test] fn test_node_types_with_supertypes() { let node_types = get_node_types(InputGrammar { @@ -1685,14 +1811,14 @@ mod tests { } fn get_node_types(grammar: InputGrammar) -> Vec { - let (syntax_grammar, lexical_grammar, _, simple_aliases) = + let (syntax_grammar, lexical_grammar, _, default_aliases) = prepare_grammar(&grammar).unwrap(); let variable_info = - get_variable_info(&syntax_grammar, &lexical_grammar, &simple_aliases).unwrap(); + get_variable_info(&syntax_grammar, &lexical_grammar, &default_aliases).unwrap(); generate_node_types_json( &syntax_grammar, &lexical_grammar, - &simple_aliases, + &default_aliases, &variable_info, ) } diff --git a/cli/src/generate/prepare_grammar/extract_default_aliases.rs b/cli/src/generate/prepare_grammar/extract_default_aliases.rs new file mode 100644 index 00000000..3e08e3ad --- /dev/null +++ b/cli/src/generate/prepare_grammar/extract_default_aliases.rs @@ -0,0 +1,293 @@ +use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar}; +use crate::generate::rules::{Alias, AliasMap, Symbol, SymbolType}; + +#[derive(Clone, Default)] +struct SymbolStatus { + aliases: Vec<(Alias, usize)>, + appears_unaliased: bool, +} + +// Update the grammar by finding symbols that always are aliased, and for each such symbol, +// promoting one of its aliases to a "default alias", which is applied globally instead +// of in a context-specific way. +// +// This has two benefits: +// * It reduces the overhead of storing production-specific alias info in the parse table. +// * Within an `ERROR` node, no context-specific aliases will be applied. This transformation +// ensures that the children of an `ERROR` node have symbols that are consistent with the +// way that they would appear in a valid syntax tree. +pub(super) fn extract_default_aliases( + syntax_grammar: &mut SyntaxGrammar, + lexical_grammar: &LexicalGrammar, +) -> AliasMap { + let mut terminal_status_list = vec![SymbolStatus::default(); lexical_grammar.variables.len()]; + let mut non_terminal_status_list = + vec![SymbolStatus::default(); syntax_grammar.variables.len()]; + let mut external_status_list = + vec![SymbolStatus::default(); syntax_grammar.external_tokens.len()]; + + // For each grammar symbol, find all of the aliases under which the symbol appears, + // and determine whether or not the symbol ever appears *unaliased*. + for variable in syntax_grammar.variables.iter() { + for production in variable.productions.iter() { + for step in production.steps.iter() { + let mut status = match step.symbol.kind { + SymbolType::External => &mut external_status_list[step.symbol.index], + SymbolType::NonTerminal => &mut non_terminal_status_list[step.symbol.index], + SymbolType::Terminal => &mut terminal_status_list[step.symbol.index], + SymbolType::End => panic!("Unexpected end token"), + }; + + // Default aliases don't work for inlined variables. + if syntax_grammar.variables_to_inline.contains(&step.symbol) { + continue; + } + + if let Some(alias) = &step.alias { + if let Some(count_for_alias) = status + .aliases + .iter_mut() + .find_map(|(a, count)| if a == alias { Some(count) } else { None }) + { + *count_for_alias += 1; + } else { + status.aliases.push((alias.clone(), 1)); + } + } else { + status.appears_unaliased = true; + } + } + } + } + + let symbols_with_statuses = (terminal_status_list + .iter_mut() + .enumerate() + .map(|(i, status)| (Symbol::terminal(i), status))) + .chain( + non_terminal_status_list + .iter_mut() + .enumerate() + .map(|(i, status)| (Symbol::non_terminal(i), status)), + ) + .chain( + external_status_list + .iter_mut() + .enumerate() + .map(|(i, status)| (Symbol::external(i), status)), + ); + + // For each symbol that always appears aliased, find the alias the occurs most often, + // and designate that alias as the symbol's "default alias". Store all of these + // default aliases in a map that will be returned. + let mut result = AliasMap::new(); + for (symbol, status) in symbols_with_statuses { + if status.appears_unaliased { + status.aliases.clear(); + } else { + if let Some(default_entry) = status + .aliases + .iter() + .enumerate() + .max_by_key(|(i, (_, count))| (count, -(*i as i64))) + .map(|(_, entry)| entry.clone()) + { + status.aliases.clear(); + status.aliases.push(default_entry.clone()); + result.insert(symbol, default_entry.0); + } + } + } + + // Wherever a symbol is aliased as its default alias, remove the usage of the alias, + // because it will now be redundant. + let mut alias_positions_to_clear = Vec::new(); + for variable in syntax_grammar.variables.iter_mut() { + alias_positions_to_clear.clear(); + + for (i, production) in variable.productions.iter().enumerate() { + for (j, step) in production.steps.iter().enumerate() { + let status = match step.symbol.kind { + SymbolType::External => &mut external_status_list[step.symbol.index], + SymbolType::NonTerminal => &mut non_terminal_status_list[step.symbol.index], + SymbolType::Terminal => &mut terminal_status_list[step.symbol.index], + SymbolType::End => panic!("Unexpected end token"), + }; + + // If this step is aliased as the symbol's default alias, then remove that alias. + if step.alias.is_some() + && step.alias.as_ref() == status.aliases.get(0).map(|t| &t.0) + { + let mut other_productions_must_use_this_alias_at_this_index = false; + for (other_i, other_production) in variable.productions.iter().enumerate() { + if other_i != i + && other_production.steps.len() > j + && other_production.steps[j].alias == step.alias + && result.get(&other_production.steps[j].symbol) != step.alias.as_ref() + { + other_productions_must_use_this_alias_at_this_index = true; + break; + } + } + + if !other_productions_must_use_this_alias_at_this_index { + alias_positions_to_clear.push((i, j)); + } + } + } + } + + for (production_index, step_index) in &alias_positions_to_clear { + variable.productions[*production_index].steps[*step_index].alias = None; + } + } + + result +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::generate::grammars::{ + LexicalVariable, Production, ProductionStep, SyntaxVariable, VariableType, + }; + use crate::generate::nfa::Nfa; + + #[test] + fn test_extract_simple_aliases() { + let mut syntax_grammar = SyntaxGrammar { + variables: vec![ + SyntaxVariable { + name: "v1".to_owned(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(0)).with_alias("a1", true), + ProductionStep::new(Symbol::terminal(1)).with_alias("a2", true), + ProductionStep::new(Symbol::terminal(2)).with_alias("a3", true), + ProductionStep::new(Symbol::terminal(3)).with_alias("a4", true), + ], + }], + }, + SyntaxVariable { + name: "v2".to_owned(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ + // Token 0 is always aliased as "a1". + ProductionStep::new(Symbol::terminal(0)).with_alias("a1", true), + // Token 1 is aliased within rule `v1` above, but not here. + ProductionStep::new(Symbol::terminal(1)), + // Token 2 is aliased differently here than in `v1`. The alias from + // `v1` should be promoted to the default alias, because `v1` appears + // first in the grammar. + ProductionStep::new(Symbol::terminal(2)).with_alias("a5", true), + // Token 3 is also aliased differently here than in `v1`. In this case, + // this alias should be promoted to the default alias, because it is + // used a greater number of times (twice). + ProductionStep::new(Symbol::terminal(3)).with_alias("a6", true), + ProductionStep::new(Symbol::terminal(3)).with_alias("a6", true), + ], + }], + }, + ], + extra_symbols: Vec::new(), + expected_conflicts: Vec::new(), + variables_to_inline: Vec::new(), + supertype_symbols: Vec::new(), + external_tokens: Vec::new(), + word_token: None, + }; + + let lexical_grammar = LexicalGrammar { + nfa: Nfa::new(), + variables: vec![ + LexicalVariable { + name: "t0".to_string(), + kind: VariableType::Anonymous, + implicit_precedence: 0, + start_state: 0, + }, + LexicalVariable { + name: "t1".to_string(), + kind: VariableType::Anonymous, + implicit_precedence: 0, + start_state: 0, + }, + LexicalVariable { + name: "t2".to_string(), + kind: VariableType::Anonymous, + implicit_precedence: 0, + start_state: 0, + }, + LexicalVariable { + name: "t3".to_string(), + kind: VariableType::Anonymous, + implicit_precedence: 0, + start_state: 0, + }, + ], + }; + + let default_aliases = extract_default_aliases(&mut syntax_grammar, &lexical_grammar); + assert_eq!(default_aliases.len(), 3); + + assert_eq!( + default_aliases.get(&Symbol::terminal(0)), + Some(&Alias { + value: "a1".to_string(), + is_named: true, + }) + ); + assert_eq!( + default_aliases.get(&Symbol::terminal(2)), + Some(&Alias { + value: "a3".to_string(), + is_named: true, + }) + ); + assert_eq!( + default_aliases.get(&Symbol::terminal(3)), + Some(&Alias { + value: "a6".to_string(), + is_named: true, + }) + ); + assert_eq!(default_aliases.get(&Symbol::terminal(1)), None); + + assert_eq!( + syntax_grammar.variables, + vec![ + SyntaxVariable { + name: "v1".to_owned(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(0)), + ProductionStep::new(Symbol::terminal(1)).with_alias("a2", true), + ProductionStep::new(Symbol::terminal(2)), + ProductionStep::new(Symbol::terminal(3)).with_alias("a4", true), + ], + },], + }, + SyntaxVariable { + name: "v2".to_owned(), + kind: VariableType::Named, + productions: vec![Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::terminal(0)), + ProductionStep::new(Symbol::terminal(1)), + ProductionStep::new(Symbol::terminal(2)).with_alias("a5", true), + ProductionStep::new(Symbol::terminal(3)), + ProductionStep::new(Symbol::terminal(3)), + ], + },], + }, + ] + ); + } +} diff --git a/cli/src/generate/prepare_grammar/extract_simple_aliases.rs b/cli/src/generate/prepare_grammar/extract_simple_aliases.rs deleted file mode 100644 index 6da009d5..00000000 --- a/cli/src/generate/prepare_grammar/extract_simple_aliases.rs +++ /dev/null @@ -1,223 +0,0 @@ -use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar}; -use crate::generate::rules::{Alias, AliasMap, Symbol, SymbolType}; - -#[derive(Clone, Default)] -struct SymbolStatus { - alias: Option, - conflicting: bool, -} - -pub(super) fn extract_simple_aliases( - syntax_grammar: &mut SyntaxGrammar, - lexical_grammar: &LexicalGrammar, -) -> AliasMap { - // Determine which symbols in the grammars are *always* aliased to a single name. - let mut terminal_status_list = vec![SymbolStatus::default(); lexical_grammar.variables.len()]; - let mut non_terminal_status_list = - vec![SymbolStatus::default(); syntax_grammar.variables.len()]; - let mut external_status_list = - vec![SymbolStatus::default(); syntax_grammar.external_tokens.len()]; - for variable in syntax_grammar.variables.iter() { - for production in variable.productions.iter() { - for step in production.steps.iter() { - let mut status = match step.symbol { - Symbol { - kind: SymbolType::External, - index, - } => &mut external_status_list[index], - Symbol { - kind: SymbolType::NonTerminal, - index, - } => &mut non_terminal_status_list[index], - Symbol { - kind: SymbolType::Terminal, - index, - } => &mut terminal_status_list[index], - Symbol { - kind: SymbolType::End, - .. - } => panic!("Unexpected end token"), - }; - - if step.alias.is_none() { - status.alias = None; - status.conflicting = true; - } - - if !status.conflicting { - if status.alias.is_none() { - status.alias = step.alias.clone(); - } else if status.alias != step.alias { - status.alias = None; - status.conflicting = true; - } - } - } - } - } - - // Remove the aliases for those symbols. - for variable in syntax_grammar.variables.iter_mut() { - for production in variable.productions.iter_mut() { - for step in production.steps.iter_mut() { - let status = match step.symbol { - Symbol { - kind: SymbolType::External, - index, - } => &external_status_list[index], - Symbol { - kind: SymbolType::NonTerminal, - index, - } => &non_terminal_status_list[index], - Symbol { - kind: SymbolType::Terminal, - index, - } => &terminal_status_list[index], - Symbol { - kind: SymbolType::End, - .. - } => panic!("Unexpected end token"), - }; - - if status.alias.is_some() { - step.alias = None; - } - } - } - } - - // Populate a map of the symbols to their aliases. - let mut result = AliasMap::new(); - for (i, status) in terminal_status_list.into_iter().enumerate() { - if let Some(alias) = status.alias { - result.insert(Symbol::terminal(i), alias); - } - } - for (i, status) in non_terminal_status_list.into_iter().enumerate() { - if let Some(alias) = status.alias { - result.insert(Symbol::non_terminal(i), alias); - } - } - for (i, status) in external_status_list.into_iter().enumerate() { - if let Some(alias) = status.alias { - result.insert(Symbol::external(i), alias); - } - } - result -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::generate::grammars::{ - LexicalVariable, Production, ProductionStep, SyntaxVariable, VariableType, - }; - use crate::generate::nfa::Nfa; - - #[test] - fn test_extract_simple_aliases() { - let mut syntax_grammar = SyntaxGrammar { - variables: vec![ - SyntaxVariable { - name: "v1".to_owned(), - kind: VariableType::Named, - productions: vec![Production { - dynamic_precedence: 0, - steps: vec![ - ProductionStep::new(Symbol::terminal(0)).with_alias("a1", true), - ProductionStep::new(Symbol::terminal(1)).with_alias("a2", true), - ProductionStep::new(Symbol::terminal(2)).with_alias("a3", true), - ], - }], - }, - SyntaxVariable { - name: "v2".to_owned(), - kind: VariableType::Named, - productions: vec![Production { - dynamic_precedence: 0, - steps: vec![ - // Token 0 is always aliased as "a1". - ProductionStep::new(Symbol::terminal(0)).with_alias("a1", true), - // Token 1 is aliased above, but not here. - ProductionStep::new(Symbol::terminal(1)), - // Token 2 is aliased differently than above. - ProductionStep::new(Symbol::terminal(2)).with_alias("a4", true), - ], - }], - }, - ], - extra_symbols: Vec::new(), - expected_conflicts: Vec::new(), - variables_to_inline: Vec::new(), - supertype_symbols: Vec::new(), - external_tokens: Vec::new(), - word_token: None, - }; - - let lexical_grammar = LexicalGrammar { - nfa: Nfa::new(), - variables: vec![ - LexicalVariable { - name: "t1".to_string(), - kind: VariableType::Anonymous, - implicit_precedence: 0, - start_state: 0, - }, - LexicalVariable { - name: "t2".to_string(), - kind: VariableType::Anonymous, - implicit_precedence: 0, - start_state: 0, - }, - LexicalVariable { - name: "t3".to_string(), - kind: VariableType::Anonymous, - implicit_precedence: 0, - start_state: 0, - }, - ], - }; - - let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &lexical_grammar); - assert_eq!(simple_aliases.len(), 1); - assert_eq!( - simple_aliases[&Symbol::terminal(0)], - Alias { - value: "a1".to_string(), - is_named: true, - } - ); - - assert_eq!( - syntax_grammar.variables, - vec![ - SyntaxVariable { - name: "v1".to_owned(), - kind: VariableType::Named, - productions: vec![Production { - dynamic_precedence: 0, - steps: vec![ - // 'Simple' alias removed - ProductionStep::new(Symbol::terminal(0)), - // Other aliases unchanged - ProductionStep::new(Symbol::terminal(1)).with_alias("a2", true), - ProductionStep::new(Symbol::terminal(2)).with_alias("a3", true), - ], - },], - }, - SyntaxVariable { - name: "v2".to_owned(), - kind: VariableType::Named, - productions: vec![Production { - dynamic_precedence: 0, - steps: vec![ - ProductionStep::new(Symbol::terminal(0)), - ProductionStep::new(Symbol::terminal(1)), - ProductionStep::new(Symbol::terminal(2)).with_alias("a4", true), - ], - },], - }, - ] - ); - } -} diff --git a/cli/src/generate/prepare_grammar/mod.rs b/cli/src/generate/prepare_grammar/mod.rs index 029483d3..8b094c56 100644 --- a/cli/src/generate/prepare_grammar/mod.rs +++ b/cli/src/generate/prepare_grammar/mod.rs @@ -1,6 +1,6 @@ mod expand_repeats; mod expand_tokens; -mod extract_simple_aliases; +mod extract_default_aliases; mod extract_tokens; mod flatten_grammar; mod intern_symbols; @@ -8,7 +8,7 @@ mod process_inlines; use self::expand_repeats::expand_repeats; pub(crate) use self::expand_tokens::expand_tokens; -use self::extract_simple_aliases::extract_simple_aliases; +use self::extract_default_aliases::extract_default_aliases; use self::extract_tokens::extract_tokens; use self::flatten_grammar::flatten_grammar; use self::intern_symbols::intern_symbols; @@ -52,7 +52,7 @@ pub(crate) fn prepare_grammar( let syntax_grammar = expand_repeats(syntax_grammar); let mut syntax_grammar = flatten_grammar(syntax_grammar)?; let lexical_grammar = expand_tokens(lexical_grammar)?; - let simple_aliases = extract_simple_aliases(&mut syntax_grammar, &lexical_grammar); + let default_aliases = extract_default_aliases(&mut syntax_grammar, &lexical_grammar); let inlines = process_inlines(&syntax_grammar); - Ok((syntax_grammar, lexical_grammar, inlines, simple_aliases)) + Ok((syntax_grammar, lexical_grammar, inlines, default_aliases)) } diff --git a/cli/src/generate/render.rs b/cli/src/generate/render.rs index f7f788d0..58d99cc4 100644 --- a/cli/src/generate/render.rs +++ b/cli/src/generate/render.rs @@ -65,7 +65,7 @@ struct Generator { keyword_capture_token: Option, syntax_grammar: SyntaxGrammar, lexical_grammar: LexicalGrammar, - simple_aliases: AliasMap, + default_aliases: AliasMap, symbol_order: HashMap, symbol_ids: HashMap, alias_ids: HashMap, @@ -143,49 +143,6 @@ impl Generator { self.assign_symbol_id(self.parse_table.symbols[i], &mut symbol_identifiers); } - let mut field_names = Vec::new(); - for production_info in &self.parse_table.production_infos { - for field_name in production_info.field_map.keys() { - field_names.push(field_name); - } - - for alias in &production_info.alias_sequence { - if let Some(alias) = &alias { - let alias_kind = alias.kind(); - let matching_symbol = self.parse_table.symbols.iter().cloned().find(|symbol| { - let (name, kind) = self.metadata_for_symbol(*symbol); - name == alias.value && kind == alias_kind - }); - let alias_id = if let Some(symbol) = matching_symbol { - self.symbol_ids[&symbol].clone() - } else if alias.is_named { - format!("alias_sym_{}", self.sanitize_identifier(&alias.value)) - } else { - format!("anon_alias_sym_{}", self.sanitize_identifier(&alias.value)) - }; - self.alias_ids.entry(alias.clone()).or_insert(alias_id); - } - } - } - - self.unique_aliases = self - .alias_ids - .keys() - .filter(|alias| { - self.parse_table - .symbols - .iter() - .cloned() - .find(|symbol| { - let (name, kind) = self.metadata_for_symbol(*symbol); - name == alias.value && kind == alias.kind() - }) - .is_none() - }) - .cloned() - .collect(); - self.unique_aliases.sort_unstable(); - self.symbol_map = self .parse_table .symbols @@ -198,10 +155,10 @@ impl Generator { // public-facing symbol. If one of the symbols is not aliased, choose that one // to be the public-facing symbol. Otherwise, pick the symbol with the lowest // numeric value. - if let Some(alias) = self.simple_aliases.get(symbol) { + if let Some(alias) = self.default_aliases.get(symbol) { let kind = alias.kind(); for other_symbol in &self.parse_table.symbols { - if let Some(other_alias) = self.simple_aliases.get(other_symbol) { + if let Some(other_alias) = self.default_aliases.get(other_symbol) { if other_symbol < mapping && other_alias == alias { mapping = other_symbol; } @@ -230,13 +187,51 @@ impl Generator { }) .collect(); - field_names.sort_unstable(); - field_names.dedup(); - self.field_names = field_names.into_iter().cloned().collect(); + for production_info in &self.parse_table.production_infos { + // Build a list of all field names + for field_name in production_info.field_map.keys() { + if let Err(i) = self.field_names.binary_search(&field_name) { + self.field_names.insert(i, field_name.clone()); + } + } - // If we are opting in to the new unstable language ABI, then use the concept of - // "small parse states". Otherwise, use the same representation for all parse - // states. + for alias in &production_info.alias_sequence { + // Generate a mapping from aliases to C identifiers. + if let Some(alias) = &alias { + let existing_symbol = self.parse_table.symbols.iter().cloned().find(|symbol| { + if let Some(default_alias) = self.default_aliases.get(symbol) { + default_alias == alias + } else { + let (name, kind) = self.metadata_for_symbol(*symbol); + name == alias.value && kind == alias.kind() + } + }); + + // Some aliases match an existing symbol in the grammar. + let alias_id; + if let Some(existing_symbol) = existing_symbol { + alias_id = self.symbol_ids[&self.symbol_map[&existing_symbol]].clone(); + } + // Other aliases don't match any existing symbol, and need their own identifiers. + else { + if let Err(i) = self.unique_aliases.binary_search(alias) { + self.unique_aliases.insert(i, alias.clone()); + } + + alias_id = if alias.is_named { + format!("alias_sym_{}", self.sanitize_identifier(&alias.value)) + } else { + format!("anon_alias_sym_{}", self.sanitize_identifier(&alias.value)) + }; + } + + self.alias_ids.entry(alias.clone()).or_insert(alias_id); + } + } + } + + // Determine which states should use the "small state" representation, and which should + // use the normal array representation. let threshold = cmp::min(SMALL_STATE_THRESHOLD, self.parse_table.symbols.len() / 2); self.large_state_count = self .parse_table @@ -361,7 +356,7 @@ impl Generator { indent!(self); for symbol in self.parse_table.symbols.iter() { let name = self.sanitize_string( - self.simple_aliases + self.default_aliases .get(symbol) .map(|alias| alias.value.as_str()) .unwrap_or(self.metadata_for_symbol(*symbol).0), @@ -444,7 +439,7 @@ impl Generator { for symbol in &self.parse_table.symbols { add_line!(self, "[{}] = {{", self.symbol_ids[&symbol]); indent!(self); - if let Some(Alias { is_named, .. }) = self.simple_aliases.get(symbol) { + if let Some(Alias { is_named, .. }) = self.default_aliases.get(symbol) { add_line!(self, ".visible = true,"); add_line!(self, ".named = {},", is_named); } else { @@ -519,19 +514,22 @@ impl Generator { } fn add_non_terminal_alias_map(&mut self) { - let mut aliases_by_symbol = HashMap::new(); + let mut alias_ids_by_symbol = HashMap::new(); for variable in &self.syntax_grammar.variables { for production in &variable.productions { for step in &production.steps { if let Some(alias) = &step.alias { if step.symbol.is_non_terminal() - && !self.simple_aliases.contains_key(&step.symbol) + && Some(alias) != self.default_aliases.get(&step.symbol) { if self.symbol_ids.contains_key(&step.symbol) { - let alias_ids = - aliases_by_symbol.entry(step.symbol).or_insert(Vec::new()); - if let Err(i) = alias_ids.binary_search(&alias) { - alias_ids.insert(i, alias); + if let Some(alias_id) = self.alias_ids.get(&alias) { + let alias_ids = alias_ids_by_symbol + .entry(step.symbol) + .or_insert(Vec::new()); + if let Err(i) = alias_ids.binary_search(&alias_id) { + alias_ids.insert(i, alias_id); + } } } } @@ -540,19 +538,19 @@ impl Generator { } } - let mut aliases_by_symbol = aliases_by_symbol.iter().collect::>(); - aliases_by_symbol.sort_unstable_by_key(|e| e.0); + let mut alias_ids_by_symbol = alias_ids_by_symbol.iter().collect::>(); + alias_ids_by_symbol.sort_unstable_by_key(|e| e.0); add_line!(self, "static uint16_t ts_non_terminal_alias_map[] = {{"); indent!(self); - for (symbol, aliases) in aliases_by_symbol { + for (symbol, alias_ids) in alias_ids_by_symbol { let symbol_id = &self.symbol_ids[symbol]; let public_symbol_id = &self.symbol_ids[&self.symbol_map[&symbol]]; - add_line!(self, "{}, {},", symbol_id, 1 + aliases.len()); + add_line!(self, "{}, {},", symbol_id, 1 + alias_ids.len()); indent!(self); add_line!(self, "{},", public_symbol_id); - for alias in aliases { - add_line!(self, "{},", &self.alias_ids[&alias]); + for alias_id in alias_ids { + add_line!(self, "{},", alias_id); } dedent!(self); } @@ -1545,7 +1543,7 @@ impl Generator { /// for keyword capture, if any. /// * `syntax_grammar` - The syntax grammar extracted from the language's grammar /// * `lexical_grammar` - The lexical grammar extracted from the language's grammar -/// * `simple_aliases` - A map describing the global rename rules that should apply. +/// * `default_aliases` - A map describing the global rename rules that should apply. /// the keys are symbols that are *always* aliased in the same way, and the values /// are the aliases that are applied to those symbols. /// * `next_abi` - A boolean indicating whether to opt into the new, unstable parse @@ -1558,7 +1556,7 @@ pub(crate) fn render_c_code( keyword_capture_token: Option, syntax_grammar: SyntaxGrammar, lexical_grammar: LexicalGrammar, - simple_aliases: AliasMap, + default_aliases: AliasMap, next_abi: bool, ) -> String { Generator { @@ -1572,7 +1570,7 @@ pub(crate) fn render_c_code( keyword_capture_token, syntax_grammar, lexical_grammar, - simple_aliases, + default_aliases, symbol_ids: HashMap::new(), symbol_order: HashMap::new(), alias_ids: HashMap::new(), diff --git a/cli/src/tests/query_test.rs b/cli/src/tests/query_test.rs index 1f7ddaff..082686ac 100644 --- a/cli/src/tests/query_test.rs +++ b/cli/src/tests/query_test.rs @@ -367,6 +367,30 @@ fn test_query_errors_on_impossible_patterns() { }); } +#[test] +fn test_query_verifies_possible_patterns_with_aliased_parent_nodes() { + allocations::record(|| { + let ruby = get_language("ruby"); + + Query::new(ruby, "(destructured_parameter (identifier))").unwrap(); + + assert_eq!( + Query::new(ruby, "(destructured_parameter (string))",), + Err(QueryError { + kind: QueryErrorKind::Structure, + row: 0, + offset: 24, + column: 24, + message: [ + "(destructured_parameter (string))", // + " ^", + ] + .join("\n") + }) + ); + }); +} + #[test] fn test_query_matches_with_simple_pattern() { allocations::record(|| { @@ -1451,6 +1475,7 @@ fn test_query_matches_with_anonymous_tokens() { r#" ";" @punctuation "&&" @operator + "\"" @quote "#, ) .unwrap(); @@ -1458,9 +1483,11 @@ fn test_query_matches_with_anonymous_tokens() { assert_query_matches( language, &query, - "foo(a && b);", + r#"foo(a && "b");"#, &[ (1, vec![("operator", "&&")]), + (2, vec![("quote", "\"")]), + (2, vec![("quote", "\"")]), (0, vec![("punctuation", ";")]), ], ); @@ -1808,6 +1835,33 @@ fn test_query_matches_with_no_captures() { }); } +#[test] +fn test_query_matches_with_repeated_fields() { + allocations::record(|| { + let language = get_language("c"); + let query = Query::new( + language, + "(field_declaration declarator: (field_identifier) @field)", + ) + .unwrap(); + + assert_query_matches( + language, + &query, + " + struct S { + int a, b, c; + } + ", + &[ + (0, vec![("field", "a")]), + (0, vec![("field", "b")]), + (0, vec![("field", "c")]), + ], + ); + }); +} + #[test] fn test_query_captures_basic() { allocations::record(|| { diff --git a/docs/index.md b/docs/index.md index 8551d1eb..d9410cc2 100644 --- a/docs/index.md +++ b/docs/index.md @@ -34,6 +34,7 @@ Parsers for these languages are fairly complete: * [Elm](https://github.com/razzeee/tree-sitter-elm) * [Eno](https://github.com/eno-lang/tree-sitter-eno) * [ERB / EJS](https://github.com/tree-sitter/tree-sitter-embedded-template) +- [Fennel](https://github.com/travonted/tree-sitter-fennel) * [Go](https://github.com/tree-sitter/tree-sitter-go) * [HTML](https://github.com/tree-sitter/tree-sitter-html) * [Java](https://github.com/tree-sitter/tree-sitter-java) @@ -49,6 +50,7 @@ Parsers for these languages are fairly complete: * [TOML](https://github.com/ikatyang/tree-sitter-toml) * [TypeScript](https://github.com/tree-sitter/tree-sitter-typescript) * [Verilog](https://github.com/tree-sitter/tree-sitter-verilog) +* [VHDL](https://github.com/alemuller/tree-sitter-vhdl) * [Vue](https://github.com/ikatyang/tree-sitter-vue) * [YAML](https://github.com/ikatyang/tree-sitter-yaml) * [WASM](https://github.com/wasm-lsp/tree-sitter-wasm) diff --git a/docs/section-3-creating-parsers.md b/docs/section-3-creating-parsers.md index 77999190..3fc8f04a 100644 --- a/docs/section-3-creating-parsers.md +++ b/docs/section-3-creating-parsers.md @@ -210,6 +210,7 @@ The following is a complete list of built-in functions you can use in your `gram * **Right Associativity : `prec.right([number], rule)`** - This function is like `prec.left`, but it instructs Tree-sitter to prefer matching a rule that ends *later*. * **Dynamic Precedence : `prec.dynamic(number, rule)`** - This function is similar to `prec`, but the given numerical precedence is applied at *runtime* instead of at parser generation time. This is only necessary when handling a conflict dynamically using the `conflicts` field in the grammar, and when there is a genuine *ambiguity*: multiple rules correctly match a given piece of code. In that event, Tree-sitter compares the total dynamic precedence associated with each rule, and selects the one with the highest total. This is similar to [dynamic precedence directives][bison-dprec] in Bison grammars. * **Tokens : `token(rule)`** - This function marks the given rule as producing only a single token. Tree-sitter's default is to treat each String or RegExp literal in the grammar as a separate token. Each token is matched separately by the lexer and returned as its own leaf node in the tree. The `token` function allows you to express a complex rule using the functions described above (rather than as a single regular expression) but still have Tree-sitter treat it as a single token. +* **Immediate Tokens : `token.immediate(rule)`** - Usually, whitespace (and any other extras, such as comments) is optional before each token. This function means that the token will only match if there is no whitespace. * **Aliases : `alias(rule, name)`** - This function causes the given rule to *appear* with an alternative name in the syntax tree. If `name` is a *symbol*, as in `alias($.foo, $.bar)`, then the aliased rule will *appear* as a [named node][named-vs-anonymous-nodes-section] called `bar`. And if `name` is a *string literal*, as in `alias($.foo, 'bar')`, then the aliased rule will appear as an [anonymous node][named-vs-anonymous-nodes-section], as if the rule had been written as the simple string. * **Field Names : `field(name, rule)`** - This function assigns a *field name* to the child node(s) matched by the given rule. In the resulting syntax tree, you can then use that field name to access specific children. diff --git a/lib/Cargo.toml b/lib/Cargo.toml index 2d132788..8f88966f 100644 --- a/lib/Cargo.toml +++ b/lib/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "tree-sitter" description = "Rust bindings to the Tree-sitter parsing library" -version = "0.17.0" +version = "0.17.1" authors = ["Max Brunsfeld "] license = "MIT" readme = "binding_rust/README.md" diff --git a/lib/binding_web/tree-sitter-web.d.ts b/lib/binding_web/tree-sitter-web.d.ts index 7ddae952..092c9353 100644 --- a/lib/binding_web/tree-sitter-web.d.ts +++ b/lib/binding_web/tree-sitter-web.d.ts @@ -37,7 +37,7 @@ declare module 'web-tree-sitter' { export type Logger = ( message: string, - params: {[param: string]: string}, + params: { [param: string]: string }, type: "parse" | "lex" ) => void; @@ -48,9 +48,9 @@ declare module 'web-tree-sitter' { ) => string | null; export interface SyntaxNode { + id: number; tree: Tree; type: string; - isNamed: boolean; text: string; startPosition: Point; endPosition: Point; @@ -74,6 +74,7 @@ declare module 'web-tree-sitter' { hasError(): boolean; equals(other: SyntaxNode): boolean; isMissing(): boolean; + isNamed(): boolean; toString(): string; child(index: number): SyntaxNode | null; namedChild(index: number): SyntaxNode | null; @@ -131,8 +132,33 @@ declare module 'web-tree-sitter' { readonly version: number; readonly fieldCount: number; - fieldNameForId(fieldId: number): string | null - fieldIdForName(fieldName: string): number | null + fieldNameForId(fieldId: number): string | null; + fieldIdForName(fieldName: string): number | null; + query(source: string): Query; + } + + interface QueryCapture { + name: string; + node: SyntaxNode; + } + + interface QueryMatch { + pattern: number; + captures: QueryCapture[]; + } + + interface PredicateResult { + operator: string; + operands: { name: string; type: string }[]; + } + + class Query { + captureNames: string[]; + + delete(): void; + matches(node: SyntaxNode, startPosition?: Point, endPosition?: Point): QueryMatch[]; + captures(node: SyntaxNode, startPosition?: Point, endPosition?: Point): QueryCapture[]; + predicatesForPattern(patternIndex: number): PredicateResult[]; } } diff --git a/lib/src/alloc.h b/lib/src/alloc.h index 0e0927a9..6e22a0ab 100644 --- a/lib/src/alloc.h +++ b/lib/src/alloc.h @@ -17,24 +17,28 @@ void *ts_record_realloc(void *, size_t); void ts_record_free(void *); bool ts_toggle_allocation_recording(bool); -static inline void *ts_malloc(size_t size) { - return ts_record_malloc(size); -} - -static inline void *ts_calloc(size_t count, size_t size) { - return ts_record_calloc(count, size); -} - -static inline void *ts_realloc(void *buffer, size_t size) { - return ts_record_realloc(buffer, size); -} - -static inline void ts_free(void *buffer) { - ts_record_free(buffer); -} +#define ts_malloc ts_record_malloc +#define ts_calloc ts_record_calloc +#define ts_realloc ts_record_realloc +#define ts_free ts_record_free #else +// Allow clients to override allocation functions + +#ifndef ts_malloc +#define ts_malloc ts_malloc_default +#endif +#ifndef ts_calloc +#define ts_calloc ts_calloc_default +#endif +#ifndef ts_realloc +#define ts_realloc ts_realloc_default +#endif +#ifndef ts_free +#define ts_free ts_free_default +#endif + #include static inline bool ts_toggle_allocation_recording(bool value) { @@ -42,7 +46,8 @@ static inline bool ts_toggle_allocation_recording(bool value) { return false; } -static inline void *ts_malloc(size_t size) { + +static inline void *ts_malloc_default(size_t size) { void *result = malloc(size); if (size > 0 && !result) { fprintf(stderr, "tree-sitter failed to allocate %zu bytes", size); @@ -51,7 +56,7 @@ static inline void *ts_malloc(size_t size) { return result; } -static inline void *ts_calloc(size_t count, size_t size) { +static inline void *ts_calloc_default(size_t count, size_t size) { void *result = calloc(count, size); if (count > 0 && !result) { fprintf(stderr, "tree-sitter failed to allocate %zu bytes", count * size); @@ -60,7 +65,7 @@ static inline void *ts_calloc(size_t count, size_t size) { return result; } -static inline void *ts_realloc(void *buffer, size_t size) { +static inline void *ts_realloc_default(void *buffer, size_t size) { void *result = realloc(buffer, size); if (size > 0 && !result) { fprintf(stderr, "tree-sitter failed to reallocate %zu bytes", size); @@ -69,7 +74,7 @@ static inline void *ts_realloc(void *buffer, size_t size) { return result; } -static inline void ts_free(void *buffer) { +static inline void ts_free_default(void *buffer) { free(buffer); } diff --git a/lib/src/array.h b/lib/src/array.h index 13117194..5ff5580a 100644 --- a/lib/src/array.h +++ b/lib/src/array.h @@ -52,14 +52,24 @@ extern "C" { (self)->size += (count)) #define array_push_all(self, other) \ - array_splice((self), (self)->size, 0, (other)->size, (other)->contents) + array_extend((self), (other)->size, (other)->contents) + +// Append `count` elements to the end of the array, reading their values from the +// `contents` pointer. +#define array_extend(self, count, contents) \ + array__splice( \ + (VoidArray *)(self), array__elem_size(self), (self)->size, \ + 0, count, contents \ + ) // Remove `old_count` elements from the array starting at the given `index`. At // the same index, insert `new_count` new elements, reading their values from the // `new_contents` pointer. -#define array_splice(self, index, old_count, new_count, new_contents) \ - array__splice((VoidArray *)(self), array__elem_size(self), index, old_count, \ - new_count, new_contents) +#define array_splice(self, index, old_count, new_count, new_contents) \ + array__splice( \ + (VoidArray *)(self), array__elem_size(self), index, \ + old_count, new_count, new_contents \ + ) // Insert one `element` into the array at the given `index`. #define array_insert(self, index, element) \ diff --git a/lib/src/query.c b/lib/src/query.c index ae476c2a..bf0598ce 100644 --- a/lib/src/query.c +++ b/lib/src/query.c @@ -214,6 +214,7 @@ struct TSQuery { Array(TSQueryPredicateStep) predicate_steps; Array(QueryPattern) patterns; Array(StepOffset) step_offsets; + Array(char) string_buffer; const TSLanguage *language; uint16_t wildcard_root_pattern_count; TSSymbol *symbol_map; @@ -439,67 +440,6 @@ static uint16_t symbol_table_insert_name( return self->slices.size - 1; } -static uint16_t symbol_table_insert_name_with_escapes( - SymbolTable *self, - const char *escaped_name, - uint32_t escaped_length -) { - Slice slice = { - .offset = self->characters.size, - .length = 0, - }; - array_grow_by(&self->characters, escaped_length + 1); - - // Copy the contents of the literal into the characters buffer, processing escape - // sequences like \n and \". This needs to be done before checking if the literal - // is already present, in order to do the string comparison. - bool is_escaped = false; - for (unsigned i = 0; i < escaped_length; i++) { - const char *src = &escaped_name[i]; - char *dest = &self->characters.contents[slice.offset + slice.length]; - if (is_escaped) { - switch (*src) { - case 'n': - *dest = '\n'; - break; - case 'r': - *dest = '\r'; - break; - case 't': - *dest = '\t'; - break; - case '0': - *dest = '\0'; - break; - default: - *dest = *src; - break; - } - is_escaped = false; - slice.length++; - } else { - if (*src == '\\') { - is_escaped = true; - } else { - *dest = *src; - slice.length++; - } - } - } - - // If the string is already present, remove the redundant content from the characters - // buffer and return the existing id. - int id = symbol_table_id_for_name(self, &self->characters.contents[slice.offset], slice.length); - if (id >= 0) { - self->characters.size -= (escaped_length + 1); - return id; - } - - self->characters.contents[slice.offset + slice.length] = 0; - array_push(&self->slices, slice); - return self->slices.size - 1; -} - /************ * QueryStep ************/ @@ -1393,6 +1333,59 @@ static void ts_query__finalize_steps(TSQuery *self) { } } +static TSQueryError ts_query__parse_string_literal( + TSQuery *self, + Stream *stream +) { + const char *string_start = stream->input; + if (stream->next != '"') return TSQueryErrorSyntax; + stream_advance(stream); + const char *prev_position = stream->input; + + bool is_escaped = false; + array_clear(&self->string_buffer); + for (;;) { + if (is_escaped) { + is_escaped = false; + switch (stream->next) { + case 'n': + array_push(&self->string_buffer, '\n'); + break; + case 'r': + array_push(&self->string_buffer, '\r'); + break; + case 't': + array_push(&self->string_buffer, '\t'); + break; + case '0': + array_push(&self->string_buffer, '\0'); + break; + default: + array_extend(&self->string_buffer, stream->next_size, stream->input); + break; + } + prev_position = stream->input + stream->next_size; + } else { + if (stream->next == '\\') { + array_extend(&self->string_buffer, (stream->input - prev_position), prev_position); + prev_position = stream->input + 1; + is_escaped = true; + } else if (stream->next == '"') { + array_extend(&self->string_buffer, (stream->input - prev_position), prev_position); + stream_advance(stream); + return TSQueryErrorNone; + } else if (stream->next == '\n') { + stream_reset(stream, string_start); + return TSQueryErrorSyntax; + } + } + if (!stream_advance(stream)) { + stream_reset(stream, string_start); + return TSQueryErrorSyntax; + } + } +} + // Parse a single predicate associated with a pattern, adding it to the // query's internal `predicate_steps` array. Predicates are arbitrary // S-expressions associated with a pattern which are meant to be handled at @@ -1458,44 +1451,17 @@ static TSQueryError ts_query__parse_predicate( // Parse a string literal else if (stream->next == '"') { - stream_advance(stream); - - // Parse the string content - bool is_escaped = false; - const char *string_content = stream->input; - for (;;) { - if (is_escaped) { - is_escaped = false; - } else { - if (stream->next == '\\') { - is_escaped = true; - } else if (stream->next == '"') { - break; - } else if (stream->next == '\n') { - stream_reset(stream, string_content - 1); - return TSQueryErrorSyntax; - } - } - if (!stream_advance(stream)) { - stream_reset(stream, string_content - 1); - return TSQueryErrorSyntax; - } - } - uint32_t length = stream->input - string_content; - - // Add a step for the node - uint16_t id = symbol_table_insert_name_with_escapes( + TSQueryError e = ts_query__parse_string_literal(self, stream); + if (e) return e; + uint16_t id = symbol_table_insert_name( &self->predicate_values, - string_content, - length + self->string_buffer.contents, + self->string_buffer.size ); array_push(&self->predicate_steps, ((TSQueryPredicateStep) { .type = TSQueryPredicateStepTypeString, .value_id = id, })); - - if (stream->next != '"') return TSQueryErrorSyntax; - stream_advance(stream); } // Parse a bare symbol @@ -1761,33 +1727,22 @@ static TSQueryError ts_query__parse_pattern( // Parse a double-quoted anonymous leaf node expression else if (stream->next == '"') { - stream_advance(stream); - - // Parse the string content - const char *string_content = stream->input; - while (stream->next != '"') { - if (!stream_advance(stream)) { - stream_reset(stream, string_content - 1); - return TSQueryErrorSyntax; - } - } - uint32_t length = stream->input - string_content; + const char *string_start = stream->input; + TSQueryError e = ts_query__parse_string_literal(self, stream); + if (e) return e; // Add a step for the node TSSymbol symbol = ts_language_symbol_for_name( self->language, - string_content, - length, + self->string_buffer.contents, + self->string_buffer.size, false ); if (!symbol) { - stream_reset(stream, string_content); + stream_reset(stream, string_start + 1); return TSQueryErrorNodeType; } array_push(&self->steps, query_step__new(symbol, depth, is_immediate)); - - if (stream->next != '"') return TSQueryErrorSyntax; - stream_advance(stream); } // Parse a field-prefixed pattern @@ -1977,6 +1932,7 @@ TSQuery *ts_query_new( .predicate_steps = array_new(), .patterns = array_new(), .step_offsets = array_new(), + .string_buffer = array_new(), .symbol_map = symbol_map, .wildcard_root_pattern_count = 0, .language = language, @@ -2056,6 +2012,7 @@ TSQuery *ts_query_new( } ts_query__finalize_steps(self); + array_delete(&self->string_buffer); return self; } @@ -2066,6 +2023,7 @@ void ts_query_delete(TSQuery *self) { array_delete(&self->predicate_steps); array_delete(&self->patterns); array_delete(&self->step_offsets); + array_delete(&self->string_buffer); symbol_table_delete(&self->captures); symbol_table_delete(&self->predicate_values); ts_free(self->symbol_map); diff --git a/lib/src/tree_cursor.c b/lib/src/tree_cursor.c index 8af44a34..98b86605 100644 --- a/lib/src/tree_cursor.c +++ b/lib/src/tree_cursor.c @@ -330,7 +330,7 @@ void ts_tree_cursor_current_status( } } - #undef subtree_metadata + #undef subtree_symbol if (!ts_subtree_extra(*entry->subtree)) { const TSFieldMapEntry *field_map, *field_map_end; @@ -345,7 +345,6 @@ void ts_tree_cursor_current_status( for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) { if (!i->inherited && i->child_index == entry->structural_child_index) { *field_id = i->field_id; - *can_have_later_siblings_with_this_field = false; break; } } @@ -354,9 +353,14 @@ void ts_tree_cursor_current_status( // Determine if the current node can have later siblings with the same field name. if (*field_id) { for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) { - if (i->field_id == *field_id && i->child_index > entry->structural_child_index) { - *can_have_later_siblings_with_this_field = true; - break; + if (i->field_id == *field_id) { + if ( + i->child_index > entry->structural_child_index || + (i->child_index == entry->structural_child_index && *has_later_named_siblings) + ) { + *can_have_later_siblings_with_this_field = true; + break; + } } } } diff --git a/test/fixtures/error_corpus/readme.md b/test/fixtures/error_corpus/readme.md new file mode 100644 index 00000000..d8b5da09 --- /dev/null +++ b/test/fixtures/error_corpus/readme.md @@ -0,0 +1,8 @@ +The Error Corpus +================ + +This directory contains corpus tests that exercise error recovery in a variety of languages. + +These corpus tests provide a simple way of asserting that error recoveries are "reasonable" in a variety of situations. But they are also somewhat *overspecified*. It isn't critical that error recovery behaves *exactly* as these tests specify, just that most of the syntax tree is preserved despite the error. + +Sometimes these tests can start failing when changes are pushed to the parser repositories like `tree-sitter-ruby`, `tree-sitter-javascript`, etc. Usually, we just need to tweak the expected syntax tree. diff --git a/test/fixtures/error_corpus/ruby_errors.txt b/test/fixtures/error_corpus/ruby_errors.txt index 9c35781c..49dc2b32 100644 --- a/test/fixtures/error_corpus/ruby_errors.txt +++ b/test/fixtures/error_corpus/ruby_errors.txt @@ -14,6 +14,6 @@ c method: (identifier) (ERROR (heredoc_beginning)) arguments: (argument_list - (heredoc_body (heredoc_end)) + (heredoc_body (heredoc_content) (heredoc_end)) (identifier) (MISSING ")"))))