diff --git a/Cargo.lock b/Cargo.lock index cdad3b61..117ac49e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -740,7 +740,7 @@ dependencies = [ [[package]] name = "tree-sitter-cli" -version = "0.16.8" +version = "0.16.9" dependencies = [ "ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)", "cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 0d85952f..52a2ed6b 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "tree-sitter-cli" description = "CLI tool for developing, testing, and using Tree-sitter parsers" -version = "0.16.8" +version = "0.16.9" authors = ["Max Brunsfeld "] edition = "2018" license = "MIT" diff --git a/cli/npm/dsl.d.ts b/cli/npm/dsl.d.ts new file mode 100644 index 00000000..b9bf1c98 --- /dev/null +++ b/cli/npm/dsl.d.ts @@ -0,0 +1,356 @@ +type AliasRule = {type: 'ALIAS'; named: boolean; content: Rule; value: string}; +type BlankRule = {type: 'BLANK'}; +type ChoiceRule = {type: 'CHOICE'; members: Rule[]}; +type FieldRule = {type: 'FIELD'; name: string; content: Rule}; +type ImmediateTokenRule = {type: 'IMMEDIATE_TOKEN'; content: Rule}; +type PatternRule = {type: 'PATTERN'; value: string}; +type PrecDynamicRule = {type: 'PREC_DYNAMIC'; content: Rule; value: number}; +type PrecLeftRule = {type: 'PREC_LEFT'; content: Rule; value: number}; +type PrecRightRule = {type: 'PREC_RIGHT'; content: Rule; value: number}; +type PrecRule = {type: 'PREC'; content: Rule; value: number}; +type Repeat1Rule = {type: 'REPEAT1'; content: Rule}; +type RepeatRule = {type: 'REPEAT'; content: Rule}; +type SeqRule = {type: 'SEQ'; members: Rule[]}; +type StringRule = {type: 'STRING'; value: string}; +type SymbolRule = {type: 'SYMBOL'; name: Name}; +type TokenRule = {type: 'TOKEN'; content: Rule}; + +type Rule = + | AliasRule + | BlankRule + | ChoiceRule + | FieldRule + | ImmediateTokenRule + | PatternRule + | PrecDynamicRule + | PrecLeftRule + | PrecRightRule + | PrecRule + | Repeat1Rule + | RepeatRule + | SeqRule + | StringRule + | SymbolRule + | TokenRule; + +type RuleOrLiteral = Rule | RegExp | string; + +type GrammarSymbols = { + [name in RuleName]: SymbolRule; +} & + Record>; + +type RuleBuilder = ( + $: GrammarSymbols, +) => RuleOrLiteral; + +type RuleBuilders< + RuleName extends string, + BaseGrammarRuleName extends string +> = { + [name in RuleName]: RuleBuilder; +}; + +interface Grammar< + RuleName extends string, + BaseGrammarRuleName extends string = never, + Rules extends RuleBuilders = RuleBuilders< + RuleName, + BaseGrammarRuleName + > +> { + /** + * Name of the grammar language. + */ + name: string; + + /** Mapping of grammar rule names to rule builder functions. */ + rules: Rules; + + /** + * An array of arrays of rule names. Each inner array represents a set of + * rules that's involved in an _LR(1) conflict_ that is _intended to exist_ + * in the grammar. When these conflicts occur at runtime, Tree-sitter will + * use the GLR algorithm to explore all of the possible interpretations. If + * _multiple_ parses end up succeeding, Tree-sitter will pick the subtree + * whose corresponding rule has the highest total _dynamic precedence_. + * + * @param $ grammar rules + */ + conflicts?: ( + $: GrammarSymbols, + ) => RuleOrLiteral[][]; + + /** + * An array of token names which can be returned by an _external scanner_. + * External scanners allow you to write custom C code which runs during the + * lexing process in order to handle lexical rules (e.g. Python's indentation + * tokens) that cannot be described by regular expressions. + * + * @param $ grammar rules + * @param previous array of externals from the base schema, if any + * + * @see https://tree-sitter.github.io/tree-sitter/creating-parsers#external-scanners + */ + externals?: ( + $: Record>, + previous: Rule[], + ) => SymbolRule[]; + + /** + * An array of tokens that may appear anywhere in the language. This + * is often used for whitespace and comments. The default value of + * extras is to accept whitespace. To control whitespace explicitly, + * specify extras: `$ => []` in your grammar. + * + * @param $ grammar rules + */ + extras?: ( + $: GrammarSymbols, + ) => RuleOrLiteral[]; + + /** + * An array of rules that should be automatically removed from the + * grammar by replacing all of their usages with a copy of their definition. + * This is useful for rules that are used in multiple places but for which + * you don't want to create syntax tree nodes at runtime. + * + * @param $ grammar rules + */ + inline?: ( + $: GrammarSymbols, + ) => RuleOrLiteral[]; + + /** + * A list of hidden rule names that should be considered supertypes in the + * generated node types file. + * + * @param $ grammar rules + * + * @see http://tree-sitter.github.io/tree-sitter/using-parsers#static-node-types + */ + supertypes?: ( + $: GrammarSymbols, + ) => RuleOrLiteral[]; + + /** + * The name of a token that will match keywords for the purpose of the + * keyword extraction optimization. + * + * @param $ grammar rules + * + * @see https://tree-sitter.github.io/tree-sitter/creating-parsers#keyword-extraction + */ + word?: ($: GrammarSymbols) => RuleOrLiteral; +} + +type GrammarSchema = { + [K in keyof Grammar]: K extends 'rules' + ? Record + : Grammar[K]; +}; + +/** + * Causes the given rule to appear with an alternative name in the syntax tree. + * For instance with `alias($.foo, 'bar')`, the aliased rule will appear as an + * anonymous node, as if the rule had been written as the simple string. + * + * @param rule rule that will be aliased + * @param name target name for the alias + */ +declare function alias(rule: RuleOrLiteral, name: string): AliasRule; + +/** + * Causes the given rule to appear as an alternative named node, for instance + * with `alias($.foo, $.bar)`, the aliased rule `foo` will appear as a named + * node called `bar`. + * + * @param rule rule that will be aliased + * @param symbol target symbol for the alias + */ +declare function alias( + rule: RuleOrLiteral, + symbol: SymbolRule, +): AliasRule; + +/** + * Creates a blank rule, matching nothing. + */ +declare function blank(): BlankRule; + +/** + * Assigns a field name to the child node(s) matched by the given rule. + * In the resulting syntax tree, you can then use that field name to + * access specific children. + * + * @param name name of the field + * @param rule rule the field should match + */ +declare function field(name: string, rule: RuleOrLiteral): FieldRule; + +/** + * Creates a rule that matches one of a set of possible rules. The order + * of the arguments does not matter. This is analogous to the `|` (pipe) + * operator in EBNF notation. + * + * @param options possible rule choices + */ +declare function choice(...options: RuleOrLiteral[]): ChoiceRule; + +/** + * Creates a rule that matches zero or one occurrence of a given rule. + * It is analogous to the `[x]` (square bracket) syntax in EBNF notation. + * + * @param value rule to be made optional + */ +declare function optional(rule: RuleOrLiteral): ChoiceRule; + +/** + * Marks the given rule with a numerical precedence which will be used to + * resolve LR(1) conflicts at parser-generation time. When two rules overlap + * in a way that represents either a true ambiguity or a _local_ ambiguity + * given one token of lookahead, Tree-sitter will try to resolve the conflict by + * matching the rule with the higher precedence. The default precedence of all + * rules is zero. This works similarly to the precedence directives in Yacc grammars. + * + * @param number precedence weight + * @param rule rule being weighted + * + * @see https://en.wikipedia.org/wiki/LR_parser#Conflicts_in_the_constructed_tables + * @see https://docs.oracle.com/cd/E19504-01/802-5880/6i9k05dh3/index.html + */ +declare const prec: { + (number: number, rule: RuleOrLiteral): PrecRule; + + /** + * Marks the given rule as left-associative (and optionally applies a + * numerical precedence). When an LR(1) conflict arises in which all of the + * rules have the same numerical precedence, Tree-sitter will consult the + * rules' associativity. If there is a left-associative rule, Tree-sitter + * will prefer matching a rule that ends _earlier_. This works similarly to + * associativity directives in Yacc grammars. + * + * @param number (optional) precedence weight + * @param rule rule to mark as left-associative + * + * @see https://docs.oracle.com/cd/E19504-01/802-5880/6i9k05dh3/index.html + */ + left(rule: RuleOrLiteral): PrecLeftRule; + left(number: number, rule: RuleOrLiteral): PrecLeftRule; + + /** + * Marks the given rule as right-associative (and optionally applies a + * numerical precedence). When an LR(1) conflict arises in which all of the + * rules have the same numerical precedence, Tree-sitter will consult the + * rules' associativity. If there is a right-associative rule, Tree-sitter + * will prefer matching a rule that ends _later_. This works similarly to + * associativity directives in Yacc grammars. + * + * @param number (optional) precedence weight + * @param rule rule to mark as right-associative + * + * @see https://docs.oracle.com/cd/E19504-01/802-5880/6i9k05dh3/index.html + */ + right(rule: RuleOrLiteral): PrecRightRule; + right(number: number, rule: RuleOrLiteral): PrecRightRule; + + /** + * Marks the given rule with a numerical precedence which will be used to + * resolve LR(1) conflicts at _runtime_ instead of parser-generation time. + * This is only necessary when handling a conflict dynamically using the + * `conflicts` field in the grammar, and when there is a genuine _ambiguity_: + * multiple rules correctly match a given piece of code. In that event, + * Tree-sitter compares the total dynamic precedence associated with each + * rule, and selects the one with the highest total. This is similar to + * dynamic precedence directives in Bison grammars. + * + * @param number precedence weight + * @param rule rule being weighted + * + * @see https://www.gnu.org/software/bison/manual/html_node/Generalized-LR-Parsing.html + */ + dynamic(number: number, rule: RuleOrLiteral): PrecDynamicRule; +}; + +/** + * Creates a rule that matches _zero-or-more_ occurrences of a given rule. + * It is analogous to the `{x}` (curly brace) syntax in EBNF notation. This + * rule is implemented in terms of `repeat1` but is included because it + * is very commonly used. + * + * @param rule rule to repeat, zero or more times + */ +declare function repeat(rule: RuleOrLiteral): RepeatRule; + +/** + * Creates a rule that matches one-or-more occurrences of a given rule. + * + * @param rule rule to repeat, one or more times + */ +declare function repeat1(rule: RuleOrLiteral): Repeat1Rule; + +/** + * Creates a rule that matches any number of other rules, one after another. + * It is analogous to simply writing multiple symbols next to each other + * in EBNF notation. + * + * @param rules ordered rules that comprise the sequence + */ +declare function seq(...rules: RuleOrLiteral[]): SeqRule; + +/** + * Creates a symbol rule, representing another rule in the grammar by name. + * + * @param name name of the target rule + */ +declare function sym(name: Name): SymbolRule; + +/** + * Marks the given rule as producing only a single token. Tree-sitter's + * default is to treat each String or RegExp literal in the grammar as a + * separate token. Each token is matched separately by the lexer and + * returned as its own leaf node in the tree. The token function allows + * you to express a complex rule using the DSL functions (rather + * than as a single regular expression) but still have Tree-sitter treat + * it as a single token. + * + * @param rule rule to represent as a single token + */ +declare const token: { + (rule: RuleOrLiteral): TokenRule; + + /** + * Marks the given rule as producing an immediate token. This allows + * the parser to produce a different token based on whether or not + * there are `extras` preceding the token's main content. When there + * are _no_ leading `extras`, an immediate token is preferred over a + * normal token which would otherwise match. + * + * @param rule rule to represent as an immediate token + */ + immediate(rule: RuleOrLiteral): ImmediateTokenRule; +}; + +/** + * Creates a new language grammar with the provided schema. + * + * @param options grammar options + */ +declare function grammar( + options: Grammar, +): GrammarSchema; + +/** + * Extends an existing language grammar with the provided options, + * creating a new language. + * + * @param baseGrammar base grammar schema to extend from + * @param options grammar options for the new extended language + */ +declare function grammar< + BaseGrammarRuleName extends string, + RuleName extends string +>( + baseGrammar: GrammarSchema, + options: Grammar, +): GrammarSchema; diff --git a/cli/npm/package.json b/cli/npm/package.json index 738c5622..01afe107 100644 --- a/cli/npm/package.json +++ b/cli/npm/package.json @@ -1,6 +1,6 @@ { "name": "tree-sitter-cli", - "version": "0.16.8", + "version": "0.16.9", "author": "Max Brunsfeld", "license": "MIT", "repository": { diff --git a/cli/src/error.rs b/cli/src/error.rs index 4b493019..c30e3647 100644 --- a/cli/src/error.rs +++ b/cli/src/error.rs @@ -87,7 +87,7 @@ impl<'a> From for Error { impl<'a> From for Error { fn from(error: tree_sitter_tags::Error) -> Self { - Error::new(format!("{:?}", error)) + Error::new(format!("{}", error)) } } diff --git a/cli/src/generate/build_tables/minimize_parse_table.rs b/cli/src/generate/build_tables/minimize_parse_table.rs index 5d8f7f0f..aa4801c8 100644 --- a/cli/src/generate/build_tables/minimize_parse_table.rs +++ b/cli/src/generate/build_tables/minimize_parse_table.rs @@ -199,6 +199,9 @@ impl<'a> Minimizer<'a> { right_state: &ParseState, group_ids_by_state_id: &Vec, ) -> bool { + if left_state.is_non_terminal_extra != right_state.is_non_terminal_extra { + return true; + } for (token, left_entry) in &left_state.terminal_entries { if let Some(right_entry) = right_state.terminal_entries.get(token) { if self.entries_conflict( diff --git a/cli/src/generate/node_types.rs b/cli/src/generate/node_types.rs index 9c3bea64..7a5768a5 100644 --- a/cli/src/generate/node_types.rs +++ b/cli/src/generate/node_types.rs @@ -19,7 +19,7 @@ pub(crate) struct FieldInfo { #[derive(Clone, Debug, Default, PartialEq, Eq)] pub(crate) struct VariableInfo { pub fields: HashMap, - pub child_types: Vec, + pub children: FieldInfo, pub children_without_fields: FieldInfo, pub has_multi_step_production: bool, } @@ -70,7 +70,7 @@ impl Default for FieldInfoJSON { impl Default for ChildQuantity { fn default() -> Self { - Self::zero() + Self::one() } } @@ -158,7 +158,7 @@ pub(crate) fn get_variable_info( // Each variable's summary can depend on the summaries of other hidden variables, // and variables can have mutually recursive structure. So we compute the summaries - // iteratively, in a loop that terminates only when more changes are possible. + // iteratively, in a loop that terminates only when no more changes are possible. let mut did_change = true; let mut all_initialized = false; let mut result = vec![VariableInfo::default(); syntax_grammar.variables.len()]; @@ -168,13 +168,14 @@ pub(crate) fn get_variable_info( for (i, variable) in syntax_grammar.variables.iter().enumerate() { let mut variable_info = result[i].clone(); - // Within a variable, consider each production separately. For each - // production, determine which children and fields can occur, and how many - // times they can occur. - for (production_index, production) in variable.productions.iter().enumerate() { - let mut field_quantities = HashMap::new(); - let mut children_without_fields_quantity = ChildQuantity::zero(); - let mut has_uninitialized_invisible_children = false; + // Examine each of the variable's productions. The variable's child types can be + // immediately combined across all productions, but the child quantities must be + // recorded separately for each production. + for production in &variable.productions { + let mut production_field_quantities = HashMap::new(); + let mut production_children_quantity = ChildQuantity::zero(); + let mut production_children_without_fields_quantity = ChildQuantity::zero(); + let mut production_has_uninitialized_invisible_children = false; if production.steps.len() > 1 { variable_info.has_multi_step_production = true; @@ -190,111 +191,97 @@ pub(crate) fn get_variable_info( ChildType::Normal(child_symbol) }; - // Record all of the types of direct children. - did_change |= sorted_vec_insert(&mut variable_info.child_types, &child_type); + let child_is_hidden = !child_type_is_visible(&child_type) + && !syntax_grammar.supertype_symbols.contains(&child_symbol); - // Record all of the field names that occur. + // Maintain the set of all child types for this variable, and the quantity of + // visible children in this production. + did_change |= + extend_sorted(&mut variable_info.children.types, Some(&child_type)); + if !child_is_hidden { + production_children_quantity.append(ChildQuantity::one()); + } + + // Maintain the set of child types associated with each field, and the quantity + // of children associated with each field in this production. if let Some(field_name) = &step.field_name { - // Record how many times each field occurs in this production. - field_quantities + let field_info = variable_info + .fields + .entry(field_name.clone()) + .or_insert(FieldInfo::default()); + did_change |= extend_sorted(&mut field_info.types, Some(&child_type)); + + let production_field_quantity = production_field_quantities .entry(field_name) - .or_insert(ChildQuantity::zero()) - .append(ChildQuantity::one()); + .or_insert(ChildQuantity::zero()); - // Record the types of children for this field. - let field_info = - variable_info.fields.entry(field_name.clone()).or_insert({ - let mut info = FieldInfo { - types: Vec::new(), - quantity: ChildQuantity::one(), - }; - - // If this field did *not* occur in an earlier production, - // then it is not required. - if production_index > 0 { - info.quantity.required = false; - } - info - }); - did_change |= sorted_vec_insert(&mut field_info.types, &child_type); - } - // Record named children without fields. - else if child_type_is_named(&child_type) { - // Record how many named children without fields occur in this production. - children_without_fields_quantity.append(ChildQuantity::one()); - - // Record the types of all of the named children without fields. - let children_info = &mut variable_info.children_without_fields; - if children_info.types.is_empty() { - children_info.quantity = ChildQuantity::one(); + // Inherit the types and quantities of hidden children associated with fields. + if child_is_hidden && child_symbol.is_non_terminal() { + let child_variable_info = &result[child_symbol.index]; + did_change |= extend_sorted( + &mut field_info.types, + &child_variable_info.children.types, + ); + production_field_quantity.append(child_variable_info.children.quantity); + } else { + production_field_quantity.append(ChildQuantity::one()); } - did_change |= sorted_vec_insert(&mut children_info.types, &child_type); + } + // Maintain the set of named children without fields within this variable. + else if child_type_is_named(&child_type) { + production_children_without_fields_quantity.append(ChildQuantity::one()); + did_change |= extend_sorted( + &mut variable_info.children_without_fields.types, + Some(&child_type), + ); } - // Inherit information from any hidden children. - if child_symbol.is_non_terminal() - && !syntax_grammar.supertype_symbols.contains(&child_symbol) - && step.alias.is_none() - && !child_type_is_visible(&child_type) - { + // Inherit all child information from hidden children. + if child_is_hidden && child_symbol.is_non_terminal() { let child_variable_info = &result[child_symbol.index]; - // If a hidden child can have multiple children, then this - // node can appear to have multiple children. + // If a hidden child can have multiple children, then its parent node can + // appear to have multiple children. if child_variable_info.has_multi_step_production { variable_info.has_multi_step_production = true; } - // Inherit fields from this hidden child + // If a hidden child has fields, then the parent node can appear to have + // those same fields. for (field_name, child_field_info) in &child_variable_info.fields { - field_quantities + production_field_quantities .entry(field_name) .or_insert(ChildQuantity::zero()) .append(child_field_info.quantity); - let field_info = variable_info - .fields - .entry(field_name.clone()) - .or_insert(FieldInfo { - types: Vec::new(), - quantity: ChildQuantity::one(), - }); - for child_type in &child_field_info.types { - sorted_vec_insert(&mut field_info.types, &child_type); - } + did_change |= extend_sorted( + &mut variable_info + .fields + .entry(field_name.clone()) + .or_insert(FieldInfo::default()) + .types, + &child_field_info.types, + ); } - // Inherit child types from this hidden child - for child_type in &child_variable_info.child_types { - did_change |= - sorted_vec_insert(&mut variable_info.child_types, child_type); - } + // If a hidden child has children, then the parent node can appear to have + // those same children. + production_children_quantity.append(child_variable_info.children.quantity); + did_change |= extend_sorted( + &mut variable_info.children.types, + &child_variable_info.children.types, + ); - // If any field points to this hidden child, inherit child types - // for the field. - if let Some(field_name) = &step.field_name { - let field_info = variable_info.fields.get_mut(field_name).unwrap(); - for child_type in &child_variable_info.child_types { - did_change |= sorted_vec_insert(&mut field_info.types, &child_type); - } - } - // Inherit info about children without fields from this hidden child. - else { + // If a hidden child can have named children without fields, then the parent + // node can appear to have those same children. + if step.field_name.is_none() { let grandchildren_info = &child_variable_info.children_without_fields; if !grandchildren_info.types.is_empty() { - children_without_fields_quantity - .append(grandchildren_info.quantity); - - if variable_info.children_without_fields.types.is_empty() { - variable_info.children_without_fields.quantity = - ChildQuantity::one(); - } - - for child_type in &grandchildren_info.types { - did_change |= sorted_vec_insert( - &mut variable_info.children_without_fields.types, - &child_type, - ); - } + production_children_without_fields_quantity + .append(child_variable_info.children_without_fields.quantity); + did_change |= extend_sorted( + &mut variable_info.children_without_fields.types, + &child_variable_info.children_without_fields.types, + ); } } } @@ -302,22 +289,27 @@ pub(crate) fn get_variable_info( // Note whether or not this production contains children whose summaries // have not yet been computed. if child_symbol.index >= i && !all_initialized { - has_uninitialized_invisible_children = true; + production_has_uninitialized_invisible_children = true; } } // If this production's children all have had their summaries initialized, // then expand the quantity information with all of the possibilities introduced // by this production. - if !has_uninitialized_invisible_children { + if !production_has_uninitialized_invisible_children { + did_change |= variable_info + .children + .quantity + .union(production_children_quantity); + did_change |= variable_info .children_without_fields .quantity - .union(children_without_fields_quantity); + .union(production_children_without_fields_quantity); for (field_name, info) in variable_info.fields.iter_mut() { did_change |= info.quantity.union( - field_quantities + production_field_quantities .get(field_name) .cloned() .unwrap_or(ChildQuantity::zero()), @@ -352,13 +344,15 @@ pub(crate) fn get_variable_info( // Update all of the node type lists to eliminate hidden nodes. for supertype_symbol in &syntax_grammar.supertype_symbols { result[supertype_symbol.index] - .child_types + .children + .types .retain(child_type_is_visible); } for variable_info in result.iter_mut() { for (_, field_info) in variable_info.fields.iter_mut() { field_info.types.retain(child_type_is_visible); } + variable_info.fields.retain(|_, v| !v.types.is_empty()); variable_info .children_without_fields .types @@ -467,7 +461,8 @@ pub(crate) fn generate_node_types_json( subtypes: None, }); let mut subtypes = info - .child_types + .children + .types .iter() .map(child_type_to_node_type) .collect::>(); @@ -686,16 +681,19 @@ fn variable_type_for_child_type( } } -fn sorted_vec_insert(vec: &mut Vec, value: &T) -> bool +fn extend_sorted<'a, T>(vec: &mut Vec, values: impl IntoIterator) -> bool where T: Clone + Eq + Ord, + T: 'a, { - if let Err(i) = vec.binary_search(&value) { - vec.insert(i, value.clone()); - true - } else { - false - } + values.into_iter().any(|value| { + if let Err(i) = vec.binary_search(&value) { + vec.insert(i, value.clone()); + true + } else { + false + } + }) } #[cfg(test)] @@ -1177,6 +1175,38 @@ mod tests { ); } + #[test] + fn test_node_types_with_fields_on_hidden_tokens() { + let node_types = get_node_types(InputGrammar { + name: String::new(), + extra_symbols: Vec::new(), + external_tokens: Vec::new(), + expected_conflicts: Vec::new(), + variables_to_inline: Vec::new(), + word_token: None, + supertype_symbols: vec![], + variables: vec![Variable { + name: "script".to_string(), + kind: VariableType::Named, + rule: Rule::seq(vec![ + Rule::field("a".to_string(), Rule::pattern("hi")), + Rule::field("b".to_string(), Rule::pattern("bye")), + ]), + }], + }); + + assert_eq!( + node_types, + [NodeInfoJSON { + kind: "script".to_string(), + named: true, + fields: Some(BTreeMap::new()), + children: None, + subtypes: None + }] + ); + } + #[test] fn test_node_types_with_multiple_rules_same_alias_name() { let node_types = get_node_types(InputGrammar { @@ -1461,6 +1491,71 @@ mod tests { ); } + #[test] + fn test_get_variable_info_with_repetitions_inside_fields() { + let variable_info = get_variable_info( + &build_syntax_grammar( + vec![ + // Field associated with a repetition. + SyntaxVariable { + name: "rule0".to_string(), + kind: VariableType::Named, + productions: vec![ + Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::non_terminal(1)) + .with_field_name("field1")], + }, + Production { + dynamic_precedence: 0, + steps: vec![], + }, + ], + }, + // Repetition node + SyntaxVariable { + name: "_rule0_repeat".to_string(), + kind: VariableType::Hidden, + productions: vec![ + Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::terminal(1))], + }, + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::non_terminal(1)), + ProductionStep::new(Symbol::non_terminal(1)), + ], + }, + ], + }, + ], + vec![], + ), + &build_lexical_grammar(), + &AliasMap::new(), + ) + .unwrap(); + + assert_eq!( + variable_info[0].fields, + vec![( + "field1".to_string(), + FieldInfo { + quantity: ChildQuantity { + exists: true, + required: false, + multiple: true, + }, + types: vec![ChildType::Normal(Symbol::terminal(1))], + } + )] + .into_iter() + .collect::>() + ); + } + #[test] fn test_get_variable_info_with_inherited_fields() { let variable_info = get_variable_info( diff --git a/cli/src/generate/prepare_grammar/process_inlines.rs b/cli/src/generate/prepare_grammar/process_inlines.rs index 9ef89d75..f83658b2 100644 --- a/cli/src/generate/prepare_grammar/process_inlines.rs +++ b/cli/src/generate/prepare_grammar/process_inlines.rs @@ -127,6 +127,9 @@ impl InlinedProductionMapBuilder { last_inserted_step.associativity = removed_step.associativity; } } + if p.dynamic_precedence.abs() > production.dynamic_precedence.abs() { + production.dynamic_precedence = p.dynamic_precedence; + } production }), ); @@ -226,7 +229,7 @@ mod tests { ], }, Production { - dynamic_precedence: 0, + dynamic_precedence: -2, steps: vec![ProductionStep::new(Symbol::terminal(14))], }, ], @@ -258,7 +261,7 @@ mod tests { ], }, Production { - dynamic_precedence: 0, + dynamic_precedence: -2, steps: vec![ ProductionStep::new(Symbol::terminal(10)), ProductionStep::new(Symbol::terminal(14)), diff --git a/cli/src/loader.rs b/cli/src/loader.rs index cf2eb143..62cc9b62 100644 --- a/cli/src/loader.rs +++ b/cli/src/loader.rs @@ -160,7 +160,9 @@ impl Loader { // If multiple language configurations match, then determine which // one to use by applying the configurations' content regexes. else { - let file_contents = fs::read_to_string(path)?; + let file_contents = fs::read(path) + .map_err(Error::wrap(|| format!("Failed to read path {:?}", path)))?; + let file_contents = String::from_utf8_lossy(&file_contents); let mut best_score = -2isize; let mut best_configuration_id = None; for configuration_id in configuration_ids { diff --git a/cli/src/main.rs b/cli/src/main.rs index 04cd34cd..2f8c6dd5 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -53,11 +53,12 @@ fn run() -> error::Result<()> { .subcommand( SubCommand::with_name("parse") .about("Parse files") + .arg(Arg::with_name("paths-file").long("paths").takes_value(true)) .arg( - Arg::with_name("path") + Arg::with_name("paths") .index(1) .multiple(true) - .required(true), + .required(false), ) .arg(Arg::with_name("scope").long("scope").takes_value(true)) .arg(Arg::with_name("debug").long("debug").short("d")) @@ -79,37 +80,33 @@ fn run() -> error::Result<()> { SubCommand::with_name("query") .about("Search files using a syntax tree query") .arg(Arg::with_name("query-path").index(1).required(true)) + .arg(Arg::with_name("paths-file").long("paths").takes_value(true)) .arg( - Arg::with_name("path") + Arg::with_name("paths") .index(2) .multiple(true) - .required(true), + .required(false), + ) + .arg( + Arg::with_name("byte-range") + .help("The range of byte offsets in which the query will be executed") + .long("byte-range") + .takes_value(true), ) .arg(Arg::with_name("scope").long("scope").takes_value(true)) .arg(Arg::with_name("captures").long("captures").short("c")), ) .subcommand( SubCommand::with_name("tags") - .arg( - Arg::with_name("format") - .short("f") - .long("format") - .value_name("json|protobuf") - .help("Determine output format (default: json)"), - ) + .arg(Arg::with_name("quiet").long("quiet").short("q")) + .arg(Arg::with_name("time").long("time").short("t")) .arg(Arg::with_name("scope").long("scope").takes_value(true)) + .arg(Arg::with_name("paths-file").long("paths").takes_value(true)) .arg( - Arg::with_name("inputs") + Arg::with_name("paths") .help("The source file to use") .index(1) - .required(true) .multiple(true), - ) - .arg( - Arg::with_name("v") - .short("v") - .multiple(true) - .help("Sets the level of verbosity"), ), ) .subcommand( @@ -127,11 +124,12 @@ fn run() -> error::Result<()> { .subcommand( SubCommand::with_name("highlight") .about("Highlight a file") + .arg(Arg::with_name("paths-file").long("paths").takes_value(true)) .arg( - Arg::with_name("path") + Arg::with_name("paths") .index(1) .multiple(true) - .required(true), + .required(false), ) .arg(Arg::with_name("scope").long("scope").takes_value(true)) .arg(Arg::with_name("html").long("html").short("h")) @@ -230,7 +228,9 @@ fn run() -> error::Result<()> { let timeout = matches .value_of("timeout") .map_or(0, |t| u64::from_str_radix(t, 10).unwrap()); - let paths = collect_paths(matches.values_of("path").unwrap())?; + + let paths = collect_paths(matches.value_of("paths-file"), matches.values_of("paths"))?; + let max_path_length = paths.iter().map(|p| p.chars().count()).max().unwrap(); let mut has_error = false; loader.find_all_languages(&config.parser_directories)?; @@ -256,31 +256,36 @@ fn run() -> error::Result<()> { } } else if let Some(matches) = matches.subcommand_matches("query") { let ordered_captures = matches.values_of("captures").is_some(); - let paths = matches - .values_of("path") - .unwrap() - .into_iter() - .map(Path::new) - .collect::>(); + let paths = collect_paths(matches.value_of("paths-file"), matches.values_of("paths"))?; loader.find_all_languages(&config.parser_directories)?; let language = select_language( &mut loader, - paths[0], + Path::new(&paths[0]), ¤t_dir, matches.value_of("scope"), )?; let query_path = Path::new(matches.value_of("query-path").unwrap()); - query::query_files_at_paths(language, paths, query_path, ordered_captures)?; + let range = matches.value_of("byte-range").map(|br| { + let r: Vec<&str> = br.split(":").collect(); + (r[0].parse().unwrap(), r[1].parse().unwrap()) + }); + query::query_files_at_paths(language, paths, query_path, ordered_captures, range)?; } else if let Some(matches) = matches.subcommand_matches("tags") { loader.find_all_languages(&config.parser_directories)?; - let paths = collect_paths(matches.values_of("inputs").unwrap())?; - tags::generate_tags(&loader, matches.value_of("scope"), &paths)?; + let paths = collect_paths(matches.value_of("paths-file"), matches.values_of("paths"))?; + tags::generate_tags( + &loader, + matches.value_of("scope"), + &paths, + matches.is_present("quiet"), + matches.is_present("time"), + )?; } else if let Some(matches) = matches.subcommand_matches("highlight") { loader.configure_highlights(&config.theme.highlight_names); loader.find_all_languages(&config.parser_directories)?; let time = matches.is_present("time"); - let paths = collect_paths(matches.values_of("path").unwrap())?; + let paths = collect_paths(matches.value_of("paths-file"), matches.values_of("paths"))?; let html_mode = matches.is_present("html"); if html_mode { println!("{}", highlight::HTML_HEADER); @@ -353,39 +358,58 @@ fn run() -> error::Result<()> { Ok(()) } -fn collect_paths<'a>(paths: impl Iterator) -> error::Result> { - let mut result = Vec::new(); +fn collect_paths<'a>( + paths_file: Option<&str>, + paths: Option>, +) -> error::Result> { + if let Some(paths_file) = paths_file { + return Ok(fs::read_to_string(paths_file) + .map_err(Error::wrap(|| { + format!("Failed to read paths file {}", paths_file) + }))? + .trim() + .split_ascii_whitespace() + .map(String::from) + .collect::>()); + } - let mut incorporate_path = |path: &str, positive| { - if positive { - result.push(path.to_string()); - } else { - if let Some(index) = result.iter().position(|p| p == path) { - result.remove(index); + if let Some(paths) = paths { + let mut result = Vec::new(); + + let mut incorporate_path = |path: &str, positive| { + if positive { + result.push(path.to_string()); + } else { + if let Some(index) = result.iter().position(|p| p == path) { + result.remove(index); + } } - } - }; + }; - for mut path in paths { - let mut positive = true; - if path.starts_with("!") { - positive = false; - path = path.trim_start_matches("!"); - } + for mut path in paths { + let mut positive = true; + if path.starts_with("!") { + positive = false; + path = path.trim_start_matches("!"); + } - if Path::new(path).exists() { - incorporate_path(path, positive); - } else { - let paths = - glob(path).map_err(Error::wrap(|| format!("Invalid glob pattern {:?}", path)))?; - for path in paths { - if let Some(path) = path?.to_str() { - incorporate_path(path, positive); + if Path::new(path).exists() { + incorporate_path(path, positive); + } else { + let paths = glob(path) + .map_err(Error::wrap(|| format!("Invalid glob pattern {:?}", path)))?; + for path in paths { + if let Some(path) = path?.to_str() { + incorporate_path(path, positive); + } } } } + + return Ok(result); } - Ok(result) + + Err(Error::new("Must provide one or more paths".to_string())) } fn select_language( diff --git a/cli/src/query.rs b/cli/src/query.rs index 47242273..e71e6254 100644 --- a/cli/src/query.rs +++ b/cli/src/query.rs @@ -6,9 +6,10 @@ use tree_sitter::{Language, Node, Parser, Query, QueryCursor}; pub fn query_files_at_paths( language: Language, - paths: Vec<&Path>, + paths: Vec, query_path: &Path, ordered_captures: bool, + range: Option<(usize, usize)>, ) -> Result<()> { let stdout = io::stdout(); let mut stdout = stdout.lock(); @@ -20,14 +21,17 @@ pub fn query_files_at_paths( .map_err(|e| Error::new(format!("Query compilation failed: {:?}", e)))?; let mut query_cursor = QueryCursor::new(); + if let Some((beg, end)) = range { + query_cursor.set_byte_range(beg, end); + } let mut parser = Parser::new(); parser.set_language(language).map_err(|e| e.to_string())?; for path in paths { - writeln!(&mut stdout, "{}", path.to_str().unwrap())?; + writeln!(&mut stdout, "{}", path)?; - let source_code = fs::read(path).map_err(Error::wrap(|| { + let source_code = fs::read(&path).map_err(Error::wrap(|| { format!("Error reading source file {:?}", path) }))?; let text_callback = |n: Node| &source_code[n.byte_range()]; diff --git a/cli/src/tags.rs b/cli/src/tags.rs index d6704ec5..122b58d2 100644 --- a/cli/src/tags.rs +++ b/cli/src/tags.rs @@ -3,10 +3,17 @@ use super::util; use crate::error::{Error, Result}; use std::io::{self, Write}; use std::path::Path; +use std::time::Instant; use std::{fs, str}; use tree_sitter_tags::TagsContext; -pub fn generate_tags(loader: &Loader, scope: Option<&str>, paths: &[String]) -> Result<()> { +pub fn generate_tags( + loader: &Loader, + scope: Option<&str>, + paths: &[String], + quiet: bool, + time: bool, +) -> Result<()> { let mut lang = None; if let Some(scope) = scope { lang = loader.language_configuration_for_scope(scope)?; @@ -34,28 +41,50 @@ pub fn generate_tags(loader: &Loader, scope: Option<&str>, paths: &[String]) -> }; if let Some(tags_config) = language_config.tags_config(language)? { - let path_str = format!("{:?}", path); - writeln!(&mut stdout, "{}", &path_str[1..path_str.len() - 1])?; + let indent; + if paths.len() > 1 { + if !quiet { + writeln!(&mut stdout, "{}", path.to_string_lossy())?; + } + indent = "\t" + } else { + indent = ""; + }; let source = fs::read(path)?; - for tag in context.generate_tags(tags_config, &source, Some(&cancellation_flag))? { + let t0 = Instant::now(); + for tag in context.generate_tags(tags_config, &source, Some(&cancellation_flag))?.0 { let tag = tag?; - write!( - &mut stdout, - " {:<8} {:<40}\t{:>9}-{:<9}", - tag.kind, - str::from_utf8(&source[tag.name_range]).unwrap_or(""), - tag.span.start, - tag.span.end, - )?; - if let Some(docs) = tag.docs { - if docs.len() > 120 { - write!(&mut stdout, "\t{:?}...", &docs[0..120])?; - } else { - write!(&mut stdout, "\t{:?}", &docs)?; + if !quiet { + write!( + &mut stdout, + "{}{:<10}\t | {:<8}\t{} {} - {} `{}`", + indent, + str::from_utf8(&source[tag.name_range]).unwrap_or(""), + &tags_config.syntax_type_name(tag.syntax_type_id), + if tag.is_definition { "def" } else { "ref" }, + tag.span.start, + tag.span.end, + str::from_utf8(&source[tag.line_range]).unwrap_or(""), + )?; + if let Some(docs) = tag.docs { + if docs.len() > 120 { + write!(&mut stdout, "\t{:?}...", &docs[0..120])?; + } else { + write!(&mut stdout, "\t{:?}", &docs)?; + } } + writeln!(&mut stdout, "")?; } - writeln!(&mut stdout, "")?; + } + + if time { + writeln!( + &mut stdout, + "{}time: {}ms", + indent, + t0.elapsed().as_millis(), + )?; } } else { eprintln!("No tags config found for path {:?}", path); diff --git a/cli/src/tests/mod.rs b/cli/src/tests/mod.rs index ac54db00..24e8160e 100644 --- a/cli/src/tests/mod.rs +++ b/cli/src/tests/mod.rs @@ -3,6 +3,7 @@ mod helpers; mod highlight_test; mod node_test; mod parser_test; +mod pathological_test; mod query_test; mod tags_test; mod test_highlight_test; diff --git a/cli/src/tests/pathological_test.rs b/cli/src/tests/pathological_test.rs new file mode 100644 index 00000000..7ebd5439 --- /dev/null +++ b/cli/src/tests/pathological_test.rs @@ -0,0 +1,15 @@ +use super::helpers::allocations; +use super::helpers::fixtures::get_language; +use tree_sitter::Parser; + +#[test] +fn test_pathological_example_1() { + let language = "cpp"; + let source = r#"*ss(qqX>(), + ); + }); +} + #[test] fn test_query_matches_with_leading_zero_or_more_repeated_leaf_nodes() { allocations::record(|| { @@ -1161,6 +1210,43 @@ fn test_query_matches_with_too_many_permutations_to_track() { }); } +#[test] +fn test_query_matches_with_alternatives_and_too_many_permutations_to_track() { + allocations::record(|| { + let language = get_language("javascript"); + let query = Query::new( + language, + " + ( + (comment) @doc + ; not immediate + (class_declaration) @class + ) + + (call_expression + function: [ + (identifier) @function + (member_expression property: (property_identifier) @method) + ]) + ", + ) + .unwrap(); + + let source = "/* hi */ a.b(); ".repeat(50); + + let mut parser = Parser::new(); + parser.set_language(language).unwrap(); + let tree = parser.parse(&source, None).unwrap(); + let mut cursor = QueryCursor::new(); + let matches = cursor.matches(&query, tree.root_node(), to_callback(&source)); + + assert_eq!( + collect_matches(matches, &query, source.as_str()), + vec![(1, vec![("method", "b")]); 50], + ); + }); +} + #[test] fn test_query_matches_with_anonymous_tokens() { allocations::record(|| { @@ -1215,6 +1301,45 @@ fn test_query_matches_within_byte_range() { }); } +#[test] +fn test_query_captures_within_byte_range() { + allocations::record(|| { + let language = get_language("c"); + let query = Query::new( + language, + " + (call_expression + function: (identifier) @function + arguments: (argument_list (string_literal) @string.arg)) + + (string_literal) @string + ", + ) + .unwrap(); + + let source = r#"DEFUN ("safe-length", Fsafe_length, Ssafe_length, 1, 1, 0)"#; + + let mut parser = Parser::new(); + parser.set_language(language).unwrap(); + let tree = parser.parse(&source, None).unwrap(); + + let mut cursor = QueryCursor::new(); + let captures = + cursor + .set_byte_range(3, 27) + .captures(&query, tree.root_node(), to_callback(source)); + + assert_eq!( + collect_captures(captures, &query, source), + &[ + ("function", "DEFUN"), + ("string.arg", "\"safe-length\""), + ("string", "\"safe-length\""), + ] + ); + }); +} + #[test] fn test_query_matches_different_queries_same_cursor() { allocations::record(|| { @@ -1420,12 +1545,17 @@ fn test_query_captures_with_text_conditions() { ((identifier) @function.builtin (#eq? @function.builtin "require")) - (identifier) @variable + ((identifier) @variable + (#not-match? @variable "^(lambda|load)$")) "#, ) .unwrap(); let source = " + toad + load + panda + lambda const ab = require('./ab'); new Cd(EF); "; @@ -1439,6 +1569,8 @@ fn test_query_captures_with_text_conditions() { assert_eq!( collect_captures(captures, &query, source), &[ + ("variable", "toad"), + ("variable", "panda"), ("variable", "ab"), ("function.builtin", "require"), ("variable", "require"), @@ -2074,6 +2206,39 @@ fn test_query_disable_pattern() { }); } +#[test] +fn test_query_alternative_predicate_prefix() { + allocations::record(|| { + let language = get_language("c"); + let query = Query::new( + language, + r#" + ((call_expression + function: (identifier) @keyword + arguments: (argument_list + (string_literal) @function)) + (.eq? @keyword "DEFUN")) + "#, + ) + .unwrap(); + let source = r#" + DEFUN ("identity", Fidentity, Sidentity, 1, 1, 0, + doc: /* Return the argument unchanged. */ + attributes: const) + (Lisp_Object arg) + { + return arg; + } + "#; + assert_query_matches( + language, + &query, + source, + &[(0, vec![("keyword", "DEFUN"), ("function", "\"identity\"")])], + ); + }); +} + #[test] fn test_query_is_definite() { struct Row { @@ -2086,10 +2251,7 @@ fn test_query_is_definite() { Row { language: get_language("python"), pattern: r#"(expression_statement (string))"#, - results_by_symbol: &[ - ("expression_statement", false), - ("string", false), - ], + results_by_symbol: &[("expression_statement", false), ("string", false)], }, Row { language: get_language("javascript"), @@ -2102,30 +2264,17 @@ fn test_query_is_definite() { Row { language: get_language("javascript"), pattern: r#"(object "{" "}")"#, - results_by_symbol: &[ - ("object", false), - ("{", true), - ("}", true), - ], + results_by_symbol: &[("object", false), ("{", true), ("}", true)], }, Row { language: get_language("javascript"), pattern: r#"(pair (property_identifier) ":")"#, - results_by_symbol: &[ - ("pair", false), - ("property_identifier", false), - (":", true), - ], + results_by_symbol: &[("pair", false), ("property_identifier", false), (":", true)], }, Row { language: get_language("javascript"), pattern: r#"(object "{" (_) "}")"#, - results_by_symbol: &[ - ("object", false), - ("{", false), - ("", false), - ("}", true), - ], + results_by_symbol: &[("object", false), ("{", false), ("", false), ("}", true)], }, Row { language: get_language("javascript"), diff --git a/cli/src/tests/tags_test.rs b/cli/src/tests/tags_test.rs index fad8ebd8..2b058c0b 100644 --- a/cli/src/tests/tags_test.rs +++ b/cli/src/tests/tags_test.rs @@ -1,73 +1,81 @@ use super::helpers::allocations; use super::helpers::fixtures::{get_language, get_language_queries_path}; +use std::ffi::CStr; use std::ffi::CString; use std::{fs, ptr, slice, str}; +use tree_sitter::Point; use tree_sitter_tags::c_lib as c; -use tree_sitter_tags::{Error, TagKind, TagsConfiguration, TagsContext}; +use tree_sitter_tags::{Error, TagsConfiguration, TagsContext}; const PYTHON_TAG_QUERY: &'static str = r#" ( - (function_definition - name: (identifier) @name - body: (block . (expression_statement (string) @doc))) @function - (#strip! @doc "(^['\"\\s]*)|(['\"\\s]*$)") + (function_definition + name: (identifier) @name + body: (block . (expression_statement (string) @doc))) @definition.function + (#strip! @doc "(^['\"\\s]*)|(['\"\\s]*$)") ) (function_definition - name: (identifier) @name) @function + name: (identifier) @name) @definition.function ( - (class_definition - name: (identifier) @name - body: (block - . (expression_statement (string) @doc))) @class - (#strip! @doc "(^['\"\\s]*)|(['\"\\s]*$)") + (class_definition + name: (identifier) @name + body: (block + . (expression_statement (string) @doc))) @definition.class + (#strip! @doc "(^['\"\\s]*)|(['\"\\s]*$)") ) (class_definition - name: (identifier) @name) @class + name: (identifier) @name) @definition.class (call - function: (identifier) @name) @call + function: (identifier) @name) @reference.call + +(call + function: (attribute + attribute: (identifier) @name)) @reference.call "#; const JS_TAG_QUERY: &'static str = r#" ( (comment)* @doc . (class_declaration - name: (identifier) @name) @class - (#select-adjacent! @doc @class) + name: (identifier) @name) @definition.class + (#select-adjacent! @doc @definition.class) (#strip! @doc "(^[/\\*\\s]*)|([/\\*\\s]*$)") ) ( (comment)* @doc . (method_definition - name: (property_identifier) @name) @method - (#select-adjacent! @doc @method) + name: (property_identifier) @name) @definition.method + (#select-adjacent! @doc @definition.method) (#strip! @doc "(^[/\\*\\s]*)|([/\\*\\s]*$)") ) ( (comment)* @doc . (function_declaration - name: (identifier) @name) @function - (#select-adjacent! @doc @function) + name: (identifier) @name) @definition.function + (#select-adjacent! @doc @definition.function) (#strip! @doc "(^[/\\*\\s]*)|([/\\*\\s]*$)") ) (call_expression - function: (identifier) @name) @call + function: (identifier) @name) @reference.call "#; const RUBY_TAG_QUERY: &'static str = r#" (method - name: (identifier) @name) @method + name: (_) @name) @definition.method (method_call - method: (identifier) @name) @call + method: (identifier) @name) @reference.call -((identifier) @name @call +(setter (identifier) @ignore) + +((identifier) @name @reference.call (#is-not? local)) "#; @@ -94,25 +102,26 @@ fn test_tags_python() { let tags = tag_context .generate_tags(&tags_config, source, None) .unwrap() + .0 .collect::, _>>() .unwrap(); assert_eq!( tags.iter() - .map(|t| (substr(source, &t.name_range), t.kind)) + .map(|t| ( + substr(source, &t.name_range), + tags_config.syntax_type_name(t.syntax_type_id) + )) .collect::>(), &[ - ("Customer", TagKind::Class), - ("age", TagKind::Function), - ("compute_age", TagKind::Call), + ("Customer", "class"), + ("age", "function"), + ("compute_age", "call"), ] ); - assert_eq!(substr(source, &tags[0].line_range), " class Customer:"); - assert_eq!( - substr(source, &tags[1].line_range), - " def age(self):" - ); + assert_eq!(substr(source, &tags[0].line_range), "class Customer:"); + assert_eq!(substr(source, &tags[1].line_range), "def age(self):"); assert_eq!(tags[0].docs.as_ref().unwrap(), "Data about a customer"); assert_eq!(tags[1].docs.as_ref().unwrap(), "Get the customer's age"); } @@ -145,17 +154,22 @@ fn test_tags_javascript() { let tags = tag_context .generate_tags(&tags_config, source, None) .unwrap() + .0 .collect::, _>>() .unwrap(); assert_eq!( tags.iter() - .map(|t| (substr(source, &t.name_range), t.kind)) + .map(|t| ( + substr(source, &t.name_range), + t.span.clone(), + tags_config.syntax_type_name(t.syntax_type_id) + )) .collect::>(), &[ - ("Customer", TagKind::Class), - ("getAge", TagKind::Method), - ("Agent", TagKind::Class) + ("Customer", Point::new(5, 10)..Point::new(5, 18), "class",), + ("getAge", Point::new(9, 8)..Point::new(9, 14), "method",), + ("Agent", Point::new(15, 10)..Point::new(15, 15), "class",) ] ); assert_eq!( @@ -166,6 +180,27 @@ fn test_tags_javascript() { assert_eq!(tags[2].docs, None); } +#[test] +fn test_tags_columns_measured_in_utf16_code_units() { + let language = get_language("python"); + let tags_config = TagsConfiguration::new(language, PYTHON_TAG_QUERY, "").unwrap(); + let mut tag_context = TagsContext::new(); + + let source = r#""ā¤ļøā¤ļøā¤ļø".hello_α_ω()"#.as_bytes(); + + let tag = tag_context + .generate_tags(&tags_config, source, None) + .unwrap() + .0 + .next() + .unwrap() + .unwrap(); + + assert_eq!(substr(source, &tag.name_range), "hello_α_ω"); + assert_eq!(tag.span, Point::new(0, 21)..Point::new(0, 32)); + assert_eq!(tag.utf16_column_range, 9..18); +} + #[test] fn test_tags_ruby() { let language = get_language("ruby"); @@ -177,7 +212,7 @@ fn test_tags_ruby() { " b = 1 - def foo() + def foo=() c = 1 # a is a method because it is not in scope @@ -197,6 +232,7 @@ fn test_tags_ruby() { let tags = tag_context .generate_tags(&tags_config, source.as_bytes(), None) .unwrap() + .0 .collect::, _>>() .unwrap(); @@ -204,18 +240,18 @@ fn test_tags_ruby() { tags.iter() .map(|t| ( substr(source.as_bytes(), &t.name_range), - t.kind, + tags_config.syntax_type_name(t.syntax_type_id), (t.span.start.row, t.span.start.column), )) .collect::>(), &[ - ("foo", TagKind::Method, (2, 0)), - ("bar", TagKind::Call, (7, 4)), - ("a", TagKind::Call, (7, 8)), - ("b", TagKind::Call, (7, 11)), - ("each", TagKind::Call, (9, 14)), - ("baz", TagKind::Call, (13, 8)), - ("b", TagKind::Call, (13, 15),), + ("foo=", "method", (2, 4)), + ("bar", "call", (7, 4)), + ("a", "call", (7, 8)), + ("b", "call", (7, 11)), + ("each", "call", (9, 14)), + ("baz", "call", (13, 8)), + ("b", "call", (13, 15),), ] ); } @@ -239,7 +275,7 @@ fn test_tags_cancellation() { .generate_tags(&tags_config, source.as_bytes(), Some(&cancellation_flag)) .unwrap(); - for (i, tag) in tags.enumerate() { + for (i, tag) in tags.0.enumerate() { if i == 150 { cancellation_flag.store(1, Ordering::SeqCst); } @@ -253,6 +289,47 @@ fn test_tags_cancellation() { }); } +#[test] +fn test_invalid_capture() { + let language = get_language("python"); + let e = TagsConfiguration::new(language, "(identifier) @method", "") + .expect_err("expected InvalidCapture error"); + assert_eq!(e, Error::InvalidCapture("method".to_string())); +} + +#[test] +fn test_tags_with_parse_error() { + let language = get_language("python"); + let tags_config = TagsConfiguration::new(language, PYTHON_TAG_QUERY, "").unwrap(); + let mut tag_context = TagsContext::new(); + + let source = br#" + class Fine: pass + class Bad + "#; + + let (tags, failed) = tag_context + .generate_tags(&tags_config, source, None) + .unwrap(); + + let newtags = tags.collect::, _>>().unwrap(); + + assert!(failed, "syntax error should have been detected"); + + assert_eq!( + newtags.iter() + .map(|t| ( + substr(source, &t.name_range), + tags_config.syntax_type_name(t.syntax_type_id) + )) + .collect::>(), + &[ + ("Fine", "class"), + ] + ); +} + + #[test] fn test_tags_via_c_api() { allocations::record(|| { @@ -316,29 +393,29 @@ fn test_tags_via_c_api() { }) .unwrap(); + let syntax_types: Vec<&str> = unsafe { + let mut len: u32 = 0; + let ptr = + c::ts_tagger_syntax_kinds_for_scope_name(tagger, c_scope_name.as_ptr(), &mut len); + slice::from_raw_parts(ptr, len as usize) + .iter() + .map(|i| CStr::from_ptr(*i).to_str().unwrap()) + .collect() + }; + assert_eq!( tags.iter() .map(|tag| ( - tag.kind, + syntax_types[tag.syntax_type_id as usize], &source_code[tag.name_start_byte as usize..tag.name_end_byte as usize], &source_code[tag.line_start_byte as usize..tag.line_end_byte as usize], &docs[tag.docs_start_byte as usize..tag.docs_end_byte as usize], )) .collect::>(), &[ - ( - c::TSTagKind::Function, - "b", - "function b() {", - "one\ntwo\nthree" - ), - ( - c::TSTagKind::Class, - "C", - "class C extends D {", - "four\nfive" - ), - (c::TSTagKind::Call, "b", "b(a);", "") + ("function", "b", "function b() {", "one\ntwo\nthree"), + ("class", "C", "class C extends D {", "four\nfive"), + ("call", "b", "b(a);", "") ] ); diff --git a/cli/src/util.rs b/cli/src/util.rs index 8978ecc1..9f941f62 100644 --- a/cli/src/util.rs +++ b/cli/src/util.rs @@ -1,3 +1,4 @@ +use super::error::{Error, Result}; use std::io; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; @@ -31,12 +32,12 @@ pub struct LogSession(); pub struct LogSession(PathBuf, Option, Option); #[cfg(windows)] -pub fn log_graphs(_parser: &mut Parser, _path: &str) -> std::io::Result { +pub fn log_graphs(_parser: &mut Parser, _path: &str) -> Result { Ok(LogSession()) } #[cfg(unix)] -pub fn log_graphs(parser: &mut Parser, path: &str) -> std::io::Result { +pub fn log_graphs(parser: &mut Parser, path: &str) -> Result { use std::io::Write; let mut dot_file = std::fs::File::create(path)?; @@ -46,11 +47,13 @@ pub fn log_graphs(parser: &mut Parser, path: &str) -> std::io::Result; fn next(&mut self) -> Option { - loop { + 'main: loop { // If we've already determined the next highlight boundary, just return it. if let Some(e) = self.next_event.take() { return Some(Ok(e)); @@ -640,29 +642,34 @@ where // If none of the layers have any more highlight boundaries, terminate. if self.layers.is_empty() { - if self.byte_offset < self.source.len() { + return if self.byte_offset < self.source.len() { let result = Some(Ok(HighlightEvent::Source { start: self.byte_offset, end: self.source.len(), })); self.byte_offset = self.source.len(); - return result; + result } else { - return None; - } + None + }; } // Get the next capture from whichever layer has the earliest highlight boundary. - let match_; - let mut captures; - let mut capture; - let mut pattern_index; + let range; let layer = &mut self.layers[0]; - if let Some((m, capture_index)) = layer.captures.peek() { - match_ = m; - captures = match_.captures; - pattern_index = match_.pattern_index; - capture = captures[*capture_index]; + if let Some((next_match, capture_index)) = layer.captures.peek() { + let next_capture = next_match.captures[*capture_index]; + range = next_capture.node.byte_range(); + + // If any previous highlight ends before this node starts, then before + // processing this capture, emit the source code up until the end of the + // previous highlight, and an end event for that highlight. + if let Some(end_byte) = layer.highlight_end_stack.last().cloned() { + if end_byte <= range.start { + layer.highlight_end_stack.pop(); + return self.emit_event(end_byte, Some(HighlightEvent::HighlightEnd)); + } + } } // If there are no more captures, then emit any remaining highlight end events. // And if there are none of those, then just advance to the end of the document. @@ -673,30 +680,17 @@ where return self.emit_event(self.source.len(), None); }; - // If any previous highlight ends before this node starts, then before - // processing this capture, emit the source code up until the end of the - // previous highlight, and an end event for that highlight. - let range = capture.node.byte_range(); - if let Some(end_byte) = layer.highlight_end_stack.last().cloned() { - if end_byte <= range.start { - layer.highlight_end_stack.pop(); - return self.emit_event(end_byte, Some(HighlightEvent::HighlightEnd)); - } - } - - // Remove from the local scope stack any local scopes that have already ended. - while range.start > layer.scope_stack.last().unwrap().range.end { - layer.scope_stack.pop(); - } + let (mut match_, capture_index) = layer.captures.next().unwrap(); + let mut capture = match_.captures[capture_index]; // If this capture represents an injection, then process the injection. - if pattern_index < layer.config.locals_pattern_index { + if match_.pattern_index < layer.config.locals_pattern_index { let (language_name, content_node, include_children) = - injection_for_match(&layer.config, &layer.config.query, match_, &self.source); + injection_for_match(&layer.config, &layer.config.query, &match_, &self.source); // Explicitly remove this match so that none of its other captures will remain - // in the stream of captures. The `unwrap` is ok because - layer.captures.next().unwrap().0.remove(); + // in the stream of captures. + match_.remove(); // If a language is found with the given name, then add a new language layer // to the highlighted document. @@ -729,16 +723,19 @@ where } self.sort_layers(); - continue; + continue 'main; } - layer.captures.next(); + // Remove from the local scope stack any local scopes that have already ended. + while range.start > layer.scope_stack.last().unwrap().range.end { + layer.scope_stack.pop(); + } // If this capture is for tracking local variables, then process the // local variable info. let mut reference_highlight = None; let mut definition_highlight = None; - while pattern_index < layer.config.highlights_pattern_index { + while match_.pattern_index < layer.config.highlights_pattern_index { // If the node represents a local scope, push a new local scope onto // the scope stack. if Some(capture.index) == layer.config.local_scope_capture_index { @@ -748,7 +745,7 @@ where range: range.clone(), local_defs: Vec::new(), }; - for prop in layer.config.query.property_settings(pattern_index) { + for prop in layer.config.query.property_settings(match_.pattern_index) { match prop.key.as_ref() { "local.scope-inherits" => { scope.inherits = @@ -767,7 +764,7 @@ where let scope = layer.scope_stack.last_mut().unwrap(); let mut value_range = 0..0; - for capture in captures { + for capture in match_.captures { if Some(capture.index) == layer.config.local_def_value_capture_index { value_range = capture.node.byte_range(); } @@ -810,84 +807,76 @@ where } } - // Continue processing any additional local-variable-tracking patterns - // for the same node. + // Continue processing any additional matches for the same node. if let Some((next_match, next_capture_index)) = layer.captures.peek() { let next_capture = next_match.captures[*next_capture_index]; if next_capture.node == capture.node { - pattern_index = next_match.pattern_index; - captures = next_match.captures; capture = next_capture; - layer.captures.next(); + match_ = layer.captures.next().unwrap().0; continue; - } else { - break; } } - break; + self.sort_layers(); + continue 'main; } // Otherwise, this capture must represent a highlight. - let mut has_highlight = true; - // If this exact range has already been highlighted by an earlier pattern, or by // a different layer, then skip over this one. if let Some((last_start, last_end, last_depth)) = self.last_highlight_range { if range.start == last_start && range.end == last_end && layer.depth < last_depth { - has_highlight = false; + self.sort_layers(); + continue 'main; } } // If the current node was found to be a local variable, then skip over any // highlighting patterns that are disabled for local variables. - while has_highlight - && (definition_highlight.is_some() || reference_highlight.is_some()) - && layer.config.non_local_variable_patterns[pattern_index] - { - has_highlight = false; - if let Some((next_match, next_capture_index)) = layer.captures.peek() { - let next_capture = next_match.captures[*next_capture_index]; - if next_capture.node == capture.node { - capture = next_capture; - has_highlight = true; - pattern_index = next_match.pattern_index; - layer.captures.next(); - continue; + if definition_highlight.is_some() || reference_highlight.is_some() { + while layer.config.non_local_variable_patterns[match_.pattern_index] { + if let Some((next_match, next_capture_index)) = layer.captures.peek() { + let next_capture = next_match.captures[*next_capture_index]; + if next_capture.node == capture.node { + capture = next_capture; + match_ = layer.captures.next().unwrap().0; + continue; + } } + + self.sort_layers(); + continue 'main; } - break; } - if has_highlight { - // Once a highlighting pattern is found for the current node, skip over - // any later highlighting patterns that also match this node. Captures - // for a given node are ordered by pattern index, so these subsequent - // captures are guaranteed to be for highlighting, not injections or - // local variables. - while let Some((next_match, next_capture_index)) = layer.captures.peek() { - if next_match.captures[*next_capture_index].node == capture.node { - layer.captures.next(); - } else { - break; - } + // Once a highlighting pattern is found for the current node, skip over + // any later highlighting patterns that also match this node. Captures + // for a given node are ordered by pattern index, so these subsequent + // captures are guaranteed to be for highlighting, not injections or + // local variables. + while let Some((next_match, next_capture_index)) = layer.captures.peek() { + let next_capture = next_match.captures[*next_capture_index]; + if next_capture.node == capture.node { + layer.captures.next(); + } else { + break; } + } - let current_highlight = layer.config.highlight_indices[capture.index as usize]; + let current_highlight = layer.config.highlight_indices[capture.index as usize]; - // If this node represents a local definition, then store the current - // highlight value on the local scope entry representing this node. - if let Some(definition_highlight) = definition_highlight { - *definition_highlight = current_highlight; - } + // If this node represents a local definition, then store the current + // highlight value on the local scope entry representing this node. + if let Some(definition_highlight) = definition_highlight { + *definition_highlight = current_highlight; + } - // Emit a scope start event and push the node's end position to the stack. - if let Some(highlight) = reference_highlight.or(current_highlight) { - self.last_highlight_range = Some((range.start, range.end, layer.depth)); - layer.highlight_end_stack.push(range.end); - return self - .emit_event(range.start, Some(HighlightEvent::HighlightStart(highlight))); - } + // Emit a scope start event and push the node's end position to the stack. + if let Some(highlight) = reference_highlight.or(current_highlight) { + self.last_highlight_range = Some((range.start, range.end, layer.depth)); + layer.highlight_end_stack.push(range.end); + return self + .emit_event(range.start, Some(HighlightEvent::HighlightStart(highlight))); } self.sort_layers(); @@ -897,11 +886,13 @@ where impl HtmlRenderer { pub fn new() -> Self { - HtmlRenderer { - html: Vec::new(), - line_offsets: vec![0], + let mut result = HtmlRenderer { + html: Vec::with_capacity(BUFFER_HTML_RESERVE_CAPACITY), + line_offsets: Vec::with_capacity(BUFFER_LINES_RESERVE_CAPACITY), carriage_return_highlight: None, - } + }; + result.line_offsets.push(0); + result } pub fn set_carriage_return_highlight(&mut self, highlight: Option) { @@ -909,8 +900,8 @@ impl HtmlRenderer { } pub fn reset(&mut self) { - self.html.clear(); - self.line_offsets.clear(); + shrink_and_clear(&mut self.html, BUFFER_HTML_RESERVE_CAPACITY); + shrink_and_clear(&mut self.line_offsets, BUFFER_LINES_RESERVE_CAPACITY); self.line_offsets.push(0); } @@ -1074,3 +1065,11 @@ fn injection_for_match<'a>( (language_name, content_node, include_children) } + +fn shrink_and_clear(vec: &mut Vec, capacity: usize) { + if vec.len() > capacity { + vec.truncate(capacity); + vec.shrink_to_fit(); + } + vec.clear(); +} diff --git a/lib/binding_rust/lib.rs b/lib/binding_rust/lib.rs index b4d6f8c5..c601aecc 100644 --- a/lib/binding_rust/lib.rs +++ b/lib/binding_rust/lib.rs @@ -170,7 +170,7 @@ pub enum QueryError { enum TextPredicate { CaptureEqString(u32, String, bool), CaptureEqCapture(u32, u32, bool), - CaptureMatchString(u32, regex::bytes::Regex), + CaptureMatchString(u32, regex::bytes::Regex, bool), } impl Language { @@ -1314,7 +1314,7 @@ impl Query { }); } - "match?" => { + "match?" | "not-match?" => { if p.len() != 3 { return Err(QueryError::Predicate(format!( "Wrong number of arguments to #match? predicate. Expected 2, got {}.", @@ -1334,12 +1334,14 @@ impl Query { ))); } + let is_positive = operator_name == "match?"; let regex = &string_values[p[2].value_id as usize]; text_predicates.push(TextPredicate::CaptureMatchString( p[1].value_id, regex::bytes::Regex::new(regex).map_err(|_| { QueryError::Predicate(format!("Invalid regex '{}'", regex)) })?, + is_positive, )); } @@ -1631,9 +1633,9 @@ impl<'a> QueryMatch<'a> { let node = self.capture_for_index(*i).unwrap(); (text_callback(node).as_ref() == s.as_bytes()) == *is_positive } - TextPredicate::CaptureMatchString(i, r) => { + TextPredicate::CaptureMatchString(i, r, is_positive) => { let node = self.capture_for_index(*i).unwrap(); - r.is_match(text_callback(node).as_ref()) + r.is_match(text_callback(node).as_ref()) == *is_positive } }) } diff --git a/lib/binding_web/binding.js b/lib/binding_web/binding.js index cd8bec75..404beeb6 100644 --- a/lib/binding_web/binding.js +++ b/lib/binding_web/binding.js @@ -787,6 +787,8 @@ class Language { } break; + case 'not-match?': + isPositive = false; case 'match?': if (steps.length !== 3) throw new Error( `Wrong number of arguments to \`#match?\` predicate. Expected 2, got ${steps.length - 1}.` @@ -801,7 +803,7 @@ class Language { const regex = new RegExp(steps[2].value); textPredicates[i].push(function(captures) { for (const c of captures) { - if (c.name === captureName) return regex.test(c.node.text); + if (c.name === captureName) return regex.test(c.node.text) === isPositive; } return false; }); diff --git a/lib/binding_web/test/query-test.js b/lib/binding_web/test/query-test.js index 9dda9834..9d1e24e1 100644 --- a/lib/binding_web/test/query-test.js +++ b/lib/binding_web/test/query-test.js @@ -126,12 +126,17 @@ describe("Query", () => { it("handles conditions that compare the text of capture to literal strings", () => { tree = parser.parse(` + lambda + panda + load + toad const ab = require('./ab'); new Cd(EF); `); query = JavaScript.query(` - (identifier) @variable + ((identifier) @variable + (#not-match? @variable "^(lambda|load)$")) ((identifier) @function.builtin (#eq? @function.builtin "require")) @@ -145,6 +150,8 @@ describe("Query", () => { const captures = query.captures(tree.rootNode); assert.deepEqual(formatCaptures(captures), [ + { name: "variable", text: "panda" }, + { name: "variable", text: "toad" }, { name: "variable", text: "ab" }, { name: "variable", text: "require" }, { name: "function.builtin", text: "require" }, diff --git a/lib/src/alloc.h b/lib/src/alloc.h index 9bbf7513..0e0927a9 100644 --- a/lib/src/alloc.h +++ b/lib/src/alloc.h @@ -45,7 +45,7 @@ static inline bool ts_toggle_allocation_recording(bool value) { static inline void *ts_malloc(size_t size) { void *result = malloc(size); if (size > 0 && !result) { - fprintf(stderr, "tree-sitter failed to allocate %lu bytes", size); + fprintf(stderr, "tree-sitter failed to allocate %zu bytes", size); exit(1); } return result; @@ -54,7 +54,7 @@ static inline void *ts_malloc(size_t size) { static inline void *ts_calloc(size_t count, size_t size) { void *result = calloc(count, size); if (count > 0 && !result) { - fprintf(stderr, "tree-sitter failed to allocate %lu bytes", count * size); + fprintf(stderr, "tree-sitter failed to allocate %zu bytes", count * size); exit(1); } return result; @@ -63,7 +63,7 @@ static inline void *ts_calloc(size_t count, size_t size) { static inline void *ts_realloc(void *buffer, size_t size) { void *result = realloc(buffer, size); if (size > 0 && !result) { - fprintf(stderr, "tree-sitter failed to reallocate %lu bytes", size); + fprintf(stderr, "tree-sitter failed to reallocate %zu bytes", size); exit(1); } return result; diff --git a/lib/src/parser.c b/lib/src/parser.c index e9c16f0c..37d1a1c2 100644 --- a/lib/src/parser.c +++ b/lib/src/parser.c @@ -355,10 +355,14 @@ static Subtree ts_parser__lex( StackVersion version, TSStateId parse_state ) { + TSLexMode lex_mode = self->language->lex_modes[parse_state]; + if (lex_mode.lex_state == (uint16_t)-1) { + LOG("no_lookahead_after_non_terminal_extra"); + return NULL_SUBTREE; + } + Length start_position = ts_stack_position(self->stack, version); Subtree external_token = ts_stack_last_external_token(self->stack, version); - TSLexMode lex_mode = self->language->lex_modes[parse_state]; - if (lex_mode.lex_state == (uint16_t)-1) return NULL_SUBTREE; const bool *valid_external_tokens = ts_language_enabled_external_tokens( self->language, lex_mode.external_lex_state @@ -761,20 +765,26 @@ static StackVersion ts_parser__reduce( int dynamic_precedence, uint16_t production_id, bool is_fragile, - bool is_extra + bool end_of_non_terminal_extra ) { uint32_t initial_version_count = ts_stack_version_count(self->stack); - uint32_t removed_version_count = 0; - StackSliceArray pop = ts_stack_pop_count(self->stack, version, count); + // Pop the given number of nodes from the given version of the parse stack. + // If stack versions have previously merged, then there may be more than one + // path back through the stack. For each path, create a new parent node to + // contain the popped children, and push it onto the stack in place of the + // children. + StackSliceArray pop = ts_stack_pop_count(self->stack, version, count); + uint32_t removed_version_count = 0; for (uint32_t i = 0; i < pop.size; i++) { StackSlice slice = pop.contents[i]; StackVersion slice_version = slice.version - removed_version_count; - // Error recovery can sometimes cause lots of stack versions to merge, - // such that a single pop operation can produce a lots of slices. - // Avoid creating too many stack versions in that situation. - if (i > 0 && slice_version > MAX_VERSION_COUNT + MAX_VERSION_COUNT_OVERFLOW) { + // This is where new versions are added to the parse stack. The versions + // will all be sorted and truncated at the end of the outer parsing loop. + // Allow the maximum version count to be temporarily exceeded, but only + // by a limited threshold. + if (slice_version > MAX_VERSION_COUNT + MAX_VERSION_COUNT_OVERFLOW) { ts_stack_remove_version(self->stack, slice_version); ts_subtree_array_delete(&self->tree_pool, &slice.subtrees); removed_version_count++; @@ -826,7 +836,9 @@ static StackVersion ts_parser__reduce( TSStateId state = ts_stack_state(self->stack, slice_version); TSStateId next_state = ts_language_next_state(self->language, state, symbol); - if (is_extra) parent.ptr->extra = true; + if (end_of_non_terminal_extra && next_state == state) { + parent.ptr->extra = true; + } if (is_fragile || pop.size > 1 || initial_version_count > 1) { parent.ptr->fragile_left = true; parent.ptr->fragile_right = true; @@ -1339,24 +1351,26 @@ static bool ts_parser__advance( ); } -lex: - // Otherwise, re-run the lexer. - if (!lookahead.ptr) { - lookahead = ts_parser__lex(self, version, state); - if (lookahead.ptr) { - ts_parser__set_cached_token(self, position, last_external_token, lookahead); - ts_language_table_entry(self->language, state, ts_subtree_symbol(lookahead), &table_entry); - } - - // When parsing a non-terminal extra, a null lookahead indicates the - // end of the rule. The reduction is stored in the EOF table entry. - // After the reduction, the lexer needs to be run again. - else { - ts_language_table_entry(self->language, state, ts_builtin_sym_end, &table_entry); - } - } - + bool needs_lex = !lookahead.ptr; for (;;) { + // Otherwise, re-run the lexer. + if (needs_lex) { + needs_lex = false; + lookahead = ts_parser__lex(self, version, state); + + if (lookahead.ptr) { + ts_parser__set_cached_token(self, position, last_external_token, lookahead); + ts_language_table_entry(self->language, state, ts_subtree_symbol(lookahead), &table_entry); + } + + // When parsing a non-terminal extra, a null lookahead indicates the + // end of the rule. The reduction is stored in the EOF table entry. + // After the reduction, the lexer needs to be run again. + else { + ts_language_table_entry(self->language, state, ts_builtin_sym_end, &table_entry); + } + } + // If a cancellation flag or a timeout was provided, then check every // time a fixed number of parse actions has been processed. if (++self->operation_count == OP_COUNT_PER_TIMEOUT_CHECK) { @@ -1408,12 +1422,12 @@ lex: case TSParseActionTypeReduce: { bool is_fragile = table_entry.action_count > 1; - bool is_extra = lookahead.ptr == NULL; + bool end_of_non_terminal_extra = lookahead.ptr == NULL; LOG("reduce sym:%s, child_count:%u", SYM_NAME(action.params.reduce.symbol), action.params.reduce.child_count); StackVersion reduction_version = ts_parser__reduce( self, version, action.params.reduce.symbol, action.params.reduce.child_count, action.params.reduce.dynamic_precedence, action.params.reduce.production_id, - is_fragile, is_extra + is_fragile, end_of_non_terminal_extra ); if (reduction_version != STACK_VERSION_NONE) { last_reduction_version = reduction_version; @@ -1453,8 +1467,10 @@ lex: // (and completing the non-terminal extra rule) run the lexer again based // on the current parse state. if (!lookahead.ptr) { - lookahead = ts_parser__lex(self, version, state); + needs_lex = true; + continue; } + ts_language_table_entry( self->language, state, @@ -1464,6 +1480,11 @@ lex: continue; } + if (!lookahead.ptr) { + ts_stack_pause(self->stack, version, ts_builtin_sym_end); + return true; + } + // If there were no parse actions for the current lookahead token, then // it is not valid in this state. If the current lookahead token is a // keyword, then switch to treating it as the normal word token if that @@ -1503,8 +1524,7 @@ lex: if (ts_parser__breakdown_top_of_stack(self, version)) { state = ts_stack_state(self->stack, version); ts_subtree_release(&self->tree_pool, lookahead); - lookahead = NULL_SUBTREE; - goto lex; + needs_lex = true; continue; } diff --git a/lib/src/query.c b/lib/src/query.c index dd6ad8c0..15aa2fd1 100644 --- a/lib/src/query.c +++ b/lib/src/query.c @@ -11,7 +11,6 @@ // #define LOG(...) fprintf(stderr, __VA_ARGS__) #define LOG(...) -#define MAX_STATE_COUNT 256 #define MAX_CAPTURE_LIST_COUNT 32 #define MAX_STEP_CAPTURE_COUNT 3 #define MAX_STATE_PREDECESSOR_COUNT 100 @@ -51,7 +50,6 @@ typedef struct { uint16_t alternative_index; uint16_t depth; bool contains_captures: 1; - bool is_pattern_start: 1; bool is_immediate: 1; bool is_last_child: 1; bool is_pass_through: 1; @@ -128,9 +126,10 @@ typedef struct { uint16_t step_index; uint16_t pattern_index; uint16_t capture_list_id; - uint16_t consumed_capture_count: 14; + uint16_t consumed_capture_count: 12; bool seeking_immediate_match: 1; bool has_in_progress_alternatives: 1; + bool dead: 1; } QueryState; typedef Array(TSQueryCapture) CaptureList; @@ -224,6 +223,7 @@ struct TSQueryCursor { TSPoint start_point; TSPoint end_point; bool ascending; + bool halted; }; static const TSQueryError PARENT_DONE = -1; @@ -500,7 +500,6 @@ static QueryStep query_step__new( .alternative_index = NONE, .contains_captures = false, .is_last_child = false, - .is_pattern_start = false, .is_pass_through = false, .is_dead_end = false, .is_definite = false, @@ -692,6 +691,23 @@ static inline void ts_query__pattern_map_insert( ) { uint32_t index; ts_query__pattern_map_search(self, symbol, &index); + + // Ensure that the entries are sorted not only by symbol, but also + // by pattern_index. This way, states for earlier patterns will be + // initiated first, which allows the ordering of the states array + // to be maintained more efficiently. + while (index < self->pattern_map.size) { + PatternEntry *entry = &self->pattern_map.contents[index]; + if ( + self->steps.contents[entry->step_index].symbol == symbol && + entry->pattern_index < pattern_index + ) { + index++; + } else { + break; + } + } + array_insert(&self->pattern_map, index, ((PatternEntry) { .step_index = start_step_index, .pattern_index = pattern_index, @@ -1438,8 +1454,8 @@ static TSQueryError ts_query__parse_pattern( } } - // A pound character indicates the start of a predicate. - else if (stream->next == '#') { + // A dot/pound character indicates the start of a predicate. + else if (stream->next == '.' || stream->next == '#') { stream_advance(stream); return ts_query__parse_predicate(self, stream); } @@ -1796,7 +1812,6 @@ TSQuery *ts_query_new( // Maintain a map that can look up patterns for a given root symbol. for (;;) { QueryStep *step = &self->steps.contents[start_step_index]; - step->is_pattern_start = true; ts_query__pattern_map_insert(self, step->symbol, start_step_index, pattern_index); if (step->symbol == WILDCARD_SYMBOL) { self->wildcard_root_pattern_count++; @@ -1806,6 +1821,7 @@ TSQuery *ts_query_new( // then add multiple entries to the pattern map. if (step->alternative_index != NONE) { start_step_index = step->alternative_index; + step->alternative_index = NONE; } else { break; } @@ -1944,6 +1960,7 @@ TSQueryCursor *ts_query_cursor_new(void) { TSQueryCursor *self = ts_malloc(sizeof(TSQueryCursor)); *self = (TSQueryCursor) { .ascending = false, + .halted = false, .states = array_new(), .finished_states = array_new(), .capture_list_pool = capture_list_pool_new(), @@ -1952,8 +1969,8 @@ TSQueryCursor *ts_query_cursor_new(void) { .start_point = {0, 0}, .end_point = POINT_MAX, }; - array_reserve(&self->states, MAX_STATE_COUNT); - array_reserve(&self->finished_states, MAX_CAPTURE_LIST_COUNT); + array_reserve(&self->states, 8); + array_reserve(&self->finished_states, 8); return self; } @@ -1977,6 +1994,7 @@ void ts_query_cursor_exec( self->next_state_id = 0; self->depth = 0; self->ascending = false; + self->halted = false; self->query = query; } @@ -2020,6 +2038,7 @@ static bool ts_query_cursor__first_in_progress_capture( *pattern_index = UINT32_MAX; for (unsigned i = 0; i < self->states.size; i++) { const QueryState *state = &self->states.contents[i]; + if (state->dead) continue; const CaptureList *captures = capture_list_pool_get( &self->capture_list_pool, state->capture_list_id @@ -2114,65 +2133,138 @@ void ts_query_cursor__compare_captures( } } -static bool ts_query_cursor__add_state( +static void ts_query_cursor__add_state( TSQueryCursor *self, const PatternEntry *pattern ) { - if (self->states.size >= MAX_STATE_COUNT) { - LOG(" too many states"); - return false; + QueryStep *step = &self->query->steps.contents[pattern->step_index]; + uint32_t start_depth = self->depth - step->depth; + + // Keep the states array in ascending order of start_depth and pattern_index, + // so that it can be processed more efficiently elsewhere. Usually, there is + // no work to do here because of two facts: + // * States with lower start_depth are naturally added first due to the + // order in which nodes are visited. + // * Earlier patterns are naturally added first because of the ordering of the + // pattern_map data structure that's used to initiate matches. + // + // This loop is only needed in cases where two conditions hold: + // * A pattern consists of more than one sibling node, so that its states + // remain in progress after exiting the node that started the match. + // * The first node in the pattern matches against multiple nodes at the + // same depth. + // + // An example of this is the pattern '((comment)* (function))'. If multiple + // `comment` nodes appear in a row, then we may initiate a new state for this + // pattern while another state for the same pattern is already in progress. + // If there are multiple patterns like this in a query, then this loop will + // need to execute in order to keep the states ordered by pattern_index. + uint32_t index = self->states.size; + while (index > 0) { + QueryState *prev_state = &self->states.contents[index - 1]; + if (prev_state->start_depth < start_depth) break; + if (prev_state->start_depth == start_depth) { + if (prev_state->pattern_index < pattern->pattern_index) break; + if (prev_state->pattern_index == pattern->pattern_index) { + // Avoid unnecessarily inserting an unnecessary duplicate state, + // which would be immediately pruned by the longest-match criteria. + if (prev_state->step_index == pattern->step_index) return; + } + } + index--; } + LOG( " start state. pattern:%u, step:%u\n", pattern->pattern_index, pattern->step_index ); - QueryStep *step = &self->query->steps.contents[pattern->step_index]; - array_push(&self->states, ((QueryState) { + array_insert(&self->states, index, ((QueryState) { .capture_list_id = NONE, .step_index = pattern->step_index, .pattern_index = pattern->pattern_index, - .start_depth = self->depth - step->depth, + .start_depth = start_depth, .consumed_capture_count = 0, - .seeking_immediate_match = false, + .seeking_immediate_match = true, + .has_in_progress_alternatives = false, + .dead = false, })); - return true; +} + +// Acquire a capture list for this state. If there are no capture lists left in the +// pool, this will steal the capture list from another existing state, and mark that +// other state as 'dead'. +static CaptureList *ts_query_cursor__prepare_to_capture( + TSQueryCursor *self, + QueryState *state, + unsigned state_index_to_preserve +) { + if (state->capture_list_id == NONE) { + state->capture_list_id = capture_list_pool_acquire(&self->capture_list_pool); + + // If there are no capture lists left in the pool, then terminate whichever + // state has captured the earliest node in the document, and steal its + // capture list. + if (state->capture_list_id == NONE) { + uint32_t state_index, byte_offset, pattern_index; + if ( + ts_query_cursor__first_in_progress_capture( + self, + &state_index, + &byte_offset, + &pattern_index + ) && + state_index != state_index_to_preserve + ) { + LOG( + " abandon state. index:%u, pattern:%u, offset:%u.\n", + state_index, pattern_index, byte_offset + ); + QueryState *other_state = &self->states.contents[state_index]; + state->capture_list_id = other_state->capture_list_id; + other_state->capture_list_id = NONE; + other_state->dead = true; + CaptureList *list = capture_list_pool_get_mut( + &self->capture_list_pool, + state->capture_list_id + ); + array_clear(list); + return list; + } else { + LOG(" ran out of capture lists"); + return NULL; + } + } + } + return capture_list_pool_get_mut(&self->capture_list_pool, state->capture_list_id); } // Duplicate the given state and insert the newly-created state immediately after -// the given state in the `states` array. -static QueryState *ts_query__cursor_copy_state( +// the given state in the `states` array. Ensures that the given state reference is +// still valid, even if the states array is reallocated. +static QueryState *ts_query_cursor__copy_state( TSQueryCursor *self, - const QueryState *state + QueryState **state_ref ) { - if (self->states.size >= MAX_STATE_COUNT) { - LOG(" too many states"); - return NULL; - } + const QueryState *state = *state_ref; + uint32_t state_index = state - self->states.contents; + QueryState copy = *state; + copy.capture_list_id = NONE; // If the state has captures, copy its capture list. - QueryState copy = *state; - copy.capture_list_id = state->capture_list_id; if (state->capture_list_id != NONE) { - copy.capture_list_id = capture_list_pool_acquire(&self->capture_list_pool); - if (copy.capture_list_id == NONE) { - LOG(" too many capture lists"); - return NULL; - } + CaptureList *new_captures = ts_query_cursor__prepare_to_capture(self, ©, state_index); + if (!new_captures) return NULL; const CaptureList *old_captures = capture_list_pool_get( &self->capture_list_pool, state->capture_list_id ); - CaptureList *new_captures = capture_list_pool_get_mut( - &self->capture_list_pool, - copy.capture_list_id - ); array_push_all(new_captures, old_captures); } - uint32_t index = (state - self->states.contents) + 1; - array_insert(&self->states, index, copy); - return &self->states.contents[index]; + array_insert(&self->states, state_index + 1, copy); + *state_ref = &self->states.contents[state_index]; + return &self->states.contents[state_index + 1]; } // Walk the tree, processing patterns until at least one pattern finishes, @@ -2180,18 +2272,30 @@ static QueryState *ts_query__cursor_copy_state( // `finished_states` array. Multiple patterns can finish on the same node. If // there are no more matches, return `false`. static inline bool ts_query_cursor__advance(TSQueryCursor *self) { - do { + bool did_match = false; + for (;;) { + if (self->halted) { + while (self->states.size > 0) { + QueryState state = array_pop(&self->states); + capture_list_pool_release( + &self->capture_list_pool, + state.capture_list_id + ); + } + } + + if (did_match || self->halted) return did_match; + if (self->ascending) { LOG("leave node. type:%s\n", ts_node_type(ts_tree_cursor_current_node(&self->cursor))); // Leave this node by stepping to its next sibling or to its parent. - bool did_move = true; if (ts_tree_cursor_goto_next_sibling(&self->cursor)) { self->ascending = false; } else if (ts_tree_cursor_goto_parent(&self->cursor)) { self->depth--; } else { - did_move = false; + self->halted = true; } // After leaving a node, remove any states that cannot make further progress. @@ -2203,10 +2307,11 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { // If a state completed its pattern inside of this node, but was deferred from finishing // in order to search for longer matches, mark it as finished. if (step->depth == PATTERN_DONE_MARKER) { - if (state->start_depth > self->depth || !did_move) { + if (state->start_depth > self->depth || self->halted) { LOG(" finish pattern %u\n", state->pattern_index); state->id = self->next_state_id++; array_push(&self->finished_states, *state); + did_match = true; deleted_count++; continue; } @@ -2233,10 +2338,6 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { } } self->states.size -= deleted_count; - - if (!did_move) { - return self->finished_states.size > 0; - } } else { // If this node is before the selected range, then avoid descending into it. TSNode node = ts_tree_cursor_current_node(&self->cursor); @@ -2254,7 +2355,10 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { if ( self->end_byte <= ts_node_start_byte(node) || point_lte(self->end_point, ts_node_start_point(node)) - ) return false; + ) { + self->halted = true; + continue; + } // Get the properties of the current node. TSSymbol symbol = ts_node_symbol(node); @@ -2286,7 +2390,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { // If this node matches the first step of the pattern, then add a new // state at the start of this pattern. if (step->field && field_id != step->field) continue; - if (!ts_query_cursor__add_state(self, pattern)) break; + ts_query_cursor__add_state(self, pattern); } // Add new states for any patterns whose root node matches this node. @@ -2298,7 +2402,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { // If this node matches the first step of the pattern, then add a new // state at the start of this pattern. if (step->field && field_id != step->field) continue; - if (!ts_query_cursor__add_state(self, pattern)) break; + ts_query_cursor__add_state(self, pattern); // Advance to the next pattern whose root node matches this node. i++; @@ -2366,12 +2470,8 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { // parent, then this query state cannot simply be updated in place. It must be // split into two states: one that matches this node, and one which skips over // this node, to preserve the possibility of matching later siblings. - if ( - later_sibling_can_match && - !step->is_pattern_start && - step->contains_captures - ) { - if (ts_query__cursor_copy_state(self, state)) { + if (later_sibling_can_match && step->contains_captures) { + if (ts_query_cursor__copy_state(self, &state)) { LOG( " split state for capture. pattern:%u, step:%u\n", state->pattern_index, @@ -2382,45 +2482,14 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { } // If the current node is captured in this pattern, add it to the capture list. - // For the first capture in a pattern, lazily acquire a capture list. if (step->capture_ids[0] != NONE) { - if (state->capture_list_id == NONE) { - state->capture_list_id = capture_list_pool_acquire(&self->capture_list_pool); - - // If there are no capture lists left in the pool, then terminate whichever - // state has captured the earliest node in the document, and steal its - // capture list. - if (state->capture_list_id == NONE) { - uint32_t state_index, byte_offset, pattern_index; - if (ts_query_cursor__first_in_progress_capture( - self, - &state_index, - &byte_offset, - &pattern_index - )) { - LOG( - " abandon state. index:%u, pattern:%u, offset:%u.\n", - state_index, pattern_index, byte_offset - ); - state->capture_list_id = self->states.contents[state_index].capture_list_id; - array_erase(&self->states, state_index); - if (state_index < i) { - i--; - state--; - } - } else { - LOG(" too many finished states.\n"); - array_erase(&self->states, i); - i--; - continue; - } - } + CaptureList *capture_list = ts_query_cursor__prepare_to_capture(self, state, UINT32_MAX); + if (!capture_list) { + array_erase(&self->states, i); + i--; + continue; } - CaptureList *capture_list = capture_list_pool_get_mut( - &self->capture_list_pool, - state->capture_list_id - ); for (unsigned j = 0; j < MAX_STEP_CAPTURE_COUNT; j++) { uint16_t capture_id = step->capture_ids[j]; if (step->capture_ids[j] == NONE) break; @@ -2443,10 +2512,9 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { state->step_index ); - // If this state's next step has an 'alternative' step (the step is either optional, - // or is the end of a repetition), then copy the state in order to pursue both - // alternatives. The alternative step itself may have an alternative, so this is - // an interative process. + // If this state's next step has an alternative step, then copy the state in order + // to pursue both alternatives. The alternative step itself may have an alternative, + // so this is an interative process. unsigned end_index = i + 1; for (unsigned j = i; j < end_index; j++) { QueryState *state = &self->states.contents[j]; @@ -2458,25 +2526,27 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { continue; } - QueryState *copy = ts_query__cursor_copy_state(self, state); if (next_step->is_pass_through) { state->step_index++; j--; } + + QueryState *copy = ts_query_cursor__copy_state(self, &state); if (copy) { - copy_count++; + LOG( + " split state for branch. pattern:%u, from_step:%u, to_step:%u, immediate:%d, capture_count: %u\n", + copy->pattern_index, + copy->step_index, + next_step->alternative_index, + next_step->alternative_is_immediate, + capture_list_pool_get(&self->capture_list_pool, copy->capture_list_id)->size + ); end_index++; + copy_count++; copy->step_index = next_step->alternative_index; if (next_step->alternative_is_immediate) { copy->seeking_immediate_match = true; } - LOG( - " split state for branch. pattern:%u, step:%u, step:%u, immediate:%d\n", - copy->pattern_index, - state->step_index, - copy->step_index, - copy->seeking_immediate_match - ); } } } @@ -2484,59 +2554,77 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { for (unsigned i = 0; i < self->states.size; i++) { QueryState *state = &self->states.contents[i]; - bool did_remove = false; + if (state->dead) { + array_erase(&self->states, i); + i--; + continue; + } // Enfore the longest-match criteria. When a query pattern contains optional or - // repeated nodes, this is necesssary to avoid multiple redundant states, where + // repeated nodes, this is necessary to avoid multiple redundant states, where // one state has a strict subset of another state's captures. + bool did_remove = false; for (unsigned j = i + 1; j < self->states.size; j++) { QueryState *other_state = &self->states.contents[j]; + + // Query states are kept in ascending order of start_depth and pattern_index. + // Since the longest-match criteria is only used for deduping matches of the same + // pattern and root node, we only need to perform pairwise comparisons within a + // small slice of the states array. if ( - state->pattern_index == other_state->pattern_index && - state->start_depth == other_state->start_depth - ) { - bool left_contains_right, right_contains_left; - ts_query_cursor__compare_captures( - self, - state, - other_state, - &left_contains_right, - &right_contains_left - ); - if (left_contains_right) { - if (state->step_index == other_state->step_index) { - LOG( - " drop shorter state. pattern: %u, step_index: %u\n", - state->pattern_index, - state->step_index - ); - capture_list_pool_release(&self->capture_list_pool, other_state->capture_list_id); - array_erase(&self->states, j); - j--; - continue; - } - other_state->has_in_progress_alternatives = true; + other_state->start_depth != state->start_depth || + other_state->pattern_index != state->pattern_index + ) break; + + bool left_contains_right, right_contains_left; + ts_query_cursor__compare_captures( + self, + state, + other_state, + &left_contains_right, + &right_contains_left + ); + if (left_contains_right) { + if (state->step_index == other_state->step_index) { + LOG( + " drop shorter state. pattern: %u, step_index: %u\n", + state->pattern_index, + state->step_index + ); + capture_list_pool_release(&self->capture_list_pool, other_state->capture_list_id); + array_erase(&self->states, j); + j--; + continue; } - if (right_contains_left) { - if (state->step_index == other_state->step_index) { - LOG( - " drop shorter state. pattern: %u, step_index: %u\n", - state->pattern_index, - state->step_index - ); - capture_list_pool_release(&self->capture_list_pool, state->capture_list_id); - array_erase(&self->states, i); - did_remove = true; - break; - } - state->has_in_progress_alternatives = true; + other_state->has_in_progress_alternatives = true; + } + if (right_contains_left) { + if (state->step_index == other_state->step_index) { + LOG( + " drop shorter state. pattern: %u, step_index: %u\n", + state->pattern_index, + state->step_index + ); + capture_list_pool_release(&self->capture_list_pool, state->capture_list_id); + array_erase(&self->states, i); + i--; + did_remove = true; + break; } + state->has_in_progress_alternatives = true; } } // If there the state is at the end of its pattern, remove it from the list // of in-progress states and add it to the list of finished states. if (!did_remove) { + LOG( + " keep state. pattern: %u, start_depth: %u, step_index: %u, capture_count: %u\n", + state->pattern_index, + state->start_depth, + state->step_index, + capture_list_pool_get(&self->capture_list_pool, state->capture_list_id)->size + ); QueryStep *next_step = &self->query->steps.contents[state->step_index]; if (next_step->depth == PATTERN_DONE_MARKER) { if (state->has_in_progress_alternatives) { @@ -2546,6 +2634,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { state->id = self->next_state_id++; array_push(&self->finished_states, *state); array_erase(&self->states, state - self->states.contents); + did_match = true; i--; } } @@ -2559,9 +2648,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { self->ascending = true; } } - } while (self->finished_states.size == 0); - - return true; + } } bool ts_query_cursor_next_match( @@ -2701,7 +2788,10 @@ bool ts_query_cursor_next_capture( // If there are no finished matches that are ready to be returned, then // continue finding more matches. - if (!ts_query_cursor__advance(self)) return false; + if ( + !ts_query_cursor__advance(self) && + self->finished_states.size == 0 + ) return false; } } diff --git a/tags/include/tree_sitter/tags.h b/tags/include/tree_sitter/tags.h index 946dc6f1..4784abbb 100644 --- a/tags/include/tree_sitter/tags.h +++ b/tags/include/tree_sitter/tags.h @@ -16,18 +16,10 @@ typedef enum { TSTagsInvalidUtf8, TSTagsInvalidRegex, TSTagsInvalidQuery, + TSTagsInvalidCapture, } TSTagsError; -typedef enum { - TSTagKindFunction, - TSTagKindMethod, - TSTagKindClass, - TSTagKindModule, - TSTagKindCall, -} TSTagKind; - typedef struct { - TSTagKind kind; uint32_t start_byte; uint32_t end_byte; uint32_t name_start_byte; @@ -36,8 +28,12 @@ typedef struct { uint32_t line_end_byte; TSPoint start_point; TSPoint end_point; + uint32_t utf16_start_column; + uint32_t utf16_end_column; uint32_t docs_start_byte; uint32_t docs_end_byte; + uint32_t syntax_type_id; + bool is_definition; } TSTag; typedef struct TSTagger TSTagger; @@ -89,6 +85,12 @@ uint32_t ts_tags_buffer_tags_len(const TSTagsBuffer *); const char *ts_tags_buffer_docs(const TSTagsBuffer *); uint32_t ts_tags_buffer_docs_len(const TSTagsBuffer *); +// Get the syntax kinds for a scope. +const char **ts_tagger_syntax_kinds_for_scope_name(const TSTagger *, const char *scope_name, uint32_t *len); + +// Determine whether a parse error was encountered while tagging. +bool ts_tags_buffer_found_parse_error(const TSTagsBuffer*); + #ifdef __cplusplus } #endif diff --git a/tags/src/c_lib.rs b/tags/src/c_lib.rs index 0c367977..85de1ff6 100644 --- a/tags/src/c_lib.rs +++ b/tags/src/c_lib.rs @@ -1,4 +1,4 @@ -use super::{Error, TagKind, TagsConfiguration, TagsContext}; +use super::{Error, TagsConfiguration, TagsContext}; use std::collections::HashMap; use std::ffi::CStr; use std::process::abort; @@ -6,6 +6,9 @@ use std::sync::atomic::AtomicUsize; use std::{fmt, slice, str}; use tree_sitter::Language; +const BUFFER_TAGS_RESERVE_CAPACITY: usize = 100; +const BUFFER_DOCS_RESERVE_CAPACITY: usize = 1024; + #[repr(C)] #[derive(Debug, PartialEq, Eq)] pub enum TSTagsError { @@ -16,19 +19,10 @@ pub enum TSTagsError { InvalidUtf8, InvalidRegex, InvalidQuery, + InvalidCapture, Unknown, } -#[repr(C)] -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub enum TSTagKind { - Function, - Method, - Class, - Module, - Call, -} - #[repr(C)] pub struct TSPoint { row: u32, @@ -37,7 +31,6 @@ pub struct TSPoint { #[repr(C)] pub struct TSTag { - pub kind: TSTagKind, pub start_byte: u32, pub end_byte: u32, pub name_start_byte: u32, @@ -46,8 +39,12 @@ pub struct TSTag { pub line_end_byte: u32, pub start_point: TSPoint, pub end_point: TSPoint, + pub utf16_start_colum: u32, + pub utf16_end_colum: u32, pub docs_start_byte: u32, pub docs_end_byte: u32, + pub syntax_type_id: u32, + pub is_definition: bool, } pub struct TSTagger { @@ -58,6 +55,7 @@ pub struct TSTagsBuffer { context: TagsContext, tags: Vec, docs: Vec, + errors_present: bool, } #[no_mangle] @@ -102,7 +100,9 @@ pub extern "C" fn ts_tagger_add_language( } Err(Error::Query(_)) => TSTagsError::InvalidQuery, Err(Error::Regex(_)) => TSTagsError::InvalidRegex, - Err(_) => TSTagsError::Unknown, + Err(Error::Cancelled) => TSTagsError::Timeout, + Err(Error::InvalidLanguage) => TSTagsError::InvalidLanguage, + Err(Error::InvalidCapture(_)) => TSTagsError::InvalidCapture, } } @@ -120,8 +120,9 @@ pub extern "C" fn ts_tagger_tag( let scope_name = unsafe { unwrap(CStr::from_ptr(scope_name).to_str()) }; if let Some(config) = tagger.languages.get(scope_name) { - buffer.tags.clear(); - buffer.docs.clear(); + shrink_and_clear(&mut buffer.tags, BUFFER_TAGS_RESERVE_CAPACITY); + shrink_and_clear(&mut buffer.docs, BUFFER_DOCS_RESERVE_CAPACITY); + let source_code = unsafe { slice::from_raw_parts(source_code, source_code_len as usize) }; let cancellation_flag = unsafe { cancellation_flag.as_ref() }; @@ -129,7 +130,10 @@ pub extern "C" fn ts_tagger_tag( .context .generate_tags(config, source_code, cancellation_flag) { - Ok(tags) => tags, + Ok((tags, found_error)) => { + buffer.errors_present = found_error; + tags + } Err(e) => { return match e { Error::InvalidLanguage => TSTagsError::InvalidLanguage, @@ -153,13 +157,6 @@ pub extern "C" fn ts_tagger_tag( buffer.docs.extend_from_slice(docs.as_bytes()); } buffer.tags.push(TSTag { - kind: match tag.kind { - TagKind::Function => TSTagKind::Function, - TagKind::Method => TSTagKind::Method, - TagKind::Class => TSTagKind::Class, - TagKind::Module => TSTagKind::Module, - TagKind::Call => TSTagKind::Call, - }, start_byte: tag.range.start as u32, end_byte: tag.range.end as u32, name_start_byte: tag.name_range.start as u32, @@ -174,8 +171,12 @@ pub extern "C" fn ts_tagger_tag( row: tag.span.end.row as u32, column: tag.span.end.column as u32, }, + utf16_start_colum: tag.utf16_column_range.start as u32, + utf16_end_colum: tag.utf16_column_range.end as u32, docs_start_byte: prev_docs_len as u32, docs_end_byte: buffer.docs.len() as u32, + syntax_type_id: tag.syntax_type_id, + is_definition: tag.is_definition, }); } @@ -189,8 +190,9 @@ pub extern "C" fn ts_tagger_tag( pub extern "C" fn ts_tags_buffer_new() -> *mut TSTagsBuffer { Box::into_raw(Box::new(TSTagsBuffer { context: TagsContext::new(), - tags: Vec::with_capacity(64), - docs: Vec::with_capacity(64), + tags: Vec::with_capacity(BUFFER_TAGS_RESERVE_CAPACITY), + docs: Vec::with_capacity(BUFFER_DOCS_RESERVE_CAPACITY), + errors_present: false, })) } @@ -223,6 +225,30 @@ pub extern "C" fn ts_tags_buffer_docs_len(this: *const TSTagsBuffer) -> u32 { buffer.docs.len() as u32 } +#[no_mangle] +pub extern "C" fn ts_tags_buffer_found_parse_error(this: *const TSTagsBuffer) -> bool { + let buffer = unwrap_ptr(this); + buffer.errors_present +} + +#[no_mangle] +pub extern "C" fn ts_tagger_syntax_kinds_for_scope_name( + this: *mut TSTagger, + scope_name: *const i8, + len: *mut u32, +) -> *const *const i8 { + let tagger = unwrap_mut_ptr(this); + let scope_name = unsafe { unwrap(CStr::from_ptr(scope_name).to_str()) }; + let len = unwrap_mut_ptr(len); + + *len = 0; + if let Some(config) = tagger.languages.get(scope_name) { + *len = config.c_syntax_type_names.len() as u32; + return config.c_syntax_type_names.as_ptr() as *const *const i8; + } + std::ptr::null() +} + fn unwrap_ptr<'a, T>(result: *const T) -> &'a T { unsafe { result.as_ref() }.unwrap_or_else(|| { eprintln!("{}:{} - pointer must not be null", file!(), line!()); @@ -243,3 +269,11 @@ fn unwrap(result: Result) -> T { abort(); }) } + +fn shrink_and_clear(vec: &mut Vec, capacity: usize) { + if vec.len() > capacity { + vec.truncate(capacity); + vec.shrink_to_fit(); + } + vec.clear(); +} diff --git a/tags/src/lib.rs b/tags/src/lib.rs index 8d1853bb..dd55d4be 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -1,10 +1,12 @@ pub mod c_lib; -use memchr::{memchr, memrchr}; +use memchr::memchr; use regex::Regex; +use std::collections::HashMap; +use std::ffi::{CStr, CString}; use std::ops::Range; use std::sync::atomic::{AtomicUsize, Ordering}; -use std::{fmt, mem, str}; +use std::{char, fmt, mem, str}; use tree_sitter::{ Language, Parser, Point, Query, QueryCursor, QueryError, QueryPredicateArg, Tree, }; @@ -18,19 +20,24 @@ const CANCELLATION_CHECK_INTERVAL: usize = 100; pub struct TagsConfiguration { pub language: Language, pub query: Query, - call_capture_index: Option, - class_capture_index: Option, + syntax_type_names: Vec>, + c_syntax_type_names: Vec<*const u8>, + capture_map: HashMap, doc_capture_index: Option, - function_capture_index: Option, - method_capture_index: Option, - module_capture_index: Option, name_capture_index: Option, + ignore_capture_index: Option, local_scope_capture_index: Option, local_definition_capture_index: Option, tags_pattern_index: usize, pattern_info: Vec, } +#[derive(Debug)] +pub struct NamedCapture { + pub syntax_type_id: u32, + pub is_definition: bool, +} + pub struct TagsContext { parser: Parser, cursor: QueryCursor, @@ -38,21 +45,14 @@ pub struct TagsContext { #[derive(Debug, Clone)] pub struct Tag { - pub kind: TagKind, pub range: Range, pub name_range: Range, pub line_range: Range, pub span: Range, + pub utf16_column_range: Range, pub docs: Option, -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub enum TagKind { - Function, - Method, - Class, - Module, - Call, + pub is_definition: bool, + pub syntax_type_id: u32, } #[derive(Debug, PartialEq)] @@ -61,6 +61,7 @@ pub enum Error { Regex(regex::Error), Cancelled, InvalidLanguage, + InvalidCapture(String), } #[derive(Debug, Default)] @@ -91,6 +92,7 @@ where matches: I, _tree: Tree, source: &'a [u8], + prev_line_info: Option, config: &'a TagsConfiguration, cancellation_flag: Option<&'a AtomicUsize>, iter_count: usize, @@ -98,6 +100,18 @@ where scopes: Vec>, } +struct LineInfo { + utf8_position: Point, + utf8_byte: usize, + utf16_column: usize, + line_range: Range, +} + +struct LossyUtf8<'a> { + bytes: &'a [u8], + in_replacement: bool, +} + impl TagsConfiguration { pub fn new(language: Language, tags_query: &str, locals_query: &str) -> Result { let query = Query::new(language, &format!("{}{}", locals_query, tags_query))?; @@ -111,31 +125,57 @@ impl TagsConfiguration { } } - let mut call_capture_index = None; - let mut class_capture_index = None; + let mut capture_map = HashMap::new(); + let mut syntax_type_names = Vec::new(); let mut doc_capture_index = None; - let mut function_capture_index = None; - let mut method_capture_index = None; - let mut module_capture_index = None; let mut name_capture_index = None; + let mut ignore_capture_index = None; let mut local_scope_capture_index = None; let mut local_definition_capture_index = None; for (i, name) in query.capture_names().iter().enumerate() { - let index = match name.as_str() { - "call" => &mut call_capture_index, - "class" => &mut class_capture_index, - "doc" => &mut doc_capture_index, - "function" => &mut function_capture_index, - "method" => &mut method_capture_index, - "module" => &mut module_capture_index, - "name" => &mut name_capture_index, - "local.scope" => &mut local_scope_capture_index, - "local.definition" => &mut local_definition_capture_index, - _ => continue, - }; - *index = Some(i as u32); + match name.as_str() { + "" => continue, + "name" => name_capture_index = Some(i as u32), + "ignore" => ignore_capture_index = Some(i as u32), + "doc" => doc_capture_index = Some(i as u32), + "local.scope" => local_scope_capture_index = Some(i as u32), + "local.definition" => local_definition_capture_index = Some(i as u32), + "local.reference" => continue, + _ => { + let mut is_definition = false; + + let kind = if name.starts_with("definition.") { + is_definition = true; + name.trim_start_matches("definition.") + } else if name.starts_with("reference.") { + name.trim_start_matches("reference.") + } else { + return Err(Error::InvalidCapture(name.to_string())); + }; + + if let Ok(cstr) = CString::new(kind) { + let c_kind = cstr.to_bytes_with_nul().to_vec().into_boxed_slice(); + let syntax_type_id = syntax_type_names + .iter() + .position(|n| n == &c_kind) + .unwrap_or_else(|| { + syntax_type_names.push(c_kind); + syntax_type_names.len() - 1 + }) as u32; + capture_map.insert( + i as u32, + NamedCapture { + syntax_type_id, + is_definition, + }, + ); + } + } + } } + let c_syntax_type_names = syntax_type_names.iter().map(|s| s.as_ptr()).collect(); + let pattern_info = (0..query.pattern_count()) .map(|pattern_index| { let mut info = PatternInfo::default(); @@ -180,19 +220,26 @@ impl TagsConfiguration { Ok(TagsConfiguration { language, query, - function_capture_index, - class_capture_index, - method_capture_index, - module_capture_index, + syntax_type_names, + c_syntax_type_names, + capture_map, doc_capture_index, - call_capture_index, name_capture_index, + ignore_capture_index, tags_pattern_index, local_scope_capture_index, local_definition_capture_index, pattern_info, }) } + + pub fn syntax_type_name(&self, id: u32) -> &str { + unsafe { + let cstr = CStr::from_ptr(self.syntax_type_names[id as usize].as_ptr() as *const i8) + .to_bytes(); + str::from_utf8(cstr).expect("syntax type name was not valid utf-8") + } + } } impl TagsContext { @@ -208,7 +255,7 @@ impl TagsContext { config: &'a TagsConfiguration, source: &'a [u8], cancellation_flag: Option<&'a AtomicUsize>, - ) -> Result> + 'a, Error> { + ) -> Result<(impl Iterator> + 'a, bool), Error> { self.parser .set_language(config.language) .map_err(|_| Error::InvalidLanguage)?; @@ -224,12 +271,13 @@ impl TagsContext { .matches(&config.query, tree_ref.root_node(), move |node| { &source[node.byte_range()] }); - Ok(TagsIter { + Ok((TagsIter { _tree: tree, matches, source, config, cancellation_flag, + prev_line_info: None, tag_queue: Vec::new(), iter_count: 0, scopes: vec![LocalScope { @@ -237,7 +285,7 @@ impl TagsContext { inherits: false, local_defs: Vec::new(), }], - }) + }, tree_ref.root_node().has_error())) } } @@ -267,7 +315,12 @@ where if self.tag_queue.len() > 1 && self.tag_queue[0].0.name_range.end < last_entry.0.name_range.start { - return Some(Ok(self.tag_queue.remove(0).0)); + let tag = self.tag_queue.remove(0).0; + if tag.is_ignored() { + continue; + } else { + return Some(Ok(tag)); + } } } @@ -300,141 +353,185 @@ where continue; } - let mut name_range = None; + let mut name_node = None; let mut doc_nodes = Vec::new(); let mut tag_node = None; - let mut kind = TagKind::Call; + let mut syntax_type_id = 0; + let mut is_definition = false; let mut docs_adjacent_node = None; + let mut is_ignored = false; for capture in mat.captures { let index = Some(capture.index); + if index == self.config.ignore_capture_index { + is_ignored = true; + name_node = Some(capture.node); + } + if index == self.config.pattern_info[mat.pattern_index].docs_adjacent_capture { docs_adjacent_node = Some(capture.node); } if index == self.config.name_capture_index { - name_range = Some(capture.node.byte_range()); + name_node = Some(capture.node); } else if index == self.config.doc_capture_index { doc_nodes.push(capture.node); - } else if index == self.config.call_capture_index { + } + + if let Some(named_capture) = self.config.capture_map.get(&capture.index) { tag_node = Some(capture.node); - kind = TagKind::Call; - } else if index == self.config.class_capture_index { - tag_node = Some(capture.node); - kind = TagKind::Class; - } else if index == self.config.function_capture_index { - tag_node = Some(capture.node); - kind = TagKind::Function; - } else if index == self.config.method_capture_index { - tag_node = Some(capture.node); - kind = TagKind::Method; - } else if index == self.config.module_capture_index { - tag_node = Some(capture.node); - kind = TagKind::Module; + syntax_type_id = named_capture.syntax_type_id; + is_definition = named_capture.is_definition; } } - if let (Some(tag_node), Some(name_range)) = (tag_node, name_range) { - if pattern_info.name_must_be_non_local { - let mut is_local = false; - for scope in self.scopes.iter().rev() { - if scope.range.start <= name_range.start - && scope.range.end >= name_range.end - { - if scope - .local_defs - .iter() - .any(|d| d.name == &self.source[name_range.clone()]) - { - is_local = true; - break; - } - if !scope.inherits { - break; - } - } - } - if is_local { + if let Some(name_node) = name_node { + let name_range = name_node.byte_range(); + + let tag; + if let Some(tag_node) = tag_node { + if name_node.has_error() { continue; } - } - // If needed, filter the doc nodes based on their ranges, selecting - // only the slice that are adjacent to some specified node. - let mut docs_start_index = 0; - if let (Some(docs_adjacent_node), false) = - (docs_adjacent_node, doc_nodes.is_empty()) - { - docs_start_index = doc_nodes.len(); - let mut start_row = docs_adjacent_node.start_position().row; - while docs_start_index > 0 { - let doc_node = &doc_nodes[docs_start_index - 1]; - let prev_doc_end_row = doc_node.end_position().row; - if prev_doc_end_row + 1 >= start_row { - docs_start_index -= 1; - start_row = doc_node.start_position().row; - } else { - break; + if pattern_info.name_must_be_non_local { + let mut is_local = false; + for scope in self.scopes.iter().rev() { + if scope.range.start <= name_range.start + && scope.range.end >= name_range.end + { + if scope + .local_defs + .iter() + .any(|d| d.name == &self.source[name_range.clone()]) + { + is_local = true; + break; + } + if !scope.inherits { + break; + } + } + } + if is_local { + continue; } } - } - // Generate a doc string from all of the doc nodes, applying any strip regexes. - let mut docs = None; - for doc_node in &doc_nodes[docs_start_index..] { - if let Ok(content) = str::from_utf8(&self.source[doc_node.byte_range()]) { - let content = if let Some(regex) = &pattern_info.doc_strip_regex { - regex.replace_all(content, "").to_string() - } else { - content.to_string() - }; - match &mut docs { - None => docs = Some(content), - Some(d) => { - d.push('\n'); - d.push_str(&content); + // If needed, filter the doc nodes based on their ranges, selecting + // only the slice that are adjacent to some specified node. + let mut docs_start_index = 0; + if let (Some(docs_adjacent_node), false) = + (docs_adjacent_node, doc_nodes.is_empty()) + { + docs_start_index = doc_nodes.len(); + let mut start_row = docs_adjacent_node.start_position().row; + while docs_start_index > 0 { + let doc_node = &doc_nodes[docs_start_index - 1]; + let prev_doc_end_row = doc_node.end_position().row; + if prev_doc_end_row + 1 >= start_row { + docs_start_index -= 1; + start_row = doc_node.start_position().row; + } else { + break; } } } + + // Generate a doc string from all of the doc nodes, applying any strip regexes. + let mut docs = None; + for doc_node in &doc_nodes[docs_start_index..] { + if let Ok(content) = str::from_utf8(&self.source[doc_node.byte_range()]) + { + let content = if let Some(regex) = &pattern_info.doc_strip_regex { + regex.replace_all(content, "").to_string() + } else { + content.to_string() + }; + match &mut docs { + None => docs = Some(content), + Some(d) => { + d.push('\n'); + d.push_str(&content); + } + } + } + } + + let rng = tag_node.byte_range(); + let range = rng.start.min(name_range.start)..rng.end.max(name_range.end); + let span = name_node.start_position()..name_node.end_position(); + + // Compute tag properties that depend on the text of the containing line. If the + // previous tag occurred on the same line, then reuse results from the previous tag. + let line_range; + let mut prev_utf16_column = 0; + let mut prev_utf8_byte = name_range.start - span.start.column; + let line_info = self.prev_line_info.as_ref().and_then(|info| { + if info.utf8_position.row == span.start.row { + Some(info) + } else { + None + } + }); + if let Some(line_info) = line_info { + line_range = line_info.line_range.clone(); + if line_info.utf8_position.column <= span.start.column { + prev_utf8_byte = line_info.utf8_byte; + prev_utf16_column = line_info.utf16_column; + } + } else { + line_range = self::line_range( + self.source, + name_range.start, + span.start, + MAX_LINE_LEN, + ); + } + + let utf16_start_column = prev_utf16_column + + utf16_len(&self.source[prev_utf8_byte..name_range.start]); + let utf16_end_column = + utf16_start_column + utf16_len(&self.source[name_range.clone()]); + let utf16_column_range = utf16_start_column..utf16_end_column; + + self.prev_line_info = Some(LineInfo { + utf8_position: span.end, + utf8_byte: name_range.end, + utf16_column: utf16_end_column, + line_range: line_range.clone(), + }); + tag = Tag { + line_range, + span, + utf16_column_range, + range, + name_range, + docs, + is_definition, + syntax_type_id, + }; + } else if is_ignored { + tag = Tag::ignored(name_range); + } else { + continue; } // Only create one tag per node. The tag queue is sorted by node position // to allow for fast lookup. - let range = tag_node.byte_range(); - match self - .tag_queue - .binary_search_by_key(&(name_range.end, name_range.start), |(tag, _)| { - (tag.name_range.end, tag.name_range.start) - }) { + match self.tag_queue.binary_search_by_key( + &(tag.name_range.end, tag.name_range.start), + |(tag, _)| (tag.name_range.end, tag.name_range.start), + ) { Ok(i) => { - let (tag, pattern_index) = &mut self.tag_queue[i]; + let (existing_tag, pattern_index) = &mut self.tag_queue[i]; if *pattern_index > mat.pattern_index { *pattern_index = mat.pattern_index; - *tag = Tag { - line_range: line_range(self.source, range.start, MAX_LINE_LEN), - span: tag_node.start_position()..tag_node.end_position(), - kind, - range, - name_range, - docs, - }; + *existing_tag = tag; } } - Err(i) => self.tag_queue.insert( - i, - ( - Tag { - line_range: line_range(self.source, range.start, MAX_LINE_LEN), - span: tag_node.start_position()..tag_node.end_position(), - kind, - range, - name_range, - docs, - }, - mat.pattern_index, - ), - ), + Err(i) => self.tag_queue.insert(i, (tag, mat.pattern_index)), } } } @@ -448,16 +545,31 @@ where } } -impl fmt::Display for TagKind { +impl Tag { + fn ignored(name_range: Range) -> Self { + Tag { + name_range, + line_range: 0..0, + span: Point::new(0, 0)..Point::new(0, 0), + utf16_column_range: 0..0, + range: usize::MAX..usize::MAX, + docs: None, + is_definition: false, + syntax_type_id: 0, + } + } + + fn is_ignored(&self) -> bool { + self.range.start == usize::MAX + } +} + +impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { - TagKind::Call => "Call", - TagKind::Module => "Module", - TagKind::Class => "Class", - TagKind::Method => "Method", - TagKind::Function => "Function", + Error::InvalidCapture(name) => write!(f, "Invalid capture @{}. Expected one of: @definition.*, @reference.*, @doc, @name, @local.(scope|definition|reference).", name), + _ => write!(f, "{:?}", self) } - .fmt(f) } } @@ -473,11 +585,90 @@ impl From for Error { } } -fn line_range(text: &[u8], index: usize, max_line_len: usize) -> Range { - let start = memrchr(b'\n', &text[0..index]).map_or(0, |i| i + 1); - let max_line_len = max_line_len.min(text.len() - start); - let end = start + memchr(b'\n', &text[start..(start + max_line_len)]).unwrap_or(max_line_len); - start..end +// TODO: Remove this struct at at some point. If `core::str::lossy::Utf8Lossy` +// is ever stabilized, we should use that. Otherwise, this struct could be moved +// into some module that's shared between `tree-sitter-tags` and `tree-sitter-highlight`. +impl<'a> LossyUtf8<'a> { + fn new(bytes: &'a [u8]) -> Self { + LossyUtf8 { + bytes, + in_replacement: false, + } + } +} + +impl<'a> Iterator for LossyUtf8<'a> { + type Item = &'a str; + + fn next(&mut self) -> Option<&'a str> { + if self.bytes.is_empty() { + return None; + } + if self.in_replacement { + self.in_replacement = false; + return Some("\u{fffd}"); + } + match str::from_utf8(self.bytes) { + Ok(valid) => { + self.bytes = &[]; + Some(valid) + } + Err(error) => { + if let Some(error_len) = error.error_len() { + let error_start = error.valid_up_to(); + if error_start > 0 { + let result = + unsafe { str::from_utf8_unchecked(&self.bytes[..error_start]) }; + self.bytes = &self.bytes[(error_start + error_len)..]; + self.in_replacement = true; + Some(result) + } else { + self.bytes = &self.bytes[error_len..]; + Some("\u{fffd}") + } + } else { + None + } + } + } + } +} + +fn line_range( + text: &[u8], + start_byte: usize, + start_point: Point, + max_line_len: usize, +) -> Range { + // Trim leading whitespace + let mut line_start_byte = start_byte - start_point.column; + while line_start_byte < text.len() && text[line_start_byte].is_ascii_whitespace() { + line_start_byte += 1; + } + + let max_line_len = max_line_len.min(text.len() - line_start_byte); + let text_after_line_start = &text[line_start_byte..(line_start_byte + max_line_len)]; + let line_len = if let Some(len) = memchr(b'\n', text_after_line_start) { + len + } else if let Err(e) = str::from_utf8(text_after_line_start) { + e.valid_up_to() + } else { + max_line_len + }; + + // Trim trailing whitespace + let mut line_end_byte = line_start_byte + line_len; + while line_end_byte > line_start_byte && text[line_end_byte - 1].is_ascii_whitespace() { + line_end_byte -= 1; + } + + line_start_byte..line_end_byte +} + +fn utf16_len(bytes: &[u8]) -> usize { + LossyUtf8::new(bytes) + .flat_map(|chunk| chunk.chars().map(char::len_utf16)) + .sum() } #[cfg(test)] @@ -486,14 +677,27 @@ mod tests { #[test] fn test_get_line() { - let text = b"abc\ndefg\nhijkl"; - assert_eq!(line_range(text, 0, 10), 0..3); - assert_eq!(line_range(text, 1, 10), 0..3); - assert_eq!(line_range(text, 2, 10), 0..3); - assert_eq!(line_range(text, 3, 10), 0..3); - assert_eq!(line_range(text, 1, 2), 0..2); - assert_eq!(line_range(text, 4, 10), 4..8); - assert_eq!(line_range(text, 5, 10), 4..8); - assert_eq!(line_range(text, 11, 10), 9..14); + let text = "abc\ndefgā¤hij\nklmno".as_bytes(); + assert_eq!(line_range(text, 5, Point::new(1, 1), 30), 4..14); + assert_eq!(line_range(text, 5, Point::new(1, 1), 6), 4..8); + assert_eq!(line_range(text, 17, Point::new(2, 2), 30), 15..20); + assert_eq!(line_range(text, 17, Point::new(2, 2), 4), 15..19); + } + + #[test] + fn test_get_line_trims() { + let text = b" foo\nbar\n"; + assert_eq!(line_range(text, 0, Point::new(0, 0), 10), 3..6); + + let text = b"\t func foo \nbar\n"; + assert_eq!(line_range(text, 0, Point::new(0, 0), 10), 2..10); + + let r = line_range(text, 0, Point::new(0, 0), 14); + assert_eq!(r, 2..10); + assert_eq!(str::from_utf8(&text[r]).unwrap_or(""), "func foo"); + + let r = line_range(text, 12, Point::new(1, 0), 14); + assert_eq!(r, 12..15); + assert_eq!(str::from_utf8(&text[r]).unwrap_or(""), "bar"); } } diff --git a/test/fixtures/error_corpus/ruby_errors.txt b/test/fixtures/error_corpus/ruby_errors.txt new file mode 100644 index 00000000..9c35781c --- /dev/null +++ b/test/fixtures/error_corpus/ruby_errors.txt @@ -0,0 +1,19 @@ +========================== +Heredocs with errors +========================== + +joins(<<~SQL( + b +SQL +c + +--- + +(program + (method_call + method: (identifier) + (ERROR (heredoc_beginning)) + arguments: (argument_list + (heredoc_body (heredoc_end)) + (identifier) + (MISSING ")")))) diff --git a/test/fixtures/test_grammars/extra_non_terminals_with_shared_rules/corpus.txt b/test/fixtures/test_grammars/extra_non_terminals_with_shared_rules/corpus.txt new file mode 100644 index 00000000..a22d8b8d --- /dev/null +++ b/test/fixtures/test_grammars/extra_non_terminals_with_shared_rules/corpus.txt @@ -0,0 +1,23 @@ +===== +Extras +===== + +; +%; +%foo:; +; +bar: baz:; +; + +--- + +(program + (statement) + (macro_statement (statement)) + (macro_statement (statement + (label_declaration (identifier)))) + (statement) + (statement + (label_declaration (identifier)) + (label_declaration (identifier))) + (statement)) diff --git a/test/fixtures/test_grammars/extra_non_terminals_with_shared_rules/grammar.json b/test/fixtures/test_grammars/extra_non_terminals_with_shared_rules/grammar.json new file mode 100644 index 00000000..a7f51b8e --- /dev/null +++ b/test/fixtures/test_grammars/extra_non_terminals_with_shared_rules/grammar.json @@ -0,0 +1,68 @@ +{ + "name": "extra_non_terminals_with_shared_rules", + + "extras": [ + { "type": "PATTERN", "value": "\\s+" }, + { "type": "SYMBOL", "name": "macro_statement" } + ], + + "rules": { + "program": { + "type": "REPEAT", + "content": { + "type": "SYMBOL", + "name": "statement" + } + }, + "statement": { + "type": "SEQ", + "members": [ + { + "type": "REPEAT", + "content": { + "type": "SYMBOL", + "name": "label_declaration" + } + }, + { + "type": "STRING", + "value": ";" + } + ] + }, + "macro_statement": { + "type": "SEQ", + "members": [ + { + "type": "STRING", + "value": "%" + }, + { + "type": "SYMBOL", + "name": "statement" + } + ] + }, + "label_declaration": { + "type": "SEQ", + "members": [ + { + "type": "SYMBOL", + "name": "identifier" + }, + { + "type": "STRING", + "value": ":" + } + ] + }, + "identifier": { + "type": "PATTERN", + "value": "[a-zA-Z]+" + } + }, + "conflicts": [], + "externals": [], + "inline": [], + "supertypes": [] +} diff --git a/test/fuzz/README.md b/test/fuzz/README.md index 649d2d89..a02d2689 100644 --- a/test/fuzz/README.md +++ b/test/fuzz/README.md @@ -22,10 +22,10 @@ The fuzzers can then be built with: export CLANG_DIR=$HOME/src/third_party/llvm-build/Release+Asserts/bin CC="$CLANG_DIR/clang" CXX="$CLANG_DIR/clang++" LINK="$CLANG_DIR/clang++" \ LIB_FUZZER_PATH=$HOME/src/compiler-rt/lib/fuzzer/libFuzzer.a \ - ./script/build_fuzzers + ./script/build-fuzzers ``` -This will generate a separate fuzzer for each grammar defined in `test/fixtures/grammars` and will be instrumented with [AddressSanitizer](https://clang.llvm.org/docs/AddressSanitizer.html) and [UndefinedBehaviorSanitizer](https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html). Individual fuzzers can be built with, for example, `./script/build_fuzzers python ruby`. +This will generate a separate fuzzer for each grammar defined in `test/fixtures/grammars` and will be instrumented with [AddressSanitizer](https://clang.llvm.org/docs/AddressSanitizer.html) and [UndefinedBehaviorSanitizer](https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html). Individual fuzzers can be built with, for example, `./script/build-fuzzers python ruby`. The `run-fuzzer` script handles running an individual fuzzer with a sensible default set of arguments: ```