Merge branch 'master' into query-pattern-is-definite

This commit is contained in:
Max Brunsfeld 2020-08-14 09:31:55 -07:00
commit 1ea29053e1
33 changed files with 2004 additions and 763 deletions

2
Cargo.lock generated
View file

@ -740,7 +740,7 @@ dependencies = [
[[package]]
name = "tree-sitter-cli"
version = "0.16.8"
version = "0.16.9"
dependencies = [
"ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)",
"cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)",

View file

@ -1,7 +1,7 @@
[package]
name = "tree-sitter-cli"
description = "CLI tool for developing, testing, and using Tree-sitter parsers"
version = "0.16.8"
version = "0.16.9"
authors = ["Max Brunsfeld <maxbrunsfeld@gmail.com>"]
edition = "2018"
license = "MIT"

356
cli/npm/dsl.d.ts vendored Normal file
View file

@ -0,0 +1,356 @@
type AliasRule = {type: 'ALIAS'; named: boolean; content: Rule; value: string};
type BlankRule = {type: 'BLANK'};
type ChoiceRule = {type: 'CHOICE'; members: Rule[]};
type FieldRule = {type: 'FIELD'; name: string; content: Rule};
type ImmediateTokenRule = {type: 'IMMEDIATE_TOKEN'; content: Rule};
type PatternRule = {type: 'PATTERN'; value: string};
type PrecDynamicRule = {type: 'PREC_DYNAMIC'; content: Rule; value: number};
type PrecLeftRule = {type: 'PREC_LEFT'; content: Rule; value: number};
type PrecRightRule = {type: 'PREC_RIGHT'; content: Rule; value: number};
type PrecRule = {type: 'PREC'; content: Rule; value: number};
type Repeat1Rule = {type: 'REPEAT1'; content: Rule};
type RepeatRule = {type: 'REPEAT'; content: Rule};
type SeqRule = {type: 'SEQ'; members: Rule[]};
type StringRule = {type: 'STRING'; value: string};
type SymbolRule<Name extends string> = {type: 'SYMBOL'; name: Name};
type TokenRule = {type: 'TOKEN'; content: Rule};
type Rule =
| AliasRule
| BlankRule
| ChoiceRule
| FieldRule
| ImmediateTokenRule
| PatternRule
| PrecDynamicRule
| PrecLeftRule
| PrecRightRule
| PrecRule
| Repeat1Rule
| RepeatRule
| SeqRule
| StringRule
| SymbolRule<string>
| TokenRule;
type RuleOrLiteral = Rule | RegExp | string;
type GrammarSymbols<RuleName extends string> = {
[name in RuleName]: SymbolRule<name>;
} &
Record<string, SymbolRule<string>>;
type RuleBuilder<RuleName extends string> = (
$: GrammarSymbols<RuleName>,
) => RuleOrLiteral;
type RuleBuilders<
RuleName extends string,
BaseGrammarRuleName extends string
> = {
[name in RuleName]: RuleBuilder<RuleName | BaseGrammarRuleName>;
};
interface Grammar<
RuleName extends string,
BaseGrammarRuleName extends string = never,
Rules extends RuleBuilders<RuleName, BaseGrammarRuleName> = RuleBuilders<
RuleName,
BaseGrammarRuleName
>
> {
/**
* Name of the grammar language.
*/
name: string;
/** Mapping of grammar rule names to rule builder functions. */
rules: Rules;
/**
* An array of arrays of rule names. Each inner array represents a set of
* rules that's involved in an _LR(1) conflict_ that is _intended to exist_
* in the grammar. When these conflicts occur at runtime, Tree-sitter will
* use the GLR algorithm to explore all of the possible interpretations. If
* _multiple_ parses end up succeeding, Tree-sitter will pick the subtree
* whose corresponding rule has the highest total _dynamic precedence_.
*
* @param $ grammar rules
*/
conflicts?: (
$: GrammarSymbols<RuleName | BaseGrammarRuleName>,
) => RuleOrLiteral[][];
/**
* An array of token names which can be returned by an _external scanner_.
* External scanners allow you to write custom C code which runs during the
* lexing process in order to handle lexical rules (e.g. Python's indentation
* tokens) that cannot be described by regular expressions.
*
* @param $ grammar rules
* @param previous array of externals from the base schema, if any
*
* @see https://tree-sitter.github.io/tree-sitter/creating-parsers#external-scanners
*/
externals?: (
$: Record<string, SymbolRule<string>>,
previous: Rule[],
) => SymbolRule<string>[];
/**
* An array of tokens that may appear anywhere in the language. This
* is often used for whitespace and comments. The default value of
* extras is to accept whitespace. To control whitespace explicitly,
* specify extras: `$ => []` in your grammar.
*
* @param $ grammar rules
*/
extras?: (
$: GrammarSymbols<RuleName | BaseGrammarRuleName>,
) => RuleOrLiteral[];
/**
* An array of rules that should be automatically removed from the
* grammar by replacing all of their usages with a copy of their definition.
* This is useful for rules that are used in multiple places but for which
* you don't want to create syntax tree nodes at runtime.
*
* @param $ grammar rules
*/
inline?: (
$: GrammarSymbols<RuleName | BaseGrammarRuleName>,
) => RuleOrLiteral[];
/**
* A list of hidden rule names that should be considered supertypes in the
* generated node types file.
*
* @param $ grammar rules
*
* @see http://tree-sitter.github.io/tree-sitter/using-parsers#static-node-types
*/
supertypes?: (
$: GrammarSymbols<RuleName | BaseGrammarRuleName>,
) => RuleOrLiteral[];
/**
* The name of a token that will match keywords for the purpose of the
* keyword extraction optimization.
*
* @param $ grammar rules
*
* @see https://tree-sitter.github.io/tree-sitter/creating-parsers#keyword-extraction
*/
word?: ($: GrammarSymbols<RuleName | BaseGrammarRuleName>) => RuleOrLiteral;
}
type GrammarSchema<RuleName extends string> = {
[K in keyof Grammar<RuleName>]: K extends 'rules'
? Record<RuleName, Rule>
: Grammar<RuleName>[K];
};
/**
* Causes the given rule to appear with an alternative name in the syntax tree.
* For instance with `alias($.foo, 'bar')`, the aliased rule will appear as an
* anonymous node, as if the rule had been written as the simple string.
*
* @param rule rule that will be aliased
* @param name target name for the alias
*/
declare function alias(rule: RuleOrLiteral, name: string): AliasRule;
/**
* Causes the given rule to appear as an alternative named node, for instance
* with `alias($.foo, $.bar)`, the aliased rule `foo` will appear as a named
* node called `bar`.
*
* @param rule rule that will be aliased
* @param symbol target symbol for the alias
*/
declare function alias(
rule: RuleOrLiteral,
symbol: SymbolRule<string>,
): AliasRule;
/**
* Creates a blank rule, matching nothing.
*/
declare function blank(): BlankRule;
/**
* Assigns a field name to the child node(s) matched by the given rule.
* In the resulting syntax tree, you can then use that field name to
* access specific children.
*
* @param name name of the field
* @param rule rule the field should match
*/
declare function field(name: string, rule: RuleOrLiteral): FieldRule;
/**
* Creates a rule that matches one of a set of possible rules. The order
* of the arguments does not matter. This is analogous to the `|` (pipe)
* operator in EBNF notation.
*
* @param options possible rule choices
*/
declare function choice(...options: RuleOrLiteral[]): ChoiceRule;
/**
* Creates a rule that matches zero or one occurrence of a given rule.
* It is analogous to the `[x]` (square bracket) syntax in EBNF notation.
*
* @param value rule to be made optional
*/
declare function optional(rule: RuleOrLiteral): ChoiceRule;
/**
* Marks the given rule with a numerical precedence which will be used to
* resolve LR(1) conflicts at parser-generation time. When two rules overlap
* in a way that represents either a true ambiguity or a _local_ ambiguity
* given one token of lookahead, Tree-sitter will try to resolve the conflict by
* matching the rule with the higher precedence. The default precedence of all
* rules is zero. This works similarly to the precedence directives in Yacc grammars.
*
* @param number precedence weight
* @param rule rule being weighted
*
* @see https://en.wikipedia.org/wiki/LR_parser#Conflicts_in_the_constructed_tables
* @see https://docs.oracle.com/cd/E19504-01/802-5880/6i9k05dh3/index.html
*/
declare const prec: {
(number: number, rule: RuleOrLiteral): PrecRule;
/**
* Marks the given rule as left-associative (and optionally applies a
* numerical precedence). When an LR(1) conflict arises in which all of the
* rules have the same numerical precedence, Tree-sitter will consult the
* rules' associativity. If there is a left-associative rule, Tree-sitter
* will prefer matching a rule that ends _earlier_. This works similarly to
* associativity directives in Yacc grammars.
*
* @param number (optional) precedence weight
* @param rule rule to mark as left-associative
*
* @see https://docs.oracle.com/cd/E19504-01/802-5880/6i9k05dh3/index.html
*/
left(rule: RuleOrLiteral): PrecLeftRule;
left(number: number, rule: RuleOrLiteral): PrecLeftRule;
/**
* Marks the given rule as right-associative (and optionally applies a
* numerical precedence). When an LR(1) conflict arises in which all of the
* rules have the same numerical precedence, Tree-sitter will consult the
* rules' associativity. If there is a right-associative rule, Tree-sitter
* will prefer matching a rule that ends _later_. This works similarly to
* associativity directives in Yacc grammars.
*
* @param number (optional) precedence weight
* @param rule rule to mark as right-associative
*
* @see https://docs.oracle.com/cd/E19504-01/802-5880/6i9k05dh3/index.html
*/
right(rule: RuleOrLiteral): PrecRightRule;
right(number: number, rule: RuleOrLiteral): PrecRightRule;
/**
* Marks the given rule with a numerical precedence which will be used to
* resolve LR(1) conflicts at _runtime_ instead of parser-generation time.
* This is only necessary when handling a conflict dynamically using the
* `conflicts` field in the grammar, and when there is a genuine _ambiguity_:
* multiple rules correctly match a given piece of code. In that event,
* Tree-sitter compares the total dynamic precedence associated with each
* rule, and selects the one with the highest total. This is similar to
* dynamic precedence directives in Bison grammars.
*
* @param number precedence weight
* @param rule rule being weighted
*
* @see https://www.gnu.org/software/bison/manual/html_node/Generalized-LR-Parsing.html
*/
dynamic(number: number, rule: RuleOrLiteral): PrecDynamicRule;
};
/**
* Creates a rule that matches _zero-or-more_ occurrences of a given rule.
* It is analogous to the `{x}` (curly brace) syntax in EBNF notation. This
* rule is implemented in terms of `repeat1` but is included because it
* is very commonly used.
*
* @param rule rule to repeat, zero or more times
*/
declare function repeat(rule: RuleOrLiteral): RepeatRule;
/**
* Creates a rule that matches one-or-more occurrences of a given rule.
*
* @param rule rule to repeat, one or more times
*/
declare function repeat1(rule: RuleOrLiteral): Repeat1Rule;
/**
* Creates a rule that matches any number of other rules, one after another.
* It is analogous to simply writing multiple symbols next to each other
* in EBNF notation.
*
* @param rules ordered rules that comprise the sequence
*/
declare function seq(...rules: RuleOrLiteral[]): SeqRule;
/**
* Creates a symbol rule, representing another rule in the grammar by name.
*
* @param name name of the target rule
*/
declare function sym<Name extends string>(name: Name): SymbolRule<Name>;
/**
* Marks the given rule as producing only a single token. Tree-sitter's
* default is to treat each String or RegExp literal in the grammar as a
* separate token. Each token is matched separately by the lexer and
* returned as its own leaf node in the tree. The token function allows
* you to express a complex rule using the DSL functions (rather
* than as a single regular expression) but still have Tree-sitter treat
* it as a single token.
*
* @param rule rule to represent as a single token
*/
declare const token: {
(rule: RuleOrLiteral): TokenRule;
/**
* Marks the given rule as producing an immediate token. This allows
* the parser to produce a different token based on whether or not
* there are `extras` preceding the token's main content. When there
* are _no_ leading `extras`, an immediate token is preferred over a
* normal token which would otherwise match.
*
* @param rule rule to represent as an immediate token
*/
immediate(rule: RuleOrLiteral): ImmediateTokenRule;
};
/**
* Creates a new language grammar with the provided schema.
*
* @param options grammar options
*/
declare function grammar<RuleName extends string>(
options: Grammar<RuleName>,
): GrammarSchema<RuleName>;
/**
* Extends an existing language grammar with the provided options,
* creating a new language.
*
* @param baseGrammar base grammar schema to extend from
* @param options grammar options for the new extended language
*/
declare function grammar<
BaseGrammarRuleName extends string,
RuleName extends string
>(
baseGrammar: GrammarSchema<BaseGrammarRuleName>,
options: Grammar<RuleName, BaseGrammarRuleName>,
): GrammarSchema<RuleName | BaseGrammarRuleName>;

View file

@ -1,6 +1,6 @@
{
"name": "tree-sitter-cli",
"version": "0.16.8",
"version": "0.16.9",
"author": "Max Brunsfeld",
"license": "MIT",
"repository": {

View file

@ -87,7 +87,7 @@ impl<'a> From<tree_sitter_highlight::Error> for Error {
impl<'a> From<tree_sitter_tags::Error> for Error {
fn from(error: tree_sitter_tags::Error) -> Self {
Error::new(format!("{:?}", error))
Error::new(format!("{}", error))
}
}

View file

@ -199,6 +199,9 @@ impl<'a> Minimizer<'a> {
right_state: &ParseState,
group_ids_by_state_id: &Vec<ParseStateId>,
) -> bool {
if left_state.is_non_terminal_extra != right_state.is_non_terminal_extra {
return true;
}
for (token, left_entry) in &left_state.terminal_entries {
if let Some(right_entry) = right_state.terminal_entries.get(token) {
if self.entries_conflict(

View file

@ -19,7 +19,7 @@ pub(crate) struct FieldInfo {
#[derive(Clone, Debug, Default, PartialEq, Eq)]
pub(crate) struct VariableInfo {
pub fields: HashMap<String, FieldInfo>,
pub child_types: Vec<ChildType>,
pub children: FieldInfo,
pub children_without_fields: FieldInfo,
pub has_multi_step_production: bool,
}
@ -70,7 +70,7 @@ impl Default for FieldInfoJSON {
impl Default for ChildQuantity {
fn default() -> Self {
Self::zero()
Self::one()
}
}
@ -158,7 +158,7 @@ pub(crate) fn get_variable_info(
// Each variable's summary can depend on the summaries of other hidden variables,
// and variables can have mutually recursive structure. So we compute the summaries
// iteratively, in a loop that terminates only when more changes are possible.
// iteratively, in a loop that terminates only when no more changes are possible.
let mut did_change = true;
let mut all_initialized = false;
let mut result = vec![VariableInfo::default(); syntax_grammar.variables.len()];
@ -168,13 +168,14 @@ pub(crate) fn get_variable_info(
for (i, variable) in syntax_grammar.variables.iter().enumerate() {
let mut variable_info = result[i].clone();
// Within a variable, consider each production separately. For each
// production, determine which children and fields can occur, and how many
// times they can occur.
for (production_index, production) in variable.productions.iter().enumerate() {
let mut field_quantities = HashMap::new();
let mut children_without_fields_quantity = ChildQuantity::zero();
let mut has_uninitialized_invisible_children = false;
// Examine each of the variable's productions. The variable's child types can be
// immediately combined across all productions, but the child quantities must be
// recorded separately for each production.
for production in &variable.productions {
let mut production_field_quantities = HashMap::new();
let mut production_children_quantity = ChildQuantity::zero();
let mut production_children_without_fields_quantity = ChildQuantity::zero();
let mut production_has_uninitialized_invisible_children = false;
if production.steps.len() > 1 {
variable_info.has_multi_step_production = true;
@ -190,111 +191,97 @@ pub(crate) fn get_variable_info(
ChildType::Normal(child_symbol)
};
// Record all of the types of direct children.
did_change |= sorted_vec_insert(&mut variable_info.child_types, &child_type);
let child_is_hidden = !child_type_is_visible(&child_type)
&& !syntax_grammar.supertype_symbols.contains(&child_symbol);
// Record all of the field names that occur.
// Maintain the set of all child types for this variable, and the quantity of
// visible children in this production.
did_change |=
extend_sorted(&mut variable_info.children.types, Some(&child_type));
if !child_is_hidden {
production_children_quantity.append(ChildQuantity::one());
}
// Maintain the set of child types associated with each field, and the quantity
// of children associated with each field in this production.
if let Some(field_name) = &step.field_name {
// Record how many times each field occurs in this production.
field_quantities
let field_info = variable_info
.fields
.entry(field_name.clone())
.or_insert(FieldInfo::default());
did_change |= extend_sorted(&mut field_info.types, Some(&child_type));
let production_field_quantity = production_field_quantities
.entry(field_name)
.or_insert(ChildQuantity::zero())
.append(ChildQuantity::one());
.or_insert(ChildQuantity::zero());
// Record the types of children for this field.
let field_info =
variable_info.fields.entry(field_name.clone()).or_insert({
let mut info = FieldInfo {
types: Vec::new(),
quantity: ChildQuantity::one(),
};
// If this field did *not* occur in an earlier production,
// then it is not required.
if production_index > 0 {
info.quantity.required = false;
}
info
});
did_change |= sorted_vec_insert(&mut field_info.types, &child_type);
}
// Record named children without fields.
else if child_type_is_named(&child_type) {
// Record how many named children without fields occur in this production.
children_without_fields_quantity.append(ChildQuantity::one());
// Record the types of all of the named children without fields.
let children_info = &mut variable_info.children_without_fields;
if children_info.types.is_empty() {
children_info.quantity = ChildQuantity::one();
// Inherit the types and quantities of hidden children associated with fields.
if child_is_hidden && child_symbol.is_non_terminal() {
let child_variable_info = &result[child_symbol.index];
did_change |= extend_sorted(
&mut field_info.types,
&child_variable_info.children.types,
);
production_field_quantity.append(child_variable_info.children.quantity);
} else {
production_field_quantity.append(ChildQuantity::one());
}
did_change |= sorted_vec_insert(&mut children_info.types, &child_type);
}
// Maintain the set of named children without fields within this variable.
else if child_type_is_named(&child_type) {
production_children_without_fields_quantity.append(ChildQuantity::one());
did_change |= extend_sorted(
&mut variable_info.children_without_fields.types,
Some(&child_type),
);
}
// Inherit information from any hidden children.
if child_symbol.is_non_terminal()
&& !syntax_grammar.supertype_symbols.contains(&child_symbol)
&& step.alias.is_none()
&& !child_type_is_visible(&child_type)
{
// Inherit all child information from hidden children.
if child_is_hidden && child_symbol.is_non_terminal() {
let child_variable_info = &result[child_symbol.index];
// If a hidden child can have multiple children, then this
// node can appear to have multiple children.
// If a hidden child can have multiple children, then its parent node can
// appear to have multiple children.
if child_variable_info.has_multi_step_production {
variable_info.has_multi_step_production = true;
}
// Inherit fields from this hidden child
// If a hidden child has fields, then the parent node can appear to have
// those same fields.
for (field_name, child_field_info) in &child_variable_info.fields {
field_quantities
production_field_quantities
.entry(field_name)
.or_insert(ChildQuantity::zero())
.append(child_field_info.quantity);
let field_info = variable_info
.fields
.entry(field_name.clone())
.or_insert(FieldInfo {
types: Vec::new(),
quantity: ChildQuantity::one(),
});
for child_type in &child_field_info.types {
sorted_vec_insert(&mut field_info.types, &child_type);
}
did_change |= extend_sorted(
&mut variable_info
.fields
.entry(field_name.clone())
.or_insert(FieldInfo::default())
.types,
&child_field_info.types,
);
}
// Inherit child types from this hidden child
for child_type in &child_variable_info.child_types {
did_change |=
sorted_vec_insert(&mut variable_info.child_types, child_type);
}
// If a hidden child has children, then the parent node can appear to have
// those same children.
production_children_quantity.append(child_variable_info.children.quantity);
did_change |= extend_sorted(
&mut variable_info.children.types,
&child_variable_info.children.types,
);
// If any field points to this hidden child, inherit child types
// for the field.
if let Some(field_name) = &step.field_name {
let field_info = variable_info.fields.get_mut(field_name).unwrap();
for child_type in &child_variable_info.child_types {
did_change |= sorted_vec_insert(&mut field_info.types, &child_type);
}
}
// Inherit info about children without fields from this hidden child.
else {
// If a hidden child can have named children without fields, then the parent
// node can appear to have those same children.
if step.field_name.is_none() {
let grandchildren_info = &child_variable_info.children_without_fields;
if !grandchildren_info.types.is_empty() {
children_without_fields_quantity
.append(grandchildren_info.quantity);
if variable_info.children_without_fields.types.is_empty() {
variable_info.children_without_fields.quantity =
ChildQuantity::one();
}
for child_type in &grandchildren_info.types {
did_change |= sorted_vec_insert(
&mut variable_info.children_without_fields.types,
&child_type,
);
}
production_children_without_fields_quantity
.append(child_variable_info.children_without_fields.quantity);
did_change |= extend_sorted(
&mut variable_info.children_without_fields.types,
&child_variable_info.children_without_fields.types,
);
}
}
}
@ -302,22 +289,27 @@ pub(crate) fn get_variable_info(
// Note whether or not this production contains children whose summaries
// have not yet been computed.
if child_symbol.index >= i && !all_initialized {
has_uninitialized_invisible_children = true;
production_has_uninitialized_invisible_children = true;
}
}
// If this production's children all have had their summaries initialized,
// then expand the quantity information with all of the possibilities introduced
// by this production.
if !has_uninitialized_invisible_children {
if !production_has_uninitialized_invisible_children {
did_change |= variable_info
.children
.quantity
.union(production_children_quantity);
did_change |= variable_info
.children_without_fields
.quantity
.union(children_without_fields_quantity);
.union(production_children_without_fields_quantity);
for (field_name, info) in variable_info.fields.iter_mut() {
did_change |= info.quantity.union(
field_quantities
production_field_quantities
.get(field_name)
.cloned()
.unwrap_or(ChildQuantity::zero()),
@ -352,13 +344,15 @@ pub(crate) fn get_variable_info(
// Update all of the node type lists to eliminate hidden nodes.
for supertype_symbol in &syntax_grammar.supertype_symbols {
result[supertype_symbol.index]
.child_types
.children
.types
.retain(child_type_is_visible);
}
for variable_info in result.iter_mut() {
for (_, field_info) in variable_info.fields.iter_mut() {
field_info.types.retain(child_type_is_visible);
}
variable_info.fields.retain(|_, v| !v.types.is_empty());
variable_info
.children_without_fields
.types
@ -467,7 +461,8 @@ pub(crate) fn generate_node_types_json(
subtypes: None,
});
let mut subtypes = info
.child_types
.children
.types
.iter()
.map(child_type_to_node_type)
.collect::<Vec<_>>();
@ -686,16 +681,19 @@ fn variable_type_for_child_type(
}
}
fn sorted_vec_insert<T>(vec: &mut Vec<T>, value: &T) -> bool
fn extend_sorted<'a, T>(vec: &mut Vec<T>, values: impl IntoIterator<Item = &'a T>) -> bool
where
T: Clone + Eq + Ord,
T: 'a,
{
if let Err(i) = vec.binary_search(&value) {
vec.insert(i, value.clone());
true
} else {
false
}
values.into_iter().any(|value| {
if let Err(i) = vec.binary_search(&value) {
vec.insert(i, value.clone());
true
} else {
false
}
})
}
#[cfg(test)]
@ -1177,6 +1175,38 @@ mod tests {
);
}
#[test]
fn test_node_types_with_fields_on_hidden_tokens() {
let node_types = get_node_types(InputGrammar {
name: String::new(),
extra_symbols: Vec::new(),
external_tokens: Vec::new(),
expected_conflicts: Vec::new(),
variables_to_inline: Vec::new(),
word_token: None,
supertype_symbols: vec![],
variables: vec![Variable {
name: "script".to_string(),
kind: VariableType::Named,
rule: Rule::seq(vec![
Rule::field("a".to_string(), Rule::pattern("hi")),
Rule::field("b".to_string(), Rule::pattern("bye")),
]),
}],
});
assert_eq!(
node_types,
[NodeInfoJSON {
kind: "script".to_string(),
named: true,
fields: Some(BTreeMap::new()),
children: None,
subtypes: None
}]
);
}
#[test]
fn test_node_types_with_multiple_rules_same_alias_name() {
let node_types = get_node_types(InputGrammar {
@ -1461,6 +1491,71 @@ mod tests {
);
}
#[test]
fn test_get_variable_info_with_repetitions_inside_fields() {
let variable_info = get_variable_info(
&build_syntax_grammar(
vec![
// Field associated with a repetition.
SyntaxVariable {
name: "rule0".to_string(),
kind: VariableType::Named,
productions: vec![
Production {
dynamic_precedence: 0,
steps: vec![ProductionStep::new(Symbol::non_terminal(1))
.with_field_name("field1")],
},
Production {
dynamic_precedence: 0,
steps: vec![],
},
],
},
// Repetition node
SyntaxVariable {
name: "_rule0_repeat".to_string(),
kind: VariableType::Hidden,
productions: vec![
Production {
dynamic_precedence: 0,
steps: vec![ProductionStep::new(Symbol::terminal(1))],
},
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::non_terminal(1)),
ProductionStep::new(Symbol::non_terminal(1)),
],
},
],
},
],
vec![],
),
&build_lexical_grammar(),
&AliasMap::new(),
)
.unwrap();
assert_eq!(
variable_info[0].fields,
vec![(
"field1".to_string(),
FieldInfo {
quantity: ChildQuantity {
exists: true,
required: false,
multiple: true,
},
types: vec![ChildType::Normal(Symbol::terminal(1))],
}
)]
.into_iter()
.collect::<HashMap<_, _>>()
);
}
#[test]
fn test_get_variable_info_with_inherited_fields() {
let variable_info = get_variable_info(

View file

@ -127,6 +127,9 @@ impl InlinedProductionMapBuilder {
last_inserted_step.associativity = removed_step.associativity;
}
}
if p.dynamic_precedence.abs() > production.dynamic_precedence.abs() {
production.dynamic_precedence = p.dynamic_precedence;
}
production
}),
);
@ -226,7 +229,7 @@ mod tests {
],
},
Production {
dynamic_precedence: 0,
dynamic_precedence: -2,
steps: vec![ProductionStep::new(Symbol::terminal(14))],
},
],
@ -258,7 +261,7 @@ mod tests {
],
},
Production {
dynamic_precedence: 0,
dynamic_precedence: -2,
steps: vec![
ProductionStep::new(Symbol::terminal(10)),
ProductionStep::new(Symbol::terminal(14)),

View file

@ -160,7 +160,9 @@ impl Loader {
// If multiple language configurations match, then determine which
// one to use by applying the configurations' content regexes.
else {
let file_contents = fs::read_to_string(path)?;
let file_contents = fs::read(path)
.map_err(Error::wrap(|| format!("Failed to read path {:?}", path)))?;
let file_contents = String::from_utf8_lossy(&file_contents);
let mut best_score = -2isize;
let mut best_configuration_id = None;
for configuration_id in configuration_ids {

View file

@ -53,11 +53,12 @@ fn run() -> error::Result<()> {
.subcommand(
SubCommand::with_name("parse")
.about("Parse files")
.arg(Arg::with_name("paths-file").long("paths").takes_value(true))
.arg(
Arg::with_name("path")
Arg::with_name("paths")
.index(1)
.multiple(true)
.required(true),
.required(false),
)
.arg(Arg::with_name("scope").long("scope").takes_value(true))
.arg(Arg::with_name("debug").long("debug").short("d"))
@ -79,37 +80,33 @@ fn run() -> error::Result<()> {
SubCommand::with_name("query")
.about("Search files using a syntax tree query")
.arg(Arg::with_name("query-path").index(1).required(true))
.arg(Arg::with_name("paths-file").long("paths").takes_value(true))
.arg(
Arg::with_name("path")
Arg::with_name("paths")
.index(2)
.multiple(true)
.required(true),
.required(false),
)
.arg(
Arg::with_name("byte-range")
.help("The range of byte offsets in which the query will be executed")
.long("byte-range")
.takes_value(true),
)
.arg(Arg::with_name("scope").long("scope").takes_value(true))
.arg(Arg::with_name("captures").long("captures").short("c")),
)
.subcommand(
SubCommand::with_name("tags")
.arg(
Arg::with_name("format")
.short("f")
.long("format")
.value_name("json|protobuf")
.help("Determine output format (default: json)"),
)
.arg(Arg::with_name("quiet").long("quiet").short("q"))
.arg(Arg::with_name("time").long("time").short("t"))
.arg(Arg::with_name("scope").long("scope").takes_value(true))
.arg(Arg::with_name("paths-file").long("paths").takes_value(true))
.arg(
Arg::with_name("inputs")
Arg::with_name("paths")
.help("The source file to use")
.index(1)
.required(true)
.multiple(true),
)
.arg(
Arg::with_name("v")
.short("v")
.multiple(true)
.help("Sets the level of verbosity"),
),
)
.subcommand(
@ -127,11 +124,12 @@ fn run() -> error::Result<()> {
.subcommand(
SubCommand::with_name("highlight")
.about("Highlight a file")
.arg(Arg::with_name("paths-file").long("paths").takes_value(true))
.arg(
Arg::with_name("path")
Arg::with_name("paths")
.index(1)
.multiple(true)
.required(true),
.required(false),
)
.arg(Arg::with_name("scope").long("scope").takes_value(true))
.arg(Arg::with_name("html").long("html").short("h"))
@ -230,7 +228,9 @@ fn run() -> error::Result<()> {
let timeout = matches
.value_of("timeout")
.map_or(0, |t| u64::from_str_radix(t, 10).unwrap());
let paths = collect_paths(matches.values_of("path").unwrap())?;
let paths = collect_paths(matches.value_of("paths-file"), matches.values_of("paths"))?;
let max_path_length = paths.iter().map(|p| p.chars().count()).max().unwrap();
let mut has_error = false;
loader.find_all_languages(&config.parser_directories)?;
@ -256,31 +256,36 @@ fn run() -> error::Result<()> {
}
} else if let Some(matches) = matches.subcommand_matches("query") {
let ordered_captures = matches.values_of("captures").is_some();
let paths = matches
.values_of("path")
.unwrap()
.into_iter()
.map(Path::new)
.collect::<Vec<&Path>>();
let paths = collect_paths(matches.value_of("paths-file"), matches.values_of("paths"))?;
loader.find_all_languages(&config.parser_directories)?;
let language = select_language(
&mut loader,
paths[0],
Path::new(&paths[0]),
&current_dir,
matches.value_of("scope"),
)?;
let query_path = Path::new(matches.value_of("query-path").unwrap());
query::query_files_at_paths(language, paths, query_path, ordered_captures)?;
let range = matches.value_of("byte-range").map(|br| {
let r: Vec<&str> = br.split(":").collect();
(r[0].parse().unwrap(), r[1].parse().unwrap())
});
query::query_files_at_paths(language, paths, query_path, ordered_captures, range)?;
} else if let Some(matches) = matches.subcommand_matches("tags") {
loader.find_all_languages(&config.parser_directories)?;
let paths = collect_paths(matches.values_of("inputs").unwrap())?;
tags::generate_tags(&loader, matches.value_of("scope"), &paths)?;
let paths = collect_paths(matches.value_of("paths-file"), matches.values_of("paths"))?;
tags::generate_tags(
&loader,
matches.value_of("scope"),
&paths,
matches.is_present("quiet"),
matches.is_present("time"),
)?;
} else if let Some(matches) = matches.subcommand_matches("highlight") {
loader.configure_highlights(&config.theme.highlight_names);
loader.find_all_languages(&config.parser_directories)?;
let time = matches.is_present("time");
let paths = collect_paths(matches.values_of("path").unwrap())?;
let paths = collect_paths(matches.value_of("paths-file"), matches.values_of("paths"))?;
let html_mode = matches.is_present("html");
if html_mode {
println!("{}", highlight::HTML_HEADER);
@ -353,39 +358,58 @@ fn run() -> error::Result<()> {
Ok(())
}
fn collect_paths<'a>(paths: impl Iterator<Item = &'a str>) -> error::Result<Vec<String>> {
let mut result = Vec::new();
fn collect_paths<'a>(
paths_file: Option<&str>,
paths: Option<impl Iterator<Item = &'a str>>,
) -> error::Result<Vec<String>> {
if let Some(paths_file) = paths_file {
return Ok(fs::read_to_string(paths_file)
.map_err(Error::wrap(|| {
format!("Failed to read paths file {}", paths_file)
}))?
.trim()
.split_ascii_whitespace()
.map(String::from)
.collect::<Vec<_>>());
}
let mut incorporate_path = |path: &str, positive| {
if positive {
result.push(path.to_string());
} else {
if let Some(index) = result.iter().position(|p| p == path) {
result.remove(index);
if let Some(paths) = paths {
let mut result = Vec::new();
let mut incorporate_path = |path: &str, positive| {
if positive {
result.push(path.to_string());
} else {
if let Some(index) = result.iter().position(|p| p == path) {
result.remove(index);
}
}
}
};
};
for mut path in paths {
let mut positive = true;
if path.starts_with("!") {
positive = false;
path = path.trim_start_matches("!");
}
for mut path in paths {
let mut positive = true;
if path.starts_with("!") {
positive = false;
path = path.trim_start_matches("!");
}
if Path::new(path).exists() {
incorporate_path(path, positive);
} else {
let paths =
glob(path).map_err(Error::wrap(|| format!("Invalid glob pattern {:?}", path)))?;
for path in paths {
if let Some(path) = path?.to_str() {
incorporate_path(path, positive);
if Path::new(path).exists() {
incorporate_path(path, positive);
} else {
let paths = glob(path)
.map_err(Error::wrap(|| format!("Invalid glob pattern {:?}", path)))?;
for path in paths {
if let Some(path) = path?.to_str() {
incorporate_path(path, positive);
}
}
}
}
return Ok(result);
}
Ok(result)
Err(Error::new("Must provide one or more paths".to_string()))
}
fn select_language(

View file

@ -6,9 +6,10 @@ use tree_sitter::{Language, Node, Parser, Query, QueryCursor};
pub fn query_files_at_paths(
language: Language,
paths: Vec<&Path>,
paths: Vec<String>,
query_path: &Path,
ordered_captures: bool,
range: Option<(usize, usize)>,
) -> Result<()> {
let stdout = io::stdout();
let mut stdout = stdout.lock();
@ -20,14 +21,17 @@ pub fn query_files_at_paths(
.map_err(|e| Error::new(format!("Query compilation failed: {:?}", e)))?;
let mut query_cursor = QueryCursor::new();
if let Some((beg, end)) = range {
query_cursor.set_byte_range(beg, end);
}
let mut parser = Parser::new();
parser.set_language(language).map_err(|e| e.to_string())?;
for path in paths {
writeln!(&mut stdout, "{}", path.to_str().unwrap())?;
writeln!(&mut stdout, "{}", path)?;
let source_code = fs::read(path).map_err(Error::wrap(|| {
let source_code = fs::read(&path).map_err(Error::wrap(|| {
format!("Error reading source file {:?}", path)
}))?;
let text_callback = |n: Node| &source_code[n.byte_range()];

View file

@ -3,10 +3,17 @@ use super::util;
use crate::error::{Error, Result};
use std::io::{self, Write};
use std::path::Path;
use std::time::Instant;
use std::{fs, str};
use tree_sitter_tags::TagsContext;
pub fn generate_tags(loader: &Loader, scope: Option<&str>, paths: &[String]) -> Result<()> {
pub fn generate_tags(
loader: &Loader,
scope: Option<&str>,
paths: &[String],
quiet: bool,
time: bool,
) -> Result<()> {
let mut lang = None;
if let Some(scope) = scope {
lang = loader.language_configuration_for_scope(scope)?;
@ -34,28 +41,50 @@ pub fn generate_tags(loader: &Loader, scope: Option<&str>, paths: &[String]) ->
};
if let Some(tags_config) = language_config.tags_config(language)? {
let path_str = format!("{:?}", path);
writeln!(&mut stdout, "{}", &path_str[1..path_str.len() - 1])?;
let indent;
if paths.len() > 1 {
if !quiet {
writeln!(&mut stdout, "{}", path.to_string_lossy())?;
}
indent = "\t"
} else {
indent = "";
};
let source = fs::read(path)?;
for tag in context.generate_tags(tags_config, &source, Some(&cancellation_flag))? {
let t0 = Instant::now();
for tag in context.generate_tags(tags_config, &source, Some(&cancellation_flag))?.0 {
let tag = tag?;
write!(
&mut stdout,
" {:<8} {:<40}\t{:>9}-{:<9}",
tag.kind,
str::from_utf8(&source[tag.name_range]).unwrap_or(""),
tag.span.start,
tag.span.end,
)?;
if let Some(docs) = tag.docs {
if docs.len() > 120 {
write!(&mut stdout, "\t{:?}...", &docs[0..120])?;
} else {
write!(&mut stdout, "\t{:?}", &docs)?;
if !quiet {
write!(
&mut stdout,
"{}{:<10}\t | {:<8}\t{} {} - {} `{}`",
indent,
str::from_utf8(&source[tag.name_range]).unwrap_or(""),
&tags_config.syntax_type_name(tag.syntax_type_id),
if tag.is_definition { "def" } else { "ref" },
tag.span.start,
tag.span.end,
str::from_utf8(&source[tag.line_range]).unwrap_or(""),
)?;
if let Some(docs) = tag.docs {
if docs.len() > 120 {
write!(&mut stdout, "\t{:?}...", &docs[0..120])?;
} else {
write!(&mut stdout, "\t{:?}", &docs)?;
}
}
writeln!(&mut stdout, "")?;
}
writeln!(&mut stdout, "")?;
}
if time {
writeln!(
&mut stdout,
"{}time: {}ms",
indent,
t0.elapsed().as_millis(),
)?;
}
} else {
eprintln!("No tags config found for path {:?}", path);

View file

@ -3,6 +3,7 @@ mod helpers;
mod highlight_test;
mod node_test;
mod parser_test;
mod pathological_test;
mod query_test;
mod tags_test;
mod test_highlight_test;

View file

@ -0,0 +1,15 @@
use super::helpers::allocations;
use super::helpers::fixtures::get_language;
use tree_sitter::Parser;
#[test]
fn test_pathological_example_1() {
let language = "cpp";
let source = r#"*ss<s"ss<sqXqss<s._<s<sq<(qqX<sqss<s.ss<sqsssq<(qss<qssqXqss<s._<s<sq<(qqX<sqss<s.ss<sqsssq<(qss<sqss<sqss<s._<s<sq>(qqX<sqss<s.ss<sqsssq<(qss<sq&=ss<s<sqss<s._<s<sq<(qqX<sqss<s.ss<sqs"#;
allocations::record(|| {
let mut parser = Parser::new();
parser.set_language(get_language(language)).unwrap();
parser.parse(source, None).unwrap();
});
}

View file

@ -408,7 +408,7 @@ fn test_query_matches_with_many_overlapping_results() {
)
.unwrap();
let count = 80;
let count = 1024;
// Deeply nested chained function calls:
// a
@ -573,8 +573,8 @@ fn test_query_matches_with_immediate_siblings() {
&[
(0, vec![("parent", "a"), ("child", "b")]),
(0, vec![("parent", "b"), ("child", "c")]),
(1, vec![("last-child", "d")]),
(0, vec![("parent", "c"), ("child", "d")]),
(1, vec![("last-child", "d")]),
(2, vec![("first-element", "w")]),
(2, vec![("first-element", "1")]),
],
@ -758,6 +758,55 @@ fn test_query_matches_with_nested_repetitions() {
});
}
#[test]
fn test_query_matches_with_multiple_repetition_patterns_that_intersect_other_pattern() {
allocations::record(|| {
let language = get_language("javascript");
// When this query sees a comment, it must keep track of several potential
// matches: up to two for each pattern that begins with a comment.
let query = Query::new(
language,
r#"
(call_expression
function: (member_expression
property: (property_identifier) @name)) @ref.method
((comment)* @doc (function_declaration))
((comment)* @doc (generator_function_declaration))
((comment)* @doc (class_declaration))
((comment)* @doc (lexical_declaration))
((comment)* @doc (variable_declaration))
((comment)* @doc (method_definition))
(comment) @comment
"#,
)
.unwrap();
// Here, a series of comments occurs in the middle of a match of the first
// pattern. To avoid exceeding the storage limits and discarding that outer
// match, the comment-related matches need to be managed efficiently.
let source = format!(
"theObject\n{}\n.theMethod()",
" // the comment\n".repeat(64)
);
assert_query_matches(
language,
&query,
&source,
&vec![(7, vec![("comment", "// the comment")]); 64]
.into_iter()
.chain(vec![(
0,
vec![("ref.method", source.as_str()), ("name", "theMethod")],
)])
.collect::<Vec<_>>(),
);
});
}
#[test]
fn test_query_matches_with_leading_zero_or_more_repeated_leaf_nodes() {
allocations::record(|| {
@ -1161,6 +1210,43 @@ fn test_query_matches_with_too_many_permutations_to_track() {
});
}
#[test]
fn test_query_matches_with_alternatives_and_too_many_permutations_to_track() {
allocations::record(|| {
let language = get_language("javascript");
let query = Query::new(
language,
"
(
(comment) @doc
; not immediate
(class_declaration) @class
)
(call_expression
function: [
(identifier) @function
(member_expression property: (property_identifier) @method)
])
",
)
.unwrap();
let source = "/* hi */ a.b(); ".repeat(50);
let mut parser = Parser::new();
parser.set_language(language).unwrap();
let tree = parser.parse(&source, None).unwrap();
let mut cursor = QueryCursor::new();
let matches = cursor.matches(&query, tree.root_node(), to_callback(&source));
assert_eq!(
collect_matches(matches, &query, source.as_str()),
vec![(1, vec![("method", "b")]); 50],
);
});
}
#[test]
fn test_query_matches_with_anonymous_tokens() {
allocations::record(|| {
@ -1215,6 +1301,45 @@ fn test_query_matches_within_byte_range() {
});
}
#[test]
fn test_query_captures_within_byte_range() {
allocations::record(|| {
let language = get_language("c");
let query = Query::new(
language,
"
(call_expression
function: (identifier) @function
arguments: (argument_list (string_literal) @string.arg))
(string_literal) @string
",
)
.unwrap();
let source = r#"DEFUN ("safe-length", Fsafe_length, Ssafe_length, 1, 1, 0)"#;
let mut parser = Parser::new();
parser.set_language(language).unwrap();
let tree = parser.parse(&source, None).unwrap();
let mut cursor = QueryCursor::new();
let captures =
cursor
.set_byte_range(3, 27)
.captures(&query, tree.root_node(), to_callback(source));
assert_eq!(
collect_captures(captures, &query, source),
&[
("function", "DEFUN"),
("string.arg", "\"safe-length\""),
("string", "\"safe-length\""),
]
);
});
}
#[test]
fn test_query_matches_different_queries_same_cursor() {
allocations::record(|| {
@ -1420,12 +1545,17 @@ fn test_query_captures_with_text_conditions() {
((identifier) @function.builtin
(#eq? @function.builtin "require"))
(identifier) @variable
((identifier) @variable
(#not-match? @variable "^(lambda|load)$"))
"#,
)
.unwrap();
let source = "
toad
load
panda
lambda
const ab = require('./ab');
new Cd(EF);
";
@ -1439,6 +1569,8 @@ fn test_query_captures_with_text_conditions() {
assert_eq!(
collect_captures(captures, &query, source),
&[
("variable", "toad"),
("variable", "panda"),
("variable", "ab"),
("function.builtin", "require"),
("variable", "require"),
@ -2074,6 +2206,39 @@ fn test_query_disable_pattern() {
});
}
#[test]
fn test_query_alternative_predicate_prefix() {
allocations::record(|| {
let language = get_language("c");
let query = Query::new(
language,
r#"
((call_expression
function: (identifier) @keyword
arguments: (argument_list
(string_literal) @function))
(.eq? @keyword "DEFUN"))
"#,
)
.unwrap();
let source = r#"
DEFUN ("identity", Fidentity, Sidentity, 1, 1, 0,
doc: /* Return the argument unchanged. */
attributes: const)
(Lisp_Object arg)
{
return arg;
}
"#;
assert_query_matches(
language,
&query,
source,
&[(0, vec![("keyword", "DEFUN"), ("function", "\"identity\"")])],
);
});
}
#[test]
fn test_query_is_definite() {
struct Row {
@ -2086,10 +2251,7 @@ fn test_query_is_definite() {
Row {
language: get_language("python"),
pattern: r#"(expression_statement (string))"#,
results_by_symbol: &[
("expression_statement", false),
("string", false),
],
results_by_symbol: &[("expression_statement", false), ("string", false)],
},
Row {
language: get_language("javascript"),
@ -2102,30 +2264,17 @@ fn test_query_is_definite() {
Row {
language: get_language("javascript"),
pattern: r#"(object "{" "}")"#,
results_by_symbol: &[
("object", false),
("{", true),
("}", true),
],
results_by_symbol: &[("object", false), ("{", true), ("}", true)],
},
Row {
language: get_language("javascript"),
pattern: r#"(pair (property_identifier) ":")"#,
results_by_symbol: &[
("pair", false),
("property_identifier", false),
(":", true),
],
results_by_symbol: &[("pair", false), ("property_identifier", false), (":", true)],
},
Row {
language: get_language("javascript"),
pattern: r#"(object "{" (_) "}")"#,
results_by_symbol: &[
("object", false),
("{", false),
("", false),
("}", true),
],
results_by_symbol: &[("object", false), ("{", false), ("", false), ("}", true)],
},
Row {
language: get_language("javascript"),

View file

@ -1,73 +1,81 @@
use super::helpers::allocations;
use super::helpers::fixtures::{get_language, get_language_queries_path};
use std::ffi::CStr;
use std::ffi::CString;
use std::{fs, ptr, slice, str};
use tree_sitter::Point;
use tree_sitter_tags::c_lib as c;
use tree_sitter_tags::{Error, TagKind, TagsConfiguration, TagsContext};
use tree_sitter_tags::{Error, TagsConfiguration, TagsContext};
const PYTHON_TAG_QUERY: &'static str = r#"
(
(function_definition
name: (identifier) @name
body: (block . (expression_statement (string) @doc))) @function
(#strip! @doc "(^['\"\\s]*)|(['\"\\s]*$)")
(function_definition
name: (identifier) @name
body: (block . (expression_statement (string) @doc))) @definition.function
(#strip! @doc "(^['\"\\s]*)|(['\"\\s]*$)")
)
(function_definition
name: (identifier) @name) @function
name: (identifier) @name) @definition.function
(
(class_definition
name: (identifier) @name
body: (block
. (expression_statement (string) @doc))) @class
(#strip! @doc "(^['\"\\s]*)|(['\"\\s]*$)")
(class_definition
name: (identifier) @name
body: (block
. (expression_statement (string) @doc))) @definition.class
(#strip! @doc "(^['\"\\s]*)|(['\"\\s]*$)")
)
(class_definition
name: (identifier) @name) @class
name: (identifier) @name) @definition.class
(call
function: (identifier) @name) @call
function: (identifier) @name) @reference.call
(call
function: (attribute
attribute: (identifier) @name)) @reference.call
"#;
const JS_TAG_QUERY: &'static str = r#"
(
(comment)* @doc .
(class_declaration
name: (identifier) @name) @class
(#select-adjacent! @doc @class)
name: (identifier) @name) @definition.class
(#select-adjacent! @doc @definition.class)
(#strip! @doc "(^[/\\*\\s]*)|([/\\*\\s]*$)")
)
(
(comment)* @doc .
(method_definition
name: (property_identifier) @name) @method
(#select-adjacent! @doc @method)
name: (property_identifier) @name) @definition.method
(#select-adjacent! @doc @definition.method)
(#strip! @doc "(^[/\\*\\s]*)|([/\\*\\s]*$)")
)
(
(comment)* @doc .
(function_declaration
name: (identifier) @name) @function
(#select-adjacent! @doc @function)
name: (identifier) @name) @definition.function
(#select-adjacent! @doc @definition.function)
(#strip! @doc "(^[/\\*\\s]*)|([/\\*\\s]*$)")
)
(call_expression
function: (identifier) @name) @call
function: (identifier) @name) @reference.call
"#;
const RUBY_TAG_QUERY: &'static str = r#"
(method
name: (identifier) @name) @method
name: (_) @name) @definition.method
(method_call
method: (identifier) @name) @call
method: (identifier) @name) @reference.call
((identifier) @name @call
(setter (identifier) @ignore)
((identifier) @name @reference.call
(#is-not? local))
"#;
@ -94,25 +102,26 @@ fn test_tags_python() {
let tags = tag_context
.generate_tags(&tags_config, source, None)
.unwrap()
.0
.collect::<Result<Vec<_>, _>>()
.unwrap();
assert_eq!(
tags.iter()
.map(|t| (substr(source, &t.name_range), t.kind))
.map(|t| (
substr(source, &t.name_range),
tags_config.syntax_type_name(t.syntax_type_id)
))
.collect::<Vec<_>>(),
&[
("Customer", TagKind::Class),
("age", TagKind::Function),
("compute_age", TagKind::Call),
("Customer", "class"),
("age", "function"),
("compute_age", "call"),
]
);
assert_eq!(substr(source, &tags[0].line_range), " class Customer:");
assert_eq!(
substr(source, &tags[1].line_range),
" def age(self):"
);
assert_eq!(substr(source, &tags[0].line_range), "class Customer:");
assert_eq!(substr(source, &tags[1].line_range), "def age(self):");
assert_eq!(tags[0].docs.as_ref().unwrap(), "Data about a customer");
assert_eq!(tags[1].docs.as_ref().unwrap(), "Get the customer's age");
}
@ -145,17 +154,22 @@ fn test_tags_javascript() {
let tags = tag_context
.generate_tags(&tags_config, source, None)
.unwrap()
.0
.collect::<Result<Vec<_>, _>>()
.unwrap();
assert_eq!(
tags.iter()
.map(|t| (substr(source, &t.name_range), t.kind))
.map(|t| (
substr(source, &t.name_range),
t.span.clone(),
tags_config.syntax_type_name(t.syntax_type_id)
))
.collect::<Vec<_>>(),
&[
("Customer", TagKind::Class),
("getAge", TagKind::Method),
("Agent", TagKind::Class)
("Customer", Point::new(5, 10)..Point::new(5, 18), "class",),
("getAge", Point::new(9, 8)..Point::new(9, 14), "method",),
("Agent", Point::new(15, 10)..Point::new(15, 15), "class",)
]
);
assert_eq!(
@ -166,6 +180,27 @@ fn test_tags_javascript() {
assert_eq!(tags[2].docs, None);
}
#[test]
fn test_tags_columns_measured_in_utf16_code_units() {
let language = get_language("python");
let tags_config = TagsConfiguration::new(language, PYTHON_TAG_QUERY, "").unwrap();
let mut tag_context = TagsContext::new();
let source = r#""❤️❤️❤️".hello_α_ω()"#.as_bytes();
let tag = tag_context
.generate_tags(&tags_config, source, None)
.unwrap()
.0
.next()
.unwrap()
.unwrap();
assert_eq!(substr(source, &tag.name_range), "hello_α");
assert_eq!(tag.span, Point::new(0, 21)..Point::new(0, 32));
assert_eq!(tag.utf16_column_range, 9..18);
}
#[test]
fn test_tags_ruby() {
let language = get_language("ruby");
@ -177,7 +212,7 @@ fn test_tags_ruby() {
"
b = 1
def foo()
def foo=()
c = 1
# a is a method because it is not in scope
@ -197,6 +232,7 @@ fn test_tags_ruby() {
let tags = tag_context
.generate_tags(&tags_config, source.as_bytes(), None)
.unwrap()
.0
.collect::<Result<Vec<_>, _>>()
.unwrap();
@ -204,18 +240,18 @@ fn test_tags_ruby() {
tags.iter()
.map(|t| (
substr(source.as_bytes(), &t.name_range),
t.kind,
tags_config.syntax_type_name(t.syntax_type_id),
(t.span.start.row, t.span.start.column),
))
.collect::<Vec<_>>(),
&[
("foo", TagKind::Method, (2, 0)),
("bar", TagKind::Call, (7, 4)),
("a", TagKind::Call, (7, 8)),
("b", TagKind::Call, (7, 11)),
("each", TagKind::Call, (9, 14)),
("baz", TagKind::Call, (13, 8)),
("b", TagKind::Call, (13, 15),),
("foo=", "method", (2, 4)),
("bar", "call", (7, 4)),
("a", "call", (7, 8)),
("b", "call", (7, 11)),
("each", "call", (9, 14)),
("baz", "call", (13, 8)),
("b", "call", (13, 15),),
]
);
}
@ -239,7 +275,7 @@ fn test_tags_cancellation() {
.generate_tags(&tags_config, source.as_bytes(), Some(&cancellation_flag))
.unwrap();
for (i, tag) in tags.enumerate() {
for (i, tag) in tags.0.enumerate() {
if i == 150 {
cancellation_flag.store(1, Ordering::SeqCst);
}
@ -253,6 +289,47 @@ fn test_tags_cancellation() {
});
}
#[test]
fn test_invalid_capture() {
let language = get_language("python");
let e = TagsConfiguration::new(language, "(identifier) @method", "")
.expect_err("expected InvalidCapture error");
assert_eq!(e, Error::InvalidCapture("method".to_string()));
}
#[test]
fn test_tags_with_parse_error() {
let language = get_language("python");
let tags_config = TagsConfiguration::new(language, PYTHON_TAG_QUERY, "").unwrap();
let mut tag_context = TagsContext::new();
let source = br#"
class Fine: pass
class Bad
"#;
let (tags, failed) = tag_context
.generate_tags(&tags_config, source, None)
.unwrap();
let newtags = tags.collect::<Result<Vec<_>, _>>().unwrap();
assert!(failed, "syntax error should have been detected");
assert_eq!(
newtags.iter()
.map(|t| (
substr(source, &t.name_range),
tags_config.syntax_type_name(t.syntax_type_id)
))
.collect::<Vec<_>>(),
&[
("Fine", "class"),
]
);
}
#[test]
fn test_tags_via_c_api() {
allocations::record(|| {
@ -316,29 +393,29 @@ fn test_tags_via_c_api() {
})
.unwrap();
let syntax_types: Vec<&str> = unsafe {
let mut len: u32 = 0;
let ptr =
c::ts_tagger_syntax_kinds_for_scope_name(tagger, c_scope_name.as_ptr(), &mut len);
slice::from_raw_parts(ptr, len as usize)
.iter()
.map(|i| CStr::from_ptr(*i).to_str().unwrap())
.collect()
};
assert_eq!(
tags.iter()
.map(|tag| (
tag.kind,
syntax_types[tag.syntax_type_id as usize],
&source_code[tag.name_start_byte as usize..tag.name_end_byte as usize],
&source_code[tag.line_start_byte as usize..tag.line_end_byte as usize],
&docs[tag.docs_start_byte as usize..tag.docs_end_byte as usize],
))
.collect::<Vec<_>>(),
&[
(
c::TSTagKind::Function,
"b",
"function b() {",
"one\ntwo\nthree"
),
(
c::TSTagKind::Class,
"C",
"class C extends D {",
"four\nfive"
),
(c::TSTagKind::Call, "b", "b(a);", "")
("function", "b", "function b() {", "one\ntwo\nthree"),
("class", "C", "class C extends D {", "four\nfive"),
("call", "b", "b(a);", "")
]
);

View file

@ -1,3 +1,4 @@
use super::error::{Error, Result};
use std::io;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
@ -31,12 +32,12 @@ pub struct LogSession();
pub struct LogSession(PathBuf, Option<Child>, Option<ChildStdin>);
#[cfg(windows)]
pub fn log_graphs(_parser: &mut Parser, _path: &str) -> std::io::Result<LogSession> {
pub fn log_graphs(_parser: &mut Parser, _path: &str) -> Result<LogSession> {
Ok(LogSession())
}
#[cfg(unix)]
pub fn log_graphs(parser: &mut Parser, path: &str) -> std::io::Result<LogSession> {
pub fn log_graphs(parser: &mut Parser, path: &str) -> Result<LogSession> {
use std::io::Write;
let mut dot_file = std::fs::File::create(path)?;
@ -46,11 +47,13 @@ pub fn log_graphs(parser: &mut Parser, path: &str) -> std::io::Result<LogSession
.stdin(Stdio::piped())
.stdout(dot_file)
.spawn()
.expect("Failed to run Dot");
.map_err(Error::wrap(|| {
"Failed to run the `dot` command. Check that graphviz is installed."
}))?;
let dot_stdin = dot_process
.stdin
.take()
.expect("Failed to open stdin for Dot");
.ok_or_else(|| Error::new("Failed to open stdin for `dot` process.".to_string()))?;
parser.print_dot_graphs(&dot_stdin);
Ok(LogSession(
PathBuf::from(path),

View file

@ -13,7 +13,7 @@ Developing Tree-sitter grammars can have a difficult learning curve, but once yo
In order to develop a Tree-sitter parser, there are two dependencies that you need to install:
* **Node.js** - Tree-sitter grammars are written in JavaScript, and Tree-sitter uses [Node.js][node.js] to interpret JavaScript files. It requires the `node` command to be in one of the directories in your [`PATH`][path-env]. It shouldn't matter what version of Node you have.
* **Node.js** - Tree-sitter grammars are written in JavaScript, and Tree-sitter uses [Node.js][node.js] to interpret JavaScript files. It requires the `node` command to be in one of the directories in your [`PATH`][path-env]. You'll need Node.js version 6.0 or greater.
* **A C Compiler** - Tree-sitter creates parsers that are written in C. In order to run and test these parsers with the `tree-sitter parse` or `tree-sitter test` commands, you must have a C/C++ compiler installed. Tree-sitter will try to look for these compilers in the standard places for each platform.
### Installation
@ -505,6 +505,8 @@ Grammars often contain multiple tokens that can match the same characters. For e
4. **Match Specificity** - If there are two valid tokens with the same precedence and which both match the same number of characters, Tree-sitter will prefer a token that is specified in the grammar as a `String` over a token specified as a `RegExp`.
5. **Rule Order** - If none of the above criteria can be used to select one token over another, Tree-sitter will prefer the token that appears earlier in the grammar.
### Keywords
Many languages have a set of *keyword* tokens (e.g. `if`, `for`, `return`), as well as a more general token (e.g. `identifier`) that matches any word, including many of the keyword strings. For example, JavaScript has a keyword `instanceof`, which is used as a binary operator, like this:

View file

@ -385,6 +385,14 @@ The following query would specify that the contents of the heredoc should be par
(heredoc_end) @injection.language) @injection.content
```
You can also force the language using the `#set!` predicate.
For example, this will force the language to be always `ruby`.
```
((heredoc_body) @injection.content
(#set! injection.language "ruby"))
```
## Unit Testing
Tree-sitter has a built-in way to verify the results of syntax highlighting. The interface is based on [Sublime Text's system](https://www.sublimetext.com/docs/3/syntax.html#testing) for testing highlighting.

View file

@ -10,6 +10,8 @@ use tree_sitter::{
};
const CANCELLATION_CHECK_INTERVAL: usize = 100;
const BUFFER_HTML_RESERVE_CAPACITY: usize = 10 * 1024;
const BUFFER_LINES_RESERVE_CAPACITY: usize = 1000;
/// Indicates which highlight should be applied to a region of source code.
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
@ -620,7 +622,7 @@ where
type Item = Result<HighlightEvent, Error>;
fn next(&mut self) -> Option<Self::Item> {
loop {
'main: loop {
// If we've already determined the next highlight boundary, just return it.
if let Some(e) = self.next_event.take() {
return Some(Ok(e));
@ -640,29 +642,34 @@ where
// If none of the layers have any more highlight boundaries, terminate.
if self.layers.is_empty() {
if self.byte_offset < self.source.len() {
return if self.byte_offset < self.source.len() {
let result = Some(Ok(HighlightEvent::Source {
start: self.byte_offset,
end: self.source.len(),
}));
self.byte_offset = self.source.len();
return result;
result
} else {
return None;
}
None
};
}
// Get the next capture from whichever layer has the earliest highlight boundary.
let match_;
let mut captures;
let mut capture;
let mut pattern_index;
let range;
let layer = &mut self.layers[0];
if let Some((m, capture_index)) = layer.captures.peek() {
match_ = m;
captures = match_.captures;
pattern_index = match_.pattern_index;
capture = captures[*capture_index];
if let Some((next_match, capture_index)) = layer.captures.peek() {
let next_capture = next_match.captures[*capture_index];
range = next_capture.node.byte_range();
// If any previous highlight ends before this node starts, then before
// processing this capture, emit the source code up until the end of the
// previous highlight, and an end event for that highlight.
if let Some(end_byte) = layer.highlight_end_stack.last().cloned() {
if end_byte <= range.start {
layer.highlight_end_stack.pop();
return self.emit_event(end_byte, Some(HighlightEvent::HighlightEnd));
}
}
}
// If there are no more captures, then emit any remaining highlight end events.
// And if there are none of those, then just advance to the end of the document.
@ -673,30 +680,17 @@ where
return self.emit_event(self.source.len(), None);
};
// If any previous highlight ends before this node starts, then before
// processing this capture, emit the source code up until the end of the
// previous highlight, and an end event for that highlight.
let range = capture.node.byte_range();
if let Some(end_byte) = layer.highlight_end_stack.last().cloned() {
if end_byte <= range.start {
layer.highlight_end_stack.pop();
return self.emit_event(end_byte, Some(HighlightEvent::HighlightEnd));
}
}
// Remove from the local scope stack any local scopes that have already ended.
while range.start > layer.scope_stack.last().unwrap().range.end {
layer.scope_stack.pop();
}
let (mut match_, capture_index) = layer.captures.next().unwrap();
let mut capture = match_.captures[capture_index];
// If this capture represents an injection, then process the injection.
if pattern_index < layer.config.locals_pattern_index {
if match_.pattern_index < layer.config.locals_pattern_index {
let (language_name, content_node, include_children) =
injection_for_match(&layer.config, &layer.config.query, match_, &self.source);
injection_for_match(&layer.config, &layer.config.query, &match_, &self.source);
// Explicitly remove this match so that none of its other captures will remain
// in the stream of captures. The `unwrap` is ok because
layer.captures.next().unwrap().0.remove();
// in the stream of captures.
match_.remove();
// If a language is found with the given name, then add a new language layer
// to the highlighted document.
@ -729,16 +723,19 @@ where
}
self.sort_layers();
continue;
continue 'main;
}
layer.captures.next();
// Remove from the local scope stack any local scopes that have already ended.
while range.start > layer.scope_stack.last().unwrap().range.end {
layer.scope_stack.pop();
}
// If this capture is for tracking local variables, then process the
// local variable info.
let mut reference_highlight = None;
let mut definition_highlight = None;
while pattern_index < layer.config.highlights_pattern_index {
while match_.pattern_index < layer.config.highlights_pattern_index {
// If the node represents a local scope, push a new local scope onto
// the scope stack.
if Some(capture.index) == layer.config.local_scope_capture_index {
@ -748,7 +745,7 @@ where
range: range.clone(),
local_defs: Vec::new(),
};
for prop in layer.config.query.property_settings(pattern_index) {
for prop in layer.config.query.property_settings(match_.pattern_index) {
match prop.key.as_ref() {
"local.scope-inherits" => {
scope.inherits =
@ -767,7 +764,7 @@ where
let scope = layer.scope_stack.last_mut().unwrap();
let mut value_range = 0..0;
for capture in captures {
for capture in match_.captures {
if Some(capture.index) == layer.config.local_def_value_capture_index {
value_range = capture.node.byte_range();
}
@ -810,84 +807,76 @@ where
}
}
// Continue processing any additional local-variable-tracking patterns
// for the same node.
// Continue processing any additional matches for the same node.
if let Some((next_match, next_capture_index)) = layer.captures.peek() {
let next_capture = next_match.captures[*next_capture_index];
if next_capture.node == capture.node {
pattern_index = next_match.pattern_index;
captures = next_match.captures;
capture = next_capture;
layer.captures.next();
match_ = layer.captures.next().unwrap().0;
continue;
} else {
break;
}
}
break;
self.sort_layers();
continue 'main;
}
// Otherwise, this capture must represent a highlight.
let mut has_highlight = true;
// If this exact range has already been highlighted by an earlier pattern, or by
// a different layer, then skip over this one.
if let Some((last_start, last_end, last_depth)) = self.last_highlight_range {
if range.start == last_start && range.end == last_end && layer.depth < last_depth {
has_highlight = false;
self.sort_layers();
continue 'main;
}
}
// If the current node was found to be a local variable, then skip over any
// highlighting patterns that are disabled for local variables.
while has_highlight
&& (definition_highlight.is_some() || reference_highlight.is_some())
&& layer.config.non_local_variable_patterns[pattern_index]
{
has_highlight = false;
if let Some((next_match, next_capture_index)) = layer.captures.peek() {
let next_capture = next_match.captures[*next_capture_index];
if next_capture.node == capture.node {
capture = next_capture;
has_highlight = true;
pattern_index = next_match.pattern_index;
layer.captures.next();
continue;
if definition_highlight.is_some() || reference_highlight.is_some() {
while layer.config.non_local_variable_patterns[match_.pattern_index] {
if let Some((next_match, next_capture_index)) = layer.captures.peek() {
let next_capture = next_match.captures[*next_capture_index];
if next_capture.node == capture.node {
capture = next_capture;
match_ = layer.captures.next().unwrap().0;
continue;
}
}
self.sort_layers();
continue 'main;
}
break;
}
if has_highlight {
// Once a highlighting pattern is found for the current node, skip over
// any later highlighting patterns that also match this node. Captures
// for a given node are ordered by pattern index, so these subsequent
// captures are guaranteed to be for highlighting, not injections or
// local variables.
while let Some((next_match, next_capture_index)) = layer.captures.peek() {
if next_match.captures[*next_capture_index].node == capture.node {
layer.captures.next();
} else {
break;
}
// Once a highlighting pattern is found for the current node, skip over
// any later highlighting patterns that also match this node. Captures
// for a given node are ordered by pattern index, so these subsequent
// captures are guaranteed to be for highlighting, not injections or
// local variables.
while let Some((next_match, next_capture_index)) = layer.captures.peek() {
let next_capture = next_match.captures[*next_capture_index];
if next_capture.node == capture.node {
layer.captures.next();
} else {
break;
}
}
let current_highlight = layer.config.highlight_indices[capture.index as usize];
let current_highlight = layer.config.highlight_indices[capture.index as usize];
// If this node represents a local definition, then store the current
// highlight value on the local scope entry representing this node.
if let Some(definition_highlight) = definition_highlight {
*definition_highlight = current_highlight;
}
// If this node represents a local definition, then store the current
// highlight value on the local scope entry representing this node.
if let Some(definition_highlight) = definition_highlight {
*definition_highlight = current_highlight;
}
// Emit a scope start event and push the node's end position to the stack.
if let Some(highlight) = reference_highlight.or(current_highlight) {
self.last_highlight_range = Some((range.start, range.end, layer.depth));
layer.highlight_end_stack.push(range.end);
return self
.emit_event(range.start, Some(HighlightEvent::HighlightStart(highlight)));
}
// Emit a scope start event and push the node's end position to the stack.
if let Some(highlight) = reference_highlight.or(current_highlight) {
self.last_highlight_range = Some((range.start, range.end, layer.depth));
layer.highlight_end_stack.push(range.end);
return self
.emit_event(range.start, Some(HighlightEvent::HighlightStart(highlight)));
}
self.sort_layers();
@ -897,11 +886,13 @@ where
impl HtmlRenderer {
pub fn new() -> Self {
HtmlRenderer {
html: Vec::new(),
line_offsets: vec![0],
let mut result = HtmlRenderer {
html: Vec::with_capacity(BUFFER_HTML_RESERVE_CAPACITY),
line_offsets: Vec::with_capacity(BUFFER_LINES_RESERVE_CAPACITY),
carriage_return_highlight: None,
}
};
result.line_offsets.push(0);
result
}
pub fn set_carriage_return_highlight(&mut self, highlight: Option<Highlight>) {
@ -909,8 +900,8 @@ impl HtmlRenderer {
}
pub fn reset(&mut self) {
self.html.clear();
self.line_offsets.clear();
shrink_and_clear(&mut self.html, BUFFER_HTML_RESERVE_CAPACITY);
shrink_and_clear(&mut self.line_offsets, BUFFER_LINES_RESERVE_CAPACITY);
self.line_offsets.push(0);
}
@ -1074,3 +1065,11 @@ fn injection_for_match<'a>(
(language_name, content_node, include_children)
}
fn shrink_and_clear<T>(vec: &mut Vec<T>, capacity: usize) {
if vec.len() > capacity {
vec.truncate(capacity);
vec.shrink_to_fit();
}
vec.clear();
}

View file

@ -170,7 +170,7 @@ pub enum QueryError {
enum TextPredicate {
CaptureEqString(u32, String, bool),
CaptureEqCapture(u32, u32, bool),
CaptureMatchString(u32, regex::bytes::Regex),
CaptureMatchString(u32, regex::bytes::Regex, bool),
}
impl Language {
@ -1314,7 +1314,7 @@ impl Query {
});
}
"match?" => {
"match?" | "not-match?" => {
if p.len() != 3 {
return Err(QueryError::Predicate(format!(
"Wrong number of arguments to #match? predicate. Expected 2, got {}.",
@ -1334,12 +1334,14 @@ impl Query {
)));
}
let is_positive = operator_name == "match?";
let regex = &string_values[p[2].value_id as usize];
text_predicates.push(TextPredicate::CaptureMatchString(
p[1].value_id,
regex::bytes::Regex::new(regex).map_err(|_| {
QueryError::Predicate(format!("Invalid regex '{}'", regex))
})?,
is_positive,
));
}
@ -1631,9 +1633,9 @@ impl<'a> QueryMatch<'a> {
let node = self.capture_for_index(*i).unwrap();
(text_callback(node).as_ref() == s.as_bytes()) == *is_positive
}
TextPredicate::CaptureMatchString(i, r) => {
TextPredicate::CaptureMatchString(i, r, is_positive) => {
let node = self.capture_for_index(*i).unwrap();
r.is_match(text_callback(node).as_ref())
r.is_match(text_callback(node).as_ref()) == *is_positive
}
})
}

View file

@ -787,6 +787,8 @@ class Language {
}
break;
case 'not-match?':
isPositive = false;
case 'match?':
if (steps.length !== 3) throw new Error(
`Wrong number of arguments to \`#match?\` predicate. Expected 2, got ${steps.length - 1}.`
@ -801,7 +803,7 @@ class Language {
const regex = new RegExp(steps[2].value);
textPredicates[i].push(function(captures) {
for (const c of captures) {
if (c.name === captureName) return regex.test(c.node.text);
if (c.name === captureName) return regex.test(c.node.text) === isPositive;
}
return false;
});

View file

@ -126,12 +126,17 @@ describe("Query", () => {
it("handles conditions that compare the text of capture to literal strings", () => {
tree = parser.parse(`
lambda
panda
load
toad
const ab = require('./ab');
new Cd(EF);
`);
query = JavaScript.query(`
(identifier) @variable
((identifier) @variable
(#not-match? @variable "^(lambda|load)$"))
((identifier) @function.builtin
(#eq? @function.builtin "require"))
@ -145,6 +150,8 @@ describe("Query", () => {
const captures = query.captures(tree.rootNode);
assert.deepEqual(formatCaptures(captures), [
{ name: "variable", text: "panda" },
{ name: "variable", text: "toad" },
{ name: "variable", text: "ab" },
{ name: "variable", text: "require" },
{ name: "function.builtin", text: "require" },

View file

@ -45,7 +45,7 @@ static inline bool ts_toggle_allocation_recording(bool value) {
static inline void *ts_malloc(size_t size) {
void *result = malloc(size);
if (size > 0 && !result) {
fprintf(stderr, "tree-sitter failed to allocate %lu bytes", size);
fprintf(stderr, "tree-sitter failed to allocate %zu bytes", size);
exit(1);
}
return result;
@ -54,7 +54,7 @@ static inline void *ts_malloc(size_t size) {
static inline void *ts_calloc(size_t count, size_t size) {
void *result = calloc(count, size);
if (count > 0 && !result) {
fprintf(stderr, "tree-sitter failed to allocate %lu bytes", count * size);
fprintf(stderr, "tree-sitter failed to allocate %zu bytes", count * size);
exit(1);
}
return result;
@ -63,7 +63,7 @@ static inline void *ts_calloc(size_t count, size_t size) {
static inline void *ts_realloc(void *buffer, size_t size) {
void *result = realloc(buffer, size);
if (size > 0 && !result) {
fprintf(stderr, "tree-sitter failed to reallocate %lu bytes", size);
fprintf(stderr, "tree-sitter failed to reallocate %zu bytes", size);
exit(1);
}
return result;

View file

@ -355,10 +355,14 @@ static Subtree ts_parser__lex(
StackVersion version,
TSStateId parse_state
) {
TSLexMode lex_mode = self->language->lex_modes[parse_state];
if (lex_mode.lex_state == (uint16_t)-1) {
LOG("no_lookahead_after_non_terminal_extra");
return NULL_SUBTREE;
}
Length start_position = ts_stack_position(self->stack, version);
Subtree external_token = ts_stack_last_external_token(self->stack, version);
TSLexMode lex_mode = self->language->lex_modes[parse_state];
if (lex_mode.lex_state == (uint16_t)-1) return NULL_SUBTREE;
const bool *valid_external_tokens = ts_language_enabled_external_tokens(
self->language,
lex_mode.external_lex_state
@ -761,20 +765,26 @@ static StackVersion ts_parser__reduce(
int dynamic_precedence,
uint16_t production_id,
bool is_fragile,
bool is_extra
bool end_of_non_terminal_extra
) {
uint32_t initial_version_count = ts_stack_version_count(self->stack);
uint32_t removed_version_count = 0;
StackSliceArray pop = ts_stack_pop_count(self->stack, version, count);
// Pop the given number of nodes from the given version of the parse stack.
// If stack versions have previously merged, then there may be more than one
// path back through the stack. For each path, create a new parent node to
// contain the popped children, and push it onto the stack in place of the
// children.
StackSliceArray pop = ts_stack_pop_count(self->stack, version, count);
uint32_t removed_version_count = 0;
for (uint32_t i = 0; i < pop.size; i++) {
StackSlice slice = pop.contents[i];
StackVersion slice_version = slice.version - removed_version_count;
// Error recovery can sometimes cause lots of stack versions to merge,
// such that a single pop operation can produce a lots of slices.
// Avoid creating too many stack versions in that situation.
if (i > 0 && slice_version > MAX_VERSION_COUNT + MAX_VERSION_COUNT_OVERFLOW) {
// This is where new versions are added to the parse stack. The versions
// will all be sorted and truncated at the end of the outer parsing loop.
// Allow the maximum version count to be temporarily exceeded, but only
// by a limited threshold.
if (slice_version > MAX_VERSION_COUNT + MAX_VERSION_COUNT_OVERFLOW) {
ts_stack_remove_version(self->stack, slice_version);
ts_subtree_array_delete(&self->tree_pool, &slice.subtrees);
removed_version_count++;
@ -826,7 +836,9 @@ static StackVersion ts_parser__reduce(
TSStateId state = ts_stack_state(self->stack, slice_version);
TSStateId next_state = ts_language_next_state(self->language, state, symbol);
if (is_extra) parent.ptr->extra = true;
if (end_of_non_terminal_extra && next_state == state) {
parent.ptr->extra = true;
}
if (is_fragile || pop.size > 1 || initial_version_count > 1) {
parent.ptr->fragile_left = true;
parent.ptr->fragile_right = true;
@ -1339,24 +1351,26 @@ static bool ts_parser__advance(
);
}
lex:
// Otherwise, re-run the lexer.
if (!lookahead.ptr) {
lookahead = ts_parser__lex(self, version, state);
if (lookahead.ptr) {
ts_parser__set_cached_token(self, position, last_external_token, lookahead);
ts_language_table_entry(self->language, state, ts_subtree_symbol(lookahead), &table_entry);
}
// When parsing a non-terminal extra, a null lookahead indicates the
// end of the rule. The reduction is stored in the EOF table entry.
// After the reduction, the lexer needs to be run again.
else {
ts_language_table_entry(self->language, state, ts_builtin_sym_end, &table_entry);
}
}
bool needs_lex = !lookahead.ptr;
for (;;) {
// Otherwise, re-run the lexer.
if (needs_lex) {
needs_lex = false;
lookahead = ts_parser__lex(self, version, state);
if (lookahead.ptr) {
ts_parser__set_cached_token(self, position, last_external_token, lookahead);
ts_language_table_entry(self->language, state, ts_subtree_symbol(lookahead), &table_entry);
}
// When parsing a non-terminal extra, a null lookahead indicates the
// end of the rule. The reduction is stored in the EOF table entry.
// After the reduction, the lexer needs to be run again.
else {
ts_language_table_entry(self->language, state, ts_builtin_sym_end, &table_entry);
}
}
// If a cancellation flag or a timeout was provided, then check every
// time a fixed number of parse actions has been processed.
if (++self->operation_count == OP_COUNT_PER_TIMEOUT_CHECK) {
@ -1408,12 +1422,12 @@ lex:
case TSParseActionTypeReduce: {
bool is_fragile = table_entry.action_count > 1;
bool is_extra = lookahead.ptr == NULL;
bool end_of_non_terminal_extra = lookahead.ptr == NULL;
LOG("reduce sym:%s, child_count:%u", SYM_NAME(action.params.reduce.symbol), action.params.reduce.child_count);
StackVersion reduction_version = ts_parser__reduce(
self, version, action.params.reduce.symbol, action.params.reduce.child_count,
action.params.reduce.dynamic_precedence, action.params.reduce.production_id,
is_fragile, is_extra
is_fragile, end_of_non_terminal_extra
);
if (reduction_version != STACK_VERSION_NONE) {
last_reduction_version = reduction_version;
@ -1453,8 +1467,10 @@ lex:
// (and completing the non-terminal extra rule) run the lexer again based
// on the current parse state.
if (!lookahead.ptr) {
lookahead = ts_parser__lex(self, version, state);
needs_lex = true;
continue;
}
ts_language_table_entry(
self->language,
state,
@ -1464,6 +1480,11 @@ lex:
continue;
}
if (!lookahead.ptr) {
ts_stack_pause(self->stack, version, ts_builtin_sym_end);
return true;
}
// If there were no parse actions for the current lookahead token, then
// it is not valid in this state. If the current lookahead token is a
// keyword, then switch to treating it as the normal word token if that
@ -1503,8 +1524,7 @@ lex:
if (ts_parser__breakdown_top_of_stack(self, version)) {
state = ts_stack_state(self->stack, version);
ts_subtree_release(&self->tree_pool, lookahead);
lookahead = NULL_SUBTREE;
goto lex;
needs_lex = true;
continue;
}

View file

@ -11,7 +11,6 @@
// #define LOG(...) fprintf(stderr, __VA_ARGS__)
#define LOG(...)
#define MAX_STATE_COUNT 256
#define MAX_CAPTURE_LIST_COUNT 32
#define MAX_STEP_CAPTURE_COUNT 3
#define MAX_STATE_PREDECESSOR_COUNT 100
@ -51,7 +50,6 @@ typedef struct {
uint16_t alternative_index;
uint16_t depth;
bool contains_captures: 1;
bool is_pattern_start: 1;
bool is_immediate: 1;
bool is_last_child: 1;
bool is_pass_through: 1;
@ -128,9 +126,10 @@ typedef struct {
uint16_t step_index;
uint16_t pattern_index;
uint16_t capture_list_id;
uint16_t consumed_capture_count: 14;
uint16_t consumed_capture_count: 12;
bool seeking_immediate_match: 1;
bool has_in_progress_alternatives: 1;
bool dead: 1;
} QueryState;
typedef Array(TSQueryCapture) CaptureList;
@ -224,6 +223,7 @@ struct TSQueryCursor {
TSPoint start_point;
TSPoint end_point;
bool ascending;
bool halted;
};
static const TSQueryError PARENT_DONE = -1;
@ -500,7 +500,6 @@ static QueryStep query_step__new(
.alternative_index = NONE,
.contains_captures = false,
.is_last_child = false,
.is_pattern_start = false,
.is_pass_through = false,
.is_dead_end = false,
.is_definite = false,
@ -692,6 +691,23 @@ static inline void ts_query__pattern_map_insert(
) {
uint32_t index;
ts_query__pattern_map_search(self, symbol, &index);
// Ensure that the entries are sorted not only by symbol, but also
// by pattern_index. This way, states for earlier patterns will be
// initiated first, which allows the ordering of the states array
// to be maintained more efficiently.
while (index < self->pattern_map.size) {
PatternEntry *entry = &self->pattern_map.contents[index];
if (
self->steps.contents[entry->step_index].symbol == symbol &&
entry->pattern_index < pattern_index
) {
index++;
} else {
break;
}
}
array_insert(&self->pattern_map, index, ((PatternEntry) {
.step_index = start_step_index,
.pattern_index = pattern_index,
@ -1438,8 +1454,8 @@ static TSQueryError ts_query__parse_pattern(
}
}
// A pound character indicates the start of a predicate.
else if (stream->next == '#') {
// A dot/pound character indicates the start of a predicate.
else if (stream->next == '.' || stream->next == '#') {
stream_advance(stream);
return ts_query__parse_predicate(self, stream);
}
@ -1796,7 +1812,6 @@ TSQuery *ts_query_new(
// Maintain a map that can look up patterns for a given root symbol.
for (;;) {
QueryStep *step = &self->steps.contents[start_step_index];
step->is_pattern_start = true;
ts_query__pattern_map_insert(self, step->symbol, start_step_index, pattern_index);
if (step->symbol == WILDCARD_SYMBOL) {
self->wildcard_root_pattern_count++;
@ -1806,6 +1821,7 @@ TSQuery *ts_query_new(
// then add multiple entries to the pattern map.
if (step->alternative_index != NONE) {
start_step_index = step->alternative_index;
step->alternative_index = NONE;
} else {
break;
}
@ -1944,6 +1960,7 @@ TSQueryCursor *ts_query_cursor_new(void) {
TSQueryCursor *self = ts_malloc(sizeof(TSQueryCursor));
*self = (TSQueryCursor) {
.ascending = false,
.halted = false,
.states = array_new(),
.finished_states = array_new(),
.capture_list_pool = capture_list_pool_new(),
@ -1952,8 +1969,8 @@ TSQueryCursor *ts_query_cursor_new(void) {
.start_point = {0, 0},
.end_point = POINT_MAX,
};
array_reserve(&self->states, MAX_STATE_COUNT);
array_reserve(&self->finished_states, MAX_CAPTURE_LIST_COUNT);
array_reserve(&self->states, 8);
array_reserve(&self->finished_states, 8);
return self;
}
@ -1977,6 +1994,7 @@ void ts_query_cursor_exec(
self->next_state_id = 0;
self->depth = 0;
self->ascending = false;
self->halted = false;
self->query = query;
}
@ -2020,6 +2038,7 @@ static bool ts_query_cursor__first_in_progress_capture(
*pattern_index = UINT32_MAX;
for (unsigned i = 0; i < self->states.size; i++) {
const QueryState *state = &self->states.contents[i];
if (state->dead) continue;
const CaptureList *captures = capture_list_pool_get(
&self->capture_list_pool,
state->capture_list_id
@ -2114,65 +2133,138 @@ void ts_query_cursor__compare_captures(
}
}
static bool ts_query_cursor__add_state(
static void ts_query_cursor__add_state(
TSQueryCursor *self,
const PatternEntry *pattern
) {
if (self->states.size >= MAX_STATE_COUNT) {
LOG(" too many states");
return false;
QueryStep *step = &self->query->steps.contents[pattern->step_index];
uint32_t start_depth = self->depth - step->depth;
// Keep the states array in ascending order of start_depth and pattern_index,
// so that it can be processed more efficiently elsewhere. Usually, there is
// no work to do here because of two facts:
// * States with lower start_depth are naturally added first due to the
// order in which nodes are visited.
// * Earlier patterns are naturally added first because of the ordering of the
// pattern_map data structure that's used to initiate matches.
//
// This loop is only needed in cases where two conditions hold:
// * A pattern consists of more than one sibling node, so that its states
// remain in progress after exiting the node that started the match.
// * The first node in the pattern matches against multiple nodes at the
// same depth.
//
// An example of this is the pattern '((comment)* (function))'. If multiple
// `comment` nodes appear in a row, then we may initiate a new state for this
// pattern while another state for the same pattern is already in progress.
// If there are multiple patterns like this in a query, then this loop will
// need to execute in order to keep the states ordered by pattern_index.
uint32_t index = self->states.size;
while (index > 0) {
QueryState *prev_state = &self->states.contents[index - 1];
if (prev_state->start_depth < start_depth) break;
if (prev_state->start_depth == start_depth) {
if (prev_state->pattern_index < pattern->pattern_index) break;
if (prev_state->pattern_index == pattern->pattern_index) {
// Avoid unnecessarily inserting an unnecessary duplicate state,
// which would be immediately pruned by the longest-match criteria.
if (prev_state->step_index == pattern->step_index) return;
}
}
index--;
}
LOG(
" start state. pattern:%u, step:%u\n",
pattern->pattern_index,
pattern->step_index
);
QueryStep *step = &self->query->steps.contents[pattern->step_index];
array_push(&self->states, ((QueryState) {
array_insert(&self->states, index, ((QueryState) {
.capture_list_id = NONE,
.step_index = pattern->step_index,
.pattern_index = pattern->pattern_index,
.start_depth = self->depth - step->depth,
.start_depth = start_depth,
.consumed_capture_count = 0,
.seeking_immediate_match = false,
.seeking_immediate_match = true,
.has_in_progress_alternatives = false,
.dead = false,
}));
return true;
}
// Acquire a capture list for this state. If there are no capture lists left in the
// pool, this will steal the capture list from another existing state, and mark that
// other state as 'dead'.
static CaptureList *ts_query_cursor__prepare_to_capture(
TSQueryCursor *self,
QueryState *state,
unsigned state_index_to_preserve
) {
if (state->capture_list_id == NONE) {
state->capture_list_id = capture_list_pool_acquire(&self->capture_list_pool);
// If there are no capture lists left in the pool, then terminate whichever
// state has captured the earliest node in the document, and steal its
// capture list.
if (state->capture_list_id == NONE) {
uint32_t state_index, byte_offset, pattern_index;
if (
ts_query_cursor__first_in_progress_capture(
self,
&state_index,
&byte_offset,
&pattern_index
) &&
state_index != state_index_to_preserve
) {
LOG(
" abandon state. index:%u, pattern:%u, offset:%u.\n",
state_index, pattern_index, byte_offset
);
QueryState *other_state = &self->states.contents[state_index];
state->capture_list_id = other_state->capture_list_id;
other_state->capture_list_id = NONE;
other_state->dead = true;
CaptureList *list = capture_list_pool_get_mut(
&self->capture_list_pool,
state->capture_list_id
);
array_clear(list);
return list;
} else {
LOG(" ran out of capture lists");
return NULL;
}
}
}
return capture_list_pool_get_mut(&self->capture_list_pool, state->capture_list_id);
}
// Duplicate the given state and insert the newly-created state immediately after
// the given state in the `states` array.
static QueryState *ts_query__cursor_copy_state(
// the given state in the `states` array. Ensures that the given state reference is
// still valid, even if the states array is reallocated.
static QueryState *ts_query_cursor__copy_state(
TSQueryCursor *self,
const QueryState *state
QueryState **state_ref
) {
if (self->states.size >= MAX_STATE_COUNT) {
LOG(" too many states");
return NULL;
}
const QueryState *state = *state_ref;
uint32_t state_index = state - self->states.contents;
QueryState copy = *state;
copy.capture_list_id = NONE;
// If the state has captures, copy its capture list.
QueryState copy = *state;
copy.capture_list_id = state->capture_list_id;
if (state->capture_list_id != NONE) {
copy.capture_list_id = capture_list_pool_acquire(&self->capture_list_pool);
if (copy.capture_list_id == NONE) {
LOG(" too many capture lists");
return NULL;
}
CaptureList *new_captures = ts_query_cursor__prepare_to_capture(self, &copy, state_index);
if (!new_captures) return NULL;
const CaptureList *old_captures = capture_list_pool_get(
&self->capture_list_pool,
state->capture_list_id
);
CaptureList *new_captures = capture_list_pool_get_mut(
&self->capture_list_pool,
copy.capture_list_id
);
array_push_all(new_captures, old_captures);
}
uint32_t index = (state - self->states.contents) + 1;
array_insert(&self->states, index, copy);
return &self->states.contents[index];
array_insert(&self->states, state_index + 1, copy);
*state_ref = &self->states.contents[state_index];
return &self->states.contents[state_index + 1];
}
// Walk the tree, processing patterns until at least one pattern finishes,
@ -2180,18 +2272,30 @@ static QueryState *ts_query__cursor_copy_state(
// `finished_states` array. Multiple patterns can finish on the same node. If
// there are no more matches, return `false`.
static inline bool ts_query_cursor__advance(TSQueryCursor *self) {
do {
bool did_match = false;
for (;;) {
if (self->halted) {
while (self->states.size > 0) {
QueryState state = array_pop(&self->states);
capture_list_pool_release(
&self->capture_list_pool,
state.capture_list_id
);
}
}
if (did_match || self->halted) return did_match;
if (self->ascending) {
LOG("leave node. type:%s\n", ts_node_type(ts_tree_cursor_current_node(&self->cursor)));
// Leave this node by stepping to its next sibling or to its parent.
bool did_move = true;
if (ts_tree_cursor_goto_next_sibling(&self->cursor)) {
self->ascending = false;
} else if (ts_tree_cursor_goto_parent(&self->cursor)) {
self->depth--;
} else {
did_move = false;
self->halted = true;
}
// After leaving a node, remove any states that cannot make further progress.
@ -2203,10 +2307,11 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) {
// If a state completed its pattern inside of this node, but was deferred from finishing
// in order to search for longer matches, mark it as finished.
if (step->depth == PATTERN_DONE_MARKER) {
if (state->start_depth > self->depth || !did_move) {
if (state->start_depth > self->depth || self->halted) {
LOG(" finish pattern %u\n", state->pattern_index);
state->id = self->next_state_id++;
array_push(&self->finished_states, *state);
did_match = true;
deleted_count++;
continue;
}
@ -2233,10 +2338,6 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) {
}
}
self->states.size -= deleted_count;
if (!did_move) {
return self->finished_states.size > 0;
}
} else {
// If this node is before the selected range, then avoid descending into it.
TSNode node = ts_tree_cursor_current_node(&self->cursor);
@ -2254,7 +2355,10 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) {
if (
self->end_byte <= ts_node_start_byte(node) ||
point_lte(self->end_point, ts_node_start_point(node))
) return false;
) {
self->halted = true;
continue;
}
// Get the properties of the current node.
TSSymbol symbol = ts_node_symbol(node);
@ -2286,7 +2390,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) {
// If this node matches the first step of the pattern, then add a new
// state at the start of this pattern.
if (step->field && field_id != step->field) continue;
if (!ts_query_cursor__add_state(self, pattern)) break;
ts_query_cursor__add_state(self, pattern);
}
// Add new states for any patterns whose root node matches this node.
@ -2298,7 +2402,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) {
// If this node matches the first step of the pattern, then add a new
// state at the start of this pattern.
if (step->field && field_id != step->field) continue;
if (!ts_query_cursor__add_state(self, pattern)) break;
ts_query_cursor__add_state(self, pattern);
// Advance to the next pattern whose root node matches this node.
i++;
@ -2366,12 +2470,8 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) {
// parent, then this query state cannot simply be updated in place. It must be
// split into two states: one that matches this node, and one which skips over
// this node, to preserve the possibility of matching later siblings.
if (
later_sibling_can_match &&
!step->is_pattern_start &&
step->contains_captures
) {
if (ts_query__cursor_copy_state(self, state)) {
if (later_sibling_can_match && step->contains_captures) {
if (ts_query_cursor__copy_state(self, &state)) {
LOG(
" split state for capture. pattern:%u, step:%u\n",
state->pattern_index,
@ -2382,45 +2482,14 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) {
}
// If the current node is captured in this pattern, add it to the capture list.
// For the first capture in a pattern, lazily acquire a capture list.
if (step->capture_ids[0] != NONE) {
if (state->capture_list_id == NONE) {
state->capture_list_id = capture_list_pool_acquire(&self->capture_list_pool);
// If there are no capture lists left in the pool, then terminate whichever
// state has captured the earliest node in the document, and steal its
// capture list.
if (state->capture_list_id == NONE) {
uint32_t state_index, byte_offset, pattern_index;
if (ts_query_cursor__first_in_progress_capture(
self,
&state_index,
&byte_offset,
&pattern_index
)) {
LOG(
" abandon state. index:%u, pattern:%u, offset:%u.\n",
state_index, pattern_index, byte_offset
);
state->capture_list_id = self->states.contents[state_index].capture_list_id;
array_erase(&self->states, state_index);
if (state_index < i) {
i--;
state--;
}
} else {
LOG(" too many finished states.\n");
array_erase(&self->states, i);
i--;
continue;
}
}
CaptureList *capture_list = ts_query_cursor__prepare_to_capture(self, state, UINT32_MAX);
if (!capture_list) {
array_erase(&self->states, i);
i--;
continue;
}
CaptureList *capture_list = capture_list_pool_get_mut(
&self->capture_list_pool,
state->capture_list_id
);
for (unsigned j = 0; j < MAX_STEP_CAPTURE_COUNT; j++) {
uint16_t capture_id = step->capture_ids[j];
if (step->capture_ids[j] == NONE) break;
@ -2443,10 +2512,9 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) {
state->step_index
);
// If this state's next step has an 'alternative' step (the step is either optional,
// or is the end of a repetition), then copy the state in order to pursue both
// alternatives. The alternative step itself may have an alternative, so this is
// an interative process.
// If this state's next step has an alternative step, then copy the state in order
// to pursue both alternatives. The alternative step itself may have an alternative,
// so this is an interative process.
unsigned end_index = i + 1;
for (unsigned j = i; j < end_index; j++) {
QueryState *state = &self->states.contents[j];
@ -2458,25 +2526,27 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) {
continue;
}
QueryState *copy = ts_query__cursor_copy_state(self, state);
if (next_step->is_pass_through) {
state->step_index++;
j--;
}
QueryState *copy = ts_query_cursor__copy_state(self, &state);
if (copy) {
copy_count++;
LOG(
" split state for branch. pattern:%u, from_step:%u, to_step:%u, immediate:%d, capture_count: %u\n",
copy->pattern_index,
copy->step_index,
next_step->alternative_index,
next_step->alternative_is_immediate,
capture_list_pool_get(&self->capture_list_pool, copy->capture_list_id)->size
);
end_index++;
copy_count++;
copy->step_index = next_step->alternative_index;
if (next_step->alternative_is_immediate) {
copy->seeking_immediate_match = true;
}
LOG(
" split state for branch. pattern:%u, step:%u, step:%u, immediate:%d\n",
copy->pattern_index,
state->step_index,
copy->step_index,
copy->seeking_immediate_match
);
}
}
}
@ -2484,59 +2554,77 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) {
for (unsigned i = 0; i < self->states.size; i++) {
QueryState *state = &self->states.contents[i];
bool did_remove = false;
if (state->dead) {
array_erase(&self->states, i);
i--;
continue;
}
// Enfore the longest-match criteria. When a query pattern contains optional or
// repeated nodes, this is necesssary to avoid multiple redundant states, where
// repeated nodes, this is necessary to avoid multiple redundant states, where
// one state has a strict subset of another state's captures.
bool did_remove = false;
for (unsigned j = i + 1; j < self->states.size; j++) {
QueryState *other_state = &self->states.contents[j];
// Query states are kept in ascending order of start_depth and pattern_index.
// Since the longest-match criteria is only used for deduping matches of the same
// pattern and root node, we only need to perform pairwise comparisons within a
// small slice of the states array.
if (
state->pattern_index == other_state->pattern_index &&
state->start_depth == other_state->start_depth
) {
bool left_contains_right, right_contains_left;
ts_query_cursor__compare_captures(
self,
state,
other_state,
&left_contains_right,
&right_contains_left
);
if (left_contains_right) {
if (state->step_index == other_state->step_index) {
LOG(
" drop shorter state. pattern: %u, step_index: %u\n",
state->pattern_index,
state->step_index
);
capture_list_pool_release(&self->capture_list_pool, other_state->capture_list_id);
array_erase(&self->states, j);
j--;
continue;
}
other_state->has_in_progress_alternatives = true;
other_state->start_depth != state->start_depth ||
other_state->pattern_index != state->pattern_index
) break;
bool left_contains_right, right_contains_left;
ts_query_cursor__compare_captures(
self,
state,
other_state,
&left_contains_right,
&right_contains_left
);
if (left_contains_right) {
if (state->step_index == other_state->step_index) {
LOG(
" drop shorter state. pattern: %u, step_index: %u\n",
state->pattern_index,
state->step_index
);
capture_list_pool_release(&self->capture_list_pool, other_state->capture_list_id);
array_erase(&self->states, j);
j--;
continue;
}
if (right_contains_left) {
if (state->step_index == other_state->step_index) {
LOG(
" drop shorter state. pattern: %u, step_index: %u\n",
state->pattern_index,
state->step_index
);
capture_list_pool_release(&self->capture_list_pool, state->capture_list_id);
array_erase(&self->states, i);
did_remove = true;
break;
}
state->has_in_progress_alternatives = true;
other_state->has_in_progress_alternatives = true;
}
if (right_contains_left) {
if (state->step_index == other_state->step_index) {
LOG(
" drop shorter state. pattern: %u, step_index: %u\n",
state->pattern_index,
state->step_index
);
capture_list_pool_release(&self->capture_list_pool, state->capture_list_id);
array_erase(&self->states, i);
i--;
did_remove = true;
break;
}
state->has_in_progress_alternatives = true;
}
}
// If there the state is at the end of its pattern, remove it from the list
// of in-progress states and add it to the list of finished states.
if (!did_remove) {
LOG(
" keep state. pattern: %u, start_depth: %u, step_index: %u, capture_count: %u\n",
state->pattern_index,
state->start_depth,
state->step_index,
capture_list_pool_get(&self->capture_list_pool, state->capture_list_id)->size
);
QueryStep *next_step = &self->query->steps.contents[state->step_index];
if (next_step->depth == PATTERN_DONE_MARKER) {
if (state->has_in_progress_alternatives) {
@ -2546,6 +2634,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) {
state->id = self->next_state_id++;
array_push(&self->finished_states, *state);
array_erase(&self->states, state - self->states.contents);
did_match = true;
i--;
}
}
@ -2559,9 +2648,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) {
self->ascending = true;
}
}
} while (self->finished_states.size == 0);
return true;
}
}
bool ts_query_cursor_next_match(
@ -2701,7 +2788,10 @@ bool ts_query_cursor_next_capture(
// If there are no finished matches that are ready to be returned, then
// continue finding more matches.
if (!ts_query_cursor__advance(self)) return false;
if (
!ts_query_cursor__advance(self) &&
self->finished_states.size == 0
) return false;
}
}

View file

@ -16,18 +16,10 @@ typedef enum {
TSTagsInvalidUtf8,
TSTagsInvalidRegex,
TSTagsInvalidQuery,
TSTagsInvalidCapture,
} TSTagsError;
typedef enum {
TSTagKindFunction,
TSTagKindMethod,
TSTagKindClass,
TSTagKindModule,
TSTagKindCall,
} TSTagKind;
typedef struct {
TSTagKind kind;
uint32_t start_byte;
uint32_t end_byte;
uint32_t name_start_byte;
@ -36,8 +28,12 @@ typedef struct {
uint32_t line_end_byte;
TSPoint start_point;
TSPoint end_point;
uint32_t utf16_start_column;
uint32_t utf16_end_column;
uint32_t docs_start_byte;
uint32_t docs_end_byte;
uint32_t syntax_type_id;
bool is_definition;
} TSTag;
typedef struct TSTagger TSTagger;
@ -89,6 +85,12 @@ uint32_t ts_tags_buffer_tags_len(const TSTagsBuffer *);
const char *ts_tags_buffer_docs(const TSTagsBuffer *);
uint32_t ts_tags_buffer_docs_len(const TSTagsBuffer *);
// Get the syntax kinds for a scope.
const char **ts_tagger_syntax_kinds_for_scope_name(const TSTagger *, const char *scope_name, uint32_t *len);
// Determine whether a parse error was encountered while tagging.
bool ts_tags_buffer_found_parse_error(const TSTagsBuffer*);
#ifdef __cplusplus
}
#endif

View file

@ -1,4 +1,4 @@
use super::{Error, TagKind, TagsConfiguration, TagsContext};
use super::{Error, TagsConfiguration, TagsContext};
use std::collections::HashMap;
use std::ffi::CStr;
use std::process::abort;
@ -6,6 +6,9 @@ use std::sync::atomic::AtomicUsize;
use std::{fmt, slice, str};
use tree_sitter::Language;
const BUFFER_TAGS_RESERVE_CAPACITY: usize = 100;
const BUFFER_DOCS_RESERVE_CAPACITY: usize = 1024;
#[repr(C)]
#[derive(Debug, PartialEq, Eq)]
pub enum TSTagsError {
@ -16,19 +19,10 @@ pub enum TSTagsError {
InvalidUtf8,
InvalidRegex,
InvalidQuery,
InvalidCapture,
Unknown,
}
#[repr(C)]
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum TSTagKind {
Function,
Method,
Class,
Module,
Call,
}
#[repr(C)]
pub struct TSPoint {
row: u32,
@ -37,7 +31,6 @@ pub struct TSPoint {
#[repr(C)]
pub struct TSTag {
pub kind: TSTagKind,
pub start_byte: u32,
pub end_byte: u32,
pub name_start_byte: u32,
@ -46,8 +39,12 @@ pub struct TSTag {
pub line_end_byte: u32,
pub start_point: TSPoint,
pub end_point: TSPoint,
pub utf16_start_colum: u32,
pub utf16_end_colum: u32,
pub docs_start_byte: u32,
pub docs_end_byte: u32,
pub syntax_type_id: u32,
pub is_definition: bool,
}
pub struct TSTagger {
@ -58,6 +55,7 @@ pub struct TSTagsBuffer {
context: TagsContext,
tags: Vec<TSTag>,
docs: Vec<u8>,
errors_present: bool,
}
#[no_mangle]
@ -102,7 +100,9 @@ pub extern "C" fn ts_tagger_add_language(
}
Err(Error::Query(_)) => TSTagsError::InvalidQuery,
Err(Error::Regex(_)) => TSTagsError::InvalidRegex,
Err(_) => TSTagsError::Unknown,
Err(Error::Cancelled) => TSTagsError::Timeout,
Err(Error::InvalidLanguage) => TSTagsError::InvalidLanguage,
Err(Error::InvalidCapture(_)) => TSTagsError::InvalidCapture,
}
}
@ -120,8 +120,9 @@ pub extern "C" fn ts_tagger_tag(
let scope_name = unsafe { unwrap(CStr::from_ptr(scope_name).to_str()) };
if let Some(config) = tagger.languages.get(scope_name) {
buffer.tags.clear();
buffer.docs.clear();
shrink_and_clear(&mut buffer.tags, BUFFER_TAGS_RESERVE_CAPACITY);
shrink_and_clear(&mut buffer.docs, BUFFER_DOCS_RESERVE_CAPACITY);
let source_code = unsafe { slice::from_raw_parts(source_code, source_code_len as usize) };
let cancellation_flag = unsafe { cancellation_flag.as_ref() };
@ -129,7 +130,10 @@ pub extern "C" fn ts_tagger_tag(
.context
.generate_tags(config, source_code, cancellation_flag)
{
Ok(tags) => tags,
Ok((tags, found_error)) => {
buffer.errors_present = found_error;
tags
}
Err(e) => {
return match e {
Error::InvalidLanguage => TSTagsError::InvalidLanguage,
@ -153,13 +157,6 @@ pub extern "C" fn ts_tagger_tag(
buffer.docs.extend_from_slice(docs.as_bytes());
}
buffer.tags.push(TSTag {
kind: match tag.kind {
TagKind::Function => TSTagKind::Function,
TagKind::Method => TSTagKind::Method,
TagKind::Class => TSTagKind::Class,
TagKind::Module => TSTagKind::Module,
TagKind::Call => TSTagKind::Call,
},
start_byte: tag.range.start as u32,
end_byte: tag.range.end as u32,
name_start_byte: tag.name_range.start as u32,
@ -174,8 +171,12 @@ pub extern "C" fn ts_tagger_tag(
row: tag.span.end.row as u32,
column: tag.span.end.column as u32,
},
utf16_start_colum: tag.utf16_column_range.start as u32,
utf16_end_colum: tag.utf16_column_range.end as u32,
docs_start_byte: prev_docs_len as u32,
docs_end_byte: buffer.docs.len() as u32,
syntax_type_id: tag.syntax_type_id,
is_definition: tag.is_definition,
});
}
@ -189,8 +190,9 @@ pub extern "C" fn ts_tagger_tag(
pub extern "C" fn ts_tags_buffer_new() -> *mut TSTagsBuffer {
Box::into_raw(Box::new(TSTagsBuffer {
context: TagsContext::new(),
tags: Vec::with_capacity(64),
docs: Vec::with_capacity(64),
tags: Vec::with_capacity(BUFFER_TAGS_RESERVE_CAPACITY),
docs: Vec::with_capacity(BUFFER_DOCS_RESERVE_CAPACITY),
errors_present: false,
}))
}
@ -223,6 +225,30 @@ pub extern "C" fn ts_tags_buffer_docs_len(this: *const TSTagsBuffer) -> u32 {
buffer.docs.len() as u32
}
#[no_mangle]
pub extern "C" fn ts_tags_buffer_found_parse_error(this: *const TSTagsBuffer) -> bool {
let buffer = unwrap_ptr(this);
buffer.errors_present
}
#[no_mangle]
pub extern "C" fn ts_tagger_syntax_kinds_for_scope_name(
this: *mut TSTagger,
scope_name: *const i8,
len: *mut u32,
) -> *const *const i8 {
let tagger = unwrap_mut_ptr(this);
let scope_name = unsafe { unwrap(CStr::from_ptr(scope_name).to_str()) };
let len = unwrap_mut_ptr(len);
*len = 0;
if let Some(config) = tagger.languages.get(scope_name) {
*len = config.c_syntax_type_names.len() as u32;
return config.c_syntax_type_names.as_ptr() as *const *const i8;
}
std::ptr::null()
}
fn unwrap_ptr<'a, T>(result: *const T) -> &'a T {
unsafe { result.as_ref() }.unwrap_or_else(|| {
eprintln!("{}:{} - pointer must not be null", file!(), line!());
@ -243,3 +269,11 @@ fn unwrap<T, E: fmt::Display>(result: Result<T, E>) -> T {
abort();
})
}
fn shrink_and_clear<T>(vec: &mut Vec<T>, capacity: usize) {
if vec.len() > capacity {
vec.truncate(capacity);
vec.shrink_to_fit();
}
vec.clear();
}

View file

@ -1,10 +1,12 @@
pub mod c_lib;
use memchr::{memchr, memrchr};
use memchr::memchr;
use regex::Regex;
use std::collections::HashMap;
use std::ffi::{CStr, CString};
use std::ops::Range;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::{fmt, mem, str};
use std::{char, fmt, mem, str};
use tree_sitter::{
Language, Parser, Point, Query, QueryCursor, QueryError, QueryPredicateArg, Tree,
};
@ -18,19 +20,24 @@ const CANCELLATION_CHECK_INTERVAL: usize = 100;
pub struct TagsConfiguration {
pub language: Language,
pub query: Query,
call_capture_index: Option<u32>,
class_capture_index: Option<u32>,
syntax_type_names: Vec<Box<[u8]>>,
c_syntax_type_names: Vec<*const u8>,
capture_map: HashMap<u32, NamedCapture>,
doc_capture_index: Option<u32>,
function_capture_index: Option<u32>,
method_capture_index: Option<u32>,
module_capture_index: Option<u32>,
name_capture_index: Option<u32>,
ignore_capture_index: Option<u32>,
local_scope_capture_index: Option<u32>,
local_definition_capture_index: Option<u32>,
tags_pattern_index: usize,
pattern_info: Vec<PatternInfo>,
}
#[derive(Debug)]
pub struct NamedCapture {
pub syntax_type_id: u32,
pub is_definition: bool,
}
pub struct TagsContext {
parser: Parser,
cursor: QueryCursor,
@ -38,21 +45,14 @@ pub struct TagsContext {
#[derive(Debug, Clone)]
pub struct Tag {
pub kind: TagKind,
pub range: Range<usize>,
pub name_range: Range<usize>,
pub line_range: Range<usize>,
pub span: Range<Point>,
pub utf16_column_range: Range<usize>,
pub docs: Option<String>,
}
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum TagKind {
Function,
Method,
Class,
Module,
Call,
pub is_definition: bool,
pub syntax_type_id: u32,
}
#[derive(Debug, PartialEq)]
@ -61,6 +61,7 @@ pub enum Error {
Regex(regex::Error),
Cancelled,
InvalidLanguage,
InvalidCapture(String),
}
#[derive(Debug, Default)]
@ -91,6 +92,7 @@ where
matches: I,
_tree: Tree,
source: &'a [u8],
prev_line_info: Option<LineInfo>,
config: &'a TagsConfiguration,
cancellation_flag: Option<&'a AtomicUsize>,
iter_count: usize,
@ -98,6 +100,18 @@ where
scopes: Vec<LocalScope<'a>>,
}
struct LineInfo {
utf8_position: Point,
utf8_byte: usize,
utf16_column: usize,
line_range: Range<usize>,
}
struct LossyUtf8<'a> {
bytes: &'a [u8],
in_replacement: bool,
}
impl TagsConfiguration {
pub fn new(language: Language, tags_query: &str, locals_query: &str) -> Result<Self, Error> {
let query = Query::new(language, &format!("{}{}", locals_query, tags_query))?;
@ -111,31 +125,57 @@ impl TagsConfiguration {
}
}
let mut call_capture_index = None;
let mut class_capture_index = None;
let mut capture_map = HashMap::new();
let mut syntax_type_names = Vec::new();
let mut doc_capture_index = None;
let mut function_capture_index = None;
let mut method_capture_index = None;
let mut module_capture_index = None;
let mut name_capture_index = None;
let mut ignore_capture_index = None;
let mut local_scope_capture_index = None;
let mut local_definition_capture_index = None;
for (i, name) in query.capture_names().iter().enumerate() {
let index = match name.as_str() {
"call" => &mut call_capture_index,
"class" => &mut class_capture_index,
"doc" => &mut doc_capture_index,
"function" => &mut function_capture_index,
"method" => &mut method_capture_index,
"module" => &mut module_capture_index,
"name" => &mut name_capture_index,
"local.scope" => &mut local_scope_capture_index,
"local.definition" => &mut local_definition_capture_index,
_ => continue,
};
*index = Some(i as u32);
match name.as_str() {
"" => continue,
"name" => name_capture_index = Some(i as u32),
"ignore" => ignore_capture_index = Some(i as u32),
"doc" => doc_capture_index = Some(i as u32),
"local.scope" => local_scope_capture_index = Some(i as u32),
"local.definition" => local_definition_capture_index = Some(i as u32),
"local.reference" => continue,
_ => {
let mut is_definition = false;
let kind = if name.starts_with("definition.") {
is_definition = true;
name.trim_start_matches("definition.")
} else if name.starts_with("reference.") {
name.trim_start_matches("reference.")
} else {
return Err(Error::InvalidCapture(name.to_string()));
};
if let Ok(cstr) = CString::new(kind) {
let c_kind = cstr.to_bytes_with_nul().to_vec().into_boxed_slice();
let syntax_type_id = syntax_type_names
.iter()
.position(|n| n == &c_kind)
.unwrap_or_else(|| {
syntax_type_names.push(c_kind);
syntax_type_names.len() - 1
}) as u32;
capture_map.insert(
i as u32,
NamedCapture {
syntax_type_id,
is_definition,
},
);
}
}
}
}
let c_syntax_type_names = syntax_type_names.iter().map(|s| s.as_ptr()).collect();
let pattern_info = (0..query.pattern_count())
.map(|pattern_index| {
let mut info = PatternInfo::default();
@ -180,19 +220,26 @@ impl TagsConfiguration {
Ok(TagsConfiguration {
language,
query,
function_capture_index,
class_capture_index,
method_capture_index,
module_capture_index,
syntax_type_names,
c_syntax_type_names,
capture_map,
doc_capture_index,
call_capture_index,
name_capture_index,
ignore_capture_index,
tags_pattern_index,
local_scope_capture_index,
local_definition_capture_index,
pattern_info,
})
}
pub fn syntax_type_name(&self, id: u32) -> &str {
unsafe {
let cstr = CStr::from_ptr(self.syntax_type_names[id as usize].as_ptr() as *const i8)
.to_bytes();
str::from_utf8(cstr).expect("syntax type name was not valid utf-8")
}
}
}
impl TagsContext {
@ -208,7 +255,7 @@ impl TagsContext {
config: &'a TagsConfiguration,
source: &'a [u8],
cancellation_flag: Option<&'a AtomicUsize>,
) -> Result<impl Iterator<Item = Result<Tag, Error>> + 'a, Error> {
) -> Result<(impl Iterator<Item = Result<Tag, Error>> + 'a, bool), Error> {
self.parser
.set_language(config.language)
.map_err(|_| Error::InvalidLanguage)?;
@ -224,12 +271,13 @@ impl TagsContext {
.matches(&config.query, tree_ref.root_node(), move |node| {
&source[node.byte_range()]
});
Ok(TagsIter {
Ok((TagsIter {
_tree: tree,
matches,
source,
config,
cancellation_flag,
prev_line_info: None,
tag_queue: Vec::new(),
iter_count: 0,
scopes: vec![LocalScope {
@ -237,7 +285,7 @@ impl TagsContext {
inherits: false,
local_defs: Vec::new(),
}],
})
}, tree_ref.root_node().has_error()))
}
}
@ -267,7 +315,12 @@ where
if self.tag_queue.len() > 1
&& self.tag_queue[0].0.name_range.end < last_entry.0.name_range.start
{
return Some(Ok(self.tag_queue.remove(0).0));
let tag = self.tag_queue.remove(0).0;
if tag.is_ignored() {
continue;
} else {
return Some(Ok(tag));
}
}
}
@ -300,141 +353,185 @@ where
continue;
}
let mut name_range = None;
let mut name_node = None;
let mut doc_nodes = Vec::new();
let mut tag_node = None;
let mut kind = TagKind::Call;
let mut syntax_type_id = 0;
let mut is_definition = false;
let mut docs_adjacent_node = None;
let mut is_ignored = false;
for capture in mat.captures {
let index = Some(capture.index);
if index == self.config.ignore_capture_index {
is_ignored = true;
name_node = Some(capture.node);
}
if index == self.config.pattern_info[mat.pattern_index].docs_adjacent_capture {
docs_adjacent_node = Some(capture.node);
}
if index == self.config.name_capture_index {
name_range = Some(capture.node.byte_range());
name_node = Some(capture.node);
} else if index == self.config.doc_capture_index {
doc_nodes.push(capture.node);
} else if index == self.config.call_capture_index {
}
if let Some(named_capture) = self.config.capture_map.get(&capture.index) {
tag_node = Some(capture.node);
kind = TagKind::Call;
} else if index == self.config.class_capture_index {
tag_node = Some(capture.node);
kind = TagKind::Class;
} else if index == self.config.function_capture_index {
tag_node = Some(capture.node);
kind = TagKind::Function;
} else if index == self.config.method_capture_index {
tag_node = Some(capture.node);
kind = TagKind::Method;
} else if index == self.config.module_capture_index {
tag_node = Some(capture.node);
kind = TagKind::Module;
syntax_type_id = named_capture.syntax_type_id;
is_definition = named_capture.is_definition;
}
}
if let (Some(tag_node), Some(name_range)) = (tag_node, name_range) {
if pattern_info.name_must_be_non_local {
let mut is_local = false;
for scope in self.scopes.iter().rev() {
if scope.range.start <= name_range.start
&& scope.range.end >= name_range.end
{
if scope
.local_defs
.iter()
.any(|d| d.name == &self.source[name_range.clone()])
{
is_local = true;
break;
}
if !scope.inherits {
break;
}
}
}
if is_local {
if let Some(name_node) = name_node {
let name_range = name_node.byte_range();
let tag;
if let Some(tag_node) = tag_node {
if name_node.has_error() {
continue;
}
}
// If needed, filter the doc nodes based on their ranges, selecting
// only the slice that are adjacent to some specified node.
let mut docs_start_index = 0;
if let (Some(docs_adjacent_node), false) =
(docs_adjacent_node, doc_nodes.is_empty())
{
docs_start_index = doc_nodes.len();
let mut start_row = docs_adjacent_node.start_position().row;
while docs_start_index > 0 {
let doc_node = &doc_nodes[docs_start_index - 1];
let prev_doc_end_row = doc_node.end_position().row;
if prev_doc_end_row + 1 >= start_row {
docs_start_index -= 1;
start_row = doc_node.start_position().row;
} else {
break;
if pattern_info.name_must_be_non_local {
let mut is_local = false;
for scope in self.scopes.iter().rev() {
if scope.range.start <= name_range.start
&& scope.range.end >= name_range.end
{
if scope
.local_defs
.iter()
.any(|d| d.name == &self.source[name_range.clone()])
{
is_local = true;
break;
}
if !scope.inherits {
break;
}
}
}
if is_local {
continue;
}
}
}
// Generate a doc string from all of the doc nodes, applying any strip regexes.
let mut docs = None;
for doc_node in &doc_nodes[docs_start_index..] {
if let Ok(content) = str::from_utf8(&self.source[doc_node.byte_range()]) {
let content = if let Some(regex) = &pattern_info.doc_strip_regex {
regex.replace_all(content, "").to_string()
} else {
content.to_string()
};
match &mut docs {
None => docs = Some(content),
Some(d) => {
d.push('\n');
d.push_str(&content);
// If needed, filter the doc nodes based on their ranges, selecting
// only the slice that are adjacent to some specified node.
let mut docs_start_index = 0;
if let (Some(docs_adjacent_node), false) =
(docs_adjacent_node, doc_nodes.is_empty())
{
docs_start_index = doc_nodes.len();
let mut start_row = docs_adjacent_node.start_position().row;
while docs_start_index > 0 {
let doc_node = &doc_nodes[docs_start_index - 1];
let prev_doc_end_row = doc_node.end_position().row;
if prev_doc_end_row + 1 >= start_row {
docs_start_index -= 1;
start_row = doc_node.start_position().row;
} else {
break;
}
}
}
// Generate a doc string from all of the doc nodes, applying any strip regexes.
let mut docs = None;
for doc_node in &doc_nodes[docs_start_index..] {
if let Ok(content) = str::from_utf8(&self.source[doc_node.byte_range()])
{
let content = if let Some(regex) = &pattern_info.doc_strip_regex {
regex.replace_all(content, "").to_string()
} else {
content.to_string()
};
match &mut docs {
None => docs = Some(content),
Some(d) => {
d.push('\n');
d.push_str(&content);
}
}
}
}
let rng = tag_node.byte_range();
let range = rng.start.min(name_range.start)..rng.end.max(name_range.end);
let span = name_node.start_position()..name_node.end_position();
// Compute tag properties that depend on the text of the containing line. If the
// previous tag occurred on the same line, then reuse results from the previous tag.
let line_range;
let mut prev_utf16_column = 0;
let mut prev_utf8_byte = name_range.start - span.start.column;
let line_info = self.prev_line_info.as_ref().and_then(|info| {
if info.utf8_position.row == span.start.row {
Some(info)
} else {
None
}
});
if let Some(line_info) = line_info {
line_range = line_info.line_range.clone();
if line_info.utf8_position.column <= span.start.column {
prev_utf8_byte = line_info.utf8_byte;
prev_utf16_column = line_info.utf16_column;
}
} else {
line_range = self::line_range(
self.source,
name_range.start,
span.start,
MAX_LINE_LEN,
);
}
let utf16_start_column = prev_utf16_column
+ utf16_len(&self.source[prev_utf8_byte..name_range.start]);
let utf16_end_column =
utf16_start_column + utf16_len(&self.source[name_range.clone()]);
let utf16_column_range = utf16_start_column..utf16_end_column;
self.prev_line_info = Some(LineInfo {
utf8_position: span.end,
utf8_byte: name_range.end,
utf16_column: utf16_end_column,
line_range: line_range.clone(),
});
tag = Tag {
line_range,
span,
utf16_column_range,
range,
name_range,
docs,
is_definition,
syntax_type_id,
};
} else if is_ignored {
tag = Tag::ignored(name_range);
} else {
continue;
}
// Only create one tag per node. The tag queue is sorted by node position
// to allow for fast lookup.
let range = tag_node.byte_range();
match self
.tag_queue
.binary_search_by_key(&(name_range.end, name_range.start), |(tag, _)| {
(tag.name_range.end, tag.name_range.start)
}) {
match self.tag_queue.binary_search_by_key(
&(tag.name_range.end, tag.name_range.start),
|(tag, _)| (tag.name_range.end, tag.name_range.start),
) {
Ok(i) => {
let (tag, pattern_index) = &mut self.tag_queue[i];
let (existing_tag, pattern_index) = &mut self.tag_queue[i];
if *pattern_index > mat.pattern_index {
*pattern_index = mat.pattern_index;
*tag = Tag {
line_range: line_range(self.source, range.start, MAX_LINE_LEN),
span: tag_node.start_position()..tag_node.end_position(),
kind,
range,
name_range,
docs,
};
*existing_tag = tag;
}
}
Err(i) => self.tag_queue.insert(
i,
(
Tag {
line_range: line_range(self.source, range.start, MAX_LINE_LEN),
span: tag_node.start_position()..tag_node.end_position(),
kind,
range,
name_range,
docs,
},
mat.pattern_index,
),
),
Err(i) => self.tag_queue.insert(i, (tag, mat.pattern_index)),
}
}
}
@ -448,16 +545,31 @@ where
}
}
impl fmt::Display for TagKind {
impl Tag {
fn ignored(name_range: Range<usize>) -> Self {
Tag {
name_range,
line_range: 0..0,
span: Point::new(0, 0)..Point::new(0, 0),
utf16_column_range: 0..0,
range: usize::MAX..usize::MAX,
docs: None,
is_definition: false,
syntax_type_id: 0,
}
}
fn is_ignored(&self) -> bool {
self.range.start == usize::MAX
}
}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
TagKind::Call => "Call",
TagKind::Module => "Module",
TagKind::Class => "Class",
TagKind::Method => "Method",
TagKind::Function => "Function",
Error::InvalidCapture(name) => write!(f, "Invalid capture @{}. Expected one of: @definition.*, @reference.*, @doc, @name, @local.(scope|definition|reference).", name),
_ => write!(f, "{:?}", self)
}
.fmt(f)
}
}
@ -473,11 +585,90 @@ impl From<QueryError> for Error {
}
}
fn line_range(text: &[u8], index: usize, max_line_len: usize) -> Range<usize> {
let start = memrchr(b'\n', &text[0..index]).map_or(0, |i| i + 1);
let max_line_len = max_line_len.min(text.len() - start);
let end = start + memchr(b'\n', &text[start..(start + max_line_len)]).unwrap_or(max_line_len);
start..end
// TODO: Remove this struct at at some point. If `core::str::lossy::Utf8Lossy`
// is ever stabilized, we should use that. Otherwise, this struct could be moved
// into some module that's shared between `tree-sitter-tags` and `tree-sitter-highlight`.
impl<'a> LossyUtf8<'a> {
fn new(bytes: &'a [u8]) -> Self {
LossyUtf8 {
bytes,
in_replacement: false,
}
}
}
impl<'a> Iterator for LossyUtf8<'a> {
type Item = &'a str;
fn next(&mut self) -> Option<&'a str> {
if self.bytes.is_empty() {
return None;
}
if self.in_replacement {
self.in_replacement = false;
return Some("\u{fffd}");
}
match str::from_utf8(self.bytes) {
Ok(valid) => {
self.bytes = &[];
Some(valid)
}
Err(error) => {
if let Some(error_len) = error.error_len() {
let error_start = error.valid_up_to();
if error_start > 0 {
let result =
unsafe { str::from_utf8_unchecked(&self.bytes[..error_start]) };
self.bytes = &self.bytes[(error_start + error_len)..];
self.in_replacement = true;
Some(result)
} else {
self.bytes = &self.bytes[error_len..];
Some("\u{fffd}")
}
} else {
None
}
}
}
}
}
fn line_range(
text: &[u8],
start_byte: usize,
start_point: Point,
max_line_len: usize,
) -> Range<usize> {
// Trim leading whitespace
let mut line_start_byte = start_byte - start_point.column;
while line_start_byte < text.len() && text[line_start_byte].is_ascii_whitespace() {
line_start_byte += 1;
}
let max_line_len = max_line_len.min(text.len() - line_start_byte);
let text_after_line_start = &text[line_start_byte..(line_start_byte + max_line_len)];
let line_len = if let Some(len) = memchr(b'\n', text_after_line_start) {
len
} else if let Err(e) = str::from_utf8(text_after_line_start) {
e.valid_up_to()
} else {
max_line_len
};
// Trim trailing whitespace
let mut line_end_byte = line_start_byte + line_len;
while line_end_byte > line_start_byte && text[line_end_byte - 1].is_ascii_whitespace() {
line_end_byte -= 1;
}
line_start_byte..line_end_byte
}
fn utf16_len(bytes: &[u8]) -> usize {
LossyUtf8::new(bytes)
.flat_map(|chunk| chunk.chars().map(char::len_utf16))
.sum()
}
#[cfg(test)]
@ -486,14 +677,27 @@ mod tests {
#[test]
fn test_get_line() {
let text = b"abc\ndefg\nhijkl";
assert_eq!(line_range(text, 0, 10), 0..3);
assert_eq!(line_range(text, 1, 10), 0..3);
assert_eq!(line_range(text, 2, 10), 0..3);
assert_eq!(line_range(text, 3, 10), 0..3);
assert_eq!(line_range(text, 1, 2), 0..2);
assert_eq!(line_range(text, 4, 10), 4..8);
assert_eq!(line_range(text, 5, 10), 4..8);
assert_eq!(line_range(text, 11, 10), 9..14);
let text = "abc\ndefg❤hij\nklmno".as_bytes();
assert_eq!(line_range(text, 5, Point::new(1, 1), 30), 4..14);
assert_eq!(line_range(text, 5, Point::new(1, 1), 6), 4..8);
assert_eq!(line_range(text, 17, Point::new(2, 2), 30), 15..20);
assert_eq!(line_range(text, 17, Point::new(2, 2), 4), 15..19);
}
#[test]
fn test_get_line_trims() {
let text = b" foo\nbar\n";
assert_eq!(line_range(text, 0, Point::new(0, 0), 10), 3..6);
let text = b"\t func foo \nbar\n";
assert_eq!(line_range(text, 0, Point::new(0, 0), 10), 2..10);
let r = line_range(text, 0, Point::new(0, 0), 14);
assert_eq!(r, 2..10);
assert_eq!(str::from_utf8(&text[r]).unwrap_or(""), "func foo");
let r = line_range(text, 12, Point::new(1, 0), 14);
assert_eq!(r, 12..15);
assert_eq!(str::from_utf8(&text[r]).unwrap_or(""), "bar");
}
}

View file

@ -0,0 +1,19 @@
==========================
Heredocs with errors
==========================
joins(<<~SQL(
b
SQL
c
---
(program
(method_call
method: (identifier)
(ERROR (heredoc_beginning))
arguments: (argument_list
(heredoc_body (heredoc_end))
(identifier)
(MISSING ")"))))

View file

@ -0,0 +1,23 @@
=====
Extras
=====
;
%;
%foo:;
;
bar: baz:;
;
---
(program
(statement)
(macro_statement (statement))
(macro_statement (statement
(label_declaration (identifier))))
(statement)
(statement
(label_declaration (identifier))
(label_declaration (identifier)))
(statement))

View file

@ -0,0 +1,68 @@
{
"name": "extra_non_terminals_with_shared_rules",
"extras": [
{ "type": "PATTERN", "value": "\\s+" },
{ "type": "SYMBOL", "name": "macro_statement" }
],
"rules": {
"program": {
"type": "REPEAT",
"content": {
"type": "SYMBOL",
"name": "statement"
}
},
"statement": {
"type": "SEQ",
"members": [
{
"type": "REPEAT",
"content": {
"type": "SYMBOL",
"name": "label_declaration"
}
},
{
"type": "STRING",
"value": ";"
}
]
},
"macro_statement": {
"type": "SEQ",
"members": [
{
"type": "STRING",
"value": "%"
},
{
"type": "SYMBOL",
"name": "statement"
}
]
},
"label_declaration": {
"type": "SEQ",
"members": [
{
"type": "SYMBOL",
"name": "identifier"
},
{
"type": "STRING",
"value": ":"
}
]
},
"identifier": {
"type": "PATTERN",
"value": "[a-zA-Z]+"
}
},
"conflicts": [],
"externals": [],
"inline": [],
"supertypes": []
}

View file

@ -22,10 +22,10 @@ The fuzzers can then be built with:
export CLANG_DIR=$HOME/src/third_party/llvm-build/Release+Asserts/bin
CC="$CLANG_DIR/clang" CXX="$CLANG_DIR/clang++" LINK="$CLANG_DIR/clang++" \
LIB_FUZZER_PATH=$HOME/src/compiler-rt/lib/fuzzer/libFuzzer.a \
./script/build_fuzzers
./script/build-fuzzers
```
This will generate a separate fuzzer for each grammar defined in `test/fixtures/grammars` and will be instrumented with [AddressSanitizer](https://clang.llvm.org/docs/AddressSanitizer.html) and [UndefinedBehaviorSanitizer](https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html). Individual fuzzers can be built with, for example, `./script/build_fuzzers python ruby`.
This will generate a separate fuzzer for each grammar defined in `test/fixtures/grammars` and will be instrumented with [AddressSanitizer](https://clang.llvm.org/docs/AddressSanitizer.html) and [UndefinedBehaviorSanitizer](https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html). Individual fuzzers can be built with, for example, `./script/build-fuzzers python ruby`.
The `run-fuzzer` script handles running an individual fuzzer with a sensible default set of arguments:
```