From 456b1f6771de9ec689ea350eb4cbdfcf14baa283 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 20 Aug 2020 16:28:54 -0700 Subject: [PATCH] Fix handling of alternations and optional nodes in query analysis --- cli/src/tests/query_test.rs | 139 +++++++++++++++++++++++++++++++++--- lib/src/query.c | 91 ++++++++++++----------- script/test | 12 ++-- 3 files changed, 190 insertions(+), 52 deletions(-) diff --git a/cli/src/tests/query_test.rs b/cli/src/tests/query_test.rs index e7231ef0..816c3aee 100644 --- a/cli/src/tests/query_test.rs +++ b/cli/src/tests/query_test.rs @@ -1,11 +1,17 @@ use super::helpers::allocations; use super::helpers::fixtures::get_language; +use lazy_static::lazy_static; +use std::env; use std::fmt::Write; use tree_sitter::{ Language, Node, Parser, Query, QueryCapture, QueryCursor, QueryError, QueryMatch, QueryPredicate, QueryPredicateArg, QueryProperty, }; +lazy_static! { + static ref EXAMPLE_FILTER: Option = env::var("TREE_SITTER_TEST_EXAMPLE_FILTER").ok(); +} + #[test] fn test_query_errors_on_invalid_syntax() { allocations::record(|| { @@ -234,6 +240,34 @@ fn test_query_errors_on_impossible_patterns() { .join("\n") )) ); + + Query::new( + js_lang, + "[ + (function (identifier)) + (function_declaration (identifier)) + (generator_function_declaration (identifier)) + ]", + ) + .unwrap(); + assert_eq!( + Query::new( + js_lang, + "[ + (function (identifier)) + (function_declaration (object)) + (generator_function_declaration (identifier)) + ]", + ), + Err(QueryError::Structure( + 3, + [ + " (function_declaration (object))", // + " ^", + ] + .join("\n") + )) + ); }); } @@ -2322,37 +2356,92 @@ fn test_query_alternative_predicate_prefix() { fn test_query_step_is_definite() { struct Row { language: Language, + description: &'static str, pattern: &'static str, results_by_substring: &'static [(&'static str, bool)], } let rows = &[ Row { + description: "no definite steps", language: get_language("python"), pattern: r#"(expression_statement (string))"#, results_by_substring: &[("expression_statement", false), ("string", false)], }, Row { - language: get_language("javascript"), - pattern: r#"(expression_statement (string))"#, - results_by_substring: &[("expression_statement", false), ("string", false)], - }, - Row { + description: "all definite steps", language: get_language("javascript"), pattern: r#"(object "{" "}")"#, results_by_substring: &[("object", false), ("{", true), ("}", true)], }, Row { + description: "an indefinite step that is optional", + language: get_language("javascript"), + pattern: r#"(object "{" (identifier)? @foo "}")"#, + results_by_substring: &[ + ("object", false), + ("{", true), + ("(identifier)?", false), + ("}", true), + ], + }, + Row { + description: "multiple indefinite steps that are optional", + language: get_language("javascript"), + pattern: r#"(object "{" (identifier)? @id1 ("," (identifier) @id2)? "}")"#, + results_by_substring: &[ + ("object", false), + ("{", true), + ("(identifier)? @id1", false), + ("\",\"", false), + ("}", true), + ], + }, + Row { + description: "definite step after indefinite step", language: get_language("javascript"), pattern: r#"(pair (property_identifier) ":")"#, results_by_substring: &[("pair", false), ("property_identifier", false), (":", true)], }, Row { + description: "indefinite step in between two definite steps", language: get_language("javascript"), - pattern: r#"(object "{" (_) "}")"#, - results_by_substring: &[("object", false), ("{", false), ("", false), ("}", true)], + pattern: r#"(ternary_expression + condition: (_) + "?" + consequence: (call_expression) + ":" + alternative: (_))"#, + results_by_substring: &[ + ("condition:", false), + ("\"?\"", false), + ("consequence:", false), + ("\":\"", true), + ("alternative:", true), + ], }, Row { + description: "one definite step after a repetition", + language: get_language("javascript"), + pattern: r#"(object "{" (_) "}")"#, + results_by_substring: &[("object", false), ("{", false), ("(_)", false), ("}", true)], + }, + Row { + description: "definite steps after multiple repetitions", + language: get_language("json"), + pattern: r#"(object "{" (pair) "," (pair) "," (_) "}")"#, + results_by_substring: &[ + ("object", false), + ("{", false), + ("(pair) \",\" (pair)", false), + ("(pair) \",\" (_)", false), + ("\",\" (_)", false), + ("(_)", true), + ("}", true), + ], + }, + Row { + description: "a definite with a field", language: get_language("javascript"), pattern: r#"(binary_expression left: (identifier) right: (_))"#, results_by_substring: &[ @@ -2362,6 +2451,7 @@ fn test_query_step_is_definite() { ], }, Row { + description: "multiple definite steps with fields", language: get_language("javascript"), pattern: r#"(function_declaration name: (identifier) body: (statement_block))"#, results_by_substring: &[ @@ -2371,6 +2461,7 @@ fn test_query_step_is_definite() { ], }, Row { + description: "nesting, one definite step", language: get_language("javascript"), pattern: r#" (function_declaration @@ -2386,6 +2477,7 @@ fn test_query_step_is_definite() { ], }, Row { + description: "definite step after some deeply nested hidden nodes", language: get_language("ruby"), pattern: r#" (singleton_class @@ -2399,6 +2491,7 @@ fn test_query_step_is_definite() { ], }, Row { + description: "nesting, no definite steps", language: get_language("javascript"), pattern: r#" (call_expression @@ -2409,6 +2502,7 @@ fn test_query_step_is_definite() { results_by_substring: &[("property_identifier", false), ("template_string", false)], }, Row { + description: "a definite step after a nested node", language: get_language("javascript"), pattern: r#" (subscript_expression @@ -2424,6 +2518,7 @@ fn test_query_step_is_definite() { ], }, Row { + description: "a step that is indefinite due to a predicate", language: get_language("javascript"), pattern: r#" (subscript_expression @@ -2439,17 +2534,45 @@ fn test_query_step_is_definite() { ("[", true), ], }, + Row { + description: "alternation where one branch has definite steps", + language: get_language("javascript"), + pattern: r#" + [ + (unary_expression (identifier)) + (call_expression + function: (_) + arguments: (_)) + (binary_expression right:(call_expression)) + ] + "#, + results_by_substring: &[ + ("identifier", false), + ("right:", false), + ("function:", true), + ("arguments:", true), + ], + }, ]; allocations::record(|| { + eprintln!(""); + for row in rows.iter() { + if let Some(filter) = EXAMPLE_FILTER.as_ref() { + if !row.description.contains(filter.as_str()) { + continue; + } + } + eprintln!(" query example: {:?}", row.description); let query = Query::new(row.language, row.pattern).unwrap(); for (substring, is_definite) in row.results_by_substring { let offset = row.pattern.find(substring).unwrap(); assert_eq!( query.step_is_definite(offset), *is_definite, - "Pattern: {:?}, substring: {:?}, expected is_definite to be {}", + "Description: {}, Pattern: {:?}, substring: {:?}, expected is_definite to be {}", + row.description, row.pattern .split_ascii_whitespace() .collect::>() diff --git a/lib/src/query.c b/lib/src/query.c index 60c892d3..8464a691 100644 --- a/lib/src/query.c +++ b/lib/src/query.c @@ -1144,34 +1144,18 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { next_states = _states; } - // A query step is definite if the containing pattern will definitely match - // once the step is reached. In other words, a step is *not* definite if - // it's possible to create a syntax node that matches up to until that step, - // but does not match the entire pattern. - uint32_t child_step_index = parent_step_index + 1; - QueryStep *child_step = &self->steps.contents[child_step_index]; - while (child_step->depth == parent_depth + 1) { - // Check if there is any way for the pattern to reach this step, but fail - // to reach the end of the sub-pattern. - for (unsigned k = 0; k < final_step_indices.size; k++) { - uint32_t final_step_index = final_step_indices.contents[k]; - if ( - final_step_index >= child_step_index && - self->steps.contents[final_step_index].depth == child_step->depth - ) { - child_step->is_definite = false; - break; - } + // Mark as indefinite any step where a match terminated. + // Later, this property will be propagated to all of the step's predecessors. + for (unsigned j = 0; j < final_step_indices.size; j++) { + uint32_t final_step_index = final_step_indices.contents[j]; + QueryStep *step = &self->steps.contents[final_step_index]; + if ( + step->depth != PATTERN_DONE_MARKER && + step->depth > parent_depth && + !step->is_dead_end + ) { + step->is_definite = false; } - - // Advance to the next child step in this sub-pattern. - do { - child_step_index++; - child_step++; - } while ( - child_step->depth != PATTERN_DONE_MARKER && - child_step->depth > parent_depth + 1 - ); } // If this pattern cannot match, store the pattern index so that it can be @@ -1187,9 +1171,7 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { } } - // In order for a step to be definite, all of its child steps must be definite, - // and all of its later sibling steps must be definite. Propagate any indefiniteness - // upward and backward through the pattern trees. + // Mark as indefinite any step with captures that are used in predicates. Array(uint16_t) predicate_capture_ids = array_new(); for (unsigned i = 0; i < self->patterns.size; i++) { QueryPattern *pattern = &self->patterns.contents[i]; @@ -1207,16 +1189,13 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { } } - bool all_later_children_definite = true; + // Find all of the steps that have these captures. for ( unsigned start = pattern->steps.offset, end = start + pattern->steps.length, - j = end - 1; j + 1 > start; j-- + j = start; j < end; j++ ) { QueryStep *step = &self->steps.contents[j]; - - // If this step has a capture that is used in a predicate, - // then it is not definite. for (unsigned k = 0; k < MAX_STEP_CAPTURE_COUNT; k++) { uint16_t capture_id = step->capture_ids[k]; if (capture_id == NONE) break; @@ -1227,10 +1206,41 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { break; } } + } + } - // If a step is not definite, then none of its predecessors can be definite. - if (!all_later_children_definite) step->is_definite = false; - if (!step->is_definite) all_later_children_definite = false; + // Propagate indefiniteness backwards. + bool done = self->steps.size == 0; + while (!done) { + done = true; + for (unsigned i = self->steps.size - 1; i > 0; i--) { + QueryStep *step = &self->steps.contents[i]; + + // Determine if this step is definite or has definite alternatives. + bool is_definite = false; + for (;;) { + if (step->is_definite) { + is_definite = true; + break; + } + if (step->alternative_index == NONE || step->alternative_index < i) { + break; + } + step = &self->steps.contents[step->alternative_index]; + } + + // If not, mark its predecessor as indefinite. + if (!is_definite) { + QueryStep *prev_step = &self->steps.contents[i - 1]; + if ( + !prev_step->is_dead_end && + prev_step->depth != PATTERN_DONE_MARKER && + prev_step->is_definite + ) { + prev_step->is_definite = false; + done = false; + } + } } } @@ -1242,11 +1252,12 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { printf(" %u: DONE\n", i); } else { printf( - " %u: {symbol: %s, is_definite: %d}\n", + " %u: {symbol: %s, field: %s, is_definite: %d}\n", i, (step->symbol == WILDCARD_SYMBOL || step->symbol == NAMED_WILDCARD_SYMBOL) ? "ANY" : ts_language_symbol_name(self->language, step->symbol), + (step->field ? ts_language_field_name_for_id(self->language, step->field) : "-"), step->is_definite ); } @@ -1979,7 +1990,7 @@ bool ts_query_step_is_definite( uint32_t step_index = UINT32_MAX; for (unsigned i = 0; i < self->step_offsets.size; i++) { StepOffset *step_offset = &self->step_offsets.contents[i]; - if (step_offset->byte_offset >= byte_offset) break; + if (step_offset->byte_offset > byte_offset) break; step_index = step_offset->step_index; } if (step_index < self->steps.size) { diff --git a/script/test b/script/test index bcc88e24..31e90226 100755 --- a/script/test +++ b/script/test @@ -83,10 +83,14 @@ done shift $(expr $OPTIND - 1) -if [[ -n $TREE_SITTER_TEST_LANGUAGE_FILTER || -n $TREE_SITTER_TEST_EXAMPLE_FILTER || -n $TREE_SITTER_TEST_TRIAL_FILTER ]]; then - top_level_filter=corpus -else - top_level_filter=$1 +top_level_filter=$1 + +if [[ \ + -n $TREE_SITTER_TEST_LANGUAGE_FILTER || \ + -n $TREE_SITTER_TEST_EXAMPLE_FILTER || \ + -n $TREE_SITTER_TEST_TRIAL_FILTER \ +]]; then + echo ${top_level_filter:=corpus} fi if [[ "${mode}" == "debug" ]]; then