From 7f955419a88caada8455f8f73230a7b32712b30c Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 22 Jun 2020 16:20:49 -0700 Subject: [PATCH] Start work on recognizing impossible patterns --- cli/src/error.rs | 4 + cli/src/tests/query_test.rs | 46 +++++++- lib/binding_rust/bindings.rs | 1 + lib/binding_rust/lib.rs | 56 ++++++---- lib/binding_web/binding.js | 3 + lib/include/tree_sitter/api.h | 1 + lib/src/query.c | 190 ++++++++++++++++++++++------------ 7 files changed, 212 insertions(+), 89 deletions(-) diff --git a/cli/src/error.rs b/cli/src/error.rs index 824bd92f..4b493019 100644 --- a/cli/src/error.rs +++ b/cli/src/error.rs @@ -70,6 +70,10 @@ impl<'a> From for Error { "Query error on line {}. Invalid syntax:\n{}", row, l )), + QueryError::Pattern(row, l) => Error::new(format!( + "Query error on line {}. Impossible pattern:\n{}", + row, l + )), QueryError::Predicate(p) => Error::new(format!("Query error: {}", p)), } } diff --git a/cli/src/tests/query_test.rs b/cli/src/tests/query_test.rs index 92aff5fb..cc42a70d 100644 --- a/cli/src/tests/query_test.rs +++ b/cli/src/tests/query_test.rs @@ -12,7 +12,11 @@ fn test_query_errors_on_invalid_syntax() { let language = get_language("javascript"); assert!(Query::new(language, "(if_statement)").is_ok()); - assert!(Query::new(language, "(if_statement condition:(identifier))").is_ok()); + assert!(Query::new( + language, + "(if_statement condition:(parenthesized_expression (identifier)))" + ) + .is_ok()); // Mismatched parens assert_eq!( @@ -180,6 +184,28 @@ fn test_query_errors_on_invalid_conditions() { }); } +#[test] +fn test_query_errors_on_impossible_patterns() { + allocations::record(|| { + let language = get_language("javascript"); + + assert_eq!( + Query::new( + language, + "(binary_expression left:(identifier) left:(identifier))" + ), + Err(QueryError::Pattern( + 1, + [ + "(binary_expression left:(identifier) left:(identifier))", // + "^" + ] + .join("\n") + )) + ); + }); +} + #[test] fn test_query_matches_with_simple_pattern() { allocations::record(|| { @@ -1946,10 +1972,10 @@ fn test_query_capture_names() { language, r#" (if_statement - condition: (binary_expression + condition: (parenthesized_expression (binary_expression left: _ @left-operand operator: "||" - right: _ @right-operand) + right: _ @right-operand)) consequence: (statement_block) @body) (while_statement @@ -2051,12 +2077,14 @@ fn test_query_disable_pattern() { #[test] fn test_query_is_definite() { struct Row { + language: Language, pattern: &'static str, results_by_step_index: &'static [(usize, bool)], } let rows = &[ Row { + language: get_language("javascript"), pattern: r#"(object "{" "}")"#, results_by_step_index: &[ (0, false), @@ -2065,6 +2093,7 @@ fn test_query_is_definite() { ], }, Row { + language: get_language("javascript"), pattern: r#"(pair (property_identifier) ":")"#, results_by_step_index: &[ (0, false), @@ -2073,6 +2102,7 @@ fn test_query_is_definite() { ], }, Row { + language: get_language("javascript"), pattern: r#"(object "{" (_) "}")"#, results_by_step_index: &[ (0, false), @@ -2083,6 +2113,7 @@ fn test_query_is_definite() { }, Row { // Named wildcards, fields + language: get_language("javascript"), pattern: r#"(binary_expression left: (identifier) right: (_))"#, results_by_step_index: &[ (0, false), @@ -2091,6 +2122,7 @@ fn test_query_is_definite() { ], }, Row { + language: get_language("javascript"), pattern: r#"(function_declaration name: (identifier) body: (statement_block))"#, results_by_step_index: &[ (0, false), @@ -2098,12 +2130,16 @@ fn test_query_is_definite() { (2, true), // statement_block ], }, + Row { + language: get_language("javascript"), + pattern: r#""#, + results_by_step_index: &[], + }, ]; allocations::record(|| { - let language = get_language("javascript"); for row in rows.iter() { - let query = Query::new(language, row.pattern).unwrap(); + let query = Query::new(row.language, row.pattern).unwrap(); for (step_index, is_definite) in row.results_by_step_index { assert_eq!( query.pattern_is_definite(0, *step_index), diff --git a/lib/binding_rust/bindings.rs b/lib/binding_rust/bindings.rs index 7dc48660..167edebf 100644 --- a/lib/binding_rust/bindings.rs +++ b/lib/binding_rust/bindings.rs @@ -132,6 +132,7 @@ pub const TSQueryError_TSQueryErrorSyntax: TSQueryError = 1; pub const TSQueryError_TSQueryErrorNodeType: TSQueryError = 2; pub const TSQueryError_TSQueryErrorField: TSQueryError = 3; pub const TSQueryError_TSQueryErrorCapture: TSQueryError = 4; +pub const TSQueryError_TSQueryErrorPattern: TSQueryError = 5; pub type TSQueryError = u32; extern "C" { #[doc = " Create a new parser."] diff --git a/lib/binding_rust/lib.rs b/lib/binding_rust/lib.rs index 453cb8e7..d3284974 100644 --- a/lib/binding_rust/lib.rs +++ b/lib/binding_rust/lib.rs @@ -163,6 +163,7 @@ pub enum QueryError { Field(usize, String), Capture(usize, String), Predicate(String), + Pattern(usize, String), } #[derive(Debug)] @@ -1175,27 +1176,42 @@ impl Query { } }); - let message = if let Some(line) = line_containing_error { - line.to_string() + "\n" + &" ".repeat(offset - line_start) + "^" - } else { - "Unexpected EOF".to_string() - }; - - // if line_containing_error - return if error_type != ffi::TSQueryError_TSQueryErrorSyntax { - let suffix = source.split_at(offset).1; - let end_offset = suffix - .find(|c| !char::is_alphanumeric(c) && c != '_' && c != '-') - .unwrap_or(source.len()); - let name = suffix.split_at(end_offset).0.to_string(); - match error_type { - ffi::TSQueryError_TSQueryErrorNodeType => Err(QueryError::NodeType(row, name)), - ffi::TSQueryError_TSQueryErrorField => Err(QueryError::Field(row, name)), - ffi::TSQueryError_TSQueryErrorCapture => Err(QueryError::Capture(row, name)), - _ => Err(QueryError::Syntax(row, message)), + return match error_type { + // Error types that report names + ffi::TSQueryError_TSQueryErrorNodeType + | ffi::TSQueryError_TSQueryErrorField + | ffi::TSQueryError_TSQueryErrorCapture => { + let suffix = source.split_at(offset).1; + let end_offset = suffix + .find(|c| !char::is_alphanumeric(c) && c != '_' && c != '-') + .unwrap_or(source.len()); + let name = suffix.split_at(end_offset).0.to_string(); + match error_type { + ffi::TSQueryError_TSQueryErrorNodeType => { + Err(QueryError::NodeType(row, name)) + } + ffi::TSQueryError_TSQueryErrorField => Err(QueryError::Field(row, name)), + ffi::TSQueryError_TSQueryErrorCapture => { + Err(QueryError::Capture(row, name)) + } + _ => unreachable!(), + } + } + + // Error types that report positions + _ => { + let message = if let Some(line) = line_containing_error { + line.to_string() + "\n" + &" ".repeat(offset - line_start) + "^" + } else { + "Unexpected EOF".to_string() + }; + match error_type { + ffi::TSQueryError_TSQueryErrorPattern => { + Err(QueryError::Pattern(row, message)) + } + _ => Err(QueryError::Syntax(row, message)), + } } - } else { - Err(QueryError::Syntax(row, message)) }; } diff --git a/lib/binding_web/binding.js b/lib/binding_web/binding.js index 567b7eb3..cd8bec75 100644 --- a/lib/binding_web/binding.js +++ b/lib/binding_web/binding.js @@ -680,6 +680,9 @@ class Language { case 4: error = new RangeError(`Bad capture name @${word}`); break; + case 5: + error = new SyntaxError(`Impossible pattern at offset ${errorIndex}: '${suffix}'...`); + break; default: error = new SyntaxError(`Bad syntax at offset ${errorIndex}: '${suffix}'...`); break; diff --git a/lib/include/tree_sitter/api.h b/lib/include/tree_sitter/api.h index 1b2533fc..1abbf28c 100644 --- a/lib/include/tree_sitter/api.h +++ b/lib/include/tree_sitter/api.h @@ -130,6 +130,7 @@ typedef enum { TSQueryErrorNodeType, TSQueryErrorField, TSQueryErrorCapture, + TSQueryErrorPattern, } TSQueryError; /********************/ diff --git a/lib/src/query.c b/lib/src/query.c index 10ab5371..0b7530da 100644 --- a/lib/src/query.c +++ b/lib/src/query.c @@ -156,7 +156,8 @@ typedef struct { TSStateId state; TSSymbol parent_symbol; uint16_t child_index; - TSFieldId field; + TSFieldId field_id: 15; + bool done: 1; } WalkStateEntry; typedef struct { @@ -165,6 +166,19 @@ typedef struct { uint16_t step_index; } WalkState; +typedef struct { + TSStateId state; + uint8_t production_id; + uint8_t child_index: 7; + bool done: 1; +} SubgraphNode; + +typedef struct { + TSSymbol symbol; + Array(TSStateId) start_states; + Array(SubgraphNode) nodes; +} SymbolSubgraph; + /* * StatePredecessorMap - A map that stores the predecessors of each parse state. */ @@ -571,6 +585,16 @@ static inline int walk_state__compare(WalkState *self, WalkState *other) { return 0; } +static inline int subgraph_node__compare(SubgraphNode *self, SubgraphNode *other) { + if (self->state < other->state) return -1; + if (self->state > other->state) return 1; + if (self->child_index < other->child_index) return -1; + if (self->child_index > other->child_index) return 1; + if (self->production_id < other->production_id) return -1; + if (self->production_id > other->production_id) return 1; + return 0; +} + static inline WalkStateEntry *walk_state__top(WalkState *self) { return &self->stack[self->depth - 1]; } @@ -647,28 +671,17 @@ static inline void ts_query__pattern_map_insert( })); } -static void ts_query__analyze_patterns(TSQuery *self) { +static bool ts_query__analyze_patterns(TSQuery *self, unsigned *impossible_index) { typedef struct { TSSymbol parent_symbol; uint32_t parent_step_index; Array(uint32_t) child_step_indices; } ParentPattern; - typedef struct { - TSStateId state; - uint8_t child_index; - uint8_t production_id; - bool done; - } SubgraphNode; - - typedef struct { - TSSymbol symbol; - Array(TSStateId) start_states; - Array(SubgraphNode) nodes; - } SymbolSubgraph; - typedef Array(WalkState) WalkStateList; + bool result = true; + // Identify all of the patterns in the query that have child patterns. This // includes both top-level patterns and patterns that are nested within some // larger pattern. For each of these, record the parent symbol, the step index @@ -846,7 +859,11 @@ static void ts_query__analyze_patterns(TSQuery *self) { .done = false, }; unsigned index, exists; - array_search_sorted_by(&subgraph->nodes, 0, .state, predecessor_node.state, &index, &exists); + array_search_sorted_with( + &subgraph->nodes, 0, + subgraph_node__compare, &predecessor_node, + &index, &exists + ); if (!exists) { array_insert(&subgraph->nodes, index, predecessor_node); array_push(&next_nodes, predecessor_node); @@ -897,7 +914,8 @@ static void ts_query__analyze_patterns(TSQuery *self) { .state = state, .child_index = 0, .parent_symbol = subgraph->symbol, - .field = 0, + .field_id = 0, + .done = false, }, }, .depth = 1, @@ -923,20 +941,14 @@ static void ts_query__analyze_patterns(TSQuery *self) { // ts_language_symbol_name(self->language, walk_state->stack[walk_state->depth - 1].parent_symbol) // ); // } - - // printf("\nFinished step indices for %u %s:", i, ts_language_symbol_name(self->language, parent_pattern->parent_symbol)); - // for (unsigned j = 0; j < finished_step_indices.size; j++) { - // printf(" %u", finished_step_indices.contents[j]); - // } - // printf("\n\n"); // } array_clear(&next_walk_states); for (unsigned j = 0; j < walk_states.size; j++) { WalkState *walk_state = &walk_states.contents[j]; - TSStateId state = walk_state->stack[walk_state->depth - 1].state; - unsigned child_index = walk_state->stack[walk_state->depth - 1].child_index; - TSSymbol parent_symbol = walk_state->stack[walk_state->depth - 1].parent_symbol; + TSStateId state = walk_state__top(walk_state)->state; + unsigned child_index = walk_state__top(walk_state)->child_index; + TSSymbol parent_symbol = walk_state__top(walk_state)->parent_symbol; unsigned subgraph_index, exists; array_search_sorted_by(&subgraphs, 0, .symbol, parent_symbol, &subgraph_index, &exists); @@ -948,15 +960,14 @@ static void ts_query__analyze_patterns(TSQuery *self) { if (successor_state && successor_state != state) { unsigned node_index; array_search_sorted_by(&subgraph->nodes, 0, .state, successor_state, &node_index, &exists); - if (exists) { - SubgraphNode *node = &subgraph->nodes.contents[node_index]; - if (node->child_index != child_index + 1) continue; + while (exists && node_index < subgraph->nodes.size) { + SubgraphNode *node = &subgraph->nodes.contents[node_index++]; + if (node->state != successor_state || node->child_index != child_index + 1) continue; WalkState next_walk_state = *walk_state; walk_state__top(&next_walk_state)->child_index++; walk_state__top(&next_walk_state)->state = successor_state; - bool does_match = true; unsigned step_index = parent_pattern->child_step_indices.contents[walk_state->step_index]; QueryStep *step = &self->steps.contents[step_index]; TSSymbol alias = ts_language_alias_at(self->language, node->production_id, child_index); @@ -965,21 +976,6 @@ static void ts_query__analyze_patterns(TSQuery *self) { : self->language->symbol_metadata[sym].visible ? self->language->public_symbol_map[sym] : 0; - if (visible_symbol) { - if (step->symbol == NAMED_WILDCARD_SYMBOL) { - if (!ts_language_symbol_metadata(self->language, visible_symbol).named) does_match = false; - } else if (step->symbol != WILDCARD_SYMBOL) { - if (step->symbol != visible_symbol) does_match = false; - } - } else if (next_walk_state.depth < MAX_WALK_STATE_DEPTH) { - does_match = false; - next_walk_state.depth++; - walk_state__top(&next_walk_state)->state = state; - walk_state__top(&next_walk_state)->child_index = 0; - walk_state__top(&next_walk_state)->parent_symbol = sym; - } else { - continue; - } TSFieldId field_id = 0; const TSFieldMapEntry *field_map, *field_map_end; @@ -991,18 +987,45 @@ static void ts_query__analyze_patterns(TSQuery *self) { } } + if (node->done) { + walk_state__top(&next_walk_state)->done = true; + } + + bool does_match = true; + if (visible_symbol) { + if (step->symbol == NAMED_WILDCARD_SYMBOL) { + if (!self->language->symbol_metadata[visible_symbol].named) does_match = false; + } else if (step->symbol != WILDCARD_SYMBOL) { + if (step->symbol != visible_symbol) does_match = false; + } + + if (step->field) { + bool does_match_field = step->field == field_id; + if (!does_match_field) { + for (unsigned i = 0; i < walk_state->depth; i++) { + if (walk_state->stack[i].field_id == step->field) { + does_match_field = true; + } + } + } + does_match &= does_match_field; + } + } else if (next_walk_state.depth < MAX_WALK_STATE_DEPTH) { + does_match = false; + next_walk_state.depth++; + walk_state__top(&next_walk_state)->state = state; + walk_state__top(&next_walk_state)->child_index = 0; + walk_state__top(&next_walk_state)->parent_symbol = sym; + walk_state__top(&next_walk_state)->field_id = field_id; + } else { + continue; + } + if (does_match) { next_walk_state.step_index++; } - if (node->done) { - next_walk_state.depth--; - } - - if ( - next_walk_state.depth == 0 || - next_walk_state.step_index == parent_pattern->child_step_indices.size - ) { + if (next_walk_state.step_index == parent_pattern->child_step_indices.size) { unsigned index, exists; array_search_sorted_by(&finished_step_indices, 0, , next_walk_state.step_index, &index, &exists); if (!exists) array_insert(&finished_step_indices, index, next_walk_state.step_index); @@ -1011,19 +1034,39 @@ static void ts_query__analyze_patterns(TSQuery *self) { unsigned index, exists; array_search_sorted_with( - &next_walk_states, - 0, - walk_state__compare, - &next_walk_state, - &index, - &exists + &next_walk_states, 0, + walk_state__compare, &next_walk_state, + &index, &exists ); - if (!exists) { - array_insert(&next_walk_states, index, next_walk_state); - } + if (!exists) array_insert(&next_walk_states, index, next_walk_state); } } } + + bool did_pop = false; + while (walk_state->depth > 0 && walk_state__top(walk_state)->done) { + walk_state->depth--; + did_pop = true; + } + + if (did_pop) { + if (walk_state->depth == 0) { + unsigned index, exists; + array_search_sorted_by(&finished_step_indices, 0, , walk_state->step_index, &index, &exists); + if (!exists) array_insert(&finished_step_indices, index, walk_state->step_index); + } else { + unsigned index, exists; + array_search_sorted_with( + &next_walk_states, + 0, + walk_state__compare, + walk_state, + &index, + &exists + ); + if (!exists) array_insert(&next_walk_states, index, *walk_state); + } + } } WalkStateList _walk_states = walk_states; @@ -1037,7 +1080,7 @@ static void ts_query__analyze_patterns(TSQuery *self) { // for (unsigned j = 0; j < finished_step_indices.size; j++) { // printf(" %u", finished_step_indices.contents[j]); // } - // printf("\n\n"); + // printf(". Length: %u\n\n", parent_pattern->child_step_indices.size); // } // A query step is definite if the containing pattern will definitely match @@ -1055,6 +1098,16 @@ static void ts_query__analyze_patterns(TSQuery *self) { } } } + + if (finished_step_indices.size == 0 || *array_back(&finished_step_indices) < parent_pattern->child_step_indices.size) { + unsigned exists; + array_search_sorted_by( + &self->patterns, 0, + .start_step, + parent_pattern->parent_step_index, impossible_index, &exists); + result = false; + goto cleanup; + } } // In order for a parent step to be definite, all of its child steps must @@ -1090,6 +1143,7 @@ static void ts_query__analyze_patterns(TSQuery *self) { // } // Cleanup +cleanup: for (unsigned i = 0; i < parent_patterns.size; i++) { array_delete(&parent_patterns.contents[i].child_step_indices); } @@ -1105,6 +1159,8 @@ static void ts_query__analyze_patterns(TSQuery *self) { array_delete(&next_walk_states); array_delete(&finished_step_indices); state_predecessor_map_delete(&predecessor_map); + + return result; } static void ts_query__finalize_steps(TSQuery *self) { @@ -1731,7 +1787,13 @@ TSQuery *ts_query_new( } if (self->language->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_STATE_COUNT) { - ts_query__analyze_patterns(self); + unsigned impossible_pattern_index = 0; + if (!ts_query__analyze_patterns(self, &impossible_pattern_index)) { + *error_type = TSQueryErrorPattern; + *error_offset = self->patterns.contents[impossible_pattern_index].start_byte; + ts_query_delete(self); + return NULL; + } } ts_query__finalize_steps(self);