Start work on recognizing impossible patterns

This commit is contained in:
Max Brunsfeld 2020-06-22 16:20:49 -07:00
parent 4c2f36a07b
commit 7f955419a8
7 changed files with 212 additions and 89 deletions

View file

@ -70,6 +70,10 @@ impl<'a> From<QueryError> for Error {
"Query error on line {}. Invalid syntax:\n{}",
row, l
)),
QueryError::Pattern(row, l) => Error::new(format!(
"Query error on line {}. Impossible pattern:\n{}",
row, l
)),
QueryError::Predicate(p) => Error::new(format!("Query error: {}", p)),
}
}

View file

@ -12,7 +12,11 @@ fn test_query_errors_on_invalid_syntax() {
let language = get_language("javascript");
assert!(Query::new(language, "(if_statement)").is_ok());
assert!(Query::new(language, "(if_statement condition:(identifier))").is_ok());
assert!(Query::new(
language,
"(if_statement condition:(parenthesized_expression (identifier)))"
)
.is_ok());
// Mismatched parens
assert_eq!(
@ -180,6 +184,28 @@ fn test_query_errors_on_invalid_conditions() {
});
}
#[test]
fn test_query_errors_on_impossible_patterns() {
allocations::record(|| {
let language = get_language("javascript");
assert_eq!(
Query::new(
language,
"(binary_expression left:(identifier) left:(identifier))"
),
Err(QueryError::Pattern(
1,
[
"(binary_expression left:(identifier) left:(identifier))", //
"^"
]
.join("\n")
))
);
});
}
#[test]
fn test_query_matches_with_simple_pattern() {
allocations::record(|| {
@ -1946,10 +1972,10 @@ fn test_query_capture_names() {
language,
r#"
(if_statement
condition: (binary_expression
condition: (parenthesized_expression (binary_expression
left: _ @left-operand
operator: "||"
right: _ @right-operand)
right: _ @right-operand))
consequence: (statement_block) @body)
(while_statement
@ -2051,12 +2077,14 @@ fn test_query_disable_pattern() {
#[test]
fn test_query_is_definite() {
struct Row {
language: Language,
pattern: &'static str,
results_by_step_index: &'static [(usize, bool)],
}
let rows = &[
Row {
language: get_language("javascript"),
pattern: r#"(object "{" "}")"#,
results_by_step_index: &[
(0, false),
@ -2065,6 +2093,7 @@ fn test_query_is_definite() {
],
},
Row {
language: get_language("javascript"),
pattern: r#"(pair (property_identifier) ":")"#,
results_by_step_index: &[
(0, false),
@ -2073,6 +2102,7 @@ fn test_query_is_definite() {
],
},
Row {
language: get_language("javascript"),
pattern: r#"(object "{" (_) "}")"#,
results_by_step_index: &[
(0, false),
@ -2083,6 +2113,7 @@ fn test_query_is_definite() {
},
Row {
// Named wildcards, fields
language: get_language("javascript"),
pattern: r#"(binary_expression left: (identifier) right: (_))"#,
results_by_step_index: &[
(0, false),
@ -2091,6 +2122,7 @@ fn test_query_is_definite() {
],
},
Row {
language: get_language("javascript"),
pattern: r#"(function_declaration name: (identifier) body: (statement_block))"#,
results_by_step_index: &[
(0, false),
@ -2098,12 +2130,16 @@ fn test_query_is_definite() {
(2, true), // statement_block
],
},
Row {
language: get_language("javascript"),
pattern: r#""#,
results_by_step_index: &[],
},
];
allocations::record(|| {
let language = get_language("javascript");
for row in rows.iter() {
let query = Query::new(language, row.pattern).unwrap();
let query = Query::new(row.language, row.pattern).unwrap();
for (step_index, is_definite) in row.results_by_step_index {
assert_eq!(
query.pattern_is_definite(0, *step_index),

View file

@ -132,6 +132,7 @@ pub const TSQueryError_TSQueryErrorSyntax: TSQueryError = 1;
pub const TSQueryError_TSQueryErrorNodeType: TSQueryError = 2;
pub const TSQueryError_TSQueryErrorField: TSQueryError = 3;
pub const TSQueryError_TSQueryErrorCapture: TSQueryError = 4;
pub const TSQueryError_TSQueryErrorPattern: TSQueryError = 5;
pub type TSQueryError = u32;
extern "C" {
#[doc = " Create a new parser."]

View file

@ -163,6 +163,7 @@ pub enum QueryError {
Field(usize, String),
Capture(usize, String),
Predicate(String),
Pattern(usize, String),
}
#[derive(Debug)]
@ -1175,27 +1176,42 @@ impl Query {
}
});
let message = if let Some(line) = line_containing_error {
line.to_string() + "\n" + &" ".repeat(offset - line_start) + "^"
} else {
"Unexpected EOF".to_string()
};
// if line_containing_error
return if error_type != ffi::TSQueryError_TSQueryErrorSyntax {
let suffix = source.split_at(offset).1;
let end_offset = suffix
.find(|c| !char::is_alphanumeric(c) && c != '_' && c != '-')
.unwrap_or(source.len());
let name = suffix.split_at(end_offset).0.to_string();
match error_type {
ffi::TSQueryError_TSQueryErrorNodeType => Err(QueryError::NodeType(row, name)),
ffi::TSQueryError_TSQueryErrorField => Err(QueryError::Field(row, name)),
ffi::TSQueryError_TSQueryErrorCapture => Err(QueryError::Capture(row, name)),
_ => Err(QueryError::Syntax(row, message)),
return match error_type {
// Error types that report names
ffi::TSQueryError_TSQueryErrorNodeType
| ffi::TSQueryError_TSQueryErrorField
| ffi::TSQueryError_TSQueryErrorCapture => {
let suffix = source.split_at(offset).1;
let end_offset = suffix
.find(|c| !char::is_alphanumeric(c) && c != '_' && c != '-')
.unwrap_or(source.len());
let name = suffix.split_at(end_offset).0.to_string();
match error_type {
ffi::TSQueryError_TSQueryErrorNodeType => {
Err(QueryError::NodeType(row, name))
}
ffi::TSQueryError_TSQueryErrorField => Err(QueryError::Field(row, name)),
ffi::TSQueryError_TSQueryErrorCapture => {
Err(QueryError::Capture(row, name))
}
_ => unreachable!(),
}
}
// Error types that report positions
_ => {
let message = if let Some(line) = line_containing_error {
line.to_string() + "\n" + &" ".repeat(offset - line_start) + "^"
} else {
"Unexpected EOF".to_string()
};
match error_type {
ffi::TSQueryError_TSQueryErrorPattern => {
Err(QueryError::Pattern(row, message))
}
_ => Err(QueryError::Syntax(row, message)),
}
}
} else {
Err(QueryError::Syntax(row, message))
};
}

View file

@ -680,6 +680,9 @@ class Language {
case 4:
error = new RangeError(`Bad capture name @${word}`);
break;
case 5:
error = new SyntaxError(`Impossible pattern at offset ${errorIndex}: '${suffix}'...`);
break;
default:
error = new SyntaxError(`Bad syntax at offset ${errorIndex}: '${suffix}'...`);
break;

View file

@ -130,6 +130,7 @@ typedef enum {
TSQueryErrorNodeType,
TSQueryErrorField,
TSQueryErrorCapture,
TSQueryErrorPattern,
} TSQueryError;
/********************/

View file

@ -156,7 +156,8 @@ typedef struct {
TSStateId state;
TSSymbol parent_symbol;
uint16_t child_index;
TSFieldId field;
TSFieldId field_id: 15;
bool done: 1;
} WalkStateEntry;
typedef struct {
@ -165,6 +166,19 @@ typedef struct {
uint16_t step_index;
} WalkState;
typedef struct {
TSStateId state;
uint8_t production_id;
uint8_t child_index: 7;
bool done: 1;
} SubgraphNode;
typedef struct {
TSSymbol symbol;
Array(TSStateId) start_states;
Array(SubgraphNode) nodes;
} SymbolSubgraph;
/*
* StatePredecessorMap - A map that stores the predecessors of each parse state.
*/
@ -571,6 +585,16 @@ static inline int walk_state__compare(WalkState *self, WalkState *other) {
return 0;
}
static inline int subgraph_node__compare(SubgraphNode *self, SubgraphNode *other) {
if (self->state < other->state) return -1;
if (self->state > other->state) return 1;
if (self->child_index < other->child_index) return -1;
if (self->child_index > other->child_index) return 1;
if (self->production_id < other->production_id) return -1;
if (self->production_id > other->production_id) return 1;
return 0;
}
static inline WalkStateEntry *walk_state__top(WalkState *self) {
return &self->stack[self->depth - 1];
}
@ -647,28 +671,17 @@ static inline void ts_query__pattern_map_insert(
}));
}
static void ts_query__analyze_patterns(TSQuery *self) {
static bool ts_query__analyze_patterns(TSQuery *self, unsigned *impossible_index) {
typedef struct {
TSSymbol parent_symbol;
uint32_t parent_step_index;
Array(uint32_t) child_step_indices;
} ParentPattern;
typedef struct {
TSStateId state;
uint8_t child_index;
uint8_t production_id;
bool done;
} SubgraphNode;
typedef struct {
TSSymbol symbol;
Array(TSStateId) start_states;
Array(SubgraphNode) nodes;
} SymbolSubgraph;
typedef Array(WalkState) WalkStateList;
bool result = true;
// Identify all of the patterns in the query that have child patterns. This
// includes both top-level patterns and patterns that are nested within some
// larger pattern. For each of these, record the parent symbol, the step index
@ -846,7 +859,11 @@ static void ts_query__analyze_patterns(TSQuery *self) {
.done = false,
};
unsigned index, exists;
array_search_sorted_by(&subgraph->nodes, 0, .state, predecessor_node.state, &index, &exists);
array_search_sorted_with(
&subgraph->nodes, 0,
subgraph_node__compare, &predecessor_node,
&index, &exists
);
if (!exists) {
array_insert(&subgraph->nodes, index, predecessor_node);
array_push(&next_nodes, predecessor_node);
@ -897,7 +914,8 @@ static void ts_query__analyze_patterns(TSQuery *self) {
.state = state,
.child_index = 0,
.parent_symbol = subgraph->symbol,
.field = 0,
.field_id = 0,
.done = false,
},
},
.depth = 1,
@ -923,20 +941,14 @@ static void ts_query__analyze_patterns(TSQuery *self) {
// ts_language_symbol_name(self->language, walk_state->stack[walk_state->depth - 1].parent_symbol)
// );
// }
// printf("\nFinished step indices for %u %s:", i, ts_language_symbol_name(self->language, parent_pattern->parent_symbol));
// for (unsigned j = 0; j < finished_step_indices.size; j++) {
// printf(" %u", finished_step_indices.contents[j]);
// }
// printf("\n\n");
// }
array_clear(&next_walk_states);
for (unsigned j = 0; j < walk_states.size; j++) {
WalkState *walk_state = &walk_states.contents[j];
TSStateId state = walk_state->stack[walk_state->depth - 1].state;
unsigned child_index = walk_state->stack[walk_state->depth - 1].child_index;
TSSymbol parent_symbol = walk_state->stack[walk_state->depth - 1].parent_symbol;
TSStateId state = walk_state__top(walk_state)->state;
unsigned child_index = walk_state__top(walk_state)->child_index;
TSSymbol parent_symbol = walk_state__top(walk_state)->parent_symbol;
unsigned subgraph_index, exists;
array_search_sorted_by(&subgraphs, 0, .symbol, parent_symbol, &subgraph_index, &exists);
@ -948,15 +960,14 @@ static void ts_query__analyze_patterns(TSQuery *self) {
if (successor_state && successor_state != state) {
unsigned node_index;
array_search_sorted_by(&subgraph->nodes, 0, .state, successor_state, &node_index, &exists);
if (exists) {
SubgraphNode *node = &subgraph->nodes.contents[node_index];
if (node->child_index != child_index + 1) continue;
while (exists && node_index < subgraph->nodes.size) {
SubgraphNode *node = &subgraph->nodes.contents[node_index++];
if (node->state != successor_state || node->child_index != child_index + 1) continue;
WalkState next_walk_state = *walk_state;
walk_state__top(&next_walk_state)->child_index++;
walk_state__top(&next_walk_state)->state = successor_state;
bool does_match = true;
unsigned step_index = parent_pattern->child_step_indices.contents[walk_state->step_index];
QueryStep *step = &self->steps.contents[step_index];
TSSymbol alias = ts_language_alias_at(self->language, node->production_id, child_index);
@ -965,21 +976,6 @@ static void ts_query__analyze_patterns(TSQuery *self) {
: self->language->symbol_metadata[sym].visible
? self->language->public_symbol_map[sym]
: 0;
if (visible_symbol) {
if (step->symbol == NAMED_WILDCARD_SYMBOL) {
if (!ts_language_symbol_metadata(self->language, visible_symbol).named) does_match = false;
} else if (step->symbol != WILDCARD_SYMBOL) {
if (step->symbol != visible_symbol) does_match = false;
}
} else if (next_walk_state.depth < MAX_WALK_STATE_DEPTH) {
does_match = false;
next_walk_state.depth++;
walk_state__top(&next_walk_state)->state = state;
walk_state__top(&next_walk_state)->child_index = 0;
walk_state__top(&next_walk_state)->parent_symbol = sym;
} else {
continue;
}
TSFieldId field_id = 0;
const TSFieldMapEntry *field_map, *field_map_end;
@ -991,18 +987,45 @@ static void ts_query__analyze_patterns(TSQuery *self) {
}
}
if (node->done) {
walk_state__top(&next_walk_state)->done = true;
}
bool does_match = true;
if (visible_symbol) {
if (step->symbol == NAMED_WILDCARD_SYMBOL) {
if (!self->language->symbol_metadata[visible_symbol].named) does_match = false;
} else if (step->symbol != WILDCARD_SYMBOL) {
if (step->symbol != visible_symbol) does_match = false;
}
if (step->field) {
bool does_match_field = step->field == field_id;
if (!does_match_field) {
for (unsigned i = 0; i < walk_state->depth; i++) {
if (walk_state->stack[i].field_id == step->field) {
does_match_field = true;
}
}
}
does_match &= does_match_field;
}
} else if (next_walk_state.depth < MAX_WALK_STATE_DEPTH) {
does_match = false;
next_walk_state.depth++;
walk_state__top(&next_walk_state)->state = state;
walk_state__top(&next_walk_state)->child_index = 0;
walk_state__top(&next_walk_state)->parent_symbol = sym;
walk_state__top(&next_walk_state)->field_id = field_id;
} else {
continue;
}
if (does_match) {
next_walk_state.step_index++;
}
if (node->done) {
next_walk_state.depth--;
}
if (
next_walk_state.depth == 0 ||
next_walk_state.step_index == parent_pattern->child_step_indices.size
) {
if (next_walk_state.step_index == parent_pattern->child_step_indices.size) {
unsigned index, exists;
array_search_sorted_by(&finished_step_indices, 0, , next_walk_state.step_index, &index, &exists);
if (!exists) array_insert(&finished_step_indices, index, next_walk_state.step_index);
@ -1011,19 +1034,39 @@ static void ts_query__analyze_patterns(TSQuery *self) {
unsigned index, exists;
array_search_sorted_with(
&next_walk_states,
0,
walk_state__compare,
&next_walk_state,
&index,
&exists
&next_walk_states, 0,
walk_state__compare, &next_walk_state,
&index, &exists
);
if (!exists) {
array_insert(&next_walk_states, index, next_walk_state);
}
if (!exists) array_insert(&next_walk_states, index, next_walk_state);
}
}
}
bool did_pop = false;
while (walk_state->depth > 0 && walk_state__top(walk_state)->done) {
walk_state->depth--;
did_pop = true;
}
if (did_pop) {
if (walk_state->depth == 0) {
unsigned index, exists;
array_search_sorted_by(&finished_step_indices, 0, , walk_state->step_index, &index, &exists);
if (!exists) array_insert(&finished_step_indices, index, walk_state->step_index);
} else {
unsigned index, exists;
array_search_sorted_with(
&next_walk_states,
0,
walk_state__compare,
walk_state,
&index,
&exists
);
if (!exists) array_insert(&next_walk_states, index, *walk_state);
}
}
}
WalkStateList _walk_states = walk_states;
@ -1037,7 +1080,7 @@ static void ts_query__analyze_patterns(TSQuery *self) {
// for (unsigned j = 0; j < finished_step_indices.size; j++) {
// printf(" %u", finished_step_indices.contents[j]);
// }
// printf("\n\n");
// printf(". Length: %u\n\n", parent_pattern->child_step_indices.size);
// }
// A query step is definite if the containing pattern will definitely match
@ -1055,6 +1098,16 @@ static void ts_query__analyze_patterns(TSQuery *self) {
}
}
}
if (finished_step_indices.size == 0 || *array_back(&finished_step_indices) < parent_pattern->child_step_indices.size) {
unsigned exists;
array_search_sorted_by(
&self->patterns, 0,
.start_step,
parent_pattern->parent_step_index, impossible_index, &exists);
result = false;
goto cleanup;
}
}
// In order for a parent step to be definite, all of its child steps must
@ -1090,6 +1143,7 @@ static void ts_query__analyze_patterns(TSQuery *self) {
// }
// Cleanup
cleanup:
for (unsigned i = 0; i < parent_patterns.size; i++) {
array_delete(&parent_patterns.contents[i].child_step_indices);
}
@ -1105,6 +1159,8 @@ static void ts_query__analyze_patterns(TSQuery *self) {
array_delete(&next_walk_states);
array_delete(&finished_step_indices);
state_predecessor_map_delete(&predecessor_map);
return result;
}
static void ts_query__finalize_steps(TSQuery *self) {
@ -1731,7 +1787,13 @@ TSQuery *ts_query_new(
}
if (self->language->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_STATE_COUNT) {
ts_query__analyze_patterns(self);
unsigned impossible_pattern_index = 0;
if (!ts_query__analyze_patterns(self, &impossible_pattern_index)) {
*error_type = TSQueryErrorPattern;
*error_offset = self->patterns.contents[impossible_pattern_index].start_byte;
ts_query_delete(self);
return NULL;
}
}
ts_query__finalize_steps(self);