query: Fix escape sequence parsing in anonymous node patterns

Fixes #776
Fixes #760
This commit is contained in:
Max Brunsfeld 2020-10-28 13:55:06 -07:00
parent 071f4e40f1
commit c2c63baf5b
3 changed files with 86 additions and 115 deletions

View file

@ -1475,6 +1475,7 @@ fn test_query_matches_with_anonymous_tokens() {
r#"
";" @punctuation
"&&" @operator
"\"" @quote
"#,
)
.unwrap();
@ -1482,9 +1483,11 @@ fn test_query_matches_with_anonymous_tokens() {
assert_query_matches(
language,
&query,
"foo(a && b);",
r#"foo(a && "b");"#,
&[
(1, vec![("operator", "&&")]),
(2, vec![("quote", "\"")]),
(2, vec![("quote", "\"")]),
(0, vec![("punctuation", ";")]),
],
);

View file

@ -52,14 +52,24 @@ extern "C" {
(self)->size += (count))
#define array_push_all(self, other) \
array_splice((self), (self)->size, 0, (other)->size, (other)->contents)
array_extend((self), (other)->size, (other)->contents)
// Append `count` elements to the end of the array, reading their values from the
// `contents` pointer.
#define array_extend(self, count, contents) \
array__splice( \
(VoidArray *)(self), array__elem_size(self), (self)->size, \
0, count, contents \
)
// Remove `old_count` elements from the array starting at the given `index`. At
// the same index, insert `new_count` new elements, reading their values from the
// `new_contents` pointer.
#define array_splice(self, index, old_count, new_count, new_contents) \
array__splice((VoidArray *)(self), array__elem_size(self), index, old_count, \
new_count, new_contents)
#define array_splice(self, index, old_count, new_count, new_contents) \
array__splice( \
(VoidArray *)(self), array__elem_size(self), index, \
old_count, new_count, new_contents \
)
// Insert one `element` into the array at the given `index`.
#define array_insert(self, index, element) \

View file

@ -214,6 +214,7 @@ struct TSQuery {
Array(TSQueryPredicateStep) predicate_steps;
Array(QueryPattern) patterns;
Array(StepOffset) step_offsets;
Array(char) string_buffer;
const TSLanguage *language;
uint16_t wildcard_root_pattern_count;
TSSymbol *symbol_map;
@ -439,67 +440,6 @@ static uint16_t symbol_table_insert_name(
return self->slices.size - 1;
}
static uint16_t symbol_table_insert_name_with_escapes(
SymbolTable *self,
const char *escaped_name,
uint32_t escaped_length
) {
Slice slice = {
.offset = self->characters.size,
.length = 0,
};
array_grow_by(&self->characters, escaped_length + 1);
// Copy the contents of the literal into the characters buffer, processing escape
// sequences like \n and \". This needs to be done before checking if the literal
// is already present, in order to do the string comparison.
bool is_escaped = false;
for (unsigned i = 0; i < escaped_length; i++) {
const char *src = &escaped_name[i];
char *dest = &self->characters.contents[slice.offset + slice.length];
if (is_escaped) {
switch (*src) {
case 'n':
*dest = '\n';
break;
case 'r':
*dest = '\r';
break;
case 't':
*dest = '\t';
break;
case '0':
*dest = '\0';
break;
default:
*dest = *src;
break;
}
is_escaped = false;
slice.length++;
} else {
if (*src == '\\') {
is_escaped = true;
} else {
*dest = *src;
slice.length++;
}
}
}
// If the string is already present, remove the redundant content from the characters
// buffer and return the existing id.
int id = symbol_table_id_for_name(self, &self->characters.contents[slice.offset], slice.length);
if (id >= 0) {
self->characters.size -= (escaped_length + 1);
return id;
}
self->characters.contents[slice.offset + slice.length] = 0;
array_push(&self->slices, slice);
return self->slices.size - 1;
}
/************
* QueryStep
************/
@ -1393,6 +1333,59 @@ static void ts_query__finalize_steps(TSQuery *self) {
}
}
static TSQueryError ts_query__parse_string_literal(
TSQuery *self,
Stream *stream
) {
const char *string_start = stream->input;
if (stream->next != '"') return TSQueryErrorSyntax;
stream_advance(stream);
const char *prev_position = stream->input;
bool is_escaped = false;
array_clear(&self->string_buffer);
for (;;) {
if (is_escaped) {
is_escaped = false;
switch (stream->next) {
case 'n':
array_push(&self->string_buffer, '\n');
break;
case 'r':
array_push(&self->string_buffer, '\r');
break;
case 't':
array_push(&self->string_buffer, '\t');
break;
case '0':
array_push(&self->string_buffer, '\0');
break;
default:
array_extend(&self->string_buffer, stream->next_size, stream->input);
break;
}
prev_position = stream->input + stream->next_size;
} else {
if (stream->next == '\\') {
array_extend(&self->string_buffer, (stream->input - prev_position), prev_position);
prev_position = stream->input + 1;
is_escaped = true;
} else if (stream->next == '"') {
array_extend(&self->string_buffer, (stream->input - prev_position), prev_position);
stream_advance(stream);
return TSQueryErrorNone;
} else if (stream->next == '\n') {
stream_reset(stream, string_start);
return TSQueryErrorSyntax;
}
}
if (!stream_advance(stream)) {
stream_reset(stream, string_start);
return TSQueryErrorSyntax;
}
}
}
// Parse a single predicate associated with a pattern, adding it to the
// query's internal `predicate_steps` array. Predicates are arbitrary
// S-expressions associated with a pattern which are meant to be handled at
@ -1458,44 +1451,17 @@ static TSQueryError ts_query__parse_predicate(
// Parse a string literal
else if (stream->next == '"') {
stream_advance(stream);
// Parse the string content
bool is_escaped = false;
const char *string_content = stream->input;
for (;;) {
if (is_escaped) {
is_escaped = false;
} else {
if (stream->next == '\\') {
is_escaped = true;
} else if (stream->next == '"') {
break;
} else if (stream->next == '\n') {
stream_reset(stream, string_content - 1);
return TSQueryErrorSyntax;
}
}
if (!stream_advance(stream)) {
stream_reset(stream, string_content - 1);
return TSQueryErrorSyntax;
}
}
uint32_t length = stream->input - string_content;
// Add a step for the node
uint16_t id = symbol_table_insert_name_with_escapes(
TSQueryError e = ts_query__parse_string_literal(self, stream);
if (e) return e;
uint16_t id = symbol_table_insert_name(
&self->predicate_values,
string_content,
length
self->string_buffer.contents,
self->string_buffer.size
);
array_push(&self->predicate_steps, ((TSQueryPredicateStep) {
.type = TSQueryPredicateStepTypeString,
.value_id = id,
}));
if (stream->next != '"') return TSQueryErrorSyntax;
stream_advance(stream);
}
// Parse a bare symbol
@ -1761,33 +1727,22 @@ static TSQueryError ts_query__parse_pattern(
// Parse a double-quoted anonymous leaf node expression
else if (stream->next == '"') {
stream_advance(stream);
// Parse the string content
const char *string_content = stream->input;
while (stream->next != '"') {
if (!stream_advance(stream)) {
stream_reset(stream, string_content - 1);
return TSQueryErrorSyntax;
}
}
uint32_t length = stream->input - string_content;
const char *string_start = stream->input;
TSQueryError e = ts_query__parse_string_literal(self, stream);
if (e) return e;
// Add a step for the node
TSSymbol symbol = ts_language_symbol_for_name(
self->language,
string_content,
length,
self->string_buffer.contents,
self->string_buffer.size,
false
);
if (!symbol) {
stream_reset(stream, string_content);
stream_reset(stream, string_start + 1);
return TSQueryErrorNodeType;
}
array_push(&self->steps, query_step__new(symbol, depth, is_immediate));
if (stream->next != '"') return TSQueryErrorSyntax;
stream_advance(stream);
}
// Parse a field-prefixed pattern
@ -1977,6 +1932,7 @@ TSQuery *ts_query_new(
.predicate_steps = array_new(),
.patterns = array_new(),
.step_offsets = array_new(),
.string_buffer = array_new(),
.symbol_map = symbol_map,
.wildcard_root_pattern_count = 0,
.language = language,
@ -2056,6 +2012,7 @@ TSQuery *ts_query_new(
}
ts_query__finalize_steps(self);
array_delete(&self->string_buffer);
return self;
}
@ -2066,6 +2023,7 @@ void ts_query_delete(TSQuery *self) {
array_delete(&self->predicate_steps);
array_delete(&self->patterns);
array_delete(&self->step_offsets);
array_delete(&self->string_buffer);
symbol_table_delete(&self->captures);
symbol_table_delete(&self->predicate_values);
ts_free(self->symbol_map);