Improve capture quantifier computation

Compute quantifiers in a bottom-up manner, which allows more precise
results for alternations, where the quantifiers are now precisly joined.
This commit is contained in:
Hendrik van Antwerpen 2021-12-02 19:04:49 +01:00
parent 9bac066330
commit 1f1a449c76
5 changed files with 506 additions and 78 deletions

View file

@ -118,9 +118,13 @@ typedef struct {
typedef struct {
Array(char) characters;
Array(Slice) slices;
Array(TSQuantifier) quantifiers;
} SymbolTable;
/**
* CaptureQuantififers - a data structure holding the quantifiers of pattern captures.
*/
typedef Array(TSQuantifier) CaptureQuantifiers;
/*
* PatternEntry - Information about the starting point for matching a particular
* pattern. These entries are stored in a 'pattern map' - a sorted array that
@ -263,6 +267,7 @@ typedef struct {
*/
struct TSQuery {
SymbolTable captures;
CaptureQuantifiers capture_quantifiers;
SymbolTable predicate_values;
Array(QueryStep) steps;
Array(PatternEntry) pattern_map;
@ -458,31 +463,104 @@ static void capture_list_pool_release(CaptureListPool *self, uint16_t id) {
* Quantifiers
**************/
static TSQuantifier quantifier_mul(
TSQuantifier left,
TSQuantifier right
) {
switch (left)
{
case Zero:
return Zero;
case One:
return right;
case OneOrMore:
switch (right) {
case Zero:
return Zero;
case ZeroOrOne:
case ZeroOrMore:
return ZeroOrMore;
case One:
case OneOrMore:
return OneOrMore;
};
break;
case ZeroOrOne:
switch (right) {
case Zero:
return Zero;
case ZeroOrOne:
case One:
return ZeroOrOne;
case ZeroOrMore:
case OneOrMore:
return ZeroOrMore;
};
break;
case ZeroOrMore:
switch (right) {
case Zero:
return Zero;
case ZeroOrOne:
case ZeroOrMore:
case One:
case OneOrMore:
return ZeroOrMore;
};
return ZeroOrMore;
}
}
static TSQuantifier quantifier_join(
TSQuantifier left,
TSQuantifier right
) {
switch (left)
{
case One:
return right;
case OneOrMore:
case Zero:
switch (right) {
case Zero:
return Zero;
case ZeroOrOne:
case One:
return ZeroOrOne;
case ZeroOrMore:
case OneOrMore:
return ZeroOrMore;
};
break;
case One:
switch (right) {
case Zero:
case ZeroOrOne:
return ZeroOrOne;
case ZeroOrMore:
return ZeroOrMore;
case One:
return One;
case OneOrMore:
return OneOrMore;
};
break;
case OneOrMore:
switch (right) {
case Zero:
case ZeroOrOne:
case ZeroOrMore:
return ZeroOrMore;
case One:
case OneOrMore:
return OneOrMore;
};
break;
case ZeroOrOne:
switch (right) {
case One:
case Zero:
case ZeroOrOne:
case One:
return ZeroOrOne;
case OneOrMore:
case ZeroOrMore:
case OneOrMore:
return ZeroOrMore;
};
break;
@ -491,6 +569,148 @@ static TSQuantifier quantifier_join(
}
}
static TSQuantifier quantifier_add(
TSQuantifier left,
TSQuantifier right
) {
switch (left)
{
case Zero:
return right;
case One:
switch (right) {
case Zero:
return One;
case ZeroOrOne:
case ZeroOrMore:
case One:
case OneOrMore:
return OneOrMore;
};
break;
case OneOrMore:
return OneOrMore;
case ZeroOrOne:
switch (right) {
case Zero:
return ZeroOrOne;
case ZeroOrOne:
case ZeroOrMore:
return ZeroOrMore;
case One:
case OneOrMore:
return OneOrMore;
};
break;
case ZeroOrMore:
switch (right) {
case Zero:
return ZeroOrMore;
case ZeroOrOne:
case ZeroOrMore:
return ZeroOrMore;
case One:
case OneOrMore:
return OneOrMore;
};
break;
}
}
// Create new capture quantifiers structure
static CaptureQuantifiers capture_quantifiers_new(void) {
return (CaptureQuantifiers) array_new();
}
// Delete capture quantifiers structure
static void capture_quantifiers_delete(
CaptureQuantifiers *self
) {
array_delete(self);
}
// Clear capture quantifiers structure
static void capture_quantifiers_clear(
CaptureQuantifiers *self
) {
array_clear(self);
}
// Replace capture quantifiers with the given quantifiers
static void capture_quantifiers_replace(
CaptureQuantifiers *self,
CaptureQuantifiers *quantifiers
) {
array_clear(self);
array_push_all(self, quantifiers);
}
// Return capture quantifier for the given capture id
static TSQuantifier capture_quantifier_for_id(
const CaptureQuantifiers *self,
uint16_t id
) {
return (self->size <= id) ? Zero : *array_get(self, id);
}
// Add the given quantifier to the current value for id
static void capture_quantifiers_add_for_id(
CaptureQuantifiers *self,
uint16_t id,
TSQuantifier quantifier
) {
if (self->size <= id) {
array_grow_by(self, id + 1 - self->size);
}
TSQuantifier *own_quantifier = array_get(self, id);
*own_quantifier = quantifier_add(*own_quantifier, quantifier);
}
// Point-wise add the given quantifiers to the current values
static void capture_quantifiers_add_all(
CaptureQuantifiers *self,
CaptureQuantifiers *quantifiers
) {
if (self->size < quantifiers->size) {
array_grow_by(self, quantifiers->size - self->size);
}
for (uint16_t id = 0; id < quantifiers->size; id++) {
TSQuantifier *quantifier = array_get(quantifiers, id);
TSQuantifier *own_quantifier = array_get(self, id);
*own_quantifier = quantifier_add(*own_quantifier, *quantifier);
}
}
// Join the given quantifier with the current values
static void capture_quantifiers_mul(
CaptureQuantifiers *self,
TSQuantifier quantifier
) {
for (uint16_t id = 0; id < self->size; id++) {
TSQuantifier *own_quantifier = array_get(self, id);
*own_quantifier = quantifier_mul(*own_quantifier, quantifier);
}
}
// Point-wise join the quantifiers from a list of alternatives with the current values
static void capture_quantifiers_join_all(
CaptureQuantifiers *self,
CaptureQuantifiers *quantifiers
) {
if (self->size < quantifiers->size) {
array_grow_by(self, quantifiers->size - self->size);
}
for (uint32_t id = 0; id < quantifiers->size; id++) {
TSQuantifier *quantifier = array_get(quantifiers, id);
TSQuantifier *own_quantifier = array_get(self, id);
*own_quantifier = quantifier_join(*own_quantifier, *quantifier);
}
for (uint32_t id = quantifiers->size; id < self->size; id++) {
TSQuantifier *own_quantifier = array_get(self, id);
*own_quantifier = quantifier_join(*own_quantifier, Zero);
}
}
/**************
* SymbolTable
**************/
@ -499,14 +719,12 @@ static SymbolTable symbol_table_new(void) {
return (SymbolTable) {
.characters = array_new(),
.slices = array_new(),
.quantifiers = array_new(),
};
}
static void symbol_table_delete(SymbolTable *self) {
array_delete(&self->characters);
array_delete(&self->slices);
array_delete(&self->quantifiers);
}
static int symbol_table_id_for_name(
@ -534,13 +752,6 @@ static const char *symbol_table_name_for_id(
return &self->characters.contents[slice.offset];
}
static TSQuantifier symbol_table_quantifier_for_id(
const SymbolTable *self,
uint16_t id
) {
return self->quantifiers.contents[id];
}
static uint16_t symbol_table_insert_name(
SymbolTable *self,
const char *name,
@ -556,22 +767,9 @@ static uint16_t symbol_table_insert_name(
memcpy(&self->characters.contents[slice.offset], name, length);
self->characters.contents[self->characters.size - 1] = 0;
array_push(&self->slices, slice);
array_push(&self->quantifiers, One);
return self->slices.size - 1;
}
static void symbol_table_quantifiers_join(
SymbolTable *self,
TSQuantifier quantifier,
uint32_t start_index,
uint32_t end_index
) {
for (uint32_t index = start_index; index < end_index; index++) {
TSQuantifier *joined_quantifier = &self->quantifiers.contents[index];
*joined_quantifier = quantifier_join(quantifier, *joined_quantifier);
}
}
/************
* QueryStep
************/
@ -1740,17 +1938,20 @@ static TSQueryError ts_query__parse_predicate(
// Read one S-expression pattern from the stream, and incorporate it into
// the query's internal state machine representation. For nested patterns,
// this function calls itself recursively.
//
// The caller is repsonsible for passing in a dedicated CaptureQuantifiers.
// These should not be shared between different calls to ts_query__parse_pattern!
static TSQueryError ts_query__parse_pattern(
TSQuery *self,
Stream *stream,
uint32_t depth,
bool is_immediate
bool is_immediate,
CaptureQuantifiers *capture_quantifiers
) {
if (stream->next == 0) return TSQueryErrorSyntax;
if (stream->next == ')' || stream->next == ']') return PARENT_DONE;
const uint32_t starting_step_index = self->steps.size;
const uint32_t starting_quantifier_index = self->captures.quantifiers.size;
// Store the byte offset of each step in the query.
if (
@ -1770,13 +1971,15 @@ static TSQueryError ts_query__parse_pattern(
// Parse each branch, and add a placeholder step in between the branches.
Array(uint32_t) branch_step_indices = array_new();
CaptureQuantifiers branch_capture_quantifiers = capture_quantifiers_new();
for (;;) {
uint32_t start_index = self->steps.size;
TSQueryError e = ts_query__parse_pattern(
self,
stream,
depth,
is_immediate
is_immediate,
&branch_capture_quantifiers
);
if (e == PARENT_DONE) {
@ -1787,12 +1990,20 @@ static TSQueryError ts_query__parse_pattern(
e = TSQueryErrorSyntax;
}
if (e) {
capture_quantifiers_delete(&branch_capture_quantifiers);
array_delete(&branch_step_indices);
return e;
}
if(start_index == 0) {
capture_quantifiers_replace(capture_quantifiers, &branch_capture_quantifiers);
} else {
capture_quantifiers_join_all(capture_quantifiers, &branch_capture_quantifiers);
}
array_push(&branch_step_indices, start_index);
array_push(&self->steps, query_step__new(0, depth, false));
capture_quantifiers_clear(&branch_capture_quantifiers);
}
(void)array_pop(&self->steps);
@ -1808,16 +2019,7 @@ static TSQueryError ts_query__parse_pattern(
end_step->is_dead_end = true;
}
if (branch_step_indices.size > 1) {
const uint32_t ending_quantifier_index = self->captures.quantifiers.size;
symbol_table_quantifiers_join(
&self->captures,
ZeroOrOne,
starting_quantifier_index,
ending_quantifier_index
);
}
capture_quantifiers_delete(&branch_capture_quantifiers);
array_delete(&branch_step_indices);
}
@ -1832,6 +2034,7 @@ static TSQueryError ts_query__parse_pattern(
// If this parenthesis is followed by a node, then it represents a grouped sequence.
if (stream->next == '(' || stream->next == '"' || stream->next == '[') {
bool child_is_immediate = false;
CaptureQuantifiers child_capture_quantifiers = capture_quantifiers_new();
for (;;) {
if (stream->next == '.') {
child_is_immediate = true;
@ -1842,7 +2045,8 @@ static TSQueryError ts_query__parse_pattern(
self,
stream,
depth,
child_is_immediate
child_is_immediate,
&child_capture_quantifiers
);
if (e == PARENT_DONE) {
if (stream->next == ')') {
@ -1851,10 +2055,17 @@ static TSQueryError ts_query__parse_pattern(
}
e = TSQueryErrorSyntax;
}
if (e) return e;
if (e) {
capture_quantifiers_delete(&child_capture_quantifiers);
return e;
}
capture_quantifiers_add_all(capture_quantifiers, &child_capture_quantifiers);
child_is_immediate = false;
capture_quantifiers_clear(&child_capture_quantifiers);
}
capture_quantifiers_delete(&child_capture_quantifiers);
}
// A dot/pound character indicates the start of a predicate.
@ -1943,12 +2154,16 @@ static TSQueryError ts_query__parse_pattern(
uint16_t last_child_step_index = 0;
uint16_t negated_field_count = 0;
TSFieldId negated_field_ids[MAX_NEGATED_FIELD_COUNT];
CaptureQuantifiers child_capture_quantifiers = capture_quantifiers_new();
for (;;) {
// Parse a negated field assertion
if (stream->next == '!') {
stream_advance(stream);
stream_skip_whitespace(stream);
if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax;
if (!stream_is_ident_start(stream)) {
capture_quantifiers_delete(&child_capture_quantifiers);
return TSQueryErrorSyntax;
}
const char *field_name = stream->input;
stream_scan_identifier(stream);
uint32_t length = stream->input - field_name;
@ -1961,6 +2176,7 @@ static TSQueryError ts_query__parse_pattern(
);
if (!field_id) {
stream->input = field_name;
capture_quantifiers_delete(&child_capture_quantifiers);
return TSQueryErrorField;
}
@ -1985,12 +2201,16 @@ static TSQueryError ts_query__parse_pattern(
self,
stream,
depth + 1,
child_is_immediate
child_is_immediate,
&child_capture_quantifiers
);
if (e == PARENT_DONE) {
if (stream->next == ')') {
if (child_is_immediate) {
if (last_child_step_index == 0) return TSQueryErrorSyntax;
if (last_child_step_index == 0) {
capture_quantifiers_delete(&child_capture_quantifiers);
return TSQueryErrorSyntax;
}
self->steps.contents[last_child_step_index].is_last_child = true;
}
@ -2008,11 +2228,18 @@ static TSQueryError ts_query__parse_pattern(
}
e = TSQueryErrorSyntax;
}
if (e) return e;
if (e) {
capture_quantifiers_delete(&child_capture_quantifiers);
return e;
}
capture_quantifiers_add_all(capture_quantifiers, &child_capture_quantifiers);
last_child_step_index = step_index;
child_is_immediate = false;
capture_quantifiers_clear(&child_capture_quantifiers);
}
capture_quantifiers_delete(&child_capture_quantifiers);
}
}
@ -2061,14 +2288,22 @@ static TSQueryError ts_query__parse_pattern(
stream_skip_whitespace(stream);
// Parse the pattern
CaptureQuantifiers field_capture_quantifiers = capture_quantifiers_new();
TSQueryError e = ts_query__parse_pattern(
self,
stream,
depth,
is_immediate
is_immediate,
&field_capture_quantifiers
);
if (e == PARENT_DONE) return TSQueryErrorSyntax;
if (e) return e;
if (e == PARENT_DONE) {
capture_quantifiers_delete(&field_capture_quantifiers);
return TSQueryErrorSyntax;
}
if (e) {
capture_quantifiers_delete(&field_capture_quantifiers);
return e;
}
// Add the field name to the first step of the pattern
TSFieldId field_id = ts_language_field_id_for_name(
@ -2096,6 +2331,9 @@ static TSQueryError ts_query__parse_pattern(
break;
}
}
capture_quantifiers_add_all(capture_quantifiers, &field_capture_quantifiers);
capture_quantifiers_delete(&field_capture_quantifiers);
}
else {
@ -2175,6 +2413,10 @@ static TSQueryError ts_query__parse_pattern(
for (;;) {
QueryStep *step = &self->steps.contents[step_index];
query_step__add_capture(step, capture_id);
// Add only once, not for every branch, lest the quantifier will be '+' instead of '1'
if (step_index == starting_step_index) {
capture_quantifiers_add_for_id(capture_quantifiers, capture_id, One);
}
if (
step->alternative_index != NONE &&
step->alternative_index > step_index &&
@ -2194,16 +2436,7 @@ static TSQueryError ts_query__parse_pattern(
}
}
// Patch capture quantifiers
if (quantifier != One) {
const uint32_t ending_quantifier_index = self->captures.quantifiers.size;
symbol_table_quantifiers_join(
&self->captures,
quantifier,
starting_quantifier_index,
ending_quantifier_index
);
}
capture_quantifiers_mul(capture_quantifiers, quantifier);
return 0;
}
@ -2229,6 +2462,7 @@ TSQuery *ts_query_new(
.steps = array_new(),
.pattern_map = array_new(),
.captures = symbol_table_new(),
.capture_quantifiers = capture_quantifiers_new(),
.predicate_values = symbol_table_new(),
.predicate_steps = array_new(),
.patterns = array_new(),
@ -2253,7 +2487,7 @@ TSQuery *ts_query_new(
.predicate_steps = (Slice) {.offset = start_predicate_step_index},
.start_byte = stream_offset(&stream),
}));
*error_type = ts_query__parse_pattern(self, &stream, 0, false);
*error_type = ts_query__parse_pattern(self, &stream, 0, false, &self->capture_quantifiers);
array_push(&self->steps, query_step__new(0, PATTERN_DONE_MARKER, false));
QueryPattern *pattern = array_back(&self->patterns);
@ -2344,6 +2578,7 @@ void ts_query_delete(TSQuery *self) {
array_delete(&self->negated_fields);
symbol_table_delete(&self->captures);
symbol_table_delete(&self->predicate_values);
capture_quantifiers_delete(&self->capture_quantifiers);
ts_free(self);
}
}
@ -2372,7 +2607,7 @@ TSQuantifier ts_query_capture_quantifier_for_id(
const TSQuery *self,
uint32_t index
) {
return symbol_table_quantifier_for_id(&self->captures, index);
return capture_quantifier_for_id(&self->capture_quantifiers, index);
}
const char *ts_query_string_value_for_id(