Merge pull request #644 from tree-sitter/query-pattern-is-definite

Analyze queries on construction to identify impossible patterns, and patterns that will definitely match
This commit is contained in:
Max Brunsfeld 2020-09-02 10:28:21 -07:00 committed by GitHub
commit 18150a1573
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
20 changed files with 2039 additions and 408 deletions

View file

@ -12,9 +12,9 @@ extern "C" {
#include <stdbool.h>
#include "./alloc.h"
#define Array(T) \
struct { \
T *contents; \
#define Array(T) \
struct { \
T *contents; \
uint32_t size; \
uint32_t capacity; \
}
@ -37,15 +37,15 @@ extern "C" {
#define array_reserve(self, new_capacity) \
array__reserve((VoidArray *)(self), array__elem_size(self), new_capacity)
#define array_erase(self, index) \
array__erase((VoidArray *)(self), array__elem_size(self), index)
// Free any memory allocated for this array.
#define array_delete(self) array__delete((VoidArray *)self)
#define array_push(self, element) \
(array__grow((VoidArray *)(self), 1, array__elem_size(self)), \
(self)->contents[(self)->size++] = (element))
// Increase the array's size by a given number of elements, reallocating
// if necessary. New elements are zero-initialized.
#define array_grow_by(self, count) \
(array__grow((VoidArray *)(self), count, array__elem_size(self)), \
memset((self)->contents + (self)->size, 0, (count) * array__elem_size(self)), \
@ -54,18 +54,64 @@ extern "C" {
#define array_push_all(self, other) \
array_splice((self), (self)->size, 0, (other)->size, (other)->contents)
// Remove `old_count` elements from the array starting at the given `index`. At
// the same index, insert `new_count` new elements, reading their values from the
// `new_contents` pointer.
#define array_splice(self, index, old_count, new_count, new_contents) \
array__splice((VoidArray *)(self), array__elem_size(self), index, old_count, \
new_count, new_contents)
// Insert one `element` into the array at the given `index`.
#define array_insert(self, index, element) \
array__splice((VoidArray *)(self), array__elem_size(self), index, 0, 1, &element)
// Remove one `element` from the array at the given `index`.
#define array_erase(self, index) \
array__erase((VoidArray *)(self), array__elem_size(self), index)
#define array_pop(self) ((self)->contents[--(self)->size])
#define array_assign(self, other) \
array__assign((VoidArray *)(self), (const VoidArray *)(other), array__elem_size(self))
// Search a sorted array for a given `needle` value, using the given `compare`
// callback to determine the order.
//
// If an existing element is found to be equal to `needle`, then the `index`
// out-parameter is set to the existing value's index, and the `exists`
// out-parameter is set to true. Otherwise, `index` is set to an index where
// `needle` should be inserted in order to preserve the sorting, and `exists`
// is set to false.
#define array_search_sorted_with(self, compare, needle, index, exists) \
array__search_sorted(self, 0, compare, , needle, index, exists)
// Search a sorted array for a given `needle` value, using integer comparisons
// of a given struct field (specified with a leading dot) to determine the order.
//
// See also `array_search_sorted_with`.
#define array_search_sorted_by(self, field, needle, index, exists) \
array__search_sorted(self, 0, _compare_int, field, needle, index, exists)
// Insert a given `value` into a sorted array, using the given `compare`
// callback to determine the order.
#define array_insert_sorted_with(self, compare, value) \
do { \
unsigned index, exists; \
array_search_sorted_with(self, compare, &(value), &index, &exists); \
if (!exists) array_insert(self, index, value); \
} while (0)
// Insert a given `value` into a sorted array, using integer comparisons of
// a given struct field (specified with a leading dot) to determine the order.
//
// See also `array_search_sorted_by`.
#define array_insert_sorted_by(self, field, value) \
do { \
unsigned index, exists; \
array_search_sorted_by(self, field, (value) field, &index, &exists); \
if (!exists) array_insert(self, index, value); \
} while (0)
// Private
typedef Array(void) VoidArray;
@ -151,6 +197,30 @@ static inline void array__splice(VoidArray *self, size_t element_size,
self->size += new_count - old_count;
}
// A binary search routine, based on Rust's `std::slice::binary_search_by`.
#define array__search_sorted(self, start, compare, suffix, needle, index, exists) \
do { \
*(index) = start; \
*(exists) = false; \
uint32_t size = (self)->size - *(index); \
if (size == 0) break; \
int comparison; \
while (size > 1) { \
uint32_t half_size = size / 2; \
uint32_t mid_index = *(index) + half_size; \
comparison = compare(&((self)->contents[mid_index] suffix), (needle)); \
if (comparison <= 0) *(index) = mid_index; \
size -= half_size; \
} \
comparison = compare(&((self)->contents[*(index)] suffix), (needle)); \
if (comparison == 0) *(exists) = true; \
else if (comparison < 0) *(index) += 1; \
} while (0)
// Helper macro for the `_sorted_by` routines below. This takes the left (existing)
// parameter by reference in order to work with the generic sorting function above.
#define _compare_int(a, b) ((int)*(a) - (int)(b))
#ifdef __cplusplus
}
#endif

View file

@ -146,17 +146,21 @@ static bool iterator_tree_is_visible(const Iterator *self) {
if (ts_subtree_visible(*entry.subtree)) return true;
if (self->cursor.stack.size > 1) {
Subtree parent = *self->cursor.stack.contents[self->cursor.stack.size - 2].subtree;
const TSSymbol *alias_sequence = ts_language_alias_sequence(
return ts_language_alias_at(
self->language,
parent.ptr->production_id
);
return alias_sequence && alias_sequence[entry.structural_child_index] != 0;
parent.ptr->production_id,
entry.structural_child_index
) != 0;
}
return false;
}
static void iterator_get_visible_state(const Iterator *self, Subtree *tree,
TSSymbol *alias_symbol, uint32_t *start_byte) {
static void iterator_get_visible_state(
const Iterator *self,
Subtree *tree,
TSSymbol *alias_symbol,
uint32_t *start_byte
) {
uint32_t i = self->cursor.stack.size - 1;
if (self->in_padding) {
@ -169,13 +173,11 @@ static void iterator_get_visible_state(const Iterator *self, Subtree *tree,
if (i > 0) {
const Subtree *parent = self->cursor.stack.contents[i - 1].subtree;
const TSSymbol *alias_sequence = ts_language_alias_sequence(
*alias_symbol = ts_language_alias_at(
self->language,
parent->ptr->production_id
parent->ptr->production_id,
entry.structural_child_index
);
if (alias_sequence) {
*alias_symbol = alias_sequence[entry.structural_child_index];
}
}
if (ts_subtree_visible(*entry.subtree) || *alias_symbol) {

View file

@ -12,6 +12,8 @@ extern "C" {
#define TREE_SITTER_LANGUAGE_VERSION_WITH_FIELDS 10
#define TREE_SITTER_LANGUAGE_VERSION_WITH_SYMBOL_DEDUPING 11
#define TREE_SITTER_LANGUAGE_VERSION_WITH_SMALL_STATES 11
#define TREE_SITTER_LANGUAGE_VERSION_WITH_STATE_COUNT 12
#define TREE_SITTER_LANGUAGE_VERSION_WITH_ALIAS_MAP 12
typedef struct {
const TSParseAction *actions;
@ -19,6 +21,22 @@ typedef struct {
bool is_reusable;
} TableEntry;
typedef struct {
const TSLanguage *language;
const uint16_t *data;
const uint16_t *group_end;
TSStateId state;
uint16_t table_value;
uint16_t section_index;
uint16_t group_count;
bool is_small_state;
const TSParseAction *actions;
TSSymbol symbol;
TSStateId next_state;
uint16_t action_count;
} LookaheadIterator;
void ts_language_table_entry(const TSLanguage *, TSStateId, TSSymbol, TableEntry *);
TSSymbolMetadata ts_language_symbol_metadata(const TSLanguage *, TSSymbol);
@ -41,22 +59,33 @@ static inline const TSParseAction *ts_language_actions(
return entry.actions;
}
static inline bool ts_language_has_actions(const TSLanguage *self,
TSStateId state,
TSSymbol symbol) {
static inline bool ts_language_has_actions(
const TSLanguage *self,
TSStateId state,
TSSymbol symbol
) {
TableEntry entry;
ts_language_table_entry(self, state, symbol, &entry);
return entry.action_count > 0;
}
static inline bool ts_language_has_reduce_action(const TSLanguage *self,
TSStateId state,
TSSymbol symbol) {
static inline bool ts_language_has_reduce_action(
const TSLanguage *self,
TSStateId state,
TSSymbol symbol
) {
TableEntry entry;
ts_language_table_entry(self, state, symbol, &entry);
return entry.action_count > 0 && entry.actions[0].type == TSParseActionTypeReduce;
}
// Lookup the table value for a given symbol and state.
//
// For non-terminal symbols, the table value represents a successor state.
// For terminal symbols, it represents an index in the actions table.
// For 'large' parse states, this is a direct lookup. For 'small' parse
// states, this requires searching through the symbol groups to find
// the given symbol.
static inline uint16_t ts_language_lookup(
const TSLanguage *self,
TSStateId state,
@ -68,8 +97,8 @@ static inline uint16_t ts_language_lookup(
) {
uint32_t index = self->small_parse_table_map[state - self->large_state_count];
const uint16_t *data = &self->small_parse_table[index];
uint16_t section_count = *(data++);
for (unsigned i = 0; i < section_count; i++) {
uint16_t group_count = *(data++);
for (unsigned i = 0; i < group_count; i++) {
uint16_t section_value = *(data++);
uint16_t symbol_count = *(data++);
for (unsigned i = 0; i < symbol_count; i++) {
@ -82,9 +111,90 @@ static inline uint16_t ts_language_lookup(
}
}
static inline TSStateId ts_language_next_state(const TSLanguage *self,
TSStateId state,
TSSymbol symbol) {
// Iterate over all of the symbols that are valid in the given state.
//
// For 'large' parse states, this just requires iterating through
// all possible symbols and checking the parse table for each one.
// For 'small' parse states, this exploits the structure of the
// table to only visit the valid symbols.
static inline LookaheadIterator ts_language_lookaheads(
const TSLanguage *self,
TSStateId state
) {
bool is_small_state =
self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_SMALL_STATES &&
state >= self->large_state_count;
const uint16_t *data;
const uint16_t *group_end = NULL;
uint16_t group_count = 0;
if (is_small_state) {
uint32_t index = self->small_parse_table_map[state - self->large_state_count];
data = &self->small_parse_table[index];
group_end = data + 1;
group_count = *data;
} else {
data = &self->parse_table[state * self->symbol_count] - 1;
}
return (LookaheadIterator) {
.language = self,
.data = data,
.group_end = group_end,
.group_count = group_count,
.is_small_state = is_small_state,
.symbol = UINT16_MAX,
.next_state = 0,
};
}
static inline bool ts_lookahead_iterator_next(LookaheadIterator *self) {
// For small parse states, valid symbols are listed explicitly,
// grouped by their value. There's no need to look up the actions
// again until moving to the next group.
if (self->is_small_state) {
self->data++;
if (self->data == self->group_end) {
if (self->group_count == 0) return false;
self->group_count--;
self->table_value = *(self->data++);
unsigned symbol_count = *(self->data++);
self->group_end = self->data + symbol_count;
self->symbol = *self->data;
} else {
self->symbol = *self->data;
return true;
}
}
// For large parse states, iterate through every symbol until one
// is found that has valid actions.
else {
do {
self->data++;
self->symbol++;
if (self->symbol >= self->language->symbol_count) return false;
self->table_value = *self->data;
} while (!self->table_value);
}
// Depending on if the symbols is terminal or non-terminal, the table value either
// represents a list of actions or a successor state.
if (self->symbol < self->language->token_count) {
const TSParseActionEntry *entry = &self->language->parse_actions[self->table_value];
self->action_count = entry->entry.count;
self->actions = (const TSParseAction *)(entry + 1);
self->next_state = 0;
} else {
self->action_count = 0;
self->next_state = self->table_value;
}
return true;
}
static inline TSStateId ts_language_next_state(
const TSLanguage *self,
TSStateId state,
TSSymbol symbol
) {
if (symbol == ts_builtin_sym_error || symbol == ts_builtin_sym_error_repeat) {
return 0;
} else if (symbol < self->token_count) {
@ -102,9 +212,10 @@ static inline TSStateId ts_language_next_state(const TSLanguage *self,
}
}
static inline const bool *
ts_language_enabled_external_tokens(const TSLanguage *self,
unsigned external_scanner_state) {
static inline const bool *ts_language_enabled_external_tokens(
const TSLanguage *self,
unsigned external_scanner_state
) {
if (external_scanner_state == 0) {
return NULL;
} else {
@ -112,13 +223,25 @@ ts_language_enabled_external_tokens(const TSLanguage *self,
}
}
static inline const TSSymbol *
ts_language_alias_sequence(const TSLanguage *self, uint32_t production_id) {
return production_id > 0 ?
self->alias_sequences + production_id * self->max_alias_sequence_length :
static inline const TSSymbol *ts_language_alias_sequence(
const TSLanguage *self,
uint32_t production_id
) {
return production_id ?
&self->alias_sequences[production_id * self->max_alias_sequence_length] :
NULL;
}
static inline TSSymbol ts_language_alias_at(
const TSLanguage *self,
uint32_t production_id,
uint32_t child_index
) {
return production_id ?
self->alias_sequences[production_id * self->max_alias_sequence_length + child_index] :
0;
}
static inline void ts_language_field_map(
const TSLanguage *self,
uint32_t production_id,
@ -136,6 +259,32 @@ static inline void ts_language_field_map(
*end = &self->field_map_entries[slice.index] + slice.length;
}
static inline void ts_language_aliases_for_symbol(
const TSLanguage *self,
TSSymbol original_symbol,
const TSSymbol **start,
const TSSymbol **end
) {
*start = &self->public_symbol_map[original_symbol];
*end = *start + 1;
if (self->version < TREE_SITTER_LANGUAGE_VERSION_WITH_ALIAS_MAP) return;
unsigned i = 0;
for (;;) {
TSSymbol symbol = self->alias_map[i++];
if (symbol == 0 || symbol > original_symbol) break;
uint16_t count = self->alias_map[i++];
if (symbol == original_symbol) {
*start = &self->alias_map[i];
*end = &self->alias_map[i + count];
break;
}
i += count;
}
}
#ifdef __cplusplus
}
#endif

File diff suppressed because it is too large Load diff

View file

@ -360,7 +360,7 @@ void ts_subtree_set_children(
self.ptr->has_external_tokens = false;
self.ptr->dynamic_precedence = 0;
uint32_t non_extra_index = 0;
uint32_t structural_index = 0;
const TSSymbol *alias_sequence = ts_language_alias_sequence(language, self.ptr->production_id);
uint32_t lookahead_end_byte = 0;
@ -387,9 +387,9 @@ void ts_subtree_set_children(
self.ptr->dynamic_precedence += ts_subtree_dynamic_precedence(child);
self.ptr->node_count += ts_subtree_node_count(child);
if (alias_sequence && alias_sequence[non_extra_index] != 0 && !ts_subtree_extra(child)) {
if (alias_sequence && alias_sequence[structural_index] != 0 && !ts_subtree_extra(child)) {
self.ptr->visible_child_count++;
if (ts_language_symbol_metadata(language, alias_sequence[non_extra_index]).named) {
if (ts_language_symbol_metadata(language, alias_sequence[structural_index]).named) {
self.ptr->named_child_count++;
}
} else if (ts_subtree_visible(child)) {
@ -407,7 +407,7 @@ void ts_subtree_set_children(
self.ptr->parse_state = TS_TREE_STATE_NONE;
}
if (!ts_subtree_extra(child)) non_extra_index++;
if (!ts_subtree_extra(child)) structural_index++;
}
self.ptr->lookahead_bytes = lookahead_end_byte - self.ptr->size.bytes - self.ptr->padding.bytes;

View file

@ -205,19 +205,21 @@ bool ts_tree_cursor_goto_parent(TSTreeCursor *_self) {
TreeCursor *self = (TreeCursor *)_self;
for (unsigned i = self->stack.size - 2; i + 1 > 0; i--) {
TreeCursorEntry *entry = &self->stack.contents[i];
bool is_aliased = false;
if (i > 0) {
TreeCursorEntry *parent_entry = &self->stack.contents[i - 1];
const TSSymbol *alias_sequence = ts_language_alias_sequence(
self->tree->language,
parent_entry->subtree->ptr->production_id
);
is_aliased = alias_sequence && alias_sequence[entry->structural_child_index];
}
if (ts_subtree_visible(*entry->subtree) || is_aliased) {
if (ts_subtree_visible(*entry->subtree)) {
self->stack.size = i + 1;
return true;
}
if (i > 0 && !ts_subtree_extra(*entry->subtree)) {
TreeCursorEntry *parent_entry = &self->stack.contents[i - 1];
if (ts_language_alias_at(
self->tree->language,
parent_entry->subtree->ptr->production_id,
entry->structural_child_index
)) {
self->stack.size = i + 1;
return true;
}
}
}
return false;
}
@ -226,15 +228,13 @@ TSNode ts_tree_cursor_current_node(const TSTreeCursor *_self) {
const TreeCursor *self = (const TreeCursor *)_self;
TreeCursorEntry *last_entry = array_back(&self->stack);
TSSymbol alias_symbol = 0;
if (self->stack.size > 1) {
if (self->stack.size > 1 && !ts_subtree_extra(*last_entry->subtree)) {
TreeCursorEntry *parent_entry = &self->stack.contents[self->stack.size - 2];
const TSSymbol *alias_sequence = ts_language_alias_sequence(
alias_symbol = ts_language_alias_at(
self->tree->language,
parent_entry->subtree->ptr->production_id
parent_entry->subtree->ptr->production_id,
last_entry->structural_child_index
);
if (alias_sequence && !ts_subtree_extra(*last_entry->subtree)) {
alias_symbol = alias_sequence[last_entry->structural_child_index];
}
}
return ts_node_new(
self->tree,
@ -263,13 +263,14 @@ TSFieldId ts_tree_cursor_current_status(
// Stop walking up when a visible ancestor is found.
if (i != self->stack.size - 1) {
if (ts_subtree_visible(*entry->subtree)) break;
const TSSymbol *alias_sequence = ts_language_alias_sequence(
self->tree->language,
parent_entry->subtree->ptr->production_id
);
if (alias_sequence && alias_sequence[entry->structural_child_index]) {
break;
}
if (
!ts_subtree_extra(*entry->subtree) &&
ts_language_alias_at(
self->tree->language,
parent_entry->subtree->ptr->production_id,
entry->structural_child_index
)
) break;
}
if (ts_subtree_child_count(*parent_entry->subtree) > entry->child_index + 1) {
@ -321,13 +322,14 @@ TSFieldId ts_tree_cursor_current_field_id(const TSTreeCursor *_self) {
// Stop walking up when another visible node is found.
if (i != self->stack.size - 1) {
if (ts_subtree_visible(*entry->subtree)) break;
const TSSymbol *alias_sequence = ts_language_alias_sequence(
self->tree->language,
parent_entry->subtree->ptr->production_id
);
if (alias_sequence && alias_sequence[entry->structural_child_index]) {
break;
}
if (
!ts_subtree_extra(*entry->subtree) &&
ts_language_alias_at(
self->tree->language,
parent_entry->subtree->ptr->production_id,
entry->structural_child_index
)
) break;
}
if (ts_subtree_extra(*entry->subtree)) break;