Merge pull request #943 from tree-sitter/new-parse-table-repr

Clean up parse table representation, use 16 bits for production_id
This commit is contained in:
Max Brunsfeld 2021-02-25 20:21:22 -08:00 committed by GitHub
commit 48584c7cad
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 115 additions and 201 deletions

View file

@ -11,11 +11,8 @@ use std::collections::{HashMap, HashSet};
use std::fmt::Write;
use std::mem::swap;
// Currently, the library supports a new ABI version that has not yet been
// stabilized, and the parser generation does not use it by default.
const STABLE_LANGUAGE_VERSION: usize = tree_sitter::LANGUAGE_VERSION - 1;
const LARGE_CHARACTER_RANGE_COUNT: usize = 8;
const SMALL_STATE_THRESHOLD: usize = 64;
macro_rules! add {
($this: tt, $($arg: tt)*) => {{
@ -52,8 +49,6 @@ macro_rules! dedent {
};
}
const SMALL_STATE_THRESHOLD: usize = 64;
struct Generator {
buffer: String,
indent_level: usize,
@ -72,6 +67,8 @@ struct Generator {
unique_aliases: Vec<Alias>,
symbol_map: HashMap<Symbol, Symbol>,
field_names: Vec<String>,
#[allow(unused)]
next_abi: bool,
}
@ -109,9 +106,7 @@ impl Generator {
self.add_alias_sequences();
}
if self.next_abi {
self.add_non_terminal_alias_map();
}
self.add_non_terminal_alias_map();
let mut main_lex_table = LexTable::default();
swap(&mut main_lex_table, &mut self.main_lex_table);
@ -296,15 +291,11 @@ impl Generator {
})
.count();
if self.next_abi {
add_line!(
self,
"#define LANGUAGE_VERSION {}",
tree_sitter::LANGUAGE_VERSION
);
} else {
add_line!(self, "#define LANGUAGE_VERSION {}", STABLE_LANGUAGE_VERSION);
}
add_line!(
self,
"#define LANGUAGE_VERSION {}",
tree_sitter::LANGUAGE_VERSION
);
add_line!(
self,
@ -331,6 +322,11 @@ impl Generator {
"#define MAX_ALIAS_SEQUENCE_LENGTH {}",
self.parse_table.max_aliased_production_length
);
add_line!(
self,
"#define PRODUCTION_ID_COUNT {}",
self.parse_table.production_infos.len()
);
add_line!(self, "");
}
@ -488,8 +484,7 @@ impl Generator {
fn add_alias_sequences(&mut self) {
add_line!(
self,
"static TSSymbol ts_alias_sequences[{}][MAX_ALIAS_SEQUENCE_LENGTH] = {{",
self.parse_table.production_infos.len()
"static TSSymbol ts_alias_sequences[PRODUCTION_ID_COUNT][MAX_ALIAS_SEQUENCE_LENGTH] = {{",
);
indent!(self);
for (i, production_info) in self.parse_table.production_infos.iter().enumerate() {
@ -597,8 +592,7 @@ impl Generator {
add_line!(
self,
"static const TSFieldMapSlice ts_field_map_slices[{}] = {{",
self.parse_table.production_infos.len(),
"static const TSFieldMapSlice ts_field_map_slices[PRODUCTION_ID_COUNT] = {{",
);
indent!(self);
for (production_id, (row_id, length)) in field_map_ids.into_iter().enumerate() {
@ -1394,11 +1388,9 @@ impl Generator {
}
add_line!(self, ".public_symbol_map = ts_symbol_map,");
if self.next_abi {
add_line!(self, ".alias_map = ts_non_terminal_alias_map,");
add_line!(self, ".state_count = STATE_COUNT,");
}
add_line!(self, ".alias_map = ts_non_terminal_alias_map,");
add_line!(self, ".state_count = STATE_COUNT,");
add_line!(self, ".production_id_count = PRODUCTION_ID_COUNT,");
dedent!(self);
add_line!(self, "}};");

View file

@ -208,8 +208,8 @@ extern "C" {
#[doc = " following three fields:"]
#[doc = " 1. `read`: A function to retrieve a chunk of text at a given byte offset"]
#[doc = " and (row, column) position. The function should return a pointer to the"]
#[doc = " text and write its length to the the `bytes_read` pointer. The parser"]
#[doc = " does not take ownership of this buffer; it just borrows it until it has"]
#[doc = " text and write its length to the `bytes_read` pointer. The parser does"]
#[doc = " not take ownership of this buffer; it just borrows it until it has"]
#[doc = " finished reading it. The function should write a zero value to the"]
#[doc = " `bytes_read` pointer to indicate the end of the document."]
#[doc = " 2. `payload`: An arbitrary pointer that will be passed to each invocation"]
@ -697,7 +697,7 @@ extern "C" {
#[doc = " to start running a given query on a given syntax node. Then, there are"]
#[doc = " two options for consuming the results of the query:"]
#[doc = " 1. Repeatedly call `ts_query_cursor_next_match` to iterate over all of the"]
#[doc = " the *matches* in the order that they were found. Each match contains the"]
#[doc = " *matches* in the order that they were found. Each match contains the"]
#[doc = " index of the pattern that matched, and an array of captures. Because"]
#[doc = " multiple patterns can match the same set of nodes, one match may contain"]
#[doc = " captures that appear *before* some of the captures from a previous match."]
@ -804,5 +804,5 @@ extern "C" {
pub fn ts_language_version(arg1: *const TSLanguage) -> u32;
}
pub const TREE_SITTER_LANGUAGE_VERSION: usize = 12;
pub const TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION: usize = 9;
pub const TREE_SITTER_LANGUAGE_VERSION: usize = 13;
pub const TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION: usize = 13;

View file

@ -21,13 +21,13 @@ extern "C" {
* The Tree-sitter library is generally backwards-compatible with languages
* generated using older CLI versions, but is not forwards-compatible.
*/
#define TREE_SITTER_LANGUAGE_VERSION 12
#define TREE_SITTER_LANGUAGE_VERSION 13
/**
* The earliest ABI version that is supported by the current version of the
* library.
*/
#define TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION 9
#define TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION 13
/*******************/
/* Section - Types */

View file

@ -13,6 +13,8 @@ extern "C" {
#define ts_builtin_sym_end 0
#define TREE_SITTER_SERIALIZATION_BUFFER_SIZE 1024
typedef uint16_t TSStateId;
#ifndef TREE_SITTER_API_H_
typedef uint16_t TSSymbol;
typedef uint16_t TSFieldId;
@ -30,12 +32,10 @@ typedef struct {
uint16_t length;
} TSFieldMapSlice;
typedef uint16_t TSStateId;
typedef struct {
bool visible : 1;
bool named : 1;
bool supertype: 1;
bool visible;
bool named;
bool supertype;
} TSSymbolMetadata;
typedef struct TSLexer TSLexer;
@ -57,21 +57,21 @@ typedef enum {
TSParseActionTypeRecover,
} TSParseActionType;
typedef struct {
union {
struct {
TSStateId state;
bool extra : 1;
bool repetition : 1;
} shift;
struct {
TSSymbol symbol;
int16_t dynamic_precedence;
uint8_t child_count;
uint8_t production_id;
} reduce;
} params;
TSParseActionType type : 4;
typedef union {
struct {
uint8_t type;
TSStateId state;
bool extra;
bool repetition;
} shift;
struct {
uint8_t type;
uint8_t child_count;
TSSymbol symbol;
int16_t dynamic_precedence;
uint16_t production_id;
} reduce;
uint8_t type;
} TSParseAction;
typedef struct {
@ -83,7 +83,7 @@ typedef union {
TSParseAction action;
struct {
uint8_t count;
bool reusable : 1;
bool reusable;
} entry;
} TSParseActionEntry;
@ -122,6 +122,7 @@ struct TSLanguage {
const TSSymbol *public_symbol_map;
const uint16_t *alias_map;
uint32_t state_count;
uint32_t production_id_count;
};
/*
@ -170,66 +171,50 @@ struct TSLanguage {
#define ACTIONS(id) id
#define SHIFT(state_value) \
{ \
{ \
.params = { \
.shift = { \
.state = state_value \
} \
}, \
.type = TSParseActionTypeShift \
} \
}
#define SHIFT(state_value) \
{{ \
.shift = { \
.type = TSParseActionTypeShift, \
.state = state_value \
} \
}}
#define SHIFT_REPEAT(state_value) \
{ \
{ \
.params = { \
.shift = { \
.state = state_value, \
.repetition = true \
} \
}, \
.type = TSParseActionTypeShift \
{{ \
.shift = { \
.type = TSParseActionTypeShift, \
.state = state_value, \
.repetition = true \
} \
}
#define RECOVER() \
{ \
{ .type = TSParseActionTypeRecover } \
}
}}
#define SHIFT_EXTRA() \
{ \
{ \
.params = { \
.shift = { \
.extra = true \
} \
}, \
.type = TSParseActionTypeShift \
{{ \
.shift = { \
.type = TSParseActionTypeShift, \
.extra = true \
} \
}
}}
#define REDUCE(symbol_val, child_count_val, ...) \
{ \
{ \
.params = { \
.reduce = { \
.symbol = symbol_val, \
.child_count = child_count_val, \
__VA_ARGS__ \
}, \
}, \
.type = TSParseActionTypeReduce \
} \
}
{{ \
.reduce = { \
.type = TSParseActionTypeReduce, \
.symbol = symbol_val, \
.child_count = child_count_val, \
__VA_ARGS__ \
}, \
}}
#define ACCEPT_INPUT() \
{ \
{ .type = TSParseActionTypeAccept } \
}
#define RECOVER() \
{{ \
.type = TSParseActionTypeRecover \
}}
#define ACCEPT_INPUT() \
{{ \
.type = TSParseActionTypeAccept \
}}
#ifdef __cplusplus
}

View file

@ -12,11 +12,7 @@ uint32_t ts_language_version(const TSLanguage *self) {
}
uint32_t ts_language_field_count(const TSLanguage *self) {
if (self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_FIELDS) {
return self->field_count;
} else {
return 0;
}
return self->field_count;
}
void ts_language_table_entry(
@ -57,11 +53,7 @@ TSSymbol ts_language_public_symbol(
TSSymbol symbol
) {
if (symbol == ts_builtin_sym_error) return symbol;
if (self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_SYMBOL_DEDUPING) {
return self->public_symbol_map[symbol];
} else {
return symbol;
}
return self->public_symbol_map[symbol];
}
const char *ts_language_symbol_name(
@ -92,11 +84,7 @@ TSSymbol ts_language_symbol_for_name(
if ((!metadata.visible && !metadata.supertype) || metadata.named != is_named) continue;
const char *symbol_name = self->symbol_names[i];
if (!strncmp(symbol_name, string, length) && !symbol_name[length]) {
if (self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_SYMBOL_DEDUPING) {
return self->public_symbol_map[i];
} else {
return i;
}
return self->public_symbol_map[i];
}
}
return 0;

View file

@ -9,11 +9,6 @@ extern "C" {
#include "tree_sitter/parser.h"
#define ts_builtin_sym_error_repeat (ts_builtin_sym_error - 1)
#define TREE_SITTER_LANGUAGE_VERSION_WITH_FIELDS 10
#define TREE_SITTER_LANGUAGE_VERSION_WITH_SYMBOL_DEDUPING 11
#define TREE_SITTER_LANGUAGE_VERSION_WITH_SMALL_STATES 11
#define TREE_SITTER_LANGUAGE_VERSION_WITH_STATE_COUNT 12
#define TREE_SITTER_LANGUAGE_VERSION_WITH_ALIAS_MAP 12
typedef struct {
const TSParseAction *actions;
@ -91,10 +86,7 @@ static inline uint16_t ts_language_lookup(
TSStateId state,
TSSymbol symbol
) {
if (
self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_SMALL_STATES &&
state >= self->large_state_count
) {
if (state >= self->large_state_count) {
uint32_t index = self->small_parse_table_map[state - self->large_state_count];
const uint16_t *data = &self->small_parse_table[index];
uint16_t group_count = *(data++);
@ -121,9 +113,7 @@ static inline LookaheadIterator ts_language_lookaheads(
const TSLanguage *self,
TSStateId state
) {
bool is_small_state =
self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_SMALL_STATES &&
state >= self->large_state_count;
bool is_small_state = state >= self->large_state_count;
const uint16_t *data;
const uint16_t *group_end = NULL;
uint16_t group_count = 0;
@ -203,7 +193,7 @@ static inline TSStateId ts_language_next_state(
if (count > 0) {
TSParseAction action = actions[count - 1];
if (action.type == TSParseActionTypeShift) {
return action.params.shift.extra ? state : action.params.shift.state;
return action.shift.extra ? state : action.shift.state;
}
}
return 0;
@ -248,7 +238,7 @@ static inline void ts_language_field_map(
const TSFieldMapEntry **start,
const TSFieldMapEntry **end
) {
if (self->version < TREE_SITTER_LANGUAGE_VERSION_WITH_FIELDS || self->field_count == 0) {
if (self->field_count == 0) {
*start = NULL;
*end = NULL;
return;
@ -268,8 +258,6 @@ static inline void ts_language_aliases_for_symbol(
*start = &self->public_symbol_map[original_symbol];
*end = *start + 1;
if (self->version < TREE_SITTER_LANGUAGE_VERSION_WITH_ALIAS_MAP) return;
unsigned i = 0;
for (;;) {
TSSymbol symbol = self->alias_map[i++];

View file

@ -1012,15 +1012,15 @@ static bool ts_parser__do_all_potential_reductions(
switch (action.type) {
case TSParseActionTypeShift:
case TSParseActionTypeRecover:
if (!action.params.shift.extra && !action.params.shift.repetition) has_shift_action = true;
if (!action.shift.extra && !action.shift.repetition) has_shift_action = true;
break;
case TSParseActionTypeReduce:
if (action.params.reduce.child_count > 0)
if (action.reduce.child_count > 0)
ts_reduce_action_set_add(&self->reduce_actions, (ReduceAction){
.symbol = action.params.reduce.symbol,
.count = action.params.reduce.child_count,
.dynamic_precedence = action.params.reduce.dynamic_precedence,
.production_id = action.params.reduce.production_id,
.symbol = action.reduce.symbol,
.count = action.reduce.child_count,
.dynamic_precedence = action.reduce.dynamic_precedence,
.production_id = action.reduce.production_id,
});
break;
default:
@ -1311,7 +1311,7 @@ static void ts_parser__recover(
// be counted in error cost calculations.
unsigned n;
const TSParseAction *actions = ts_language_actions(self->language, 1, ts_subtree_symbol(lookahead), &n);
if (n > 0 && actions[n - 1].type == TSParseActionTypeShift && actions[n - 1].params.shift.extra) {
if (n > 0 && actions[n - 1].type == TSParseActionTypeShift && actions[n - 1].shift.extra) {
MutableSubtree mutable_lookahead = ts_subtree_make_mut(&self->tree_pool, lookahead);
ts_subtree_set_extra(&mutable_lookahead);
lookahead = ts_subtree_from_mut(mutable_lookahead);
@ -1441,17 +1441,13 @@ static bool ts_parser__advance(
switch (action.type) {
case TSParseActionTypeShift: {
if (action.params.shift.repetition) break;
if (action.shift.repetition) break;
TSStateId next_state;
if (action.params.shift.extra) {
// TODO: remove when TREE_SITTER_LANGUAGE_VERSION 9 is out.
if (state == ERROR_STATE) continue;
if (action.shift.extra) {
next_state = state;
LOG("shift_extra");
} else {
next_state = action.params.shift.state;
next_state = action.shift.state;
LOG("shift state:%u", next_state);
}
@ -1460,7 +1456,7 @@ static bool ts_parser__advance(
next_state = ts_language_next_state(self->language, state, ts_subtree_symbol(lookahead));
}
ts_parser__shift(self, version, next_state, lookahead, action.params.shift.extra);
ts_parser__shift(self, version, next_state, lookahead, action.shift.extra);
if (did_reuse) reusable_node_advance(&self->reusable_node);
return true;
}
@ -1468,10 +1464,10 @@ static bool ts_parser__advance(
case TSParseActionTypeReduce: {
bool is_fragile = table_entry.action_count > 1;
bool end_of_non_terminal_extra = lookahead.ptr == NULL;
LOG("reduce sym:%s, child_count:%u", SYM_NAME(action.params.reduce.symbol), action.params.reduce.child_count);
LOG("reduce sym:%s, child_count:%u", SYM_NAME(action.reduce.symbol), action.reduce.child_count);
StackVersion reduction_version = ts_parser__reduce(
self, version, action.params.reduce.symbol, action.params.reduce.child_count,
action.params.reduce.dynamic_precedence, action.params.reduce.production_id,
self, version, action.reduce.symbol, action.reduce.child_count,
action.reduce.dynamic_precedence, action.reduce.production_id,
is_fragile, end_of_non_terminal_extra
);
if (reduction_version != STACK_VERSION_NONE) {

View file

@ -217,7 +217,6 @@ struct TSQuery {
Array(char) string_buffer;
const TSLanguage *language;
uint16_t wildcard_root_pattern_count;
TSSymbol *symbol_map;
};
/*
@ -755,7 +754,7 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) {
const TSSymbol *aliases, *aliases_end;
ts_language_aliases_for_symbol(
self->language,
action->params.reduce.symbol,
action->reduce.symbol,
&aliases,
&aliases_end
);
@ -772,15 +771,15 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) {
if (subgraph->nodes.size == 0 || array_back(&subgraph->nodes)->state != state) {
array_push(&subgraph->nodes, ((AnalysisSubgraphNode) {
.state = state,
.production_id = action->params.reduce.production_id,
.child_index = action->params.reduce.child_count,
.production_id = action->reduce.production_id,
.child_index = action->reduce.child_count,
.done = true,
}));
}
}
}
} else if (action->type == TSParseActionTypeShift && !action->params.shift.extra) {
TSStateId next_state = action->params.shift.state;
} else if (action->type == TSParseActionTypeShift && !action->shift.extra) {
TSStateId next_state = action->shift.state;
state_predecessor_map_add(&predecessor_map, next_state, state);
}
}
@ -1019,8 +1018,8 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) {
TSStateId next_parse_state;
if (lookahead_iterator.action_count) {
const TSParseAction *action = &lookahead_iterator.actions[lookahead_iterator.action_count - 1];
if (action->type == TSParseActionTypeShift && !action->params.shift.extra) {
next_parse_state = action->params.shift.state;
if (action->type == TSParseActionTypeShift && !action->shift.extra) {
next_parse_state = action->shift.state;
} else {
continue;
}
@ -1896,33 +1895,6 @@ TSQuery *ts_query_new(
uint32_t *error_offset,
TSQueryError *error_type
) {
TSSymbol *symbol_map;
if (ts_language_version(language) >= TREE_SITTER_LANGUAGE_VERSION_WITH_SYMBOL_DEDUPING) {
symbol_map = NULL;
} else {
// Work around the fact that multiple symbols can currently be
// associated with the same name, due to "simple aliases".
// In the next language ABI version, this map will be contained
// in the language's `public_symbol_map` field.
uint32_t symbol_count = ts_language_symbol_count(language);
symbol_map = ts_malloc(sizeof(TSSymbol) * symbol_count);
for (unsigned i = 0; i < symbol_count; i++) {
const char *name = ts_language_symbol_name(language, i);
const TSSymbolType symbol_type = ts_language_symbol_type(language, i);
symbol_map[i] = i;
for (unsigned j = 0; j < i; j++) {
if (ts_language_symbol_type(language, j) == symbol_type) {
if (!strcmp(name, ts_language_symbol_name(language, j))) {
symbol_map[i] = j;
break;
}
}
}
}
}
TSQuery *self = ts_malloc(sizeof(TSQuery));
*self = (TSQuery) {
.steps = array_new(),
@ -1933,7 +1905,6 @@ TSQuery *ts_query_new(
.patterns = array_new(),
.step_offsets = array_new(),
.string_buffer = array_new(),
.symbol_map = symbol_map,
.wildcard_root_pattern_count = 0,
.language = language,
};
@ -2003,12 +1974,10 @@ TSQuery *ts_query_new(
}
}
if (self->language->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_STATE_COUNT) {
if (!ts_query__analyze_patterns(self, error_offset)) {
*error_type = TSQueryErrorStructure;
ts_query_delete(self);
return NULL;
}
if (!ts_query__analyze_patterns(self, error_offset)) {
*error_type = TSQueryErrorStructure;
ts_query_delete(self);
return NULL;
}
ts_query__finalize_steps(self);
@ -2026,7 +1995,6 @@ void ts_query_delete(TSQuery *self) {
array_delete(&self->string_buffer);
symbol_table_delete(&self->captures);
symbol_table_delete(&self->predicate_values);
ts_free(self->symbol_map);
ts_free(self);
}
}
@ -2585,9 +2553,6 @@ static inline bool ts_query_cursor__advance(
// Get the properties of the current node.
TSSymbol symbol = ts_node_symbol(node);
bool is_named = ts_node_is_named(node);
if (symbol != ts_builtin_sym_error && self->query->symbol_map) {
symbol = self->query->symbol_map[symbol];
}
bool has_later_siblings;
bool has_later_named_siblings;
bool can_have_later_siblings_with_this_field;