Merge pull request #94 from tree-sitter/large-external-scanner-states

Large external scanner states
This commit is contained in:
Max Brunsfeld 2017-07-18 09:35:37 -07:00 committed by GitHub
commit 20b664969e
18 changed files with 257 additions and 113 deletions

View file

@ -11,10 +11,10 @@ extern "C" {
typedef uint16_t TSSymbol;
typedef uint16_t TSStateId;
typedef uint8_t TSExternalTokenState[16];
#define ts_builtin_sym_error ((TSSymbol)-1)
#define ts_builtin_sym_end 0
#define TREE_SITTER_SERIALIZATION_BUFFER_SIZE 1024
typedef struct {
bool visible : 1;
@ -86,10 +86,9 @@ typedef struct TSLanguage {
const TSSymbol *symbol_map;
void *(*create)();
void (*destroy)(void *);
void (*reset)(void *);
bool (*scan)(void *, TSLexer *, const bool *symbol_whitelist);
bool (*serialize)(void *, TSExternalTokenState);
void (*deserialize)(void *, const TSExternalTokenState);
unsigned (*serialize)(void *, char *);
void (*deserialize)(void *, const char *, unsigned);
} external_scanner;
} TSLanguage;

View file

@ -21,11 +21,12 @@ fetch_grammar() {
)
}
fetch_grammar 'javascript' 'origin/master'
fetch_grammar 'javascript' 'origin/new-external-scanner-api'
fetch_grammar 'json' 'origin/master'
fetch_grammar 'c' 'origin/master'
fetch_grammar 'cpp' 'origin/master'
fetch_grammar 'python' 'origin/master'
fetch_grammar 'python' 'origin/new-external-scanner-api'
fetch_grammar 'go' 'origin/master'
fetch_grammar 'ruby' 'origin/master'
fetch_grammar 'typescript' 'origin/master'
fetch_grammar 'bash' 'origin/new-external-scanner-api'

View file

@ -418,10 +418,9 @@ class CCodeGenerator {
if (!syntax_grammar.external_tokens.empty()) {
line("void *" + external_scanner_name + "_create();");
line("void " + external_scanner_name + "_destroy();");
line("void " + external_scanner_name + "_reset(void *);");
line("bool " + external_scanner_name + "_scan(void *, TSLexer *, const bool *);");
line("bool " + external_scanner_name + "_serialize(void *, TSExternalTokenState);");
line("void " + external_scanner_name + "_deserialize(void *, const TSExternalTokenState);");
line("unsigned " + external_scanner_name + "_serialize(void *, char *);");
line("void " + external_scanner_name + "_deserialize(void *, const char *, unsigned);");
line();
}
@ -436,7 +435,6 @@ class CCodeGenerator {
line("ts_external_scanner_symbol_map,");
line(external_scanner_name + "_create,");
line(external_scanner_name + "_destroy,");
line(external_scanner_name + "_reset,");
line(external_scanner_name + "_scan,");
line(external_scanner_name + "_serialize,");
line(external_scanner_name + "_deserialize,");

View file

@ -143,7 +143,8 @@ void ts_document_parse_with_options(TSDocument *self, TSParseOptions options) {
tree_path_init(&self->parser.tree_path1, old_tree);
tree_path_init(&self->parser.tree_path2, tree);
tree_path_get_changes(&self->parser.tree_path1, &self->parser.tree_path2,
options.changed_ranges, options.changed_range_count);
options.changed_ranges, options.changed_range_count,
self->parser.language);
}
ts_tree_release(old_tree);

View file

@ -5,9 +5,9 @@
#include "runtime/utf16.h"
#include "utf8proc.h"
#define LOG(...) \
if (self->logger.log) { \
snprintf(self->debug_buffer, TS_DEBUG_BUFFER_SIZE, __VA_ARGS__); \
#define LOG(...) \
if (self->logger.log) { \
snprintf(self->debug_buffer, TREE_SITTER_SERIALIZATION_BUFFER_SIZE, __VA_ARGS__); \
self->logger.log(self->logger.payload, TSLogTypeLex, self->debug_buffer); \
}

View file

@ -10,8 +10,6 @@ extern "C" {
#include "runtime/length.h"
#include "runtime/tree.h"
#define TS_DEBUG_BUFFER_SIZE 512
typedef struct {
TSLexer data;
Length current_position;
@ -25,7 +23,7 @@ typedef struct {
TSInput input;
TSLogger logger;
char debug_buffer[TS_DEBUG_BUFFER_SIZE];
char debug_buffer[TREE_SITTER_SERIALIZATION_BUFFER_SIZE];
Tree *last_external_token;
} Lexer;

View file

@ -13,16 +13,16 @@
#include "runtime/reduce_action.h"
#include "runtime/error_costs.h"
#define LOG(...) \
if (self->lexer.logger.log) { \
snprintf(self->lexer.debug_buffer, TS_DEBUG_BUFFER_SIZE, __VA_ARGS__); \
self->lexer.logger.log(self->lexer.logger.payload, TSLogTypeParse, \
self->lexer.debug_buffer); \
} \
if (self->print_debugging_graphs) { \
fprintf(stderr, "graph {\nlabel=\""); \
fprintf(stderr, __VA_ARGS__); \
fprintf(stderr, "\"\n}\n\n"); \
#define LOG(...) \
if (self->lexer.logger.log) { \
snprintf(self->lexer.debug_buffer, TREE_SITTER_SERIALIZATION_BUFFER_SIZE, __VA_ARGS__); \
self->lexer.logger.log(self->lexer.logger.payload, TSLogTypeParse, \
self->lexer.debug_buffer); \
} \
if (self->print_debugging_graphs) { \
fprintf(stderr, "graph {\nlabel=\""); \
fprintf(stderr, __VA_ARGS__); \
fprintf(stderr, "\"\n}\n\n"); \
}
#define LOG_STACK() \
@ -233,10 +233,11 @@ static void parser__restore_external_scanner(Parser *self, Tree *external_token)
if (external_token) {
self->language->external_scanner.deserialize(
self->external_scanner_payload,
external_token->external_token_state
ts_external_token_state_data(&external_token->external_token_state),
external_token->external_token_state.length
);
} else {
self->language->external_scanner.reset(self->external_scanner_payload);
self->language->external_scanner.deserialize(self->external_scanner_payload, NULL, 0);
}
}
@ -351,11 +352,11 @@ static Tree *parser__lex(Parser *self, StackVersion version) {
if (found_external_token) {
result->has_external_tokens = true;
memset(result->external_token_state, 0, sizeof(TSExternalTokenState));
self->language->external_scanner.serialize(
unsigned length = self->language->external_scanner.serialize(
self->external_scanner_payload,
result->external_token_state
self->lexer.debug_buffer
);
ts_external_token_state_init(&result->external_token_state, self->lexer.debug_buffer, length);
ts_lexer_set_last_external_token(&self->lexer, result);
}
}
@ -876,8 +877,8 @@ static void parser__start(Parser *self, TSInput input, Tree *previous_tree) {
LOG("new_parse");
}
if (self->language->external_scanner.reset) {
self->language->external_scanner.reset(self->external_scanner_payload);
if (self->language->external_scanner.deserialize) {
self->language->external_scanner.deserialize(self->external_scanner_payload, NULL, 0);
}
ts_lexer_set_input(&self->lexer, input);

View file

@ -612,13 +612,10 @@ bool ts_stack_print_dot_graph(Stack *self, const char **symbol_names, FILE *f) {
i, head->node, i, head->push_count, head->depth);
if (head->last_external_token) {
const TSExternalTokenState *s = &head->last_external_token->external_token_state;
fprintf(f,
"\nexternal_token_state: "
"%2X %2X %2X %2X %2X %2X %2X %2X %2X %2X %2X %2X %2X %2X %2X %2X",
(*s)[0], (*s)[1], (*s)[2], (*s)[3], (*s)[4], (*s)[5], (*s)[6], (*s)[7],
(*s)[8], (*s)[9], (*s)[10], (*s)[11], (*s)[12], (*s)[13], (*s)[14], (*s)[15]
);
TSExternalTokenState *state = &head->last_external_token->external_token_state;
const char *data = ts_external_token_state_data(state);
fprintf(f, "\nexternal_token_state:");
for (uint32_t j = 0; j < state->length; j++) fprintf(f, " %2X", data[j]);
}
fprintf(f, "\"]\n");

View file

@ -12,6 +12,36 @@
TSStateId TS_TREE_STATE_NONE = USHRT_MAX;
void ts_external_token_state_init(TSExternalTokenState *self, const char *content, unsigned length) {
self->length = length;
if (length > sizeof(self->short_data)) {
self->long_data = ts_malloc(length);
memcpy(self->long_data, content, length);
} else {
memcpy(self->short_data, content, length);
}
}
void ts_external_token_state_delete(TSExternalTokenState *self) {
if (self->length > sizeof(self->short_data)) {
ts_free(self->long_data);
}
}
const char *ts_external_token_state_data(const TSExternalTokenState *self) {
if (self->length > sizeof(self->short_data)) {
return self->long_data;
} else {
return self->short_data;
}
}
bool ts_external_token_state_eq(const TSExternalTokenState *a, const TSExternalTokenState *b) {
return a == b ||
(a->length == b->length &&
memcmp(ts_external_token_state_data(a), ts_external_token_state_data(b), a->length) == 0);
}
Tree *ts_tree_make_leaf(TSSymbol sym, Length padding, Length size,
TSSymbolMetadata metadata) {
Tree *result = ts_malloc(sizeof(Tree));
@ -258,9 +288,10 @@ recur:
Tree *last_child = self->children[self->child_count - 1];
ts_free(self->children);
ts_free(self);
self = last_child;
goto recur;
} else if (self->has_external_tokens) {
ts_external_token_state_delete(&self->external_token_state);
}
ts_free(self);
@ -553,19 +584,12 @@ void ts_tree_print_dot_graph(const Tree *self, const TSLanguage *language,
fprintf(f, "}\n");
}
TSExternalTokenState empty_state = {
0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 0,
};
TSExternalTokenState empty_state = {.length = 0, .short_data = {}};
bool ts_tree_external_token_state_eq(const Tree *self, const Tree *other) {
const TSExternalTokenState *state1 = &empty_state;
const TSExternalTokenState *state2 = &empty_state;
if (self && self->has_external_tokens) state1 = &self->external_token_state;
if (other && other->has_external_tokens) state2 = &other->external_token_state;
return
state1 == state2 ||
memcmp(state1, state2, sizeof(TSExternalTokenState)) == 0;
return ts_external_token_state_eq(state1, state2);
}

View file

@ -14,6 +14,17 @@ extern "C" {
extern TSStateId TS_TREE_STATE_NONE;
typedef struct {
union {
char *long_data;
char short_data[sizeof(char *) + sizeof(unsigned)];
};
unsigned length;
} TSExternalTokenState;
void ts_external_token_state_init(TSExternalTokenState *, const char *, unsigned);
const char *ts_external_token_state_data(const TSExternalTokenState *);
typedef struct Tree {
struct {
struct Tree *parent;
@ -25,10 +36,10 @@ typedef struct Tree {
uint32_t child_count;
union {
struct {
struct Tree **children;
uint32_t visible_child_count;
uint32_t named_child_count;
unsigned short rename_sequence_id;
struct Tree **children;
};
TSExternalTokenState external_token_state;
int32_t lookahead_char;

View file

@ -2,6 +2,7 @@
#define RUNTIME_TREE_PATH_H_
#include "runtime/tree.h"
#include "runtime/language.h"
#include "runtime/error_costs.h"
typedef Array(TSRange) RangeArray;
@ -122,21 +123,79 @@ Length tree_path_end_position(TreePath *self) {
return length_add(length_add(entry.position, entry.tree->padding), entry.tree->size);
}
static bool tree_must_eq(Tree *old_tree, Tree *new_tree) {
return old_tree == new_tree || (
!old_tree->has_changes &&
old_tree->symbol == new_tree->symbol &&
old_tree->symbol != ts_builtin_sym_error &&
old_tree->size.bytes == new_tree->size.bytes &&
old_tree->parse_state != TS_TREE_STATE_NONE &&
new_tree->parse_state != TS_TREE_STATE_NONE &&
(old_tree->parse_state == ERROR_STATE) ==
(new_tree->parse_state == ERROR_STATE)
);
typedef enum {
TreePathNotEq,
TreePathCanEq,
TreePathMustEq,
} TreePathComparison;
TreePathComparison tree_path_compare(const TreePath *old_path,
const TreePath *new_path,
const TSLanguage *language) {
Tree *old_tree = NULL;
TSSymbol old_rename_symbol = 0;
Length old_start = length_zero();
for (uint32_t i = old_path->size - 1; i + 1 > 0; i--) {
old_tree = old_path->contents[i].tree;
if (old_tree->visible) {
old_start = old_path->contents[i].position;
if (i > 0) {
const TSSymbol *rename_sequence = ts_language_rename_sequence(
language,
old_path->contents[i - 1].tree->rename_sequence_id
);
if (rename_sequence) {
old_rename_symbol = rename_sequence[old_path->contents[i].child_index];
}
}
break;
}
}
Tree *new_tree = NULL;
TSSymbol new_rename_symbol = 0;
Length new_start = length_zero();
for (uint32_t i = new_path->size - 1; i + 1 > 0; i--) {
new_tree = new_path->contents[i].tree;
if (new_tree->visible) {
new_start = old_path->contents[i].position;
if (i > 0) {
const TSSymbol *rename_sequence = ts_language_rename_sequence(
language,
new_path->contents[i - 1].tree->rename_sequence_id
);
if (rename_sequence) {
new_rename_symbol = rename_sequence[new_path->contents[i].child_index];
}
}
break;
}
}
if (old_rename_symbol == new_rename_symbol) {
if (old_start.bytes == new_start.bytes) {
if (!old_tree->has_changes &&
old_tree->symbol == new_tree->symbol &&
old_tree->symbol != ts_builtin_sym_error &&
old_tree->size.bytes == new_tree->size.bytes &&
old_tree->parse_state != TS_TREE_STATE_NONE &&
new_tree->parse_state != TS_TREE_STATE_NONE &&
(old_tree->parse_state == ERROR_STATE) == (new_tree->parse_state == ERROR_STATE)) {
return TreePathMustEq;
}
}
if (old_tree->symbol == new_tree->symbol) {
return TreePathCanEq;
}
}
return TreePathNotEq;
}
static void tree_path_get_changes(TreePath *old_path, TreePath *new_path,
TSRange **ranges, uint32_t *range_count) {
TSRange **ranges, uint32_t *range_count,
const TSLanguage *language) {
Length position = length_zero();
RangeArray results = array_new();
@ -144,8 +203,6 @@ static void tree_path_get_changes(TreePath *old_path, TreePath *new_path,
bool is_changed = false;
Length next_position = position;
Tree *old_tree = tree_path_visible_tree(old_path);
Tree *new_tree = tree_path_visible_tree(new_path);
Length old_start = tree_path_start_position(old_path);
Length new_start = tree_path_start_position(new_path);
Length old_end = tree_path_end_position(old_path);
@ -167,25 +224,33 @@ static void tree_path_get_changes(TreePath *old_path, TreePath *new_path,
} else if (position.bytes < new_start.bytes) {
is_changed = true;
next_position = new_start;
} else if (old_start.bytes == new_start.bytes && tree_must_eq(old_tree, new_tree)) {
next_position = old_end;
} else if (old_tree->symbol == new_tree->symbol) {
if (tree_path_descend(old_path, position)) {
if (!tree_path_descend(new_path, position)) {
tree_path_ascend(old_path, 1);
is_changed = true;
next_position = new_end;
}
} else if (tree_path_descend(new_path, position)) {
tree_path_ascend(new_path, 1);
is_changed = true;
next_position = old_end;
} else {
next_position = length_min(old_end, new_end);
}
} else {
is_changed = true;
next_position = length_min(old_end, new_end);
switch (tree_path_compare(old_path, new_path, language)) {
case TreePathMustEq:
next_position = old_end;
break;
case TreePathCanEq:
if (tree_path_descend(old_path, position)) {
if (!tree_path_descend(new_path, position)) {
tree_path_ascend(old_path, 1);
is_changed = true;
next_position = new_end;
}
} else if (tree_path_descend(new_path, position)) {
tree_path_ascend(new_path, 1);
is_changed = true;
next_position = old_end;
} else {
next_position = length_min(old_end, new_end);
}
break;
case TreePathNotEq:
is_changed = true;
next_position = length_min(old_end, new_end);
break;
}
}
bool at_old_end = old_end.bytes <= next_position.bytes;

View file

@ -17,6 +17,7 @@ vector<string> language_names({
"cpp",
"javascript",
"python",
"bash",
});
size_t mean(const vector<size_t> &values) {

View file

@ -1,14 +1,27 @@
#include "../external_and_internal_tokens/scanner.c"
void *tree_sitter_external_and_internal_anonymous_tokens_external_scanner_create() { return NULL; }
void *tree_sitter_external_and_internal_anonymous_tokens_external_scanner_create() {
return NULL;
}
void tree_sitter_external_and_internal_anonymous_tokens_external_scanner_destroy(void *payload) {}
void tree_sitter_external_and_internal_anonymous_tokens_external_scanner_destroy(
void *payload
) {}
void tree_sitter_external_and_internal_anonymous_tokens_external_scanner_reset(void *payload) {}
void tree_sitter_external_and_internal_anonymous_tokens_external_scanner_reset(
void *payload
) {}
bool tree_sitter_external_and_internal_anonymous_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) { return true; }
unsigned tree_sitter_external_and_internal_anonymous_tokens_external_scanner_serialize(
void *payload,
char *buffer
) { return 0; }
void tree_sitter_external_and_internal_anonymous_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) {}
void tree_sitter_external_and_internal_anonymous_tokens_external_scanner_deserialize(
void *payload,
const char *buffer,
unsigned length
) {}
bool tree_sitter_external_and_internal_anonymous_tokens_external_scanner_scan(
void *payload,
@ -20,4 +33,4 @@ bool tree_sitter_external_and_internal_anonymous_tokens_external_scanner_scan(
lexer,
whitelist
);
}
}

View file

@ -5,19 +5,34 @@ enum {
LINE_BREAK
};
void *tree_sitter_external_and_internal_tokens_external_scanner_create() { return NULL; }
void *tree_sitter_external_and_internal_tokens_external_scanner_create() {
return NULL;
}
void tree_sitter_external_and_internal_tokens_external_scanner_destroy(void *payload) {}
void tree_sitter_external_and_internal_tokens_external_scanner_destroy(
void *payload
) {}
void tree_sitter_external_and_internal_tokens_external_scanner_reset(void *payload) {}
void tree_sitter_external_and_internal_tokens_external_scanner_reset(
void *payload
) {}
bool tree_sitter_external_and_internal_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) { return true; }
unsigned tree_sitter_external_and_internal_tokens_external_scanner_serialize(
void *payload,
char *buffer
) { return 0; }
void tree_sitter_external_and_internal_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) {}
void tree_sitter_external_and_internal_tokens_external_scanner_deserialize(
void *payload,
const char *buffer,
unsigned length
) {}
bool tree_sitter_external_and_internal_tokens_external_scanner_scan(
void *payload, TSLexer *lexer, const bool *whitelist) {
void *payload,
TSLexer *lexer,
const bool *whitelist
) {
// If a line-break is a valid lookahead token, only skip spaces.
if (whitelist[LINE_BREAK]) {
while (lexer->lookahead == ' ') {

View file

@ -4,18 +4,32 @@ enum {
COMMENT,
};
void *tree_sitter_external_extra_tokens_external_scanner_create() { return NULL; }
void *tree_sitter_external_extra_tokens_external_scanner_create() {
return NULL;
}
void tree_sitter_external_extra_tokens_external_scanner_destroy(void *payload) {}
void tree_sitter_external_extra_tokens_external_scanner_destroy(
void *payload) {}
void tree_sitter_external_extra_tokens_external_scanner_reset(void *payload) {}
void tree_sitter_external_extra_tokens_external_scanner_reset(
void *payload) {}
bool tree_sitter_external_extra_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) { return true; }
unsigned tree_sitter_external_extra_tokens_external_scanner_serialize(
void *payload,
char *buffer
) { return 0; }
void tree_sitter_external_extra_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) {}
void tree_sitter_external_extra_tokens_external_scanner_deserialize(
void *payload,
const char *buffer,
unsigned length
) {}
bool tree_sitter_external_extra_tokens_external_scanner_scan(
void *payload, TSLexer *lexer, const bool *whitelist) {
void *payload,
TSLexer *lexer,
const bool *valid_symbols
) {
while (lexer->lookahead == ' ') {
lexer->advance(lexer, true);
@ -33,4 +47,3 @@ bool tree_sitter_external_extra_tokens_external_scanner_scan(
return false;
}

View file

@ -28,9 +28,16 @@ void tree_sitter_external_tokens_external_scanner_destroy(void *payload) {
void tree_sitter_external_tokens_external_scanner_reset(void *payload) {}
bool tree_sitter_external_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) { return true; }
unsigned tree_sitter_external_tokens_external_scanner_serialize(
void *payload,
char *buffer
) { return true; }
void tree_sitter_external_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) {}
void tree_sitter_external_tokens_external_scanner_deserialize(
void *payload,
const char *buffer,
unsigned length
) {}
bool tree_sitter_external_tokens_external_scanner_scan(
void *payload, TSLexer *lexer, const bool *whitelist) {
@ -111,4 +118,3 @@ bool tree_sitter_external_tokens_external_scanner_scan(
return false;
}

View file

@ -38,6 +38,7 @@ vector<string> test_languages({
"c",
"cpp",
"python",
"bash",
});
for (auto &language_name : test_languages) {

View file

@ -527,8 +527,8 @@ describe("Stack", [&]() {
before_each([&]() {
trees[1]->has_external_tokens = true;
trees[2]->has_external_tokens = true;
memset(&trees[1]->external_token_state, 0, sizeof(TSExternalTokenState));
memset(&trees[2]->external_token_state, 0, sizeof(TSExternalTokenState));
ts_external_token_state_init(&trees[1]->external_token_state, NULL, 0);
ts_external_token_state_init(&trees[2]->external_token_state, NULL, 0);
});
it("allows the state to be retrieved", [&]() {
@ -545,8 +545,8 @@ describe("Stack", [&]() {
});
it("does not merge stack versions with different external token states", [&]() {
trees[1]->external_token_state[5] = 'a';
trees[2]->external_token_state[5] = 'b';
ts_external_token_state_init(&trees[1]->external_token_state, "abcd", 2);
ts_external_token_state_init(&trees[2]->external_token_state, "ABCD", 2);
ts_stack_copy_version(stack, 0);
ts_stack_push(stack, 0, trees[0], false, 5);
@ -559,8 +559,8 @@ describe("Stack", [&]() {
});
it("merges stack versions with identical external token states", [&]() {
trees[1]->external_token_state[5] = 'a';
trees[2]->external_token_state[5] = 'a';
ts_external_token_state_init(&trees[1]->external_token_state, "abcd", 2);
ts_external_token_state_init(&trees[2]->external_token_state, "abcd", 2);
ts_stack_copy_version(stack, 0);
ts_stack_push(stack, 0, trees[0], false, 5);