From 9a04231ab14ad1ca55bfd8b31f15d5d35c35aece Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 17 Jul 2017 17:12:36 -0700 Subject: [PATCH] Remove length restriction in external scanner serialization API --- include/tree_sitter/parser.h | 7 ++--- script/fetch-fixtures | 1 + src/compiler/generate_code/c_code.cc | 6 ++-- src/runtime/document.c | 3 +- src/runtime/lexer.c | 6 ++-- src/runtime/lexer.h | 4 +-- src/runtime/parser.c | 35 +++++++++++----------- src/runtime/stack.c | 11 +++---- src/runtime/tree.c | 44 +++++++++++++++++++++------- src/runtime/tree.h | 13 +++++++- test/integration/real_grammars.cc | 1 + test/runtime/stack_test.cc | 10 +++---- 12 files changed, 85 insertions(+), 56 deletions(-) diff --git a/include/tree_sitter/parser.h b/include/tree_sitter/parser.h index b1682101..18df7722 100644 --- a/include/tree_sitter/parser.h +++ b/include/tree_sitter/parser.h @@ -11,10 +11,10 @@ extern "C" { typedef uint16_t TSSymbol; typedef uint16_t TSStateId; -typedef uint8_t TSExternalTokenState[16]; #define ts_builtin_sym_error ((TSSymbol)-1) #define ts_builtin_sym_end 0 +#define TREE_SITTER_SERIALIZATION_BUFFER_SIZE 1024 typedef struct { bool visible : 1; @@ -86,10 +86,9 @@ typedef struct TSLanguage { const TSSymbol *symbol_map; void *(*create)(); void (*destroy)(void *); - void (*reset)(void *); bool (*scan)(void *, TSLexer *, const bool *symbol_whitelist); - bool (*serialize)(void *, TSExternalTokenState); - void (*deserialize)(void *, const TSExternalTokenState); + unsigned (*serialize)(void *, char *); + void (*deserialize)(void *, const char *, unsigned); } external_scanner; } TSLanguage; diff --git a/script/fetch-fixtures b/script/fetch-fixtures index aa0465c7..2575e2b6 100755 --- a/script/fetch-fixtures +++ b/script/fetch-fixtures @@ -29,3 +29,4 @@ fetch_grammar 'python' 'origin/master' fetch_grammar 'go' 'origin/master' fetch_grammar 'ruby' 'origin/master' fetch_grammar 'typescript' 'origin/master' +fetch_grammar 'bash' 'origin/master' diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index 379d467e..4b1b30f7 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -418,10 +418,9 @@ class CCodeGenerator { if (!syntax_grammar.external_tokens.empty()) { line("void *" + external_scanner_name + "_create();"); line("void " + external_scanner_name + "_destroy();"); - line("void " + external_scanner_name + "_reset(void *);"); line("bool " + external_scanner_name + "_scan(void *, TSLexer *, const bool *);"); - line("bool " + external_scanner_name + "_serialize(void *, TSExternalTokenState);"); - line("void " + external_scanner_name + "_deserialize(void *, const TSExternalTokenState);"); + line("unsigned " + external_scanner_name + "_serialize(void *, char *);"); + line("void " + external_scanner_name + "_deserialize(void *, const char *, unsigned);"); line(); } @@ -436,7 +435,6 @@ class CCodeGenerator { line("ts_external_scanner_symbol_map,"); line(external_scanner_name + "_create,"); line(external_scanner_name + "_destroy,"); - line(external_scanner_name + "_reset,"); line(external_scanner_name + "_scan,"); line(external_scanner_name + "_serialize,"); line(external_scanner_name + "_deserialize,"); diff --git a/src/runtime/document.c b/src/runtime/document.c index bae6ca6f..1311cb49 100644 --- a/src/runtime/document.c +++ b/src/runtime/document.c @@ -143,7 +143,8 @@ void ts_document_parse_with_options(TSDocument *self, TSParseOptions options) { tree_path_init(&self->parser.tree_path1, old_tree); tree_path_init(&self->parser.tree_path2, tree); tree_path_get_changes(&self->parser.tree_path1, &self->parser.tree_path2, - options.changed_ranges, options.changed_range_count); + options.changed_ranges, options.changed_range_count, + self->parser.language); } ts_tree_release(old_tree); diff --git a/src/runtime/lexer.c b/src/runtime/lexer.c index 3e9f9f70..5d76f3b2 100644 --- a/src/runtime/lexer.c +++ b/src/runtime/lexer.c @@ -5,9 +5,9 @@ #include "runtime/utf16.h" #include "utf8proc.h" -#define LOG(...) \ - if (self->logger.log) { \ - snprintf(self->debug_buffer, TS_DEBUG_BUFFER_SIZE, __VA_ARGS__); \ +#define LOG(...) \ + if (self->logger.log) { \ + snprintf(self->debug_buffer, TREE_SITTER_SERIALIZATION_BUFFER_SIZE, __VA_ARGS__); \ self->logger.log(self->logger.payload, TSLogTypeLex, self->debug_buffer); \ } diff --git a/src/runtime/lexer.h b/src/runtime/lexer.h index 5ae977ad..cf0a1981 100644 --- a/src/runtime/lexer.h +++ b/src/runtime/lexer.h @@ -10,8 +10,6 @@ extern "C" { #include "runtime/length.h" #include "runtime/tree.h" -#define TS_DEBUG_BUFFER_SIZE 512 - typedef struct { TSLexer data; Length current_position; @@ -25,7 +23,7 @@ typedef struct { TSInput input; TSLogger logger; - char debug_buffer[TS_DEBUG_BUFFER_SIZE]; + char debug_buffer[TREE_SITTER_SERIALIZATION_BUFFER_SIZE]; Tree *last_external_token; } Lexer; diff --git a/src/runtime/parser.c b/src/runtime/parser.c index 74dfe609..68d667b5 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -13,16 +13,16 @@ #include "runtime/reduce_action.h" #include "runtime/error_costs.h" -#define LOG(...) \ - if (self->lexer.logger.log) { \ - snprintf(self->lexer.debug_buffer, TS_DEBUG_BUFFER_SIZE, __VA_ARGS__); \ - self->lexer.logger.log(self->lexer.logger.payload, TSLogTypeParse, \ - self->lexer.debug_buffer); \ - } \ - if (self->print_debugging_graphs) { \ - fprintf(stderr, "graph {\nlabel=\""); \ - fprintf(stderr, __VA_ARGS__); \ - fprintf(stderr, "\"\n}\n\n"); \ +#define LOG(...) \ + if (self->lexer.logger.log) { \ + snprintf(self->lexer.debug_buffer, TREE_SITTER_SERIALIZATION_BUFFER_SIZE, __VA_ARGS__); \ + self->lexer.logger.log(self->lexer.logger.payload, TSLogTypeParse, \ + self->lexer.debug_buffer); \ + } \ + if (self->print_debugging_graphs) { \ + fprintf(stderr, "graph {\nlabel=\""); \ + fprintf(stderr, __VA_ARGS__); \ + fprintf(stderr, "\"\n}\n\n"); \ } #define LOG_STACK() \ @@ -233,10 +233,11 @@ static void parser__restore_external_scanner(Parser *self, Tree *external_token) if (external_token) { self->language->external_scanner.deserialize( self->external_scanner_payload, - external_token->external_token_state + ts_external_token_state_data(&external_token->external_token_state), + external_token->external_token_state.length ); } else { - self->language->external_scanner.reset(self->external_scanner_payload); + self->language->external_scanner.deserialize(self->external_scanner_payload, NULL, 0); } } @@ -351,11 +352,11 @@ static Tree *parser__lex(Parser *self, StackVersion version) { if (found_external_token) { result->has_external_tokens = true; - memset(result->external_token_state, 0, sizeof(TSExternalTokenState)); - self->language->external_scanner.serialize( + unsigned length = self->language->external_scanner.serialize( self->external_scanner_payload, - result->external_token_state + self->lexer.debug_buffer ); + ts_external_token_state_init(&result->external_token_state, self->lexer.debug_buffer, length); ts_lexer_set_last_external_token(&self->lexer, result); } } @@ -876,8 +877,8 @@ static void parser__start(Parser *self, TSInput input, Tree *previous_tree) { LOG("new_parse"); } - if (self->language->external_scanner.reset) { - self->language->external_scanner.reset(self->external_scanner_payload); + if (self->language->external_scanner.deserialize) { + self->language->external_scanner.deserialize(self->external_scanner_payload, NULL, 0); } ts_lexer_set_input(&self->lexer, input); diff --git a/src/runtime/stack.c b/src/runtime/stack.c index 92e90661..e353153a 100644 --- a/src/runtime/stack.c +++ b/src/runtime/stack.c @@ -612,13 +612,10 @@ bool ts_stack_print_dot_graph(Stack *self, const char **symbol_names, FILE *f) { i, head->node, i, head->push_count, head->depth); if (head->last_external_token) { - const TSExternalTokenState *s = &head->last_external_token->external_token_state; - fprintf(f, - "\nexternal_token_state: " - "%2X %2X %2X %2X %2X %2X %2X %2X %2X %2X %2X %2X %2X %2X %2X %2X", - (*s)[0], (*s)[1], (*s)[2], (*s)[3], (*s)[4], (*s)[5], (*s)[6], (*s)[7], - (*s)[8], (*s)[9], (*s)[10], (*s)[11], (*s)[12], (*s)[13], (*s)[14], (*s)[15] - ); + TSExternalTokenState *state = &head->last_external_token->external_token_state; + const char *data = ts_external_token_state_data(state); + fprintf(f, "\nexternal_token_state:"); + for (uint32_t j = 0; j < state->length; j++) fprintf(f, " %2X", data[j]); } fprintf(f, "\"]\n"); diff --git a/src/runtime/tree.c b/src/runtime/tree.c index f3163c26..ddbab2cd 100644 --- a/src/runtime/tree.c +++ b/src/runtime/tree.c @@ -12,6 +12,36 @@ TSStateId TS_TREE_STATE_NONE = USHRT_MAX; +void ts_external_token_state_init(TSExternalTokenState *self, const char *content, unsigned length) { + self->length = length; + if (length > sizeof(self->short_data)) { + self->long_data = ts_malloc(length); + memcpy(self->long_data, content, length); + } else { + memcpy(self->short_data, content, length); + } +} + +void ts_external_token_state_delete(TSExternalTokenState *self) { + if (self->length > sizeof(self->short_data)) { + ts_free(self->long_data); + } +} + +const char *ts_external_token_state_data(const TSExternalTokenState *self) { + if (self->length > sizeof(self->short_data)) { + return self->long_data; + } else { + return self->short_data; + } +} + +bool ts_external_token_state_eq(const TSExternalTokenState *a, const TSExternalTokenState *b) { + return a == b || + (a->length == b->length && + memcmp(ts_external_token_state_data(a), ts_external_token_state_data(b), a->length) == 0); +} + Tree *ts_tree_make_leaf(TSSymbol sym, Length padding, Length size, TSSymbolMetadata metadata) { Tree *result = ts_malloc(sizeof(Tree)); @@ -258,9 +288,10 @@ recur: Tree *last_child = self->children[self->child_count - 1]; ts_free(self->children); ts_free(self); - self = last_child; goto recur; + } else if (self->has_external_tokens) { + ts_external_token_state_delete(&self->external_token_state); } ts_free(self); @@ -553,19 +584,12 @@ void ts_tree_print_dot_graph(const Tree *self, const TSLanguage *language, fprintf(f, "}\n"); } -TSExternalTokenState empty_state = { - 0, 0, 0, 0, - 0, 0, 0, 0, - 0, 0, 0, 0, - 0, 0, 0, 0, -}; +TSExternalTokenState empty_state = {.length = 0, .short_data = {}}; bool ts_tree_external_token_state_eq(const Tree *self, const Tree *other) { const TSExternalTokenState *state1 = &empty_state; const TSExternalTokenState *state2 = &empty_state; if (self && self->has_external_tokens) state1 = &self->external_token_state; if (other && other->has_external_tokens) state2 = &other->external_token_state; - return - state1 == state2 || - memcmp(state1, state2, sizeof(TSExternalTokenState)) == 0; + return ts_external_token_state_eq(state1, state2); } diff --git a/src/runtime/tree.h b/src/runtime/tree.h index 3b58d86b..63ff9e9f 100644 --- a/src/runtime/tree.h +++ b/src/runtime/tree.h @@ -14,6 +14,17 @@ extern "C" { extern TSStateId TS_TREE_STATE_NONE; +typedef struct { + union { + char *long_data; + char short_data[sizeof(char *) + sizeof(unsigned)]; + }; + unsigned length; +} TSExternalTokenState; + +void ts_external_token_state_init(TSExternalTokenState *, const char *, unsigned); +const char *ts_external_token_state_data(const TSExternalTokenState *); + typedef struct Tree { struct { struct Tree *parent; @@ -25,10 +36,10 @@ typedef struct Tree { uint32_t child_count; union { struct { + struct Tree **children; uint32_t visible_child_count; uint32_t named_child_count; unsigned short rename_sequence_id; - struct Tree **children; }; TSExternalTokenState external_token_state; int32_t lookahead_char; diff --git a/test/integration/real_grammars.cc b/test/integration/real_grammars.cc index dc8f82dd..3647795c 100644 --- a/test/integration/real_grammars.cc +++ b/test/integration/real_grammars.cc @@ -38,6 +38,7 @@ vector test_languages({ "c", "cpp", "python", + "bash", }); for (auto &language_name : test_languages) { diff --git a/test/runtime/stack_test.cc b/test/runtime/stack_test.cc index 8d1d47f5..49986314 100644 --- a/test/runtime/stack_test.cc +++ b/test/runtime/stack_test.cc @@ -527,8 +527,8 @@ describe("Stack", [&]() { before_each([&]() { trees[1]->has_external_tokens = true; trees[2]->has_external_tokens = true; - memset(&trees[1]->external_token_state, 0, sizeof(TSExternalTokenState)); - memset(&trees[2]->external_token_state, 0, sizeof(TSExternalTokenState)); + ts_external_token_state_init(&trees[1]->external_token_state, NULL, 0); + ts_external_token_state_init(&trees[2]->external_token_state, NULL, 0); }); it("allows the state to be retrieved", [&]() { @@ -545,8 +545,7 @@ describe("Stack", [&]() { }); it("does not merge stack versions with different external token states", [&]() { - trees[1]->external_token_state[5] = 'a'; - trees[2]->external_token_state[5] = 'b'; + ts_external_token_state_init(&trees[1]->external_token_state, "ab", 2); ts_stack_copy_version(stack, 0); ts_stack_push(stack, 0, trees[0], false, 5); @@ -559,8 +558,7 @@ describe("Stack", [&]() { }); it("merges stack versions with identical external token states", [&]() { - trees[1]->external_token_state[5] = 'a'; - trees[2]->external_token_state[5] = 'a'; + ts_external_token_state_init(&trees[1]->external_token_state, "aa", 2); ts_stack_copy_version(stack, 0); ts_stack_push(stack, 0, trees[0], false, 5);