From aaba7cd2f9cb8683d36fd2b5bf82f07643af7cd4 Mon Sep 17 00:00:00 2001 From: Amaan Qureshi Date: Mon, 14 Aug 2023 18:22:28 -0400 Subject: [PATCH] feat: implement a cache for `get_column` --- lib/src/lexer.c | 98 +++++++++++++++++++++++++++++++++++++----------- lib/src/lexer.h | 6 +++ lib/src/parser.c | 2 + 3 files changed, 84 insertions(+), 22 deletions(-) diff --git a/lib/src/lexer.c b/lib/src/lexer.c index 76cdc7f3..21448a2e 100644 --- a/lib/src/lexer.c +++ b/lib/src/lexer.c @@ -37,6 +37,35 @@ static const TSRange DEFAULT_RANGE = { .end_byte = UINT32_MAX }; +/** + * Sets the column data to the given value and marks it valid. + * @param self The lexer state. + * @param val The new value of the column data. + */ +static void ts_lexer__set_column_data(Lexer *self, uint32_t val) { + self->column_data.valid = true; + self->column_data.value = val; +} + +/** + * Increments the value of the column data; no-op if invalid. + * @param self The lexer state. + */ +static void ts_lexer__increment_column_data(Lexer *self) { + if (self->column_data.valid) { + self->column_data.value++; + } +} + +/** + * Marks the column data as invalid. + * @param self The lexer state. + */ +static void ts_lexer__invalidate_column_data(Lexer *self) { + self->column_data.valid = false; + self->column_data.value = 0; +} + // Check if the lexer has reached EOF. This state is stored // by setting the lexer's `current_included_range_index` such that // it has consumed all of its available ranges. @@ -104,6 +133,10 @@ static void ts_lexer__get_lookahead(Lexer *self) { } static void ts_lexer_goto(Lexer *self, Length position) { + if (position.bytes != self->current_position.bytes) { + ts_lexer__invalidate_column_data(self); + } + self->current_position = position; // Move to the first valid position at or after the given position. @@ -156,16 +189,24 @@ static void ts_lexer_goto(Lexer *self, Length position) { } } -// Intended to be called only from functions that control logging. +/** + * Actually advances the lexer. Does not log anything. + * @param self The lexer state. + * @param skip Whether to mark the consumed codepoint as whitespace. + */ static void ts_lexer__do_advance(Lexer *self, bool skip) { if (self->lookahead_size) { - self->current_position.bytes += self->lookahead_size; if (self->data.lookahead == '\n') { self->current_position.extent.row++; self->current_position.extent.column = 0; + ts_lexer__set_column_data(self, 0); } else { + bool is_bom = self->current_position.bytes == 0 && + self->data.lookahead == BYTE_ORDER_MARK; + if (!is_bom) ts_lexer__increment_column_data(self); self->current_position.extent.column += self->lookahead_size; } + self->current_position.bytes += self->lookahead_size; } const TSRange *current_range = &self->included_ranges[self->current_included_range_index]; @@ -249,27 +290,33 @@ static void ts_lexer__mark_end(TSLexer *_self) { static uint32_t ts_lexer__get_column(TSLexer *_self) { Lexer *self = (Lexer *)_self; - uint32_t goal_byte = self->current_position.bytes; - self->did_get_column = true; - Length start_of_col = { - self->current_position.bytes - self->current_position.extent.column, - {self->current_position.extent.row, 0}, - }; - ts_lexer_goto(self, start_of_col); - ts_lexer__get_chunk(self); - uint32_t result = 0; - if (!ts_lexer__eof(_self)) { - ts_lexer__get_lookahead(self); - while (self->current_position.bytes < goal_byte && self->chunk) { - result++; - ts_lexer__do_advance(self, false); - if (ts_lexer__eof(_self)) break; + if (!self->column_data.valid) { + // Record current position + uint32_t goal_byte = self->current_position.bytes; + + // Back up to the beginning of the line + Length start_of_col = { + self->current_position.bytes - self->current_position.extent.column, + {self->current_position.extent.row, 0}, + }; + ts_lexer_goto(self, start_of_col); + ts_lexer__set_column_data(self, 0); + ts_lexer__get_chunk(self); + + if (!ts_lexer__eof(_self)) { + ts_lexer__get_lookahead(self); + + // Advance to the recorded position + while (self->current_position.bytes < goal_byte && !ts_lexer__eof(_self) && self->chunk) { + ts_lexer__do_advance(self, false); + if (ts_lexer__eof(_self)) break; + } } } - return result; + return self->column_data.value; } // Is the lexer at a boundary between two disjoint included ranges of @@ -322,6 +369,11 @@ void ts_lexer_init(Lexer *self) { .included_ranges = NULL, .included_range_count = 0, .current_included_range_index = 0, + .did_get_column = false, + .column_data = { + .valid = false, + .value = 0 + } }; ts_lexer_set_included_ranges(self, NULL, 0); } @@ -352,10 +404,12 @@ void ts_lexer_start(Lexer *self) { if (!ts_lexer__eof(&self->data)) { if (!self->chunk_size) ts_lexer__get_chunk(self); if (!self->lookahead_size) ts_lexer__get_lookahead(self); - if ( - self->current_position.bytes == 0 && - self->data.lookahead == BYTE_ORDER_MARK - ) ts_lexer__advance(&self->data, true); + if (self->current_position.bytes == 0) { + if (self->data.lookahead == BYTE_ORDER_MARK) { + ts_lexer__advance(&self->data, true); + } + ts_lexer__set_column_data(self, 0); + } } } diff --git a/lib/src/lexer.h b/lib/src/lexer.h index fb6e6260..6ad663fa 100644 --- a/lib/src/lexer.h +++ b/lib/src/lexer.h @@ -10,6 +10,11 @@ extern "C" { #include "tree_sitter/api.h" #include "./parser.h" +typedef struct { + uint32_t value; + bool valid; +} ColumnData; + typedef struct { TSLexer data; Length current_position; @@ -27,6 +32,7 @@ typedef struct { uint32_t chunk_size; uint32_t lookahead_size; bool did_get_column; + ColumnData column_data; char debug_buffer[TREE_SITTER_SERIALIZATION_BUFFER_SIZE]; } Lexer; diff --git a/lib/src/parser.c b/lib/src/parser.c index ce2db366..b82be47e 100644 --- a/lib/src/parser.c +++ b/lib/src/parser.c @@ -529,6 +529,7 @@ static Subtree ts_parser__lex( for (;;) { bool found_token = false; Length current_position = self->lexer.current_position; + ColumnData column_data = self->lexer.column_data; if (lex_mode.external_lex_state != 0) { LOG( @@ -582,6 +583,7 @@ static Subtree ts_parser__lex( } ts_lexer_reset(&self->lexer, current_position); + self->lexer.column_data = column_data; } LOG(