feat: implement a cache for get_column

This commit is contained in:
Amaan Qureshi 2023-08-14 18:22:28 -04:00
parent a767f89967
commit aaba7cd2f9
3 changed files with 84 additions and 22 deletions

View file

@ -37,6 +37,35 @@ static const TSRange DEFAULT_RANGE = {
.end_byte = UINT32_MAX
};
/**
* Sets the column data to the given value and marks it valid.
* @param self The lexer state.
* @param val The new value of the column data.
*/
static void ts_lexer__set_column_data(Lexer *self, uint32_t val) {
self->column_data.valid = true;
self->column_data.value = val;
}
/**
* Increments the value of the column data; no-op if invalid.
* @param self The lexer state.
*/
static void ts_lexer__increment_column_data(Lexer *self) {
if (self->column_data.valid) {
self->column_data.value++;
}
}
/**
* Marks the column data as invalid.
* @param self The lexer state.
*/
static void ts_lexer__invalidate_column_data(Lexer *self) {
self->column_data.valid = false;
self->column_data.value = 0;
}
// Check if the lexer has reached EOF. This state is stored
// by setting the lexer's `current_included_range_index` such that
// it has consumed all of its available ranges.
@ -104,6 +133,10 @@ static void ts_lexer__get_lookahead(Lexer *self) {
}
static void ts_lexer_goto(Lexer *self, Length position) {
if (position.bytes != self->current_position.bytes) {
ts_lexer__invalidate_column_data(self);
}
self->current_position = position;
// Move to the first valid position at or after the given position.
@ -156,16 +189,24 @@ static void ts_lexer_goto(Lexer *self, Length position) {
}
}
// Intended to be called only from functions that control logging.
/**
* Actually advances the lexer. Does not log anything.
* @param self The lexer state.
* @param skip Whether to mark the consumed codepoint as whitespace.
*/
static void ts_lexer__do_advance(Lexer *self, bool skip) {
if (self->lookahead_size) {
self->current_position.bytes += self->lookahead_size;
if (self->data.lookahead == '\n') {
self->current_position.extent.row++;
self->current_position.extent.column = 0;
ts_lexer__set_column_data(self, 0);
} else {
bool is_bom = self->current_position.bytes == 0 &&
self->data.lookahead == BYTE_ORDER_MARK;
if (!is_bom) ts_lexer__increment_column_data(self);
self->current_position.extent.column += self->lookahead_size;
}
self->current_position.bytes += self->lookahead_size;
}
const TSRange *current_range = &self->included_ranges[self->current_included_range_index];
@ -249,27 +290,33 @@ static void ts_lexer__mark_end(TSLexer *_self) {
static uint32_t ts_lexer__get_column(TSLexer *_self) {
Lexer *self = (Lexer *)_self;
uint32_t goal_byte = self->current_position.bytes;
self->did_get_column = true;
Length start_of_col = {
self->current_position.bytes - self->current_position.extent.column,
{self->current_position.extent.row, 0},
};
ts_lexer_goto(self, start_of_col);
ts_lexer__get_chunk(self);
uint32_t result = 0;
if (!ts_lexer__eof(_self)) {
ts_lexer__get_lookahead(self);
while (self->current_position.bytes < goal_byte && self->chunk) {
result++;
ts_lexer__do_advance(self, false);
if (ts_lexer__eof(_self)) break;
if (!self->column_data.valid) {
// Record current position
uint32_t goal_byte = self->current_position.bytes;
// Back up to the beginning of the line
Length start_of_col = {
self->current_position.bytes - self->current_position.extent.column,
{self->current_position.extent.row, 0},
};
ts_lexer_goto(self, start_of_col);
ts_lexer__set_column_data(self, 0);
ts_lexer__get_chunk(self);
if (!ts_lexer__eof(_self)) {
ts_lexer__get_lookahead(self);
// Advance to the recorded position
while (self->current_position.bytes < goal_byte && !ts_lexer__eof(_self) && self->chunk) {
ts_lexer__do_advance(self, false);
if (ts_lexer__eof(_self)) break;
}
}
}
return result;
return self->column_data.value;
}
// Is the lexer at a boundary between two disjoint included ranges of
@ -322,6 +369,11 @@ void ts_lexer_init(Lexer *self) {
.included_ranges = NULL,
.included_range_count = 0,
.current_included_range_index = 0,
.did_get_column = false,
.column_data = {
.valid = false,
.value = 0
}
};
ts_lexer_set_included_ranges(self, NULL, 0);
}
@ -352,10 +404,12 @@ void ts_lexer_start(Lexer *self) {
if (!ts_lexer__eof(&self->data)) {
if (!self->chunk_size) ts_lexer__get_chunk(self);
if (!self->lookahead_size) ts_lexer__get_lookahead(self);
if (
self->current_position.bytes == 0 &&
self->data.lookahead == BYTE_ORDER_MARK
) ts_lexer__advance(&self->data, true);
if (self->current_position.bytes == 0) {
if (self->data.lookahead == BYTE_ORDER_MARK) {
ts_lexer__advance(&self->data, true);
}
ts_lexer__set_column_data(self, 0);
}
}
}

View file

@ -10,6 +10,11 @@ extern "C" {
#include "tree_sitter/api.h"
#include "./parser.h"
typedef struct {
uint32_t value;
bool valid;
} ColumnData;
typedef struct {
TSLexer data;
Length current_position;
@ -27,6 +32,7 @@ typedef struct {
uint32_t chunk_size;
uint32_t lookahead_size;
bool did_get_column;
ColumnData column_data;
char debug_buffer[TREE_SITTER_SERIALIZATION_BUFFER_SIZE];
} Lexer;

View file

@ -529,6 +529,7 @@ static Subtree ts_parser__lex(
for (;;) {
bool found_token = false;
Length current_position = self->lexer.current_position;
ColumnData column_data = self->lexer.column_data;
if (lex_mode.external_lex_state != 0) {
LOG(
@ -582,6 +583,7 @@ static Subtree ts_parser__lex(
}
ts_lexer_reset(&self->lexer, current_position);
self->lexer.column_data = column_data;
}
LOG(