feat: implement a cache for get_column

This commit is contained in:
Amaan Qureshi 2023-08-14 18:22:28 -04:00
parent a767f89967
commit aaba7cd2f9
3 changed files with 84 additions and 22 deletions

View file

@ -37,6 +37,35 @@ static const TSRange DEFAULT_RANGE = {
.end_byte = UINT32_MAX .end_byte = UINT32_MAX
}; };
/**
* Sets the column data to the given value and marks it valid.
* @param self The lexer state.
* @param val The new value of the column data.
*/
static void ts_lexer__set_column_data(Lexer *self, uint32_t val) {
self->column_data.valid = true;
self->column_data.value = val;
}
/**
* Increments the value of the column data; no-op if invalid.
* @param self The lexer state.
*/
static void ts_lexer__increment_column_data(Lexer *self) {
if (self->column_data.valid) {
self->column_data.value++;
}
}
/**
* Marks the column data as invalid.
* @param self The lexer state.
*/
static void ts_lexer__invalidate_column_data(Lexer *self) {
self->column_data.valid = false;
self->column_data.value = 0;
}
// Check if the lexer has reached EOF. This state is stored // Check if the lexer has reached EOF. This state is stored
// by setting the lexer's `current_included_range_index` such that // by setting the lexer's `current_included_range_index` such that
// it has consumed all of its available ranges. // it has consumed all of its available ranges.
@ -104,6 +133,10 @@ static void ts_lexer__get_lookahead(Lexer *self) {
} }
static void ts_lexer_goto(Lexer *self, Length position) { static void ts_lexer_goto(Lexer *self, Length position) {
if (position.bytes != self->current_position.bytes) {
ts_lexer__invalidate_column_data(self);
}
self->current_position = position; self->current_position = position;
// Move to the first valid position at or after the given position. // Move to the first valid position at or after the given position.
@ -156,16 +189,24 @@ static void ts_lexer_goto(Lexer *self, Length position) {
} }
} }
// Intended to be called only from functions that control logging. /**
* Actually advances the lexer. Does not log anything.
* @param self The lexer state.
* @param skip Whether to mark the consumed codepoint as whitespace.
*/
static void ts_lexer__do_advance(Lexer *self, bool skip) { static void ts_lexer__do_advance(Lexer *self, bool skip) {
if (self->lookahead_size) { if (self->lookahead_size) {
self->current_position.bytes += self->lookahead_size;
if (self->data.lookahead == '\n') { if (self->data.lookahead == '\n') {
self->current_position.extent.row++; self->current_position.extent.row++;
self->current_position.extent.column = 0; self->current_position.extent.column = 0;
ts_lexer__set_column_data(self, 0);
} else { } else {
bool is_bom = self->current_position.bytes == 0 &&
self->data.lookahead == BYTE_ORDER_MARK;
if (!is_bom) ts_lexer__increment_column_data(self);
self->current_position.extent.column += self->lookahead_size; self->current_position.extent.column += self->lookahead_size;
} }
self->current_position.bytes += self->lookahead_size;
} }
const TSRange *current_range = &self->included_ranges[self->current_included_range_index]; const TSRange *current_range = &self->included_ranges[self->current_included_range_index];
@ -249,27 +290,33 @@ static void ts_lexer__mark_end(TSLexer *_self) {
static uint32_t ts_lexer__get_column(TSLexer *_self) { static uint32_t ts_lexer__get_column(TSLexer *_self) {
Lexer *self = (Lexer *)_self; Lexer *self = (Lexer *)_self;
uint32_t goal_byte = self->current_position.bytes;
self->did_get_column = true; self->did_get_column = true;
Length start_of_col = {
self->current_position.bytes - self->current_position.extent.column,
{self->current_position.extent.row, 0},
};
ts_lexer_goto(self, start_of_col);
ts_lexer__get_chunk(self);
uint32_t result = 0; if (!self->column_data.valid) {
if (!ts_lexer__eof(_self)) { // Record current position
ts_lexer__get_lookahead(self); uint32_t goal_byte = self->current_position.bytes;
while (self->current_position.bytes < goal_byte && self->chunk) {
result++; // Back up to the beginning of the line
ts_lexer__do_advance(self, false); Length start_of_col = {
if (ts_lexer__eof(_self)) break; self->current_position.bytes - self->current_position.extent.column,
{self->current_position.extent.row, 0},
};
ts_lexer_goto(self, start_of_col);
ts_lexer__set_column_data(self, 0);
ts_lexer__get_chunk(self);
if (!ts_lexer__eof(_self)) {
ts_lexer__get_lookahead(self);
// Advance to the recorded position
while (self->current_position.bytes < goal_byte && !ts_lexer__eof(_self) && self->chunk) {
ts_lexer__do_advance(self, false);
if (ts_lexer__eof(_self)) break;
}
} }
} }
return result; return self->column_data.value;
} }
// Is the lexer at a boundary between two disjoint included ranges of // Is the lexer at a boundary between two disjoint included ranges of
@ -322,6 +369,11 @@ void ts_lexer_init(Lexer *self) {
.included_ranges = NULL, .included_ranges = NULL,
.included_range_count = 0, .included_range_count = 0,
.current_included_range_index = 0, .current_included_range_index = 0,
.did_get_column = false,
.column_data = {
.valid = false,
.value = 0
}
}; };
ts_lexer_set_included_ranges(self, NULL, 0); ts_lexer_set_included_ranges(self, NULL, 0);
} }
@ -352,10 +404,12 @@ void ts_lexer_start(Lexer *self) {
if (!ts_lexer__eof(&self->data)) { if (!ts_lexer__eof(&self->data)) {
if (!self->chunk_size) ts_lexer__get_chunk(self); if (!self->chunk_size) ts_lexer__get_chunk(self);
if (!self->lookahead_size) ts_lexer__get_lookahead(self); if (!self->lookahead_size) ts_lexer__get_lookahead(self);
if ( if (self->current_position.bytes == 0) {
self->current_position.bytes == 0 && if (self->data.lookahead == BYTE_ORDER_MARK) {
self->data.lookahead == BYTE_ORDER_MARK ts_lexer__advance(&self->data, true);
) ts_lexer__advance(&self->data, true); }
ts_lexer__set_column_data(self, 0);
}
} }
} }

View file

@ -10,6 +10,11 @@ extern "C" {
#include "tree_sitter/api.h" #include "tree_sitter/api.h"
#include "./parser.h" #include "./parser.h"
typedef struct {
uint32_t value;
bool valid;
} ColumnData;
typedef struct { typedef struct {
TSLexer data; TSLexer data;
Length current_position; Length current_position;
@ -27,6 +32,7 @@ typedef struct {
uint32_t chunk_size; uint32_t chunk_size;
uint32_t lookahead_size; uint32_t lookahead_size;
bool did_get_column; bool did_get_column;
ColumnData column_data;
char debug_buffer[TREE_SITTER_SERIALIZATION_BUFFER_SIZE]; char debug_buffer[TREE_SITTER_SERIALIZATION_BUFFER_SIZE];
} Lexer; } Lexer;

View file

@ -529,6 +529,7 @@ static Subtree ts_parser__lex(
for (;;) { for (;;) {
bool found_token = false; bool found_token = false;
Length current_position = self->lexer.current_position; Length current_position = self->lexer.current_position;
ColumnData column_data = self->lexer.column_data;
if (lex_mode.external_lex_state != 0) { if (lex_mode.external_lex_state != 0) {
LOG( LOG(
@ -582,6 +583,7 @@ static Subtree ts_parser__lex(
} }
ts_lexer_reset(&self->lexer, current_position); ts_lexer_reset(&self->lexer, current_position);
self->lexer.column_data = column_data;
} }
LOG( LOG(