feat: add 'reserved word' construct

Co-authored-by: Amaan Qureshi <amaanq12@gmail.com>
This commit is contained in:
Max Brunsfeld 2024-12-23 00:06:32 -08:00 committed by GitHub
parent 2a63077cac
commit 201b41cf11
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
31 changed files with 2367 additions and 1628 deletions

View file

@ -29,7 +29,7 @@ uint32_t ts_language_version(const TSLanguage *self) {
}
const char *ts_language_name(const TSLanguage *self) {
return self->version >= LANGUAGE_VERSION_WITH_METADATA ? self->name : NULL;
return self->version >= LANGUAGE_VERSION_WITH_RESERVED_WORDS ? self->name : NULL;
}
uint32_t ts_language_field_count(const TSLanguage *self) {
@ -56,6 +56,39 @@ void ts_language_table_entry(
}
}
TSLexerMode ts_language_lex_mode_for_state(
const TSLanguage *self,
TSStateId state
) {
if (self->version < 15) {
TSLexMode mode = ((const TSLexMode *)self->lex_modes)[state];
return (TSLexerMode) {
.lex_state = mode.lex_state,
.external_lex_state = mode.external_lex_state,
.reserved_word_set_id = 0,
};
} else {
return self->lex_modes[state];
}
}
bool ts_language_is_reserved_word(
const TSLanguage *self,
TSStateId state,
TSSymbol symbol
) {
TSLexerMode lex_mode = ts_language_lex_mode_for_state(self, state);
if (lex_mode.reserved_word_set_id > 0) {
unsigned start = lex_mode.reserved_word_set_id * self->max_reserved_word_set_size;
unsigned end = start + self->max_reserved_word_set_size;
for (unsigned i = start; i < end; i++) {
if (self->reserved_words[i] == symbol) return true;
if (self->reserved_words[i] == 0) break;
}
}
return false;
}
TSSymbolMetadata ts_language_symbol_metadata(
const TSLanguage *self,
TSSymbol symbol

View file

@ -10,7 +10,7 @@ extern "C" {
#define ts_builtin_sym_error_repeat (ts_builtin_sym_error - 1)
#define LANGUAGE_VERSION_WITH_METADATA 15
#define LANGUAGE_VERSION_WITH_RESERVED_WORDS 15
#define LANGUAGE_VERSION_WITH_PRIMARY_STATES 14
typedef struct {
@ -36,9 +36,9 @@ typedef struct {
} LookaheadIterator;
void ts_language_table_entry(const TSLanguage *self, TSStateId state, TSSymbol symbol, TableEntry *result);
TSLexerMode ts_language_lex_mode_for_state(const TSLanguage *self, TSStateId state);
bool ts_language_is_reserved_word(const TSLanguage *self, TSStateId state, TSSymbol symbol);
TSSymbolMetadata ts_language_symbol_metadata(const TSLanguage *self, TSSymbol symbol);
TSSymbol ts_language_public_symbol(const TSLanguage *self, TSSymbol symbol);
static inline bool ts_language_is_symbol_external(const TSLanguage *self, TSSymbol symbol) {

View file

@ -80,7 +80,7 @@
static const unsigned MAX_VERSION_COUNT = 6;
static const unsigned MAX_VERSION_COUNT_OVERFLOW = 4;
static const unsigned MAX_SUMMARY_DEPTH = 16;
static const unsigned MAX_COST_DIFFERENCE = 16 * ERROR_COST_PER_SKIPPED_TREE;
static const unsigned MAX_COST_DIFFERENCE = 18 * ERROR_COST_PER_SKIPPED_TREE;
static const unsigned OP_COUNT_PER_PARSER_TIMEOUT_CHECK = 100;
typedef struct {
@ -342,7 +342,7 @@ static bool ts_parser__better_version_exists(
return false;
}
static bool ts_parser__call_main_lex_fn(TSParser *self, TSLexMode lex_mode) {
static bool ts_parser__call_main_lex_fn(TSParser *self, TSLexerMode lex_mode) {
if (ts_language_is_wasm(self->language)) {
return ts_wasm_store_call_lex_main(self->wasm_store, lex_mode.lex_state);
} else {
@ -473,10 +473,10 @@ static bool ts_parser__can_reuse_first_leaf(
Subtree tree,
TableEntry *table_entry
) {
TSLexMode current_lex_mode = self->language->lex_modes[state];
TSSymbol leaf_symbol = ts_subtree_leaf_symbol(tree);
TSStateId leaf_state = ts_subtree_leaf_parse_state(tree);
TSLexMode leaf_lex_mode = self->language->lex_modes[leaf_state];
TSLexerMode current_lex_mode = ts_language_lex_mode_for_state(self->language, state);
TSLexerMode leaf_lex_mode = ts_language_lex_mode_for_state(self->language, leaf_state);
// At the end of a non-terminal extra node, the lexer normally returns
// NULL, which indicates that the parser should look for a reduce action
@ -487,7 +487,7 @@ static bool ts_parser__can_reuse_first_leaf(
// If the token was created in a state with the same set of lookaheads, it is reusable.
if (
table_entry->action_count > 0 &&
memcmp(&leaf_lex_mode, &current_lex_mode, sizeof(TSLexMode)) == 0 &&
memcmp(&leaf_lex_mode, &current_lex_mode, sizeof(TSLexerMode)) == 0 &&
(
leaf_symbol != self->language->keyword_capture_token ||
(!ts_subtree_is_keyword(tree) && ts_subtree_parse_state(tree) == state)
@ -507,7 +507,7 @@ static Subtree ts_parser__lex(
StackVersion version,
TSStateId parse_state
) {
TSLexMode lex_mode = self->language->lex_modes[parse_state];
TSLexerMode lex_mode = ts_language_lex_mode_for_state(self->language, parse_state);
if (lex_mode.lex_state == (uint16_t)-1) {
LOG("no_lookahead_after_non_terminal_extra");
return NULL_SUBTREE;
@ -601,7 +601,7 @@ static Subtree ts_parser__lex(
if (!error_mode) {
error_mode = true;
lex_mode = self->language->lex_modes[ERROR_STATE];
lex_mode = ts_language_lex_mode_for_state(self->language, ERROR_STATE);
ts_lexer_reset(&self->lexer, start_position);
continue;
}
@ -658,7 +658,10 @@ static Subtree ts_parser__lex(
if (
is_keyword &&
self->lexer.token_end_position.bytes == end_byte &&
ts_language_has_actions(self->language, parse_state, self->lexer.data.result_symbol)
(
ts_language_has_actions(self->language, parse_state, self->lexer.data.result_symbol) ||
ts_language_is_reserved_word(self->language, parse_state, self->lexer.data.result_symbol)
)
) {
symbol = self->lexer.data.result_symbol;
}
@ -1684,15 +1687,20 @@ static bool ts_parser__advance(
return true;
}
// If there were no parse actions for the current lookahead token, then
// it is not valid in this state. If the current lookahead token is a
// keyword, then switch to treating it as the normal word token if that
// token is valid in this state.
// If the current lookahead token is a keyword that is not valid, but the
// default word token *is* valid, then treat the lookahead token as the word
// token instead.
if (
ts_subtree_is_keyword(lookahead) &&
ts_subtree_symbol(lookahead) != self->language->keyword_capture_token
ts_subtree_symbol(lookahead) != self->language->keyword_capture_token &&
!ts_language_is_reserved_word(self->language, state, ts_subtree_symbol(lookahead))
) {
ts_language_table_entry(self->language, state, self->language->keyword_capture_token, &table_entry);
ts_language_table_entry(
self->language,
state,
self->language->keyword_capture_token,
&table_entry
);
if (table_entry.action_count > 0) {
LOG(
"switch from_keyword:%s, to_word_token:%s",
@ -1707,19 +1715,10 @@ static bool ts_parser__advance(
}
}
// If the current lookahead token is not valid and the parser is
// already in the error state, restart the error recovery process.
// TODO - can this be unified with the other `RECOVER` case above?
if (state == ERROR_STATE) {
ts_parser__recover(self, version, lookahead);
return true;
}
// If the current lookahead token is not valid and the previous
// subtree on the stack was reused from an old tree, it isn't actually
// valid to reuse it. Remove it from the stack, and in its place,
// push each of its children. Then try again to process the current
// lookahead.
// If the current lookahead token is not valid and the previous subtree on
// the stack was reused from an old tree, then it wasn't actually valid to
// reuse that previous subtree. Remove it from the stack, and in its place,
// push each of its children. Then try again to process the current lookahead.
if (ts_parser__breakdown_top_of_stack(self, version)) {
state = ts_stack_state(self->stack, version);
ts_subtree_release(&self->tree_pool, lookahead);
@ -1727,11 +1726,11 @@ static bool ts_parser__advance(
continue;
}
// At this point, the current lookahead token is definitely not valid
// for this parse stack version. Mark this version as paused and continue
// processing any other stack versions that might exist. If some other
// version advances successfully, then this version can simply be removed.
// But if all versions end up paused, then error recovery is needed.
// Otherwise, there is definitely an error in this version of the parse stack.
// Mark this version as paused and continue processing any other stack
// versions that exist. If some other version advances successfully, then
// this version can simply be removed. But if all versions end up paused,
// then error recovery is needed.
LOG("detect_error");
ts_stack_pause(self->stack, version, lookahead);
return true;

View file

@ -79,6 +79,12 @@ typedef struct {
uint16_t external_lex_state;
} TSLexMode;
typedef struct {
uint16_t lex_state;
uint16_t external_lex_state;
uint16_t reserved_word_set_id;
} TSLexerMode;
typedef union {
TSParseAction action;
struct {
@ -115,7 +121,7 @@ struct TSLanguage {
const TSSymbol *public_symbol_map;
const uint16_t *alias_map;
const TSSymbol *alias_sequences;
const TSLexMode *lex_modes;
const TSLexerMode *lex_modes;
bool (*lex_fn)(TSLexer *, TSStateId);
bool (*keyword_lex_fn)(TSLexer *, TSStateId);
TSSymbol keyword_capture_token;
@ -130,6 +136,8 @@ struct TSLanguage {
} external_scanner;
const TSStateId *primary_state_ids;
const char *name;
const TSSymbol *reserved_words;
uint16_t max_reserved_word_set_size;
};
static inline bool set_contains(const TSCharacterRange *ranges, uint32_t len, int32_t lookahead) {

File diff suppressed because it is too large Load diff

View file

@ -153,6 +153,9 @@ typedef struct {
int32_t deserialize;
} external_scanner;
int32_t primary_state_ids;
int32_t name;
int32_t reserved_words;
uint16_t max_reserved_word_set_size;
} LanguageInWasmMemory;
// LexerInWasmMemory - The memory layout of a `TSLexer` when compiled to wasm32.
@ -414,6 +417,17 @@ static void *copy_strings(
return result;
}
static void *copy_string(
const uint8_t *data,
int32_t address
) {
const char *string = (const char *)&data[address];
size_t len = strlen(string);
char *result = ts_malloc(len + 1);
memcpy(result, string, len + 1);
return result;
}
static bool name_eq(const wasm_name_t *name, const char *string) {
return strncmp(string, name->data, name->size) == 0;
}
@ -1202,24 +1216,24 @@ const TSLanguage *ts_wasm_store_load_language(
memcpy(&wasm_language, &memory[language_address], sizeof(LanguageInWasmMemory));
int32_t addresses[] = {
wasm_language.alias_map,
wasm_language.alias_sequences,
wasm_language.field_map_entries,
wasm_language.field_map_slices,
wasm_language.field_names,
wasm_language.keyword_lex_fn,
wasm_language.lex_fn,
wasm_language.lex_modes,
wasm_language.parse_actions,
wasm_language.parse_table,
wasm_language.primary_state_ids,
wasm_language.primary_state_ids,
wasm_language.public_symbol_map,
wasm_language.small_parse_table,
wasm_language.small_parse_table_map,
wasm_language.symbol_metadata,
wasm_language.symbol_metadata,
wasm_language.parse_actions,
wasm_language.symbol_names,
wasm_language.field_names,
wasm_language.field_map_slices,
wasm_language.field_map_entries,
wasm_language.symbol_metadata,
wasm_language.public_symbol_map,
wasm_language.alias_map,
wasm_language.alias_sequences,
wasm_language.lex_modes,
wasm_language.lex_fn,
wasm_language.keyword_lex_fn,
wasm_language.primary_state_ids,
wasm_language.name,
wasm_language.reserved_words,
wasm_language.external_token_count > 0 ? wasm_language.external_scanner.states : 0,
wasm_language.external_token_count > 0 ? wasm_language.external_scanner.symbol_map : 0,
wasm_language.external_token_count > 0 ? wasm_language.external_scanner.create : 0,
@ -1274,7 +1288,7 @@ const TSLanguage *ts_wasm_store_load_language(
),
.lex_modes = copy(
&memory[wasm_language.lex_modes],
wasm_language.state_count * sizeof(TSLexMode)
wasm_language.state_count * sizeof(TSLexerMode)
),
};
@ -1350,6 +1364,15 @@ const TSLanguage *ts_wasm_store_load_language(
);
}
if (language->version >= LANGUAGE_VERSION_WITH_RESERVED_WORDS) {
language->name = copy_string(memory, wasm_language.name);
language->reserved_words = copy(
&memory[wasm_language.reserved_words],
wasm_language.max_reserved_word_set_size * sizeof(TSSymbol)
);
language->max_reserved_word_set_size = wasm_language.max_reserved_word_set_size;
}
if (language->external_token_count > 0) {
language->external_scanner.symbol_map = copy(
&memory[wasm_language.external_scanner.symbol_map],
@ -1731,6 +1754,8 @@ void ts_wasm_language_release(const TSLanguage *self) {
ts_free((void *)self->field_map_slices);
ts_free((void *)self->field_names);
ts_free((void *)self->lex_modes);
ts_free((void *)self->name);
ts_free((void *)self->reserved_words);
ts_free((void *)self->parse_actions);
ts_free((void *)self->parse_table);
ts_free((void *)self->primary_state_ids);