feat: add 'reserved word' construct
Co-authored-by: Amaan Qureshi <amaanq12@gmail.com>
This commit is contained in:
parent
2a63077cac
commit
201b41cf11
31 changed files with 2367 additions and 1628 deletions
|
|
@ -29,7 +29,7 @@ uint32_t ts_language_version(const TSLanguage *self) {
|
|||
}
|
||||
|
||||
const char *ts_language_name(const TSLanguage *self) {
|
||||
return self->version >= LANGUAGE_VERSION_WITH_METADATA ? self->name : NULL;
|
||||
return self->version >= LANGUAGE_VERSION_WITH_RESERVED_WORDS ? self->name : NULL;
|
||||
}
|
||||
|
||||
uint32_t ts_language_field_count(const TSLanguage *self) {
|
||||
|
|
@ -56,6 +56,39 @@ void ts_language_table_entry(
|
|||
}
|
||||
}
|
||||
|
||||
TSLexerMode ts_language_lex_mode_for_state(
|
||||
const TSLanguage *self,
|
||||
TSStateId state
|
||||
) {
|
||||
if (self->version < 15) {
|
||||
TSLexMode mode = ((const TSLexMode *)self->lex_modes)[state];
|
||||
return (TSLexerMode) {
|
||||
.lex_state = mode.lex_state,
|
||||
.external_lex_state = mode.external_lex_state,
|
||||
.reserved_word_set_id = 0,
|
||||
};
|
||||
} else {
|
||||
return self->lex_modes[state];
|
||||
}
|
||||
}
|
||||
|
||||
bool ts_language_is_reserved_word(
|
||||
const TSLanguage *self,
|
||||
TSStateId state,
|
||||
TSSymbol symbol
|
||||
) {
|
||||
TSLexerMode lex_mode = ts_language_lex_mode_for_state(self, state);
|
||||
if (lex_mode.reserved_word_set_id > 0) {
|
||||
unsigned start = lex_mode.reserved_word_set_id * self->max_reserved_word_set_size;
|
||||
unsigned end = start + self->max_reserved_word_set_size;
|
||||
for (unsigned i = start; i < end; i++) {
|
||||
if (self->reserved_words[i] == symbol) return true;
|
||||
if (self->reserved_words[i] == 0) break;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
TSSymbolMetadata ts_language_symbol_metadata(
|
||||
const TSLanguage *self,
|
||||
TSSymbol symbol
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ extern "C" {
|
|||
|
||||
#define ts_builtin_sym_error_repeat (ts_builtin_sym_error - 1)
|
||||
|
||||
#define LANGUAGE_VERSION_WITH_METADATA 15
|
||||
#define LANGUAGE_VERSION_WITH_RESERVED_WORDS 15
|
||||
#define LANGUAGE_VERSION_WITH_PRIMARY_STATES 14
|
||||
|
||||
typedef struct {
|
||||
|
|
@ -36,9 +36,9 @@ typedef struct {
|
|||
} LookaheadIterator;
|
||||
|
||||
void ts_language_table_entry(const TSLanguage *self, TSStateId state, TSSymbol symbol, TableEntry *result);
|
||||
|
||||
TSLexerMode ts_language_lex_mode_for_state(const TSLanguage *self, TSStateId state);
|
||||
bool ts_language_is_reserved_word(const TSLanguage *self, TSStateId state, TSSymbol symbol);
|
||||
TSSymbolMetadata ts_language_symbol_metadata(const TSLanguage *self, TSSymbol symbol);
|
||||
|
||||
TSSymbol ts_language_public_symbol(const TSLanguage *self, TSSymbol symbol);
|
||||
|
||||
static inline bool ts_language_is_symbol_external(const TSLanguage *self, TSSymbol symbol) {
|
||||
|
|
|
|||
|
|
@ -80,7 +80,7 @@
|
|||
static const unsigned MAX_VERSION_COUNT = 6;
|
||||
static const unsigned MAX_VERSION_COUNT_OVERFLOW = 4;
|
||||
static const unsigned MAX_SUMMARY_DEPTH = 16;
|
||||
static const unsigned MAX_COST_DIFFERENCE = 16 * ERROR_COST_PER_SKIPPED_TREE;
|
||||
static const unsigned MAX_COST_DIFFERENCE = 18 * ERROR_COST_PER_SKIPPED_TREE;
|
||||
static const unsigned OP_COUNT_PER_PARSER_TIMEOUT_CHECK = 100;
|
||||
|
||||
typedef struct {
|
||||
|
|
@ -342,7 +342,7 @@ static bool ts_parser__better_version_exists(
|
|||
return false;
|
||||
}
|
||||
|
||||
static bool ts_parser__call_main_lex_fn(TSParser *self, TSLexMode lex_mode) {
|
||||
static bool ts_parser__call_main_lex_fn(TSParser *self, TSLexerMode lex_mode) {
|
||||
if (ts_language_is_wasm(self->language)) {
|
||||
return ts_wasm_store_call_lex_main(self->wasm_store, lex_mode.lex_state);
|
||||
} else {
|
||||
|
|
@ -473,10 +473,10 @@ static bool ts_parser__can_reuse_first_leaf(
|
|||
Subtree tree,
|
||||
TableEntry *table_entry
|
||||
) {
|
||||
TSLexMode current_lex_mode = self->language->lex_modes[state];
|
||||
TSSymbol leaf_symbol = ts_subtree_leaf_symbol(tree);
|
||||
TSStateId leaf_state = ts_subtree_leaf_parse_state(tree);
|
||||
TSLexMode leaf_lex_mode = self->language->lex_modes[leaf_state];
|
||||
TSLexerMode current_lex_mode = ts_language_lex_mode_for_state(self->language, state);
|
||||
TSLexerMode leaf_lex_mode = ts_language_lex_mode_for_state(self->language, leaf_state);
|
||||
|
||||
// At the end of a non-terminal extra node, the lexer normally returns
|
||||
// NULL, which indicates that the parser should look for a reduce action
|
||||
|
|
@ -487,7 +487,7 @@ static bool ts_parser__can_reuse_first_leaf(
|
|||
// If the token was created in a state with the same set of lookaheads, it is reusable.
|
||||
if (
|
||||
table_entry->action_count > 0 &&
|
||||
memcmp(&leaf_lex_mode, ¤t_lex_mode, sizeof(TSLexMode)) == 0 &&
|
||||
memcmp(&leaf_lex_mode, ¤t_lex_mode, sizeof(TSLexerMode)) == 0 &&
|
||||
(
|
||||
leaf_symbol != self->language->keyword_capture_token ||
|
||||
(!ts_subtree_is_keyword(tree) && ts_subtree_parse_state(tree) == state)
|
||||
|
|
@ -507,7 +507,7 @@ static Subtree ts_parser__lex(
|
|||
StackVersion version,
|
||||
TSStateId parse_state
|
||||
) {
|
||||
TSLexMode lex_mode = self->language->lex_modes[parse_state];
|
||||
TSLexerMode lex_mode = ts_language_lex_mode_for_state(self->language, parse_state);
|
||||
if (lex_mode.lex_state == (uint16_t)-1) {
|
||||
LOG("no_lookahead_after_non_terminal_extra");
|
||||
return NULL_SUBTREE;
|
||||
|
|
@ -601,7 +601,7 @@ static Subtree ts_parser__lex(
|
|||
|
||||
if (!error_mode) {
|
||||
error_mode = true;
|
||||
lex_mode = self->language->lex_modes[ERROR_STATE];
|
||||
lex_mode = ts_language_lex_mode_for_state(self->language, ERROR_STATE);
|
||||
ts_lexer_reset(&self->lexer, start_position);
|
||||
continue;
|
||||
}
|
||||
|
|
@ -658,7 +658,10 @@ static Subtree ts_parser__lex(
|
|||
if (
|
||||
is_keyword &&
|
||||
self->lexer.token_end_position.bytes == end_byte &&
|
||||
ts_language_has_actions(self->language, parse_state, self->lexer.data.result_symbol)
|
||||
(
|
||||
ts_language_has_actions(self->language, parse_state, self->lexer.data.result_symbol) ||
|
||||
ts_language_is_reserved_word(self->language, parse_state, self->lexer.data.result_symbol)
|
||||
)
|
||||
) {
|
||||
symbol = self->lexer.data.result_symbol;
|
||||
}
|
||||
|
|
@ -1684,15 +1687,20 @@ static bool ts_parser__advance(
|
|||
return true;
|
||||
}
|
||||
|
||||
// If there were no parse actions for the current lookahead token, then
|
||||
// it is not valid in this state. If the current lookahead token is a
|
||||
// keyword, then switch to treating it as the normal word token if that
|
||||
// token is valid in this state.
|
||||
// If the current lookahead token is a keyword that is not valid, but the
|
||||
// default word token *is* valid, then treat the lookahead token as the word
|
||||
// token instead.
|
||||
if (
|
||||
ts_subtree_is_keyword(lookahead) &&
|
||||
ts_subtree_symbol(lookahead) != self->language->keyword_capture_token
|
||||
ts_subtree_symbol(lookahead) != self->language->keyword_capture_token &&
|
||||
!ts_language_is_reserved_word(self->language, state, ts_subtree_symbol(lookahead))
|
||||
) {
|
||||
ts_language_table_entry(self->language, state, self->language->keyword_capture_token, &table_entry);
|
||||
ts_language_table_entry(
|
||||
self->language,
|
||||
state,
|
||||
self->language->keyword_capture_token,
|
||||
&table_entry
|
||||
);
|
||||
if (table_entry.action_count > 0) {
|
||||
LOG(
|
||||
"switch from_keyword:%s, to_word_token:%s",
|
||||
|
|
@ -1707,19 +1715,10 @@ static bool ts_parser__advance(
|
|||
}
|
||||
}
|
||||
|
||||
// If the current lookahead token is not valid and the parser is
|
||||
// already in the error state, restart the error recovery process.
|
||||
// TODO - can this be unified with the other `RECOVER` case above?
|
||||
if (state == ERROR_STATE) {
|
||||
ts_parser__recover(self, version, lookahead);
|
||||
return true;
|
||||
}
|
||||
|
||||
// If the current lookahead token is not valid and the previous
|
||||
// subtree on the stack was reused from an old tree, it isn't actually
|
||||
// valid to reuse it. Remove it from the stack, and in its place,
|
||||
// push each of its children. Then try again to process the current
|
||||
// lookahead.
|
||||
// If the current lookahead token is not valid and the previous subtree on
|
||||
// the stack was reused from an old tree, then it wasn't actually valid to
|
||||
// reuse that previous subtree. Remove it from the stack, and in its place,
|
||||
// push each of its children. Then try again to process the current lookahead.
|
||||
if (ts_parser__breakdown_top_of_stack(self, version)) {
|
||||
state = ts_stack_state(self->stack, version);
|
||||
ts_subtree_release(&self->tree_pool, lookahead);
|
||||
|
|
@ -1727,11 +1726,11 @@ static bool ts_parser__advance(
|
|||
continue;
|
||||
}
|
||||
|
||||
// At this point, the current lookahead token is definitely not valid
|
||||
// for this parse stack version. Mark this version as paused and continue
|
||||
// processing any other stack versions that might exist. If some other
|
||||
// version advances successfully, then this version can simply be removed.
|
||||
// But if all versions end up paused, then error recovery is needed.
|
||||
// Otherwise, there is definitely an error in this version of the parse stack.
|
||||
// Mark this version as paused and continue processing any other stack
|
||||
// versions that exist. If some other version advances successfully, then
|
||||
// this version can simply be removed. But if all versions end up paused,
|
||||
// then error recovery is needed.
|
||||
LOG("detect_error");
|
||||
ts_stack_pause(self->stack, version, lookahead);
|
||||
return true;
|
||||
|
|
|
|||
|
|
@ -79,6 +79,12 @@ typedef struct {
|
|||
uint16_t external_lex_state;
|
||||
} TSLexMode;
|
||||
|
||||
typedef struct {
|
||||
uint16_t lex_state;
|
||||
uint16_t external_lex_state;
|
||||
uint16_t reserved_word_set_id;
|
||||
} TSLexerMode;
|
||||
|
||||
typedef union {
|
||||
TSParseAction action;
|
||||
struct {
|
||||
|
|
@ -115,7 +121,7 @@ struct TSLanguage {
|
|||
const TSSymbol *public_symbol_map;
|
||||
const uint16_t *alias_map;
|
||||
const TSSymbol *alias_sequences;
|
||||
const TSLexMode *lex_modes;
|
||||
const TSLexerMode *lex_modes;
|
||||
bool (*lex_fn)(TSLexer *, TSStateId);
|
||||
bool (*keyword_lex_fn)(TSLexer *, TSStateId);
|
||||
TSSymbol keyword_capture_token;
|
||||
|
|
@ -130,6 +136,8 @@ struct TSLanguage {
|
|||
} external_scanner;
|
||||
const TSStateId *primary_state_ids;
|
||||
const char *name;
|
||||
const TSSymbol *reserved_words;
|
||||
uint16_t max_reserved_word_set_size;
|
||||
};
|
||||
|
||||
static inline bool set_contains(const TSCharacterRange *ranges, uint32_t len, int32_t lookahead) {
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -153,6 +153,9 @@ typedef struct {
|
|||
int32_t deserialize;
|
||||
} external_scanner;
|
||||
int32_t primary_state_ids;
|
||||
int32_t name;
|
||||
int32_t reserved_words;
|
||||
uint16_t max_reserved_word_set_size;
|
||||
} LanguageInWasmMemory;
|
||||
|
||||
// LexerInWasmMemory - The memory layout of a `TSLexer` when compiled to wasm32.
|
||||
|
|
@ -414,6 +417,17 @@ static void *copy_strings(
|
|||
return result;
|
||||
}
|
||||
|
||||
static void *copy_string(
|
||||
const uint8_t *data,
|
||||
int32_t address
|
||||
) {
|
||||
const char *string = (const char *)&data[address];
|
||||
size_t len = strlen(string);
|
||||
char *result = ts_malloc(len + 1);
|
||||
memcpy(result, string, len + 1);
|
||||
return result;
|
||||
}
|
||||
|
||||
static bool name_eq(const wasm_name_t *name, const char *string) {
|
||||
return strncmp(string, name->data, name->size) == 0;
|
||||
}
|
||||
|
|
@ -1202,24 +1216,24 @@ const TSLanguage *ts_wasm_store_load_language(
|
|||
memcpy(&wasm_language, &memory[language_address], sizeof(LanguageInWasmMemory));
|
||||
|
||||
int32_t addresses[] = {
|
||||
wasm_language.alias_map,
|
||||
wasm_language.alias_sequences,
|
||||
wasm_language.field_map_entries,
|
||||
wasm_language.field_map_slices,
|
||||
wasm_language.field_names,
|
||||
wasm_language.keyword_lex_fn,
|
||||
wasm_language.lex_fn,
|
||||
wasm_language.lex_modes,
|
||||
wasm_language.parse_actions,
|
||||
wasm_language.parse_table,
|
||||
wasm_language.primary_state_ids,
|
||||
wasm_language.primary_state_ids,
|
||||
wasm_language.public_symbol_map,
|
||||
wasm_language.small_parse_table,
|
||||
wasm_language.small_parse_table_map,
|
||||
wasm_language.symbol_metadata,
|
||||
wasm_language.symbol_metadata,
|
||||
wasm_language.parse_actions,
|
||||
wasm_language.symbol_names,
|
||||
wasm_language.field_names,
|
||||
wasm_language.field_map_slices,
|
||||
wasm_language.field_map_entries,
|
||||
wasm_language.symbol_metadata,
|
||||
wasm_language.public_symbol_map,
|
||||
wasm_language.alias_map,
|
||||
wasm_language.alias_sequences,
|
||||
wasm_language.lex_modes,
|
||||
wasm_language.lex_fn,
|
||||
wasm_language.keyword_lex_fn,
|
||||
wasm_language.primary_state_ids,
|
||||
wasm_language.name,
|
||||
wasm_language.reserved_words,
|
||||
wasm_language.external_token_count > 0 ? wasm_language.external_scanner.states : 0,
|
||||
wasm_language.external_token_count > 0 ? wasm_language.external_scanner.symbol_map : 0,
|
||||
wasm_language.external_token_count > 0 ? wasm_language.external_scanner.create : 0,
|
||||
|
|
@ -1274,7 +1288,7 @@ const TSLanguage *ts_wasm_store_load_language(
|
|||
),
|
||||
.lex_modes = copy(
|
||||
&memory[wasm_language.lex_modes],
|
||||
wasm_language.state_count * sizeof(TSLexMode)
|
||||
wasm_language.state_count * sizeof(TSLexerMode)
|
||||
),
|
||||
};
|
||||
|
||||
|
|
@ -1350,6 +1364,15 @@ const TSLanguage *ts_wasm_store_load_language(
|
|||
);
|
||||
}
|
||||
|
||||
if (language->version >= LANGUAGE_VERSION_WITH_RESERVED_WORDS) {
|
||||
language->name = copy_string(memory, wasm_language.name);
|
||||
language->reserved_words = copy(
|
||||
&memory[wasm_language.reserved_words],
|
||||
wasm_language.max_reserved_word_set_size * sizeof(TSSymbol)
|
||||
);
|
||||
language->max_reserved_word_set_size = wasm_language.max_reserved_word_set_size;
|
||||
}
|
||||
|
||||
if (language->external_token_count > 0) {
|
||||
language->external_scanner.symbol_map = copy(
|
||||
&memory[wasm_language.external_scanner.symbol_map],
|
||||
|
|
@ -1731,6 +1754,8 @@ void ts_wasm_language_release(const TSLanguage *self) {
|
|||
ts_free((void *)self->field_map_slices);
|
||||
ts_free((void *)self->field_names);
|
||||
ts_free((void *)self->lex_modes);
|
||||
ts_free((void *)self->name);
|
||||
ts_free((void *)self->reserved_words);
|
||||
ts_free((void *)self->parse_actions);
|
||||
ts_free((void *)self->parse_table);
|
||||
ts_free((void *)self->primary_state_ids);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue