From c0cc35ff079a7772b54fbb612d5b2a0e39e5994d Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 7 Mar 2018 11:56:59 -0800 Subject: [PATCH] Create separate lexer function for keywords --- include/tree_sitter/parser.h | 2 + include/tree_sitter/runtime.h | 2 +- .../build_tables/lex_table_builder.cc | 164 +++++++++++------- src/compiler/build_tables/lex_table_builder.h | 9 +- .../build_tables/parse_table_builder.cc | 22 ++- .../build_tables/parse_table_builder.h | 11 +- src/compiler/compile.cc | 31 ++-- src/compiler/generate_code/c_code.cc | 54 ++++-- src/compiler/generate_code/c_code.h | 12 +- src/compiler/rules/symbol.h | 2 +- src/runtime/parser.c | 36 ++-- 11 files changed, 231 insertions(+), 114 deletions(-) diff --git a/include/tree_sitter/parser.h b/include/tree_sitter/parser.h index 845e3879..05c9c31b 100644 --- a/include/tree_sitter/parser.h +++ b/include/tree_sitter/parser.h @@ -81,6 +81,8 @@ typedef struct TSLanguage { const TSSymbol *alias_sequences; uint16_t max_alias_sequence_length; bool (*lex_fn)(TSLexer *, TSStateId); + bool (*keyword_lex_fn)(TSLexer *, TSStateId); + TSSymbol keyword_capture_token; struct { const bool *states; const TSSymbol *symbol_map; diff --git a/include/tree_sitter/runtime.h b/include/tree_sitter/runtime.h index 9efb6328..3bd24bac 100644 --- a/include/tree_sitter/runtime.h +++ b/include/tree_sitter/runtime.h @@ -9,7 +9,7 @@ extern "C" { #include #include -#define TREE_SITTER_LANGUAGE_VERSION 6 +#define TREE_SITTER_LANGUAGE_VERSION 7 typedef unsigned short TSSymbol; typedef struct TSLanguage TSLanguage; diff --git a/src/compiler/build_tables/lex_table_builder.cc b/src/compiler/build_tables/lex_table_builder.cc index a446531d..37705849 100644 --- a/src/compiler/build_tables/lex_table_builder.cc +++ b/src/compiler/build_tables/lex_table_builder.cc @@ -34,13 +34,14 @@ using rules::Symbol; using rules::Metadata; using rules::Seq; -template +template class StartOrEndCharacterAggregator { public: void apply(const Rule &rule) { rule.match( [this](const Seq &sequence) { - apply(is_start ? *sequence.left : *sequence.right); + apply(*sequence.left); + if (include_all) apply(*sequence.right); }, [this](const rules::Choice &rule) { @@ -59,15 +60,17 @@ class StartOrEndCharacterAggregator { CharacterSet result; }; -using StartingCharacterAggregator = StartOrEndCharacterAggregator; -using EndingCharacterAggregator = StartOrEndCharacterAggregator; +using StartingCharacterAggregator = StartOrEndCharacterAggregator; +using AllCharacterAggregator = StartOrEndCharacterAggregator; class LexTableBuilderImpl : public LexTableBuilder { - LexTable lex_table; + LexTable main_lex_table; + LexTable keyword_lex_table; const LexicalGrammar grammar; vector separator_rules; LexConflictManager conflict_manager; - unordered_map lex_state_ids; + unordered_map main_lex_state_ids; + unordered_map keyword_lex_state_ids; CharacterSet separator_start_characters; vector starting_characters_by_token; vector following_characters_by_token; @@ -75,6 +78,8 @@ class LexTableBuilderImpl : public LexTableBuilder { const vector &coincident_tokens_by_token; vector conflict_status_by_token; bool conflict_detection_mode; + LookaheadSet keyword_symbols; + Symbol keyword_capture_token; public: LexTableBuilderImpl(const SyntaxGrammar &syntax_grammar, @@ -86,7 +91,8 @@ class LexTableBuilderImpl : public LexTableBuilder { following_characters_by_token(lexical_grammar.variables.size()), shadowed_tokens_by_token(lexical_grammar.variables.size()), coincident_tokens_by_token(coincident_tokens), - conflict_detection_mode(false) { + conflict_detection_mode(false), + keyword_capture_token(rules::NONE()) { // Compute the possible separator rules and the set of separator characters that can occur // immediately after any token. @@ -113,18 +119,32 @@ class LexTableBuilderImpl : public LexTableBuilder { }); } - // TODO - Refactor this. In general, a keyword token cannot be followed immediately by - // another alphanumeric character. But this requirement is currently not expressed anywhere in - // the grammar. So without this hack, we would be overly conservative about merging parse - // states because we would often consider `identifier` tokens to *conflict* with keyword - // tokens. - if (is_keyword(grammar.variables[i])) { - following_character_aggregator.result - .exclude('a', 'z') - .exclude('A', 'Z') - .exclude('0', '9') - .exclude('_') - .exclude('$'); + if (grammar.variables[i].is_string) { + AllCharacterAggregator aggregator; + aggregator.apply(grammar.variables[i].rule); + bool all_alpha = true, all_lower = true; + for (auto character : aggregator.result.included_chars) { + if (!iswalpha(character)) all_alpha = true; + if (!iswlower(character)) all_lower = false; + } + + if (all_lower) { + keyword_symbols.insert(Symbol::terminal(i)); + } + + // TODO - Refactor this. In general, a keyword token cannot be followed immediately + // by another alphanumeric character. But this requirement is currently not expressed + // anywhere in the grammar. So without this hack, we would be overly conservative about + // merging parse states because we would often consider `identifier` tokens to *conflict* + // with keyword tokens. + if (all_alpha) { + following_character_aggregator.result + .exclude('a', 'z') + .exclude('A', 'Z') + .exclude('0', '9') + .exclude('_') + .exclude('$'); + } } following_characters_by_token[i] = following_character_aggregator.result; @@ -139,18 +159,35 @@ class LexTableBuilderImpl : public LexTableBuilder { starting_characters_by_token[i].intersects(separator_start_characters) || starting_characters_by_token[j].intersects(separator_start_characters)) { clear(); - add_lex_state(item_set_for_terminals(LookaheadSet({ + add_lex_state(main_lex_table, item_set_for_terminals(LookaheadSet({ Symbol::terminal(i), Symbol::terminal(j) - }))); + }), true)); if (conflict_status_by_token[i]) shadowed_tokens_by_token[j].insert(Symbol::terminal(i)); if (conflict_status_by_token[j]) shadowed_tokens_by_token[i].insert(Symbol::terminal(j)); } } } + + for (Symbol::Index i = 0, n = grammar.variables.size(); i < n; i++) { + Symbol symbol = Symbol::terminal(i); + + bool matches_all_keywords = true; + keyword_symbols.for_each([&](Symbol keyword_symbol) { + if (!shadowed_tokens_by_token[keyword_symbol.index].count(symbol)) { + matches_all_keywords = false; + } + }); + + if (matches_all_keywords && ( + keyword_capture_token == rules::NONE() || + shadowed_tokens_by_token[symbol.index].size() < + shadowed_tokens_by_token[keyword_capture_token.index].size() + )) keyword_capture_token = symbol; + } } - LexTable build(ParseTable *parse_table) { + BuildResult build(ParseTable *parse_table) { clear(); conflict_detection_mode = false; vector>> starting_token_sets; @@ -158,7 +195,11 @@ class LexTableBuilderImpl : public LexTableBuilder { for (ParseState &parse_state : parse_table->states) { LookaheadSet token_set; for (auto &entry : parse_state.terminal_entries) { - token_set.insert(entry.first); + if (keyword_capture_token.is_terminal() && keyword_symbols.contains(entry.first)) { + token_set.insert(keyword_capture_token); + } else { + token_set.insert(entry.first); + } } bool did_merge = false; @@ -174,14 +215,17 @@ class LexTableBuilderImpl : public LexTableBuilder { } for (auto &pair : starting_token_sets) { - LexStateId state_id = add_lex_state(item_set_for_terminals(pair.first)); + LexStateId state_id = add_lex_state(main_lex_table, item_set_for_terminals(pair.first, true)); for (ParseState *parse_state : pair.second) { parse_state->lex_state_id = state_id; } } + + add_lex_state(keyword_lex_table, item_set_for_terminals(keyword_symbols, false)); + mark_fragile_tokens(parse_table); - remove_duplicate_lex_states(parse_table); - return lex_table; + remove_duplicate_lex_states(main_lex_table, parse_table); + return {main_lex_table, keyword_lex_table, keyword_capture_token}; } const set &get_incompatible_tokens(Symbol::Index index) const { @@ -189,36 +233,24 @@ class LexTableBuilderImpl : public LexTableBuilder { } private: - bool is_keyword(const LexicalVariable &variable) { - EndingCharacterAggregator aggregator; - aggregator.apply(variable.rule); - return - !aggregator.result.includes_all && - aggregator.result.included_chars.size() == 1 && - iswalpha(*aggregator.result.included_chars.begin()); - } - - LexStateId add_lex_state(const LexItemSet &item_set) { + LexStateId add_lex_state(LexTable &lex_table, const LexItemSet &item_set) { + auto &lex_state_ids = &lex_table == &main_lex_table ? + main_lex_state_ids : + keyword_lex_state_ids; const auto &pair = lex_state_ids.find(item_set); if (pair == lex_state_ids.end()) { LexStateId state_id = lex_table.states.size(); lex_table.states.push_back(LexState()); lex_state_ids[item_set] = state_id; - add_accept_token_actions(item_set, state_id); - add_advance_actions(item_set, state_id); + add_accept_token_actions(lex_table, item_set, state_id); + add_advance_actions(lex_table, item_set, state_id); return state_id; } else { return pair->second; } } - void clear() { - lex_table.states.clear(); - lex_state_ids.clear(); - conflict_status_by_token = vector(grammar.variables.size(), false); - } - - void add_advance_actions(const LexItemSet &item_set, LexStateId state_id) { + void add_advance_actions(LexTable &lex_table, const LexItemSet &item_set, LexStateId state_id) { for (const auto &pair : item_set.transitions()) { const CharacterSet &characters = pair.first; const LexItemSet::Transition &transition = pair.second; @@ -253,12 +285,12 @@ class LexTableBuilderImpl : public LexTableBuilder { if (!prefer_advancing) continue; } - action.state_index = add_lex_state(transition.destination); + action.state_index = add_lex_state(lex_table, transition.destination); lex_table.states[state_id].advance_actions[characters] = action; } } - void add_accept_token_actions(const LexItemSet &item_set, LexStateId state_id) { + void add_accept_token_actions(LexTable &lex_table, const LexItemSet &item_set, LexStateId state_id) { for (const LexItem &item : item_set.entries) { LexItem::CompletionStatus completion_status = item.completion_status(); if (completion_status.is_done) { @@ -340,7 +372,7 @@ class LexTableBuilderImpl : public LexTableBuilder { return is_compatible; } - void remove_duplicate_lex_states(ParseTable *parse_table) { + void remove_duplicate_lex_states(LexTable &lex_table, ParseTable *parse_table) { for (LexState &state : lex_table.states) { state.accept_action.is_string = false; state.accept_action.precedence = 0; @@ -407,22 +439,26 @@ class LexTableBuilderImpl : public LexTableBuilder { } } - LexItemSet item_set_for_terminals(const LookaheadSet &terminals) { + LexItemSet item_set_for_terminals(const LookaheadSet &terminals, bool with_separators) { LexItemSet result; terminals.for_each([&](Symbol symbol) { if (symbol.is_terminal()) { for (const auto &rule : rules_for_symbol(symbol)) { - for (const auto &separator_rule : separator_rules) { - result.entries.insert(LexItem( - symbol, - Metadata::separator( - Rule::seq({ - separator_rule, - Metadata::main_token(rule) - }) - ) - )); - } + if (with_separators) { + for (const auto &separator_rule : separator_rules) { + result.entries.insert(LexItem( + symbol, + Metadata::separator( + Rule::seq({ + separator_rule, + Metadata::main_token(rule) + }) + ) + )); + } + } else { + result.entries.insert(LexItem(symbol, Metadata::main_token(rule))); + } } } }); @@ -444,6 +480,12 @@ class LexTableBuilderImpl : public LexTableBuilder { } ); } + + void clear() { + main_lex_table.states.clear(); + main_lex_state_ids.clear(); + conflict_status_by_token = vector(grammar.variables.size(), false); + } }; unique_ptr LexTableBuilder::create(const SyntaxGrammar &syntax_grammar, @@ -458,7 +500,7 @@ unique_ptr LexTableBuilder::create(const SyntaxGrammar &syntax_ )); } -LexTable LexTableBuilder::build(ParseTable *parse_table) { +LexTableBuilder::BuildResult LexTableBuilder::build(ParseTable *parse_table) { return static_cast(this)->build(parse_table); } diff --git a/src/compiler/build_tables/lex_table_builder.h b/src/compiler/build_tables/lex_table_builder.h index 1b6fa0ab..3a6bb371 100644 --- a/src/compiler/build_tables/lex_table_builder.h +++ b/src/compiler/build_tables/lex_table_builder.h @@ -23,7 +23,14 @@ class LexTableBuilder { const LexicalGrammar &, const std::unordered_map &, const std::vector &); - LexTable build(ParseTable *); + + struct BuildResult { + LexTable main_table; + LexTable keyword_table; + rules::Symbol keyword_capture_token; + }; + + BuildResult build(ParseTable *); const std::set &get_incompatible_tokens(rules::Symbol::Index) const; protected: diff --git a/src/compiler/build_tables/parse_table_builder.cc b/src/compiler/build_tables/parse_table_builder.cc index fd202bda..9e581512 100644 --- a/src/compiler/build_tables/parse_table_builder.cc +++ b/src/compiler/build_tables/parse_table_builder.cc @@ -73,7 +73,7 @@ class ParseTableBuilderImpl : public ParseTableBuilder { } } - tuple build() { + BuildResult build() { // Ensure that the empty rename sequence has index 0. parse_table.alias_sequences.push_back({}); @@ -92,7 +92,13 @@ class ParseTableBuilderImpl : public ParseTableBuilder { }}); CompileError error = process_part_state_queue(); - if (error) return make_tuple(parse_table, LexTable(), error); + if (error) return { + parse_table, + LexTable(), + LexTable(), + rules::NONE(), + error, + }; lex_table_builder = LexTableBuilder::create( grammar, @@ -105,8 +111,14 @@ class ParseTableBuilderImpl : public ParseTableBuilder { remove_precedence_values(); remove_duplicate_parse_states(); - auto lex_table = lex_table_builder->build(&parse_table); - return make_tuple(parse_table, lex_table, CompileError::none()); + auto lex_table_result = lex_table_builder->build(&parse_table); + return { + parse_table, + lex_table_result.main_table, + lex_table_result.keyword_table, + lex_table_result.keyword_capture_token, + CompileError::none() + }; } private: @@ -770,7 +782,7 @@ unique_ptr ParseTableBuilder::create( return unique_ptr(new ParseTableBuilderImpl(syntax_grammar, lexical_grammar)); } -tuple ParseTableBuilder::build() { +ParseTableBuilder::BuildResult ParseTableBuilder::build() { return static_cast(this)->build(); } diff --git a/src/compiler/build_tables/parse_table_builder.h b/src/compiler/build_tables/parse_table_builder.h index 1cbecb49..cb642d6c 100644 --- a/src/compiler/build_tables/parse_table_builder.h +++ b/src/compiler/build_tables/parse_table_builder.h @@ -17,7 +17,16 @@ namespace build_tables { class ParseTableBuilder { public: static std::unique_ptr create(const SyntaxGrammar &, const LexicalGrammar &); - std::tuple build(); + + struct BuildResult { + ParseTable parse_table; + LexTable main_lex_table; + LexTable keyword_lex_table; + rules::Symbol keyword_capture_token; + CompileError error; + }; + + BuildResult build(); protected: ParseTableBuilder() = default; diff --git a/src/compiler/compile.cc b/src/compiler/compile.cc index ad3a64cb..56af3aed 100644 --- a/src/compiler/compile.cc +++ b/src/compiler/compile.cc @@ -9,6 +9,7 @@ namespace tree_sitter { +using std::move; using std::pair; using std::string; using std::vector; @@ -23,26 +24,32 @@ extern "C" TSCompileResult ts_compile_grammar(const char *input) { } auto prepare_grammar_result = prepare_grammar::prepare_grammar(parse_result.grammar); - const SyntaxGrammar &syntax_grammar = get<0>(prepare_grammar_result); - const LexicalGrammar &lexical_grammar = get<1>(prepare_grammar_result); + SyntaxGrammar &syntax_grammar = get<0>(prepare_grammar_result); + LexicalGrammar &lexical_grammar = get<1>(prepare_grammar_result); CompileError error = get<2>(prepare_grammar_result); if (error.type) { - return { nullptr, strdup(error.message.c_str()), error.type }; + return {nullptr, strdup(error.message.c_str()), error.type}; } auto builder = build_tables::ParseTableBuilder::create(syntax_grammar, lexical_grammar); - auto table_build_result = builder->build(); - const ParseTable &parse_table = get<0>(table_build_result); - const LexTable &lex_table = get<1>(table_build_result); - error = get<2>(table_build_result); - if (error.type) { - return { nullptr, strdup(error.message.c_str()), error.type }; + auto build_tables_result = builder->build(); + error = build_tables_result.error; + if (error.type != 0) { + return {nullptr, strdup(error.message.c_str()), error.type}; } - string code = generate_code::c_code(parse_result.name, parse_table, lex_table, - syntax_grammar, lexical_grammar); + string code = generate_code::c_code( + parse_result.name, + move(build_tables_result.parse_table), + move(build_tables_result.main_lex_table), + move(build_tables_result.keyword_lex_table), + build_tables_result.keyword_capture_token, + move(syntax_grammar), + move(lexical_grammar) + ); - return { strdup(code.c_str()), nullptr, TSCompileErrorTypeNone }; + return { + strdup(code.c_str()), nullptr, TSCompileErrorTypeNone }; } } // namespace tree_sitter diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index 266dac50..1c6db406 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -18,6 +18,7 @@ namespace generate_code { using std::function; using std::map; +using std::move; using std::pair; using std::set; using std::string; @@ -70,7 +71,9 @@ class CCodeGenerator { const string name; const ParseTable parse_table; - const LexTable lex_table; + const LexTable main_lex_table; + const LexTable keyword_lex_table; + Symbol keyword_capture_token; const SyntaxGrammar syntax_grammar; const LexicalGrammar lexical_grammar; map sanitized_names; @@ -80,15 +83,17 @@ class CCodeGenerator { set unique_aliases; public: - CCodeGenerator(string name, const ParseTable &parse_table, - const LexTable &lex_table, const SyntaxGrammar &syntax_grammar, - const LexicalGrammar &lexical_grammar) + CCodeGenerator(string name, ParseTable &&parse_table, LexTable &&main_lex_table, + LexTable &&keyword_lex_table, Symbol keyword_capture_token, + SyntaxGrammar &&syntax_grammar, LexicalGrammar &&lexical_grammar) : indent_level(0), name(name), - parse_table(parse_table), - lex_table(lex_table), - syntax_grammar(syntax_grammar), - lexical_grammar(lexical_grammar), + parse_table(move(parse_table)), + main_lex_table(move(main_lex_table)), + keyword_lex_table(move(keyword_lex_table)), + keyword_capture_token(keyword_capture_token), + syntax_grammar(move(syntax_grammar)), + lexical_grammar(move(lexical_grammar)), next_parse_action_list_index(0) {} string code() { @@ -105,7 +110,12 @@ class CCodeGenerator { add_alias_sequences(); } - add_lex_function(); + add_lex_function("ts_lex", main_lex_table); + + if (keyword_capture_token != rules::NONE()) { + add_lex_function("ts_lex_keywords", keyword_lex_table); + } + add_lex_modes_list(); if (!syntax_grammar.external_tokens.empty()) { @@ -273,8 +283,8 @@ class CCodeGenerator { line(); } - void add_lex_function() { - line("static bool ts_lex(TSLexer *lexer, TSStateId state) {"); + void add_lex_function(string name, const LexTable &lex_table) { + line("static bool " + name + "(TSLexer *lexer, TSStateId state) {"); indent([&]() { line("START_LEXER();"); _switch("state", [&]() { @@ -457,6 +467,12 @@ class CCodeGenerator { line(".max_alias_sequence_length = MAX_ALIAS_SEQUENCE_LENGTH,"); line(".lex_fn = ts_lex,"); + + if (keyword_capture_token != rules::NONE()) { + line(".keyword_lex_fn = ts_lex_keywords,"); + line(".keyword_capture_token = " + symbol_id(keyword_capture_token) + ","); + } + line(".external_token_count = EXTERNAL_TOKEN_COUNT,"); if (!syntax_grammar.external_tokens.empty()) { @@ -832,15 +848,17 @@ class CCodeGenerator { } }; -string c_code(string name, const ParseTable &parse_table, - const LexTable &lex_table, const SyntaxGrammar &syntax_grammar, - const LexicalGrammar &lexical_grammar) { +string c_code(string name, ParseTable &&parse_table, LexTable &&lex_table, + LexTable &&keyword_lex_table, Symbol keyword_capture_token, + SyntaxGrammar &&syntax_grammar, LexicalGrammar &&lexical_grammar) { return CCodeGenerator( name, - parse_table, - lex_table, - syntax_grammar, - lexical_grammar + move(parse_table), + move(lex_table), + move(keyword_lex_table), + keyword_capture_token, + move(syntax_grammar), + move(lexical_grammar) ).code(); } diff --git a/src/compiler/generate_code/c_code.h b/src/compiler/generate_code/c_code.h index 1e77ed0e..dc0a8ddf 100644 --- a/src/compiler/generate_code/c_code.h +++ b/src/compiler/generate_code/c_code.h @@ -2,6 +2,7 @@ #define COMPILER_GENERATE_CODE_C_CODE_H_ #include +#include "compiler/rule.h" namespace tree_sitter { @@ -12,8 +13,15 @@ struct ParseTable; namespace generate_code { -std::string c_code(std::string, const ParseTable &, const LexTable &, - const SyntaxGrammar &, const LexicalGrammar &); +std::string c_code( + std::string, + ParseTable &&, + LexTable &&, + LexTable &&, + rules::Symbol, + SyntaxGrammar &&, + LexicalGrammar && +); } // namespace generate_code } // namespace tree_sitter diff --git a/src/compiler/rules/symbol.h b/src/compiler/rules/symbol.h index 9df0dc84..adfe5f2c 100644 --- a/src/compiler/rules/symbol.h +++ b/src/compiler/rules/symbol.h @@ -73,4 +73,4 @@ inline Symbol NONE() { } // namespace rules } // namespace tree_sitter -#endif // COMPILER_RULES_SYMBOL_H_ \ No newline at end of file +#endif // COMPILER_RULES_SYMBOL_H_ diff --git a/src/runtime/parser.c b/src/runtime/parser.c index c7b83528..1066d460 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -358,9 +358,6 @@ static Tree *parser__lex(Parser *self, StackVersion version, TSStateId parse_sta ); ts_lexer_start(&self->lexer); if (self->language->lex_fn(&self->lexer.data, lex_mode.lex_state)) { - if (length_is_undefined(self->lexer.token_end_position)) { - self->lexer.token_end_position = self->lexer.current_position; - } break; } @@ -398,23 +395,39 @@ static Tree *parser__lex(Parser *self, StackVersion version, TSStateId parse_sta error_end_position = self->lexer.current_position; } + if (self->lexer.current_position.bytes > last_byte_scanned) { + last_byte_scanned = self->lexer.current_position.bytes; + } + Tree *result; if (skipped_error) { Length padding = length_sub(error_start_position, start_position); Length size = length_sub(error_end_position, error_start_position); result = ts_tree_make_error(&self->tree_pool, size, padding, first_error_character, self->language); } else { - TSSymbol symbol = self->lexer.data.result_symbol; - if (found_external_token) { - symbol = self->language->external_scanner.symbol_map[symbol]; - } - if (self->lexer.token_end_position.bytes < self->lexer.token_start_position.bytes) { self->lexer.token_start_position = self->lexer.token_end_position; } + TSSymbol symbol = self->lexer.data.result_symbol; Length padding = length_sub(self->lexer.token_start_position, start_position); Length size = length_sub(self->lexer.token_end_position, self->lexer.token_start_position); + + if (found_external_token) { + symbol = self->language->external_scanner.symbol_map[symbol]; + } else if (symbol == self->language->keyword_capture_token && symbol != 0) { + uint32_t end_byte = self->lexer.token_end_position.bytes; + ts_lexer_reset(&self->lexer, self->lexer.token_start_position); + ts_lexer_start(&self->lexer); + if ( + self->language->keyword_lex_fn(&self->lexer.data, 0) && + self->lexer.token_end_position.bytes == end_byte && + ts_language_has_actions(self->language, parse_state, self->lexer.data.result_symbol) + ) { + symbol = self->lexer.data.result_symbol; + } + } + result = ts_tree_make_leaf(&self->tree_pool, symbol, padding, size, self->language); if (found_external_token) { @@ -427,9 +440,6 @@ static Tree *parser__lex(Parser *self, StackVersion version, TSStateId parse_sta } } - if (self->lexer.current_position.bytes > last_byte_scanned) { - last_byte_scanned = self->lexer.current_position.bytes; - } result->bytes_scanned = last_byte_scanned - start_position.bytes + 1; result->parse_state = parse_state; result->first_leaf.lex_mode = lex_mode; @@ -466,7 +476,9 @@ static bool parser__can_reuse_first_leaf(Parser *self, TSStateId state, Tree *tr TSLexMode current_lex_mode = self->language->lex_modes[state]; return (tree->first_leaf.lex_mode.lex_state == current_lex_mode.lex_state && - tree->first_leaf.lex_mode.external_lex_state == current_lex_mode.external_lex_state) || + tree->first_leaf.lex_mode.external_lex_state == current_lex_mode.external_lex_state && + (tree->first_leaf.symbol != self->language->keyword_capture_token || + tree->parse_state == state)) || (current_lex_mode.external_lex_state == 0 && tree->size.bytes > 0 && table_entry->is_reusable &&