From 38c144b4a338ec1bf6dedc20ec17535b0f21f35f Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 21 Jun 2016 07:28:04 -0700 Subject: [PATCH] Refine logic for deciding when tokens need to be re-lexed * While generating the lex table, note which tokens can match the same string. A token needs to be relexed when it has possible homonyms in the current state. * Also note which tokens can match substrings of each other tokens. A token needs to be relexed when there are viable tokens that could match longer strings in the current state and the next token has been edited. * Remove the logic for marking tokens as fragile on creation. * Store the reusability/non-reusability of symbols off of individual actions and onto the entire entry for the state & symbol. --- include/tree_sitter/parser.h | 45 ++++--- .../build_tables/lex_conflict_manager_spec.cc | 24 ++-- spec/helpers/stream_methods.cc | 6 +- src/compiler/build_tables/build_lex_table.cc | 32 ++++- .../build_tables/build_parse_table.cc | 43 +++---- .../build_tables/lex_conflict_manager.cc | 16 ++- .../build_tables/lex_conflict_manager.h | 9 +- src/compiler/generate_code/c_code.cc | 112 +++++++----------- src/compiler/lex_table.cc | 9 +- src/compiler/lex_table.h | 1 - src/compiler/parse_table.cc | 51 ++++---- src/compiler/parse_table.h | 36 ++---- src/runtime/language.c | 46 +++---- src/runtime/language.h | 43 ++++++- src/runtime/lexer.c | 3 - src/runtime/lexer.h | 1 - src/runtime/parser.c | 88 +++++++++----- src/runtime/tree.c | 23 ++-- src/runtime/tree.h | 6 +- 19 files changed, 337 insertions(+), 257 deletions(-) diff --git a/include/tree_sitter/parser.h b/include/tree_sitter/parser.h index 6178ca21..04b2be8e 100644 --- a/include/tree_sitter/parser.h +++ b/include/tree_sitter/parser.h @@ -51,7 +51,6 @@ typedef struct TSLexer { int32_t lookahead; TSStateId starting_state; TSSymbol result_symbol; - bool result_is_fragile; bool result_follows_error; int32_t first_unexpected_character; @@ -79,12 +78,15 @@ typedef struct { TSParseActionType type : 3; bool extra : 1; bool fragile : 1; - bool can_hide_split : 1; } TSParseAction; typedef union { TSParseAction action; - unsigned int count; + struct { + unsigned short count; + bool reusable : 1; + bool depends_on_lookahead : 1; + }; } TSParseActionEntry; struct TSLanguage { @@ -125,13 +127,6 @@ struct TSLanguage { GO_TO_STATE(state_value); \ } -#define ACCEPT_FRAGILE_TOKEN(symbol_value) \ - { \ - lexer->result_is_fragile = true; \ - lexer->result_symbol = symbol_value; \ - return true; \ - } - #define ACCEPT_TOKEN(symbol_value) \ { \ lexer->result_symbol = symbol_value; \ @@ -151,23 +146,16 @@ struct TSLanguage { * Parse Table Macros */ -enum { - FRAGILE = 1, - CAN_HIDE_SPLIT = 2, -}; - #define ERROR() \ { \ { .type = TSParseActionTypeError } \ } -#define SHIFT(to_state_value, flags) \ - { \ - { \ - .type = TSParseActionTypeShift, \ - .can_hide_split = (flags & CAN_HIDE_SPLIT) != 0, \ - .data = {.to_state = to_state_value } \ - } \ +#define SHIFT(to_state_value) \ + { \ + { \ + .type = TSParseActionTypeShift, .data = {.to_state = to_state_value } \ + } \ } #define RECOVER(to_state_value) \ @@ -191,11 +179,18 @@ enum { } \ } -#define REDUCE(symbol_val, child_count_val, flags) \ +#define REDUCE(symbol_val, child_count_val) \ { \ { \ - .type = TSParseActionTypeReduce, .fragile = (flags & FRAGILE) != 0, \ - .can_hide_split = (flags & CAN_HIDE_SPLIT) != 0, \ + .type = TSParseActionTypeReduce, .fragile = false, \ + .data = {.symbol = symbol_val, .child_count = child_count_val } \ + } \ + } + +#define REDUCE_FRAGILE(symbol_val, child_count_val) \ + { \ + { \ + .type = TSParseActionTypeReduce, .fragile = true, \ .data = {.symbol = symbol_val, .child_count = child_count_val } \ } \ } diff --git a/spec/compiler/build_tables/lex_conflict_manager_spec.cc b/spec/compiler/build_tables/lex_conflict_manager_spec.cc index 98f6b3bf..b62b9137 100644 --- a/spec/compiler/build_tables/lex_conflict_manager_spec.cc +++ b/spec/compiler/build_tables/lex_conflict_manager_spec.cc @@ -1,7 +1,9 @@ #include "spec_helper.h" +#include "helpers/rule_helpers.h" #include "compiler/rules/built_in_symbols.h" #include "compiler/parse_table.h" #include "compiler/build_tables/lex_conflict_manager.h" +#include "compiler/build_tables/lex_item.h" using namespace rules; using namespace build_tables; @@ -14,14 +16,16 @@ describe("LexConflictManager::resolve(new_action, old_action)", []() { Symbol sym1(0, true); Symbol sym2(1, true); Symbol sym3(2, true); + Symbol sym4(3, true); + LexItemSet item_set({ LexItem(sym4, blank() )}); it("favors advance actions over empty accept token actions", [&]() { - update = conflict_manager.resolve(AdvanceAction(2, {0, 0}, true), AcceptTokenAction()); + update = conflict_manager.resolve(item_set, AdvanceAction(2, {0, 0}, true), AcceptTokenAction()); AssertThat(update, IsTrue()); }); describe("accept-token/accept-token conflicts", [&]() { - describe("when one tokens' precedence values differ", [&]() { + describe("when the tokens' precedence values differ", [&]() { it("favors the token with higher precedence", [&]() { update = conflict_manager.resolve(AcceptTokenAction(sym2, 1, false), AcceptTokenAction(sym1, 2, false)); AssertThat(update, IsFalse()); @@ -30,9 +34,9 @@ describe("LexConflictManager::resolve(new_action, old_action)", []() { AssertThat(update, IsTrue()); }); - it("adds the discarded token to the 'fragile tokens' set", [&]() { - update = conflict_manager.resolve(AcceptTokenAction(sym2, 1, false), AcceptTokenAction(sym1, 2, false)); - AssertThat(conflict_manager.fragile_tokens, Contains(sym2)); + it("adds the preferred token as a possible homonym for the discarded one", [&]() { + conflict_manager.resolve(AcceptTokenAction(sym2, 1, false), AcceptTokenAction(sym1, 2, false)); + AssertThat(conflict_manager.possible_homonyms[sym2], Contains(sym1)); }); }); @@ -60,16 +64,22 @@ describe("LexConflictManager::resolve(new_action, old_action)", []() { describe("advance/accept-token conflicts", [&]() { describe("when the token to accept has higher precedence", [&]() { it("prefers the accept-token action", [&]() { - update = conflict_manager.resolve(AdvanceAction(1, { 1, 2 }, true), AcceptTokenAction(sym3, 3, true)); + update = conflict_manager.resolve(item_set, AdvanceAction(1, { 1, 2 }, true), AcceptTokenAction(sym3, 3, true)); AssertThat(update, IsFalse()); + AssertThat(conflict_manager.possible_extensions, IsEmpty()); }); }); describe("when the token to accept does not have a higher precedence", [&]() { it("favors the advance action", [&]() { - update = conflict_manager.resolve(AdvanceAction(1, { 1, 2 }, true), AcceptTokenAction(sym3, 2, true)); + update = conflict_manager.resolve(item_set, AdvanceAction(1, { 1, 2 }, true), AcceptTokenAction(sym3, 2, true)); AssertThat(update, IsTrue()); }); + + it("adds the in-progress tokens as possible extensions of the discarded token", [&]() { + conflict_manager.resolve(item_set, AdvanceAction(1, { 1, 2 }, true), AcceptTokenAction(sym3, 3, true)); + AssertThat(conflict_manager.possible_extensions[sym3], Contains(sym4)); + }); }); }); }); diff --git a/spec/helpers/stream_methods.cc b/spec/helpers/stream_methods.cc index a2a364a3..514d6181 100644 --- a/spec/helpers/stream_methods.cc +++ b/spec/helpers/stream_methods.cc @@ -78,11 +78,11 @@ ostream &operator<<(ostream &stream, const ParseAction &action) { ostream &operator<<(ostream &stream, const ParseState &state) { stream << string("# {"); - for (auto &action : pair.second) { + stream << entry.first << string(" => {"); + for (auto &action : entry.second.actions) { stream << string(" ") << action; } stream << string("}"); diff --git a/src/compiler/build_tables/build_lex_table.cc b/src/compiler/build_tables/build_lex_table.cc index ca174e77..be5a3b4d 100644 --- a/src/compiler/build_tables/build_lex_table.cc +++ b/src/compiler/build_tables/build_lex_table.cc @@ -92,7 +92,8 @@ class LexTableBuilder { AdvanceAction action(-1, transition.precedence, transition.in_main_token); auto current_action = lex_table.state(state_id).accept_action; - if (conflict_manager.resolve(action, current_action)) { + if (conflict_manager.resolve(transition.destination, action, + current_action)) { action.state_index = add_lex_state(transition.destination); lex_table.state(state_id).advance_actions[characters] = action; } @@ -114,10 +115,31 @@ class LexTableBuilder { } void mark_fragile_tokens() { - for (LexState &state : lex_table.states) - if (state.accept_action.is_present()) - if (conflict_manager.fragile_tokens.count(state.accept_action.symbol)) - state.accept_action.is_fragile = true; + for (ParseState &state : parse_table->states) { + for (auto &entry : state.entries) { + if (!entry.first.is_token) + continue; + + auto homonyms = conflict_manager.possible_homonyms.find(entry.first); + if (homonyms != conflict_manager.possible_homonyms.end()) + for (const Symbol &homonym : homonyms->second) + if (state.entries.count(homonym)) { + entry.second.reusable = false; + break; + } + + if (!entry.second.reusable) + continue; + + auto extensions = conflict_manager.possible_extensions.find(entry.first); + if (extensions != conflict_manager.possible_extensions.end()) + for (const Symbol &extension : extensions->second) + if (state.entries.count(extension)) { + entry.second.depends_on_lookahead = true; + break; + } + } + } } void remove_duplicate_lex_states() { diff --git a/src/compiler/build_tables/build_parse_table.cc b/src/compiler/build_tables/build_parse_table.cc index 98072941..48ac26b9 100644 --- a/src/compiler/build_tables/build_parse_table.cc +++ b/src/compiler/build_tables/build_parse_table.cc @@ -110,7 +110,7 @@ class ParseTableBuilder { } for (const Symbol &symbol : grammar.extra_tokens) { - parse_table.error_state.actions[symbol].push_back( + parse_table.error_state.entries[symbol].actions.push_back( ParseAction::ShiftExtra()); } @@ -119,7 +119,7 @@ class ParseTableBuilder { add_out_of_context_parse_state(symbol); } - parse_table.error_state.actions[rules::END_OF_INPUT()].push_back( + parse_table.error_state.entries[rules::END_OF_INPUT()].actions.push_back( ParseAction::Shift(0, PrecedenceRange())); } @@ -127,7 +127,7 @@ class ParseTableBuilder { const ParseItemSet &item_set = recovery_states[symbol]; if (!item_set.entries.empty()) { ParseStateId state = add_parse_state(item_set); - parse_table.error_state.actions[symbol].push_back( + parse_table.error_state.entries[symbol].actions.push_back( ParseAction::Shift(state, PrecedenceRange())); } } @@ -198,15 +198,15 @@ class ParseTableBuilder { const ParseState &state = parse_table.states[state_id]; for (const Symbol &extra_symbol : grammar.extra_tokens) { - const auto &actions_for_symbol = state.actions.find(extra_symbol); - if (actions_for_symbol == state.actions.end()) + const auto &entry_for_symbol = state.entries.find(extra_symbol); + if (entry_for_symbol == state.entries.end()) continue; - for (const ParseAction &action : actions_for_symbol->second) + for (const ParseAction &action : entry_for_symbol->second.actions) if (action.type == ParseActionTypeShift && !action.extra) { size_t dest_state_id = action.state_index; ParseAction reduce_extra = ParseAction::ReduceExtra(extra_symbol); - for (const auto &pair : state.actions) + for (const auto &pair : state.entries) add_action(dest_state_id, pair.first, reduce_extra, null_item_set); } } @@ -216,11 +216,14 @@ class ParseTableBuilder { for (ParseState &state : parse_table.states) { set symbols_with_multiple_actions; - for (auto &entry : state.actions) { - if (entry.second.size() > 1) - symbols_with_multiple_actions.insert(entry.first); + for (auto &entry : state.entries) { + const Symbol &symbol = entry.first; + auto &actions = entry.second.actions; - for (ParseAction &action : entry.second) { + if (actions.size() > 1) + symbols_with_multiple_actions.insert(symbol); + + for (ParseAction &action : actions) { if (action.type == ParseActionTypeReduce && !action.extra) { if (has_fragile_production(action.production)) action.fragile = true; @@ -231,11 +234,11 @@ class ParseTableBuilder { } } - for (auto i = entry.second.begin(); i != entry.second.end();) { + for (auto i = actions.begin(); i != actions.end();) { bool erased = false; - for (auto j = entry.second.begin(); j != i; j++) { + for (auto j = actions.begin(); j != i; j++) { if (*j == *i) { - entry.second.erase(i); + actions.erase(i); erased = true; break; } @@ -246,12 +249,12 @@ class ParseTableBuilder { } if (!symbols_with_multiple_actions.empty()) { - for (auto &entry : state.actions) { + for (auto &entry : state.entries) { if (!entry.first.is_token) { set first_set = get_first_set(entry.first); for (const Symbol &symbol : symbols_with_multiple_actions) { if (first_set.count(symbol)) { - entry.second[0].can_hide_split = true; + entry.second.reusable = false; break; } } @@ -276,14 +279,14 @@ class ParseTableBuilder { ParseAction *add_action(ParseStateId state_id, Symbol lookahead, const ParseAction &new_action, const ParseItemSet &item_set) { - const auto ¤t_actions = parse_table.states[state_id].actions; - const auto ¤t_entry = current_actions.find(lookahead); - if (current_entry == current_actions.end()) + const ParseState &state = parse_table.states[state_id]; + const auto ¤t_entry = state.entries.find(lookahead); + if (current_entry == state.entries.end()) return &parse_table.set_action(state_id, lookahead, new_action); if (allow_any_conflict) return &parse_table.add_action(state_id, lookahead, new_action); - const ParseAction old_action = current_entry->second[0]; + const ParseAction old_action = current_entry->second.actions[0]; auto resolution = conflict_manager.resolve(new_action, old_action); switch (resolution.second) { diff --git a/src/compiler/build_tables/lex_conflict_manager.cc b/src/compiler/build_tables/lex_conflict_manager.cc index 45537081..b89228d4 100644 --- a/src/compiler/build_tables/lex_conflict_manager.cc +++ b/src/compiler/build_tables/lex_conflict_manager.cc @@ -2,15 +2,23 @@ #include #include "compiler/parse_table.h" #include "compiler/rules/built_in_symbols.h" +#include "compiler/build_tables/lex_item.h" namespace tree_sitter { namespace build_tables { -bool LexConflictManager::resolve(const AdvanceAction &new_action, +bool LexConflictManager::resolve(const LexItemSet &item_set, + const AdvanceAction &new_action, const AcceptTokenAction &old_action) { if (!old_action.is_present()) return true; - return new_action.precedence_range.max >= old_action.precedence; + if (new_action.precedence_range.max >= old_action.precedence) { + for (const LexItem &item : item_set.entries) + possible_extensions[old_action.symbol].insert(item.lhs); + return true; + } else { + return false; + } } bool LexConflictManager::resolve(const AcceptTokenAction &new_action, @@ -36,9 +44,9 @@ bool LexConflictManager::resolve(const AcceptTokenAction &new_action, result = false; if (result) - fragile_tokens.insert(old_action.symbol); + possible_homonyms[old_action.symbol].insert(new_action.symbol); else - fragile_tokens.insert(new_action.symbol); + possible_homonyms[new_action.symbol].insert(old_action.symbol); return result; } diff --git a/src/compiler/build_tables/lex_conflict_manager.h b/src/compiler/build_tables/lex_conflict_manager.h index 906326bf..8fb0f075 100644 --- a/src/compiler/build_tables/lex_conflict_manager.h +++ b/src/compiler/build_tables/lex_conflict_manager.h @@ -1,6 +1,7 @@ #ifndef COMPILER_BUILD_TABLES_LEX_CONFLICT_MANAGER_H_ #define COMPILER_BUILD_TABLES_LEX_CONFLICT_MANAGER_H_ +#include #include #include "compiler/lexical_grammar.h" #include "compiler/rules/symbol.h" @@ -12,12 +13,16 @@ struct AcceptTokenAction; namespace build_tables { +class LexItemSet; + class LexConflictManager { public: - bool resolve(const AdvanceAction &, const AcceptTokenAction &); + bool resolve(const LexItemSet &, const AdvanceAction &, + const AcceptTokenAction &); bool resolve(const AcceptTokenAction &, const AcceptTokenAction &); - std::set fragile_tokens; + std::map> possible_homonyms; + std::map> possible_extensions; }; } // namespace build_tables diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index 63c8c1ad..92d7a57f 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -72,7 +72,7 @@ class CCodeGenerator { const SyntaxGrammar syntax_grammar; const LexicalGrammar lexical_grammar; map sanitized_names; - vector>> parse_actions; + vector> parse_table_entries; vector>> in_progress_symbols; size_t next_parse_action_list_index; size_t next_in_progress_symbol_list_index; @@ -155,35 +155,28 @@ class CCodeGenerator { for (const auto &entry : parse_table.symbols) { const rules::Symbol &symbol = entry.first; line("[" + symbol_id(symbol) + "] = {"); + indent([&]() { + switch (symbol_type(symbol)) { + case VariableTypeNamed: + line(".visible = true,"); + line(".named = true,"); + break; + case VariableTypeAnonymous: + line(".visible = true,"); + line(".named = false,"); + break; + case VariableTypeHidden: + case VariableTypeAuxiliary: + line(".visible = false,"); + line(".named = false,"); + break; + } - switch (symbol_type(symbol)) { - case VariableTypeNamed: - add(".visible = true, .named = true"); - break; - case VariableTypeAnonymous: - add(".visible = true, .named = false"); - break; - case VariableTypeHidden: - case VariableTypeAuxiliary: - add(".visible = false, .named = false"); - break; - } + line(".structural = " + _boolean(entry.second.structural) + ","); + line(".extra = " + _boolean(entry.second.extra) + ","); + }); - add(", "); - - if (entry.second.structural) - add(".structural = true"); - else - add(".structural = false"); - - add(", "); - - if (syntax_grammar.extra_tokens.count(symbol)) - add(".extra = true"); - else - add(".extra = false"); - - add("},"); + line("},"); } }); line("};"); @@ -221,11 +214,10 @@ class CCodeGenerator { void add_recovery_parse_states_list() { line("static TSParseAction ts_recovery_actions[SYMBOL_COUNT] = {"); indent([&]() { - for (const auto &entry : parse_table.error_state.actions) { - const rules::Symbol &symbol = entry.first; - if (!entry.second.empty()) { - line("[" + symbol_id(symbol) + "] = "); - ParseAction action = entry.second[0]; + for (const auto &entry : parse_table.error_state.entries) { + if (!entry.second.actions.empty()) { + line("[" + symbol_id(entry.first) + "] = "); + ParseAction action = entry.second.actions[0]; if (action.extra) { add("RECOVER_EXTRA(),"); } else { @@ -239,7 +231,8 @@ class CCodeGenerator { } void add_parse_table() { - add_parse_action_list_id({ ParseAction::Error() }); + add_parse_action_list_id( + ParseTableEntry{ { ParseAction::Error() }, true, false }); size_t state_id = 0; line("#pragma GCC diagnostic push"); @@ -251,9 +244,9 @@ class CCodeGenerator { for (const auto &state : parse_table.states) { line("[" + to_string(state_id++) + "] = {"); indent([&]() { - for (const auto &pair : state.actions) { - line("[" + symbol_id(pair.first) + "] = "); - add(to_string(add_parse_action_list_id(pair.second))); + for (const auto &entry : state.entries) { + line("[" + symbol_id(entry.first) + "] = "); + add(to_string(add_parse_action_list_id(entry.second))); add(","); } }); @@ -338,22 +331,21 @@ class CCodeGenerator { } void add_accept_token_action(const AcceptTokenAction &action) { - if (action.is_fragile) - line("ACCEPT_FRAGILE_TOKEN(" + symbol_id(action.symbol) + ");"); - else - line("ACCEPT_TOKEN(" + symbol_id(action.symbol) + ");"); + line("ACCEPT_TOKEN(" + symbol_id(action.symbol) + ");"); } void add_parse_action_list() { line("static TSParseActionEntry ts_parse_actions[] = {"); indent([&]() { - for (const auto &pair : parse_actions) { + for (const auto &pair : parse_table_entries) { size_t index = pair.first; line("[" + to_string(index) + "] = {.count = " + - to_string(pair.second.size()) + "},"); + to_string(pair.second.actions.size()) + ", .reusable = " + + _boolean(pair.second.reusable) + ", .depends_on_lookahead = " + + _boolean(pair.second.depends_on_lookahead) + "},"); - for (const ParseAction &action : pair.second) { + for (const ParseAction &action : pair.second.actions) { add(" "); switch (action.type) { case ParseActionTypeError: @@ -366,19 +358,18 @@ class CCodeGenerator { if (action.extra) { add("SHIFT_EXTRA()"); } else { - add("SHIFT(" + to_string(action.state_index) + ", "); - add_action_flags(action); - add(")"); + add("SHIFT(" + to_string(action.state_index) + ")"); } break; case ParseActionTypeReduce: if (action.extra) { add("REDUCE_EXTRA(" + symbol_id(action.symbol) + ")"); + } else if (action.fragile) { + add("REDUCE_FRAGILE(" + symbol_id(action.symbol) + ", " + + to_string(action.consumed_symbol_count) + ")"); } else { add("REDUCE(" + symbol_id(action.symbol) + ", " + - to_string(action.consumed_symbol_count) + ", "); - add_action_flags(action); - add(")"); + to_string(action.consumed_symbol_count) + ")"); } break; default: {} @@ -391,16 +382,16 @@ class CCodeGenerator { line("};"); } - size_t add_parse_action_list_id(const vector &actions) { - for (const auto &pair : parse_actions) { - if (pair.second == actions) { + size_t add_parse_action_list_id(const ParseTableEntry &entry) { + for (const auto &pair : parse_table_entries) { + if (pair.second == entry) { return pair.first; } } size_t result = next_parse_action_list_index; - parse_actions.push_back({ next_parse_action_list_index, actions }); - next_parse_action_list_index += 1 + actions.size(); + parse_table_entries.push_back({ next_parse_action_list_index, entry }); + next_parse_action_list_index += 1 + entry.actions.size(); return result; } @@ -417,17 +408,6 @@ class CCodeGenerator { return result; } - void add_action_flags(const ParseAction &action) { - if (action.fragile && action.can_hide_split) - add("FRAGILE|CAN_HIDE_SPLIT"); - else if (action.fragile) - add("FRAGILE"); - else if (action.can_hide_split) - add("CAN_HIDE_SPLIT"); - else - add("0"); - } - // Helper functions string symbol_id(const rules::Symbol &symbol) { diff --git a/src/compiler/lex_table.cc b/src/compiler/lex_table.cc index 191d40dc..946fd712 100644 --- a/src/compiler/lex_table.cc +++ b/src/compiler/lex_table.cc @@ -27,14 +27,11 @@ bool AdvanceAction::operator==(const AdvanceAction &other) const { } AcceptTokenAction::AcceptTokenAction() - : symbol(rules::NONE()), precedence(0), is_string(false), is_fragile(false) {} + : symbol(rules::NONE()), precedence(0), is_string(false) {} AcceptTokenAction::AcceptTokenAction(Symbol symbol, int precedence, bool is_string) - : symbol(symbol), - precedence(precedence), - is_string(is_string), - is_fragile(false) {} + : symbol(symbol), precedence(precedence), is_string(is_string) {} bool AcceptTokenAction::is_present() const { return symbol != rules::NONE(); @@ -42,7 +39,7 @@ bool AcceptTokenAction::is_present() const { bool AcceptTokenAction::operator==(const AcceptTokenAction &other) const { return (symbol == other.symbol) && (precedence == other.precedence) && - (is_string == other.is_string) && (is_fragile == other.is_fragile); + (is_string == other.is_string); } LexState::LexState() : is_token_start(false) {} diff --git a/src/compiler/lex_table.h b/src/compiler/lex_table.h index 7696ab37..f5f8b4ce 100644 --- a/src/compiler/lex_table.h +++ b/src/compiler/lex_table.h @@ -39,7 +39,6 @@ struct AcceptTokenAction { rules::Symbol symbol; int precedence; bool is_string; - bool is_fragile; }; } // namespace tree_sitter diff --git a/src/compiler/parse_table.cc b/src/compiler/parse_table.cc index de217cb7..b996e574 100644 --- a/src/compiler/parse_table.cc +++ b/src/compiler/parse_table.cc @@ -20,7 +20,6 @@ ParseAction::ParseAction(ParseActionType type, ParseStateId state_index, : type(type), extra(false), fragile(false), - can_hide_split(false), symbol(symbol), state_index(state_index), consumed_symbol_count(consumed_symbol_count), @@ -32,7 +31,6 @@ ParseAction::ParseAction() : type(ParseActionTypeError), extra(false), fragile(false), - can_hide_split(false), symbol(Symbol(-1)), state_index(-1), consumed_symbol_count(0), @@ -81,9 +79,8 @@ ParseAction ParseAction::Reduce(Symbol symbol, size_t consumed_symbol_count, bool ParseAction::operator==(const ParseAction &other) const { return (type == other.type && extra == other.extra && - fragile == other.fragile && can_hide_split == other.can_hide_split && - symbol == other.symbol && state_index == other.state_index && - production == other.production && + fragile == other.fragile && symbol == other.symbol && + state_index == other.state_index && production == other.production && consumed_symbol_count == other.consumed_symbol_count); } @@ -100,10 +97,6 @@ bool ParseAction::operator<(const ParseAction &other) const { return true; if (other.fragile && !fragile) return false; - if (can_hide_split && !other.can_hide_split) - return true; - if (other.can_hide_split && !can_hide_split) - return false; if (symbol < other.symbol) return true; if (other.symbol < symbol) @@ -119,24 +112,38 @@ bool ParseAction::operator<(const ParseAction &other) const { return consumed_symbol_count < other.consumed_symbol_count; } +ParseTableEntry::ParseTableEntry() + : reusable(true), depends_on_lookahead(false) {} + +ParseTableEntry::ParseTableEntry(const vector &actions, + bool reusable, bool depends_on_lookahead) + : actions(actions), + reusable(reusable), + depends_on_lookahead(depends_on_lookahead) {} + +bool ParseTableEntry::operator==(const ParseTableEntry &other) const { + return actions == other.actions && reusable == other.reusable && + depends_on_lookahead == other.depends_on_lookahead; +} + ParseState::ParseState() : lex_state_id(-1) {} set ParseState::expected_inputs() const { set result; - for (auto &pair : actions) - result.insert(pair.first); + for (auto &entry : entries) + result.insert(entry.first); return result; } void ParseState::each_advance_action(function fn) { - for (auto &entry : actions) - for (ParseAction &action : entry.second) + for (auto &entry : entries) + for (ParseAction &action : entry.second.actions) if (action.type == ParseActionTypeShift) fn(&action); } bool ParseState::operator==(const ParseState &other) const { - return actions == other.actions; + return entries == other.entries; } set ParseTable::all_symbols() const { @@ -154,26 +161,28 @@ ParseStateId ParseTable::add_state() { ParseAction &ParseTable::set_action(ParseStateId id, Symbol symbol, ParseAction action) { if (action.extra) - symbols[symbol]; + symbols[symbol].extra = true; else symbols[symbol].structural = true; - states[id].actions[symbol] = vector({ action }); - return *states[id].actions[symbol].begin(); + + states[id].entries[symbol].actions = { action }; + return *states[id].entries[symbol].actions.begin(); } ParseAction &ParseTable::add_action(ParseStateId id, Symbol symbol, ParseAction action) { if (action.extra) - symbols[symbol]; + symbols[symbol].extra = true; else symbols[symbol].structural = true; - for (ParseAction &existing_action : states[id].actions[symbol]) + ParseState &state = states[id]; + for (ParseAction &existing_action : state.entries[symbol].actions) if (existing_action == action) return existing_action; - states[id].actions[symbol].push_back(action); - return *states[id].actions[symbol].rbegin(); + state.entries[symbol].actions.push_back(action); + return *state.entries[symbol].actions.rbegin(); } } // namespace tree_sitter diff --git a/src/compiler/parse_table.h b/src/compiler/parse_table.h index 8510c5fa..6b1a5d9b 100644 --- a/src/compiler/parse_table.h +++ b/src/compiler/parse_table.h @@ -15,12 +15,12 @@ namespace tree_sitter { typedef uint64_t ParseStateId; -typedef enum { +enum ParseActionType { ParseActionTypeError, ParseActionTypeShift, ParseActionTypeReduce, ParseActionTypeAccept, -} ParseActionType; +}; class ParseAction { ParseAction(ParseActionType type, ParseStateId state_index, @@ -43,7 +43,6 @@ class ParseAction { ParseActionType type; bool extra; bool fragile; - bool can_hide_split; rules::Symbol symbol; ParseStateId state_index; size_t consumed_symbol_count; @@ -52,30 +51,16 @@ class ParseAction { const Production *production; }; -} // namespace tree_sitter +struct ParseTableEntry { + std::vector actions; + bool reusable; + bool depends_on_lookahead; -namespace std { - -template <> -struct hash { - size_t operator()(const tree_sitter::ParseAction &action) const { - return (hash()(action.type) ^ - hash()(action.symbol) ^ - hash()(action.state_index) ^ - hash()(action.consumed_symbol_count) ^ - hash()(action.extra) ^ hash()(action.fragile) ^ - hash()(action.can_hide_split) ^ - hash()(action.associativity) ^ - hash()(action.precedence_range.min) ^ - hash()(action.precedence_range.max) ^ - hash()(&action.production)); - } + ParseTableEntry(); + ParseTableEntry(const std::vector &, bool, bool); + bool operator==(const ParseTableEntry &other) const; }; -} // namespace std - -namespace tree_sitter { - class ParseState { public: ParseState(); @@ -83,11 +68,12 @@ class ParseState { bool operator==(const ParseState &) const; void each_advance_action(std::function); - std::map> actions; + std::map entries; LexStateId lex_state_id; }; struct ParseTableSymbolMetadata { + bool extra; bool structural; }; diff --git a/src/runtime/language.c b/src/runtime/language.c index 9d6f2f83..3015a058 100644 --- a/src/runtime/language.c +++ b/src/runtime/language.c @@ -6,38 +6,30 @@ static const TSParseAction ERROR_SHIFT_EXTRA = { .type = TSParseActionTypeShift, .extra = true, }; -const TSParseAction *ts_language_actions(const TSLanguage *self, TSStateId state, - TSSymbol symbol, size_t *count) { +void ts_language_table_entry(const TSLanguage *self, TSStateId state, + TSSymbol symbol, TableEntry *result) { if (state == ts_parse_state_error) { - *count = 1; - if (symbol == ts_builtin_sym_error) - return &ERROR_SHIFT_EXTRA; - else if (self->recovery_actions[symbol].type == TSParseActionTypeError) - return &ERROR_SHIFT_EXTRA; + result->action_count = 1; + result->is_reusable = false; + result->depends_on_lookahead = false; + if (symbol == ts_builtin_sym_error || + self->recovery_actions[symbol].type == TSParseActionTypeError) + result->actions = &ERROR_SHIFT_EXTRA; else - return &self->recovery_actions[symbol]; + result->actions = &self->recovery_actions[symbol]; + return; } - size_t action_index = 0; - if (symbol != ts_builtin_sym_error) - action_index = self->parse_table[state * self->symbol_count + symbol]; + size_t action_index = + (symbol != ts_builtin_sym_error) + ? self->parse_table[state * self->symbol_count + symbol] + : 0; - *count = self->parse_actions[action_index].count; - const TSParseActionEntry *entry = self->parse_actions + action_index + 1; - return (const TSParseAction *)entry; -} - -TSParseAction ts_language_last_action(const TSLanguage *self, TSStateId state, - TSSymbol sym) { - size_t count; - const TSParseAction *actions = ts_language_actions(self, state, sym, &count); - return actions[count - 1]; -} - -bool ts_language_has_action(const TSLanguage *self, TSStateId state, - TSSymbol symbol) { - TSParseAction action = ts_language_last_action(self, state, symbol); - return action.type != TSParseActionTypeError; + const TSParseActionEntry *entry = &self->parse_actions[action_index]; + result->action_count = entry->count; + result->is_reusable = entry->reusable; + result->depends_on_lookahead = entry->depends_on_lookahead; + result->actions = (const TSParseAction *)(entry + 1); } size_t ts_language_symbol_count(const TSLanguage *language) { diff --git a/src/runtime/language.h b/src/runtime/language.h index d5f95297..6719c3da 100644 --- a/src/runtime/language.h +++ b/src/runtime/language.h @@ -8,13 +8,48 @@ extern "C" { #include "tree_sitter/parser.h" #include "runtime/tree.h" +typedef struct { + const TSParseAction *actions; + size_t action_count; + bool is_reusable; + bool depends_on_lookahead; +} TableEntry; + +void ts_language_table_entry(const TSLanguage *, TSStateId, TSSymbol, + TableEntry *); + bool ts_language_symbol_is_in_progress(const TSLanguage *, TSStateId, TSSymbol); -const TSParseAction *ts_language_actions(const TSLanguage *, TSStateId, - TSSymbol, size_t *); -TSParseAction ts_language_last_action(const TSLanguage *, TSStateId, TSSymbol); +static inline const TSParseAction *ts_language_actions(const TSLanguage *self, + TSStateId state, + TSSymbol symbol, + size_t *count) { + TableEntry entry; + ts_language_table_entry(self, state, symbol, &entry); + *count = entry.action_count; + return entry.actions; +} -bool ts_language_has_action(const TSLanguage *, TSStateId, TSSymbol); +static inline TSParseAction ts_language_last_action(const TSLanguage *self, + TSStateId state, + TSSymbol symbol) { + TableEntry entry; + ts_language_table_entry(self, state, symbol, &entry); + return entry.actions[entry.action_count - 1]; +} + +static inline bool ts_language_has_action(const TSLanguage *self, + TSStateId state, TSSymbol symbol) { + TSParseAction action = ts_language_last_action(self, state, symbol); + return action.type != TSParseActionTypeError; +} + +static inline bool ts_language_is_reusable(const TSLanguage *self, + TSStateId state, TSSymbol symbol) { + TableEntry entry; + ts_language_table_entry(self, state, symbol, &entry); + return entry.is_reusable; +} TSSymbolMetadata ts_language_symbol_metadata(const TSLanguage *, TSSymbol); diff --git a/src/runtime/lexer.c b/src/runtime/lexer.c index 5afb82d8..dd58cfc9 100644 --- a/src/runtime/lexer.c +++ b/src/runtime/lexer.c @@ -137,7 +137,6 @@ void ts_lexer_start(TSLexer *self, TSStateId lex_state) { self->starting_state = lex_state; self->token_start_position = self->current_position; self->result_follows_error = false; - self->result_is_fragile = false; self->result_symbol = 0; self->first_unexpected_character = 0; @@ -156,13 +155,11 @@ void ts_lexer_finish(TSLexer *self, TSLexerResult *result) { result->size = ts_length_sub(self->error_end_position, self->token_start_position); result->first_unexpected_character = self->first_unexpected_character; - result->is_fragile = true; ts_lexer_reset(self, self->error_end_position); } else { result->symbol = self->result_symbol; result->size = ts_length_sub(self->current_position, self->token_start_position); - result->is_fragile = self->result_is_fragile; self->token_end_position = self->current_position; } } diff --git a/src/runtime/lexer.h b/src/runtime/lexer.h index 97ee3787..75a03762 100644 --- a/src/runtime/lexer.h +++ b/src/runtime/lexer.h @@ -11,7 +11,6 @@ typedef struct { TSSymbol symbol; TSLength padding; TSLength size; - bool is_fragile; int32_t first_unexpected_character; } TSLexerResult; diff --git a/src/runtime/parser.c b/src/runtime/parser.c index 77b6305b..6f97076c 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -211,28 +211,64 @@ static bool ts_parser__can_reuse(TSParser *self, StackVersion version, if (tree->symbol == ts_builtin_sym_error) return false; - if (ts_tree_is_fragile(tree) && - tree->parse_state != ts_stack_top_state(self->stack, version)) - return false; + TSStateId state = ts_stack_top_state(self->stack, version); + if (tree->parse_state != state) { + if (ts_tree_is_fragile(tree)) { + LOG_ACTION("cant_reuse_fragile tree:%s", SYM_NAME(tree->symbol)); + return false; + } - TSStateId top_state = ts_stack_top_state(self->stack, version); + TableEntry entry; + ts_language_table_entry(self->language, state, tree->symbol, &entry); - if (tree->lex_state != TS_TREE_STATE_INDEPENDENT && - tree->lex_state != ts_language_lex_state(self->language, top_state)) - return false; + if (!entry.is_reusable) { + LOG_ACTION("cant_reuse tree:%s", SYM_NAME(tree->symbol)); + return false; + } - const TSParseAction action = - ts_language_last_action(self->language, top_state, tree->symbol); - if (action.type == TSParseActionTypeError || action.can_hide_split) - return false; + TSParseAction action = entry.actions[entry.action_count - 1]; + if (action.type == TSParseActionTypeError) { + LOG_ACTION("cant_reuse_unexpected tree:%s", SYM_NAME(tree->symbol)); + return false; + } - if (tree->extra && !action.extra) - return false; + if (tree->extra != action.extra) { + LOG_ACTION("cant_reuse_extra tree:%s", SYM_NAME(tree->symbol)); + return false; + } + + TSStateId lex_state = ts_language_lex_state(self->language, state); + if (tree->first_leaf.lex_state != lex_state) { + if (tree->child_count > 0) { + TableEntry leaf_entry; + ts_language_table_entry(self->language, state, tree->first_leaf.symbol, + &leaf_entry); + + if (!leaf_entry.is_reusable) { + LOG_ACTION("cant_reuse_first_leaf tree:%s, leaf:%s", + SYM_NAME(tree->symbol), SYM_NAME(tree->first_leaf.symbol)); + return false; + } + + if (tree->child_count == 1 && leaf_entry.depends_on_lookahead) { + LOG_ACTION("cant_reuse_lookahead_dependent tree:%s, leaf:%s", SYM_NAME(tree->symbol), SYM_NAME(tree->first_leaf.symbol)); + return false; + } + } else if (entry.depends_on_lookahead) { + LOG_ACTION("cant_reuse_lookahead_dependent tree:%s", SYM_NAME(tree->symbol)); + return false; + } + } + } return true; } -static TSTree *ts_parser__lex(TSParser *self, TSStateId state, bool error_mode) { +static TSTree *ts_parser__lex(TSParser *self, TSStateId parse_state, + bool error_mode) { + TSStateId state = error_mode ? 0 : self->language->lex_states[parse_state]; + LOG("lex state:%d", state); + TSLength position = self->lexer.current_position; ts_lexer_start(&self->lexer, state); @@ -247,6 +283,7 @@ static TSTree *ts_parser__lex(TSParser *self, TSStateId state, bool error_mode) TSTree *result; if (lex_result.symbol == ts_builtin_sym_error) { + LOG("accept_error_token"); result = ts_tree_make_error(lex_result.size, lex_result.padding, lex_result.first_unexpected_character); } else { @@ -254,14 +291,12 @@ static TSTree *ts_parser__lex(TSParser *self, TSStateId state, bool error_mode) result = ts_tree_make_leaf( lex_result.symbol, lex_result.padding, lex_result.size, ts_language_symbol_metadata(self->language, lex_result.symbol)); + if (!result) + return NULL; + result->parse_state = parse_state; + result->first_leaf.lex_state = state; } - if (!result) - return NULL; - - if (lex_result.is_fragile) - result->lex_state = state; - return result; } @@ -284,21 +319,20 @@ static TSTree *ts_parser__get_lookahead(TSParser *self, StackVersion version, if (reusable_node->tree->child_count == 0) ts_parser__breakdown_top_of_stack(self, version); - LOG("breakdown_changed sym:%s", SYM_NAME(reusable_node->tree->symbol)); + LOG_ACTION("breakdown_changed sym:%s", SYM_NAME(reusable_node->tree->symbol)); ts_parser__breakdown_reusable_node(reusable_node); continue; } if (!ts_parser__can_reuse(self, version, reusable_node->tree)) { - LOG("breakdown_unreusable sym:%s", SYM_NAME(reusable_node->tree->symbol)); ts_parser__breakdown_reusable_node(reusable_node); continue; } TSTree *result = reusable_node->tree; TSLength size = ts_tree_total_size(result); - LOG("reuse sym:%s size:%lu extra:%d", SYM_NAME(result->symbol), size.chars, - result->extra); + LOG_ACTION("reuse sym:%s size:%lu extra:%d", SYM_NAME(result->symbol), + size.chars, result->extra); ts_parser__pop_reusable_node(reusable_node); ts_tree_retain(result); return result; @@ -307,9 +341,7 @@ static TSTree *ts_parser__get_lookahead(TSParser *self, StackVersion version, ts_lexer_reset(&self->lexer, position); TSStateId parse_state = ts_stack_top_state(self->stack, version); bool error_mode = parse_state == ts_parse_state_error; - TSStateId lex_state = error_mode ? 0 : self->language->lex_states[parse_state]; - LOG("lex state:%d", lex_state); - return ts_parser__lex(self, lex_state, error_mode); + return ts_parser__lex(self, parse_state, error_mode); } static bool ts_parser__select_tree(TSParser *self, TSTree *left, TSTree *right) { @@ -487,7 +519,7 @@ static Reduction ts_parser__reduce(TSParser *self, StackVersion version, return (Reduction){ ReduceSucceeded, pop.slices.contents[0] }; error: - return (Reduction){ ReduceFailed }; + return (Reduction){ ReduceFailed, {} }; } static inline const TSParseAction *ts_parser__reductions_after_sequence( diff --git a/src/runtime/tree.c b/src/runtime/tree.c index fb268f1d..de5eba0b 100644 --- a/src/runtime/tree.c +++ b/src/runtime/tree.c @@ -28,15 +28,13 @@ TSTree *ts_tree_make_leaf(TSSymbol sym, TSLength padding, TSLength size, .padding = padding, .visible = metadata.visible, .named = metadata.named, - .lex_state = TS_TREE_STATE_INDEPENDENT, .parse_state = TS_TREE_STATE_INDEPENDENT, + .first_leaf = + { + .symbol = sym, .lex_state = TS_TREE_STATE_INDEPENDENT, + }, }; - if (sym == ts_builtin_sym_error) { - result->fragile_left = true; - result->fragile_right = true; - } - return result; } @@ -81,6 +79,8 @@ TSTree *ts_tree_make_error(TSLength size, TSLength padding, char lookahead_char) if (!result) return NULL; + result->fragile_left = true; + result->fragile_right = true; result->lookahead_char = lookahead_char; return result; } @@ -174,7 +174,7 @@ void ts_tree_set_children(TSTree *self, size_t child_count, TSTree **children) { } if (child_count > 0) { - self->lex_state = children[0]->lex_state; + self->first_leaf = children[0]->first_leaf; if (children[0]->fragile_left) self->fragile_left = true; if (children[child_count - 1]->fragile_right) @@ -206,9 +206,16 @@ TSTree *ts_tree_make_error_node(TreeArray *children) { } } - return ts_tree_make_node( + TSTree *result = ts_tree_make_node( ts_builtin_sym_error, children->size, children->contents, (TSSymbolMetadata){.extra = false, .visible = true, .named = true }); + + if (!result) + return NULL; + + result->fragile_left = true; + result->fragile_right = true; + return result; } void ts_tree_retain(TSTree *self) { diff --git a/src/runtime/tree.h b/src/runtime/tree.h index 6eec0479..819bd786 100644 --- a/src/runtime/tree.h +++ b/src/runtime/tree.h @@ -32,10 +32,14 @@ typedef struct TSTree { TSLength size; TSSymbol symbol; - TSStateId lex_state; TSStateId parse_state; size_t error_size; + struct { + TSSymbol symbol; + TSStateId lex_state; + } first_leaf; + unsigned short ref_count; bool visible : 1; bool named : 1;