#include "compiler/build_tables/lex_table_builder.h" #include #include #include #include #include #include #include #include #include "compiler/build_tables/lex_conflict_manager.h" #include "compiler/build_tables/lex_item.h" #include "compiler/build_tables/lookahead_set.h" #include "compiler/parse_table.h" #include "compiler/lexical_grammar.h" #include "compiler/rule.h" namespace tree_sitter { namespace build_tables { using std::iswalpha; using std::map; using std::pair; using std::set; using std::string; using std::vector; using std::unordered_map; using std::unique_ptr; using rules::Rule; using rules::Blank; using rules::Choice; using rules::CharacterSet; using rules::Repeat; using rules::Symbol; using rules::Metadata; using rules::Seq; class StartingCharacterAggregator { public: void apply(const Rule &rule) { rule.match( [this](const Seq &sequence) { apply(*sequence.left); }, [this](const rules::Choice &rule) { for (const auto &element : rule.elements) { apply(element); } }, [this](const rules::Repeat &rule) { apply(*rule.rule); }, [this](const rules::Metadata &rule) { apply(*rule.rule); }, [this](const rules::CharacterSet &rule) { result.add_set(rule); }, [this](const rules::Blank) {}, [](auto) {} ); } CharacterSet result; }; class LexTableBuilderImpl : public LexTableBuilder { LexTable lex_table; const LexicalGrammar grammar; vector separator_rules; LexConflictManager conflict_manager; unordered_map lex_state_ids; map following_characters_by_token_index; vector> incompatible_tokens_by_token_index; CharacterSet separator_start_characters; CharacterSet current_conflict_detection_following_characters; Symbol::Index current_conflict_detection_token_index; bool current_conflict_value; public: LexTableBuilderImpl(const SyntaxGrammar &syntax_grammar, const LexicalGrammar &lexical_grammar, const vector> &following_tokens_by_token_index) : grammar(lexical_grammar), incompatible_tokens_by_token_index(lexical_grammar.variables.size()) { StartingCharacterAggregator separator_character_aggregator; for (const auto &rule : grammar.separators) { separator_rules.push_back(Repeat{rule}); separator_character_aggregator.apply(rule); } separator_rules.push_back(Blank{}); separator_start_characters = separator_character_aggregator.result; clear(); for (unsigned i = 0, n = grammar.variables.size(); i < n; i++) { Symbol token = Symbol::terminal(i); auto &incompatible_indices = incompatible_tokens_by_token_index[i]; for (unsigned j = 0; j < n; j++) { if (i == j) continue; if (detect_conflict(i, j, following_tokens_by_token_index)) { incompatible_indices.insert(Symbol::terminal(j)); } } for (const ExternalToken &external_token : syntax_grammar.external_tokens) { if (external_token.corresponding_internal_token == token) { for (unsigned j = 0; j < syntax_grammar.external_tokens.size(); j++) { incompatible_indices.insert(Symbol::external(j)); } } } } } LexTable build(ParseTable *parse_table) { for (ParseState &parse_state : parse_table->states) { parse_state.lex_state_id = add_lex_state( item_set_for_terminals(parse_state.terminal_entries) ); } mark_fragile_tokens(parse_table); remove_duplicate_lex_states(parse_table); return lex_table; } const set &get_incompatible_tokens(Symbol::Index index) const { return incompatible_tokens_by_token_index[index]; } bool detect_conflict(Symbol::Index left, Symbol::Index right, const vector> &following_tokens_by_token_index) { StartingCharacterAggregator left_starting_characters; StartingCharacterAggregator right_starting_characters; left_starting_characters.apply(grammar.variables[left].rule); right_starting_characters.apply(grammar.variables[right].rule); if (!left_starting_characters.result.intersects(right_starting_characters.result) && !left_starting_characters.result.intersects(separator_start_characters) && !right_starting_characters.result.intersects(separator_start_characters)) { return false; } auto following_characters_entry = following_characters_by_token_index.find(right); if (following_characters_entry == following_characters_by_token_index.end()) { StartingCharacterAggregator aggregator; for (auto following_token_index : following_tokens_by_token_index[right]) { aggregator.apply(grammar.variables[following_token_index].rule); } following_characters_entry = following_characters_by_token_index.insert({right, aggregator.result}).first; // TODO - Refactor this. In general, a keyword token cannot be followed immediately by // another alphanumeric character. But this requirement is currently not expressed anywhere in // the grammar. So without this hack, we would be overly conservative about merging parse // states because we would often consider `identifier` tokens to *conflict* with keyword // tokens. if (is_keyword(grammar.variables[right])) { following_characters_entry->second .exclude('a', 'z') .exclude('A', 'Z') .exclude('0', '9') .exclude('_') .exclude('$'); } } current_conflict_detection_token_index = right; current_conflict_detection_following_characters = following_characters_entry->second; add_lex_state(item_set_for_terminals({{Symbol::terminal(left), {}}, {Symbol::terminal(right), {}}})); bool result = current_conflict_value; clear(); return result; } bool is_keyword(const LexicalVariable &variable) { return variable.is_string && iswalpha(get_last_character(variable.rule)); } static uint32_t get_last_character(const Rule &rule) { return rule.match( [](const Seq &sequence) { return get_last_character(*sequence.right); }, [](const rules::CharacterSet &rule) { return *rule.included_chars.begin(); }, [](const rules::Metadata &rule) { return get_last_character(*rule.rule); }, [](auto) { return 0; } ); } LexStateId add_lex_state(const LexItemSet &item_set) { const auto &pair = lex_state_ids.find(item_set); if (pair == lex_state_ids.end()) { LexStateId state_id = lex_table.states.size(); lex_table.states.push_back(LexState()); lex_state_ids[item_set] = state_id; add_accept_token_actions(item_set, state_id); add_advance_actions(item_set, state_id); return state_id; } else { return pair->second; } } void clear() { lex_table.states.clear(); lex_state_ids.clear(); current_conflict_detection_following_characters = CharacterSet(); current_conflict_value = false; } private: void add_advance_actions(const LexItemSet &item_set, LexStateId state_id) { for (const auto &pair : item_set.transitions()) { const CharacterSet &characters = pair.first; const LexItemSet::Transition &transition = pair.second; AdvanceAction action(-1, transition.precedence, transition.in_main_token); AcceptTokenAction &accept_action = lex_table.states[state_id].accept_action; if (accept_action.is_present()) { bool prefer_advancing = conflict_manager.resolve(transition.destination, action, accept_action); bool can_advance_for_accepted_token = false; for (const LexItem &item : transition.destination.entries) { if (item.lhs == accept_action.symbol) { can_advance_for_accepted_token = true; } else if (item.lhs.index == current_conflict_detection_token_index && !prefer_advancing && !transition.in_main_token) { current_conflict_value = true; } } if (accept_action.symbol.index == current_conflict_detection_token_index && !can_advance_for_accepted_token && (characters.intersects(separator_start_characters) || (characters.intersects(current_conflict_detection_following_characters) && grammar.variables[accept_action.symbol.index].is_string))) { current_conflict_value = true; } if (!prefer_advancing) continue; } action.state_index = add_lex_state(transition.destination); lex_table.states[state_id].advance_actions[characters] = action; } } void add_accept_token_actions(const LexItemSet &item_set, LexStateId state_id) { for (const LexItem &item : item_set.entries) { LexItem::CompletionStatus completion_status = item.completion_status(); if (completion_status.is_done) { AcceptTokenAction action(item.lhs, completion_status.precedence.max, item.lhs.is_built_in() || grammar.variables[item.lhs.index].is_string); AcceptTokenAction &existing_action = lex_table.states[state_id].accept_action; if (!existing_action.is_present() || conflict_manager.resolve(action, existing_action)) { lex_table.states[state_id].accept_action = action; } } } } void mark_fragile_tokens(ParseTable *parse_table) { for (ParseState &state : parse_table->states) { for (auto &entry : state.terminal_entries) { Symbol symbol = entry.first; if (symbol.is_terminal()) { auto homonyms = conflict_manager.possible_homonyms.find(symbol.index); if (homonyms != conflict_manager.possible_homonyms.end()) for (Symbol::Index homonym : homonyms->second) if (state.terminal_entries.count(Symbol::terminal(homonym))) { entry.second.reusable = false; break; } if (!entry.second.reusable) continue; auto extensions = conflict_manager.possible_extensions.find(symbol.index); if (extensions != conflict_manager.possible_extensions.end()) for (Symbol::Index extension : extensions->second) if (state.terminal_entries.count(Symbol::terminal(extension))) { entry.second.depends_on_lookahead = true; break; } } } } } void remove_duplicate_lex_states(ParseTable *parse_table) { for (LexState &state : lex_table.states) { state.accept_action.is_string = false; state.accept_action.precedence = 0; } map replacements; while (true) { map duplicates; for (LexStateId i = 0, size = lex_table.states.size(); i < size; i++) { for (LexStateId j = 0; j < i; j++) { if (!duplicates.count(j) && lex_table.states[j] == lex_table.states[i]) { duplicates.insert({ i, j }); break; } } } if (duplicates.empty()) break; map new_replacements; for (LexStateId i = 0, size = lex_table.states.size(); i < size; i++) { LexStateId new_state_index = i; auto duplicate = duplicates.find(i); if (duplicate != duplicates.end()) { new_state_index = duplicate->second; } size_t prior_removed = 0; for (const auto &duplicate : duplicates) { if (duplicate.first >= new_state_index) break; prior_removed++; } new_state_index -= prior_removed; new_replacements.insert({i, new_state_index}); replacements.insert({ i, new_state_index }); for (auto &replacement : replacements) { if (replacement.second == i) { replacement.second = new_state_index; } } } for (auto &state : lex_table.states) { for (auto &entry : state.advance_actions) { auto new_replacement = new_replacements.find(entry.second.state_index); if (new_replacement != new_replacements.end()) { entry.second.state_index = new_replacement->second; } } } for (auto i = duplicates.rbegin(); i != duplicates.rend(); ++i) { lex_table.states.erase(lex_table.states.begin() + i->first); } } for (ParseState &parse_state : parse_table->states) { auto replacement = replacements.find(parse_state.lex_state_id); if (replacement != replacements.end()) { parse_state.lex_state_id = replacement->second; } } } LexItemSet item_set_for_terminals(const map &terminals) { LexItemSet result; for (const auto &pair : terminals) { Symbol symbol = pair.first; if (symbol.is_terminal()) { for (const auto &rule : rules_for_symbol(symbol)) { for (const auto &separator_rule : separator_rules) { result.entries.insert(LexItem( symbol, Metadata::separator( Rule::seq({ separator_rule, Metadata::main_token(rule) }) ) )); } } } } return result; } vector rules_for_symbol(const rules::Symbol &symbol) { if (symbol == rules::END_OF_INPUT()) { return { CharacterSet().include(0) }; } return grammar.variables[symbol.index].rule.match( [](const Choice &choice) { return choice.elements; }, [](auto rule) { return vector{ rule }; } ); } }; unique_ptr LexTableBuilder::create(const SyntaxGrammar &syntax_grammar, const LexicalGrammar &lexical_grammar, const vector> &following_tokens) { return unique_ptr(new LexTableBuilderImpl( syntax_grammar, lexical_grammar, following_tokens )); } LexTable LexTableBuilder::build(ParseTable *parse_table) { return static_cast(this)->build(parse_table); } const set &LexTableBuilder::get_incompatible_tokens(Symbol::Index token) const { return static_cast(this)->get_incompatible_tokens(token); } } // namespace build_tables } // namespace tree_sitter