diff --git a/.gitmodules b/.gitmodules index 72fbdb67..bdbfaaf8 100644 --- a/.gitmodules +++ b/.gitmodules @@ -12,4 +12,4 @@ url = https://github.com/udp/json-parser.git [submodule "externals/crypto-algorithms"] path = externals/crypto-algorithms - url = https://github.com/B-Con/crypto-algorithms.git + url = https://github.com/maxbrunsfeld/crypto-algorithms.git diff --git a/doc/grammar-schema.json b/doc/grammar-schema.json index 916d5eff..282a51d0 100644 --- a/doc/grammar-schema.json +++ b/doc/grammar-schema.json @@ -182,41 +182,7 @@ "properties": { "type": { "type": "string", - "pattern": "^PREC$" - }, - "value": { - "type": "integer" - }, - "content": { - "$ref": "#/definitions/rule" - } - }, - "required": ["type", "content", "value"] - }, - - "prec-left-rule": { - "type": "object", - "properties": { - "type": { - "type": "string", - "pattern": "^PREC_LEFT$" - }, - "value": { - "type": "integer" - }, - "content": { - "$ref": "#/definitions/rule" - } - }, - "required": ["type", "content", "value"] - }, - - "prec-right-rule": { - "type": "object", - "properties": { - "type": { - "type": "string", - "pattern": "^PREC_RIGHT$" + "pattern": "^(PREC|PREC_LEFT|PREC_RIGHT|PREC_DYNAMIC)$" }, "value": { "type": "integer" @@ -239,9 +205,7 @@ { "$ref": "#/definitions/repeat1-rule" }, { "$ref": "#/definitions/repeat-rule" }, { "$ref": "#/definitions/token-rule" }, - { "$ref": "#/definitions/prec-rule" }, - { "$ref": "#/definitions/prec-left-rule" }, - { "$ref": "#/definitions/prec-right-rule" } + { "$ref": "#/definitions/prec-rule" } ] } } diff --git a/externals/crypto-algorithms b/externals/crypto-algorithms index cfbde484..c7e5c23a 160000 --- a/externals/crypto-algorithms +++ b/externals/crypto-algorithms @@ -1 +1 @@ -Subproject commit cfbde48414baacf51fc7c74f275190881f037d32 +Subproject commit c7e5c23ab04ecfb5465cbefbe17ba23d4cb3bc9d diff --git a/include/tree_sitter/parser.h b/include/tree_sitter/parser.h index 6e68a9f4..295b3c96 100644 --- a/include/tree_sitter/parser.h +++ b/include/tree_sitter/parser.h @@ -42,6 +42,7 @@ typedef struct { union { TSStateId to_state; struct { + short dynamic_precedence; TSSymbol symbol; unsigned short child_count; }; @@ -145,21 +146,30 @@ typedef struct TSLanguage { { .type = TSParseActionTypeShift, .extra = true } \ } -#define REDUCE(symbol_val, child_count_val) \ +#define REDUCE(symbol_val, child_count_val, dynamic_precedence_val) \ { \ { \ .type = TSParseActionTypeReduce, \ - .params = {.symbol = symbol_val, .child_count = child_count_val } \ + .params = { \ + .symbol = symbol_val, \ + .child_count = child_count_val, \ + .dynamic_precedence = dynamic_precedence_val, \ + } \ } \ } -#define REDUCE_FRAGILE(symbol_val, child_count_val) \ - { \ - { \ - .type = TSParseActionTypeReduce, .fragile = true, \ - .params = {.symbol = symbol_val, .child_count = child_count_val } \ - } \ - } +#define REDUCE_FRAGILE(symbol_val, child_count_val, dynamic_precedence_val) \ +{ \ + { \ + .type = TSParseActionTypeReduce, \ + .fragile = true, \ + .params = { \ + .symbol = symbol_val, \ + .child_count = child_count_val, \ + .dynamic_precedence = dynamic_precedence_val, \ + } \ + } \ +} #define ACCEPT_INPUT() \ { \ diff --git a/include/tree_sitter/runtime.h b/include/tree_sitter/runtime.h index 503387f6..852f7b7d 100644 --- a/include/tree_sitter/runtime.h +++ b/include/tree_sitter/runtime.h @@ -9,7 +9,7 @@ extern "C" { #include #include -#define TREE_SITTER_LANGUAGE_VERSION 2 +#define TREE_SITTER_LANGUAGE_VERSION 3 typedef unsigned short TSSymbol; typedef struct TSLanguage TSLanguage; diff --git a/src/compiler/build_tables/build_parse_table.cc b/src/compiler/build_tables/build_parse_table.cc index c5519555..5dd12fa8 100644 --- a/src/compiler/build_tables/build_parse_table.cc +++ b/src/compiler/build_tables/build_parse_table.cc @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -16,11 +17,13 @@ namespace tree_sitter { namespace build_tables { +using std::deque; using std::find; using std::pair; using std::vector; using std::set; using std::map; +using std::move; using std::string; using std::to_string; using std::unordered_map; @@ -28,75 +31,77 @@ using rules::Associativity; using rules::Symbol; using rules::END_OF_INPUT; +using SymbolSequence = vector; + +struct ParseStateQueueEntry { + SymbolSequence preceding_symbols; + ParseItemSet item_set; + ParseStateId state_id; +}; + class ParseTableBuilder { const SyntaxGrammar grammar; const LexicalGrammar lexical_grammar; unordered_map recovery_states; unordered_map parse_state_ids; - vector> item_sets_to_process; + deque parse_state_queue; ParseTable parse_table; set conflicts; ParseItemSetBuilder item_set_builder; set fragile_productions; vector> incompatible_tokens_by_index; - bool allow_any_conflict; + vector> following_terminals_by_terminal_index; + bool processing_recovery_states; public: - ParseTableBuilder(const SyntaxGrammar &grammar, - const LexicalGrammar &lex_grammar) - : grammar(grammar), - lexical_grammar(lex_grammar), - item_set_builder(grammar, lex_grammar), - allow_any_conflict(false) {} + ParseTableBuilder(const SyntaxGrammar &grammar, const LexicalGrammar &lex_grammar) + : grammar(grammar), + lexical_grammar(lex_grammar), + item_set_builder(grammar, lex_grammar), + incompatible_tokens_by_index(lexical_grammar.variables.size()), + following_terminals_by_terminal_index(lexical_grammar.variables.size()), + processing_recovery_states(false) {} pair build() { Symbol start_symbol = grammar.variables.empty() ? Symbol::terminal(0) : Symbol::non_terminal(0); + Production start_production{{{start_symbol, 0, rules::AssociativityNone}}, 0}; - Production start_production{ - ProductionStep{start_symbol, 0, rules::AssociativityNone}, - }; - - // Placeholder for error state - add_parse_state(ParseItemSet()); - - add_parse_state(ParseItemSet({ + ParseStateId error_state_id = add_parse_state({}, ParseItemSet()); + add_parse_state({}, ParseItemSet({ { ParseItem(rules::START(), start_production, 0), - LookaheadSet({ END_OF_INPUT() }), + LookaheadSet({END_OF_INPUT()}), }, })); CompileError error = process_part_state_queue(); - if (error.type != TSCompileErrorTypeNone) { - return { parse_table, error }; - } + if (error.type != TSCompileErrorTypeNone) return {parse_table, error}; compute_unmergable_token_pairs(); - build_error_parse_state(); - - allow_any_conflict = true; + processing_recovery_states = true; + build_error_parse_state(error_state_id); process_part_state_queue(); - allow_any_conflict = false; mark_fragile_actions(); remove_duplicate_parse_states(); - - return { parse_table, CompileError::none() }; + return {parse_table, CompileError::none()}; } private: CompileError process_part_state_queue() { - while (!item_sets_to_process.empty()) { - auto pair = item_sets_to_process.back(); - ParseItemSet &item_set = pair.first; - ParseStateId state_id = pair.second; - item_sets_to_process.pop_back(); + while (!parse_state_queue.empty()) { + auto entry = parse_state_queue.front(); + parse_state_queue.pop_front(); - item_set_builder.apply_transitive_closure(&item_set); - string conflict = add_actions(item_set, state_id); + item_set_builder.apply_transitive_closure(&entry.item_set); + string conflict = add_actions( + move(entry.preceding_symbols), + move(entry.item_set), + entry.state_id + ); if (!conflict.empty()) { return CompileError(TSCompileErrorTypeParseConflict, conflict); @@ -106,7 +111,7 @@ class ParseTableBuilder { return CompileError::none(); } - void build_error_parse_state() { + void build_error_parse_state(ParseStateId state_id) { ParseState error_state; for (unsigned i = 0; i < lexical_grammar.variables.size(); i++) { @@ -141,14 +146,14 @@ class ParseTableBuilder { } error_state.terminal_entries[END_OF_INPUT()].actions.push_back(ParseAction::Recover(0)); - parse_table.states[0] = error_state; + parse_table.states[state_id] = error_state; } void add_out_of_context_parse_state(ParseState *error_state, const rules::Symbol &symbol) { const ParseItemSet &item_set = recovery_states[symbol]; if (!item_set.entries.empty()) { - ParseStateId state = add_parse_state(item_set); + ParseStateId state = add_parse_state({}, item_set); if (symbol.is_non_terminal()) { error_state->nonterminal_entries[symbol.index] = state; } else { @@ -157,21 +162,24 @@ class ParseTableBuilder { } } - ParseStateId add_parse_state(const ParseItemSet &item_set) { + ParseStateId add_parse_state(SymbolSequence &&preceding_symbols, const ParseItemSet &item_set) { auto pair = parse_state_ids.find(item_set); if (pair == parse_state_ids.end()) { ParseStateId state_id = parse_table.states.size(); parse_table.states.push_back(ParseState()); parse_state_ids[item_set] = state_id; - parse_table.states[state_id].shift_actions_signature = item_set.unfinished_item_signature(); - item_sets_to_process.push_back({ std::move(item_set), state_id }); + parse_state_queue.push_back({ + move(preceding_symbols), + move(item_set), + state_id + }); return state_id; } else { return pair->second; } } - string add_actions(const ParseItemSet &item_set, ParseStateId state_id) { + string add_actions(SymbolSequence &&sequence, ParseItemSet &&item_set, ParseStateId state_id) { map terminal_successors; map nonterminal_successors; set lookaheads_with_conflicts; @@ -197,7 +205,7 @@ class ParseTableBuilder { parse_table.add_terminal_action(state_id, lookahead, action); } else { ParseAction &existing_action = entry.actions[0]; - if (existing_action.type == ParseActionTypeAccept || allow_any_conflict) { + if (existing_action.type == ParseActionTypeAccept || processing_recovery_states) { entry.actions.push_back(action); } else { int existing_precedence = existing_action.precedence(); @@ -236,29 +244,31 @@ class ParseTableBuilder { for (auto &pair : terminal_successors) { Symbol lookahead = pair.first; ParseItemSet &next_item_set = pair.second; - ParseStateId next_state_id = add_parse_state(next_item_set); + ParseStateId next_state_id = add_parse_state(append_symbol(sequence, lookahead), next_item_set); ParseState &state = parse_table.states[state_id]; bool had_existing_action = !state.terminal_entries[lookahead].actions.empty(); parse_table.add_terminal_action(state_id, lookahead, ParseAction::Shift(next_state_id)); - if (!allow_any_conflict) { - if (had_existing_action) + if (!processing_recovery_states) { + if (had_existing_action) { lookaheads_with_conflicts.insert(lookahead); + } recovery_states[lookahead].add(next_item_set); } } // Add a Shift action for each non-terminal transition. for (auto &pair : nonterminal_successors) { - Symbol::Index lookahead = pair.first; + Symbol lookahead = Symbol::non_terminal(pair.first); ParseItemSet &next_item_set = pair.second; - ParseStateId next_state = add_parse_state(next_item_set); - parse_table.set_nonterminal_action(state_id, lookahead, next_state); - if (!allow_any_conflict) - recovery_states[Symbol::non_terminal(lookahead)].add(next_item_set); + ParseStateId next_state_id = add_parse_state(append_symbol(sequence, lookahead), next_item_set); + parse_table.set_nonterminal_action(state_id, lookahead.index, next_state_id); + if (!processing_recovery_states) { + recovery_states[lookahead].add(next_item_set); + } } for (Symbol lookahead : lookaheads_with_conflicts) { - string conflict = handle_conflict(item_set, state_id, lookahead); + string conflict = handle_conflict(item_set, sequence, state_id, lookahead); if (!conflict.empty()) return conflict; } @@ -266,7 +276,7 @@ class ParseTableBuilder { ParseState &state = parse_table.states[state_id]; for (const Symbol &extra_symbol : grammar.extra_tokens) { if (!state.terminal_entries.count(extra_symbol) || - state.has_shift_action() || allow_any_conflict) { + state.has_shift_action() || processing_recovery_states) { parse_table.add_terminal_action(state_id, extra_symbol, shift_extra); } } @@ -281,9 +291,10 @@ class ParseTableBuilder { for (ParseAction &action : actions) { if (action.type == ParseActionTypeReduce) { - if (has_fragile_production(action.production)) + if (has_fragile_production(action.production)) { action.fragile = true; - action.production = NULL; + } + action.production = nullptr; } } @@ -305,8 +316,6 @@ class ParseTableBuilder { } void compute_unmergable_token_pairs() { - incompatible_tokens_by_index.resize(lexical_grammar.variables.size()); - auto lex_table_builder = LexTableBuilder::create(lexical_grammar); for (unsigned i = 0, n = lexical_grammar.variables.size(); i < n; i++) { Symbol token = Symbol::terminal(i); @@ -314,7 +323,7 @@ class ParseTableBuilder { for (unsigned j = 0; j < n; j++) { if (i == j) continue; - if (lex_table_builder->detect_conflict(i, j)) { + if (lex_table_builder->detect_conflict(i, j, following_terminals_by_terminal_index)) { incompatible_indices.insert(Symbol::terminal(j)); } } @@ -330,11 +339,12 @@ class ParseTableBuilder { } void remove_duplicate_parse_states() { - map> state_indices_by_signature; + unordered_map> state_indices_by_signature; - for (ParseStateId i = 0, n = parse_table.states.size(); i < n; i++) { - ParseState &state = parse_table.states[i]; - state_indices_by_signature[state.shift_actions_signature].insert(i); + for (auto &pair : parse_state_ids) { + const ParseItemSet &item_set = pair.first; + ParseStateId state_id = pair.second; + state_indices_by_signature[item_set.unfinished_item_signature()].insert(state_id); } set deleted_states; @@ -343,14 +353,18 @@ class ParseTableBuilder { map state_replacements; for (auto &pair : state_indices_by_signature) { - auto &state_group = pair.second; + auto &state_indices = pair.second; - for (ParseStateId i : state_group) { - for (ParseStateId j : state_group) { - if (j == i) break; - if (!state_replacements.count(j) && merge_parse_state(j, i)) { - state_replacements.insert({ i, j }); - deleted_states.insert(i); + for (auto i = state_indices.begin(), end = state_indices.end(); i != end;) { + for (ParseStateId j : state_indices) { + if (j == *i) { + ++i; + break; + } + if (!state_replacements.count(j) && merge_parse_state(j, *i)) { + state_replacements.insert({*i, j}); + deleted_states.insert(*i); + i = state_indices.erase(i); break; } } @@ -360,11 +374,8 @@ class ParseTableBuilder { if (state_replacements.empty()) break; for (ParseStateId i = 0, n = parse_table.states.size(); i < n; i++) { - ParseState &state = parse_table.states[i]; - - if (state_replacements.count(i)) { - state_indices_by_signature[state.shift_actions_signature].erase(i); - } else { + if (!state_replacements.count(i)) { + ParseState &state = parse_table.states[i]; state.each_referenced_state([&state_replacements](ParseStateId *state_index) { auto replacement = state_replacements.find(*state_index); if (replacement != state_replacements.end()) { @@ -404,7 +415,7 @@ class ParseTableBuilder { static bool has_entry(const ParseState &state, const ParseTableEntry &entry) { for (const auto &pair : state.terminal_entries) - if (pair.second == entry) + if (pair.second.actions == entry.actions) return true; return false; } @@ -417,13 +428,12 @@ class ParseTableBuilder { for (auto &entry : state.terminal_entries) { Symbol lookahead = entry.first; - const auto &other_entry = other.terminal_entries.find(lookahead); + const auto &other_entry = other.terminal_entries.find(lookahead); if (other_entry == other.terminal_entries.end()) { + if (lookahead.is_external()) return false; if (entry.second.actions.back().type != ParseActionTypeReduce) return false; if (!has_entry(other, entry.second)) return false; - - if (lookahead.is_external()) return false; if (!lookahead.is_built_in()) { for (const Symbol &incompatible_token : incompatible_tokens_by_index[lookahead.index]) { if (other.terminal_entries.count(incompatible_token)) return false; @@ -440,10 +450,9 @@ class ParseTableBuilder { Symbol lookahead = entry.first; if (!state.terminal_entries.count(lookahead)) { + if (lookahead.is_external()) return false; if (entry.second.actions.back().type != ParseActionTypeReduce) return false; if (!has_entry(state, entry.second)) return false; - - if (lookahead.is_external()) return false; if (!lookahead.is_built_in()) { for (const Symbol &incompatible_token : incompatible_tokens_by_index[lookahead.index]) { if (state.terminal_entries.count(incompatible_token)) return false; @@ -461,8 +470,8 @@ class ParseTableBuilder { return true; } - string handle_conflict(const ParseItemSet &item_set, ParseStateId state_id, - Symbol lookahead) { + string handle_conflict(const ParseItemSet &item_set, const SymbolSequence &preceding_symbols, + ParseStateId state_id, Symbol lookahead) { ParseTableEntry &entry = parse_table.states[state_id].terminal_entries[lookahead]; int reduction_precedence = entry.actions.front().precedence(); set shift_items; @@ -556,24 +565,13 @@ class ParseTableBuilder { if (expected_conflict == actual_conflict) return ""; - ParseItem earliest_starting_item; - for (const ParseAction &action : entry.actions) - if (action.type == ParseActionTypeReduce) - if (action.consumed_symbol_count > earliest_starting_item.step_index) - earliest_starting_item = ParseItem(action.symbol, *action.production, action.consumed_symbol_count); - - for (const ParseItem &shift_item : shift_items) - if (shift_item.step_index > earliest_starting_item.step_index) - earliest_starting_item = shift_item; - string description = "Unresolved conflict for symbol sequence:\n\n"; - for (size_t i = 0; i < earliest_starting_item.step_index; i++) { - description += " " + symbol_name(earliest_starting_item.production->at(i).symbol); + for (auto &symbol : preceding_symbols) { + description += " " + symbol_name(symbol); } description += " \u2022 " + symbol_name(lookahead) + " \u2026"; description += "\n\n"; - description += "Possible interpretations:\n\n"; size_t interpretation_count = 1; @@ -581,12 +579,12 @@ class ParseTableBuilder { if (action.type == ParseActionTypeReduce) { description += " " + to_string(interpretation_count++) + ":"; - for (size_t i = 0; i < earliest_starting_item.step_index - action.consumed_symbol_count; i++) { - description += " " + symbol_name(earliest_starting_item.production->at(i).symbol); + for (size_t i = 0; i < preceding_symbols.size() - action.consumed_symbol_count; i++) { + description += " " + symbol_name(preceding_symbols[i]); } description += " (" + symbol_name(action.symbol); - for (const ProductionStep &step : *action.production) { + for (const ProductionStep &step : action.production->steps) { description += " " + symbol_name(step.symbol); } description += ")"; @@ -598,8 +596,8 @@ class ParseTableBuilder { for (const ParseItem &shift_item : shift_items) { description += " " + to_string(interpretation_count++) + ":"; - for (size_t i = 0; i < earliest_starting_item.step_index - shift_item.step_index; i++) { - description += " " + symbol_name(earliest_starting_item.production->at(i).symbol); + for (size_t i = 0; i < preceding_symbols.size() - shift_item.step_index; i++) { + description += " " + symbol_name(preceding_symbols[i]); } description += " (" + symbol_name(shift_item.lhs()); @@ -690,6 +688,30 @@ class ParseTableBuilder { bool has_fragile_production(const Production *production) { return fragile_productions.find(production) != fragile_productions.end(); } + + SymbolSequence append_symbol(const SymbolSequence &sequence, const Symbol &symbol) { + if (!sequence.empty()) { + const LookaheadSet &left_tokens = item_set_builder.get_last_set(sequence.back()); + const LookaheadSet &right_tokens = item_set_builder.get_first_set(symbol); + + if (!left_tokens.empty() && !right_tokens.empty()) { + for (const Symbol &left_symbol : *left_tokens.entries) { + if (left_symbol.is_terminal() && !left_symbol.is_built_in()) { + for (const Symbol &right_symbol : *right_tokens.entries) { + if (right_symbol.is_terminal() && !right_symbol.is_built_in()) { + following_terminals_by_terminal_index[left_symbol.index].insert(right_symbol.index); + } + } + } + } + } + } + + SymbolSequence result(sequence.size() + 1); + result.assign(sequence.begin(), sequence.end()); + result.push_back(symbol); + return result; + } }; pair build_parse_table( diff --git a/src/compiler/build_tables/lex_table_builder.cc b/src/compiler/build_tables/lex_table_builder.cc index 4101f854..78413baa 100644 --- a/src/compiler/build_tables/lex_table_builder.cc +++ b/src/compiler/build_tables/lex_table_builder.cc @@ -5,6 +5,7 @@ #include #include #include +#include #include #include "compiler/build_tables/lex_conflict_manager.h" #include "compiler/build_tables/lex_item.h" @@ -15,6 +16,7 @@ namespace tree_sitter { namespace build_tables { +using std::iswalpha; using std::map; using std::pair; using std::set; @@ -70,14 +72,16 @@ class LexTableBuilderImpl : public LexTableBuilder { LexTable lex_table; const LexicalGrammar grammar; vector separator_rules; - CharacterSet separator_start_characters; - CharacterSet token_start_characters; LexConflictManager conflict_manager; unordered_map lex_state_ids; - public: - vector shadowed_token_indices; + map following_characters_by_token_index; + CharacterSet separator_start_characters; + CharacterSet current_conflict_detection_following_characters; + Symbol::Index current_conflict_detection_token_index; + bool current_conflict_value; + public: LexTableBuilderImpl(const LexicalGrammar &grammar) : grammar(grammar) { StartingCharacterAggregator separator_character_aggregator; for (const auto &rule : grammar.separators) { @@ -86,20 +90,6 @@ class LexTableBuilderImpl : public LexTableBuilder { } separator_rules.push_back(Blank{}); separator_start_characters = separator_character_aggregator.result; - - StartingCharacterAggregator token_start_character_aggregator; - for (const auto &variable : grammar.variables) { - token_start_character_aggregator.apply(variable.rule); - } - token_start_characters = token_start_character_aggregator.result; - token_start_characters - .exclude('a', 'z') - .exclude('A', 'Z') - .exclude('0', '9') - .exclude('_') - .exclude('$'); - - shadowed_token_indices.resize(grammar.variables.size()); } LexTable build(ParseTable *parse_table) { @@ -113,7 +103,10 @@ class LexTableBuilderImpl : public LexTableBuilder { return lex_table; } - bool detect_conflict(Symbol::Index left, Symbol::Index right) { + bool detect_conflict(Symbol::Index left, Symbol::Index right, + const vector> &following_terminals_by_terminal_index) { + clear(); + StartingCharacterAggregator left_starting_characters; StartingCharacterAggregator right_starting_characters; left_starting_characters.apply(grammar.variables[left].rule); @@ -124,12 +117,47 @@ class LexTableBuilderImpl : public LexTableBuilder { return false; } - clear(); - map terminals; - terminals[Symbol::terminal(left)]; - terminals[Symbol::terminal(right)]; - add_lex_state(item_set_for_terminals(terminals)); - return shadowed_token_indices[right]; + auto following_characters_entry = following_characters_by_token_index.find(right); + if (following_characters_entry == following_characters_by_token_index.end()) { + StartingCharacterAggregator aggregator; + for (auto following_token_index : following_terminals_by_terminal_index[right]) { + aggregator.apply(grammar.variables[following_token_index].rule); + } + following_characters_entry = + following_characters_by_token_index.insert({right, aggregator.result}).first; + + // TODO - Refactor this. In general, a keyword token cannot be followed immediately by + // another alphanumeric character. But this requirement is currently not expressed anywhere in + // the grammar. So without this hack, we would be overly conservative about merging parse + // states because we would often consider `identifier` tokens to *conflict* with keyword + // tokens. + if (is_keyword(grammar.variables[right])) { + following_characters_entry->second + .exclude('a', 'z') + .exclude('A', 'Z') + .exclude('0', '9') + .exclude('_') + .exclude('$'); + } + } + + current_conflict_detection_token_index = right; + current_conflict_detection_following_characters = following_characters_entry->second; + add_lex_state(item_set_for_terminals({{Symbol::terminal(left), {}}, {Symbol::terminal(right), {}}})); + return current_conflict_value; + } + + bool is_keyword(const LexicalVariable &variable) { + return variable.is_string && iswalpha(get_last_character(variable.rule)); + } + + static uint32_t get_last_character(const Rule &rule) { + return rule.match( + [](const Seq &sequence) { return get_last_character(*sequence.right); }, + [](const rules::CharacterSet &rule) { return *rule.included_chars.begin(); }, + [](const rules::Metadata &rule) { return get_last_character(*rule.rule); }, + [](auto) { return 0; } + ); } LexStateId add_lex_state(const LexItemSet &item_set) { @@ -149,7 +177,8 @@ class LexTableBuilderImpl : public LexTableBuilder { void clear() { lex_table.states.clear(); lex_state_ids.clear(); - shadowed_token_indices.assign(grammar.variables.size(), false); + current_conflict_detection_following_characters = CharacterSet(); + current_conflict_value = false; } private: @@ -166,17 +195,18 @@ class LexTableBuilderImpl : public LexTableBuilder { for (const LexItem &item : transition.destination.entries) { if (item.lhs == accept_action.symbol) { can_advance_for_accepted_token = true; - } else if (!prefer_advancing && !transition.in_main_token && !item.lhs.is_built_in()) { - shadowed_token_indices[item.lhs.index] = true; + } else if (item.lhs.index == current_conflict_detection_token_index && + !prefer_advancing && !transition.in_main_token) { + current_conflict_value = true; } } - if (!can_advance_for_accepted_token) { - if (characters.intersects(separator_start_characters) || - (grammar.variables[accept_action.symbol.index].is_string && - characters.intersects(token_start_characters))) { - shadowed_token_indices[accept_action.symbol.index] = true; - } + if (accept_action.symbol.index == current_conflict_detection_token_index && + !can_advance_for_accepted_token && + (characters.intersects(separator_start_characters) || + (characters.intersects(current_conflict_detection_following_characters) && + grammar.variables[accept_action.symbol.index].is_string))) { + current_conflict_value = true; } if (!prefer_advancing) continue; @@ -346,8 +376,9 @@ LexTable LexTableBuilder::build(ParseTable *parse_table) { return static_cast(this)->build(parse_table); } -bool LexTableBuilder::detect_conflict(Symbol::Index left, Symbol::Index right) { - return static_cast(this)->detect_conflict(left, right); +bool LexTableBuilder::detect_conflict(Symbol::Index left, Symbol::Index right, + const vector> &following_terminals) { + return static_cast(this)->detect_conflict(left, right, following_terminals); } } // namespace build_tables diff --git a/src/compiler/build_tables/lex_table_builder.h b/src/compiler/build_tables/lex_table_builder.h index 91f24f70..3b896bb7 100644 --- a/src/compiler/build_tables/lex_table_builder.h +++ b/src/compiler/build_tables/lex_table_builder.h @@ -2,6 +2,8 @@ #define COMPILER_BUILD_TABLES_LEX_TABLE_BUILDER_H_ #include +#include +#include #include "compiler/lex_table.h" namespace tree_sitter { @@ -15,7 +17,11 @@ class LexTableBuilder { public: static std::unique_ptr create(const LexicalGrammar &); LexTable build(ParseTable *); - bool detect_conflict(rules::Symbol::Index, rules::Symbol::Index); + bool detect_conflict( + rules::Symbol::Index, + rules::Symbol::Index, + const std::vector> &following_terminals_by_terminal_index + ); protected: LexTableBuilder() = default; }; diff --git a/src/compiler/build_tables/parse_item.cc b/src/compiler/build_tables/parse_item.cc index baf10a00..99acb3fc 100644 --- a/src/compiler/build_tables/parse_item.cc +++ b/src/compiler/build_tables/parse_item.cc @@ -60,6 +60,10 @@ int ParseItem::precedence() const { } } +int ParseItem::dynamic_precedence() const { + return production->dynamic_precedence; +} + rules::Associativity ParseItem::associativity() const { if (is_done()) { if (production->empty()) { @@ -93,13 +97,12 @@ size_t ParseItemSet::unfinished_item_signature() const { ParseItem previous_item; for (auto &pair : entries) { const ParseItem &item = pair.first; - if (item.step_index < item.production->size()) { - if (item.variable_index != previous_item.variable_index && - item.step_index != previous_item.step_index) { - hash_combine(&result, item.variable_index); - hash_combine(&result, item.step_index); - previous_item = item; - } + if (item.step_index < item.production->size() && + (item.variable_index != previous_item.variable_index || + item.step_index != previous_item.step_index)) { + hash_combine(&result, item.variable_index); + hash_combine(&result, item.step_index); + previous_item = item; } } return result; diff --git a/src/compiler/build_tables/parse_item.h b/src/compiler/build_tables/parse_item.h index 020afc07..5133970d 100644 --- a/src/compiler/build_tables/parse_item.h +++ b/src/compiler/build_tables/parse_item.h @@ -26,6 +26,7 @@ struct ParseItem { rules::Symbol lhs() const; rules::Symbol next_symbol() const; int precedence() const; + int dynamic_precedence() const; rules::Associativity associativity() const; bool is_done() const; diff --git a/src/compiler/build_tables/parse_item_set_builder.cc b/src/compiler/build_tables/parse_item_set_builder.cc index 36c3942f..77fde864 100644 --- a/src/compiler/build_tables/parse_item_set_builder.cc +++ b/src/compiler/build_tables/parse_item_set_builder.cc @@ -1,4 +1,5 @@ #include "compiler/build_tables/parse_item_set_builder.h" +#include #include #include #include @@ -26,18 +27,20 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar, for (size_t i = 0, n = lexical_grammar.variables.size(); i < n; i++) { Symbol symbol = Symbol::terminal(i); - first_sets.insert({symbol, LookaheadSet({ symbol })}); + first_sets.insert({symbol, LookaheadSet({symbol})}); + last_sets.insert({symbol, LookaheadSet({symbol})}); } for (size_t i = 0, n = grammar.external_tokens.size(); i < n; i++) { Symbol symbol = Symbol::external(i); - first_sets.insert({symbol, LookaheadSet({ symbol })}); + first_sets.insert({symbol, LookaheadSet({symbol})}); + last_sets.insert({symbol, LookaheadSet({symbol})}); } for (size_t i = 0, n = grammar.variables.size(); i < n; i++) { Symbol symbol = Symbol::non_terminal(i); - LookaheadSet first_set; + LookaheadSet first_set; processed_non_terminals.clear(); symbols_to_process.clear(); symbols_to_process.push_back(symbol); @@ -57,6 +60,26 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar, } first_sets.insert({symbol, first_set}); + + LookaheadSet last_set; + processed_non_terminals.clear(); + symbols_to_process.clear(); + symbols_to_process.push_back(symbol); + while (!symbols_to_process.empty()) { + Symbol current_symbol = symbols_to_process.back(); + symbols_to_process.pop_back(); + + if (!current_symbol.is_non_terminal()) { + last_set.insert(current_symbol); + } else if (processed_non_terminals.insert(current_symbol.index).second) { + for (const Production &production : grammar.variables[current_symbol.index].productions) { + if (!production.empty()) { + symbols_to_process.push_back(production.back().symbol); + } + } + } + } + last_sets.insert({symbol, last_set}); } vector components_to_process; @@ -161,5 +184,9 @@ LookaheadSet ParseItemSetBuilder::get_first_set(const rules::Symbol &symbol) con return first_sets.find(symbol)->second; } +LookaheadSet ParseItemSetBuilder::get_last_set(const rules::Symbol &symbol) const { + return last_sets.find(symbol)->second; +} + } // namespace build_tables } // namespace tree_sitter diff --git a/src/compiler/build_tables/parse_item_set_builder.h b/src/compiler/build_tables/parse_item_set_builder.h index a319d698..b0334e68 100644 --- a/src/compiler/build_tables/parse_item_set_builder.h +++ b/src/compiler/build_tables/parse_item_set_builder.h @@ -20,6 +20,7 @@ class ParseItemSetBuilder { }; std::map first_sets; + std::map last_sets; std::map> component_cache; std::vector> item_set_buffer; @@ -27,6 +28,7 @@ class ParseItemSetBuilder { ParseItemSetBuilder(const SyntaxGrammar &, const LexicalGrammar &); void apply_transitive_closure(ParseItemSet *); LookaheadSet get_first_set(const rules::Symbol &) const; + LookaheadSet get_last_set(const rules::Symbol &) const; }; } // namespace build_tables diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index 2d6c22c9..1d5ff690 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -401,52 +401,106 @@ class CCodeGenerator { add_accept_token_action(lex_state.accept_action); } + set ruled_out_characters; for (const auto &pair : lex_state.advance_actions) { - if (!pair.first.is_empty()) { - _if([&]() { add_character_set_condition(pair.first); }, - [&]() { add_advance_action(pair.second); }); + if (pair.first.is_empty()) continue; + + size_t current_length = buffer.size(); + + line("if ("); + if (add_character_set_condition(pair.first, ruled_out_characters)) { + add(")"); + indent([&]() { add_advance_action(pair.second); }); + ruled_out_characters.insert(pair.first.included_chars.begin(), pair.first.included_chars.end()); + } else { + buffer.resize(current_length); + add_advance_action(pair.second); } } line("END_STATE();"); } - void add_character_set_condition(const rules::CharacterSet &rule) { + bool add_character_set_condition(const rules::CharacterSet &rule, const set &ruled_out_characters) { if (rule.includes_all) { - add("!("); - add_character_range_conditions(rule.excluded_ranges()); - add(")"); + return add_character_range_conditions(rule.excluded_ranges(), ruled_out_characters, true); } else { - add_character_range_conditions(rule.included_ranges()); + return add_character_range_conditions(rule.included_ranges(), ruled_out_characters, false); } } - void add_character_range_conditions(const vector &ranges) { - if (ranges.size() == 1) { - add_character_range_condition(*ranges.begin()); - } else { - bool first = true; - for (const auto &range : ranges) { - if (!first) { - add(" ||"); - line(" "); + bool add_character_range_conditions(const vector &ranges, + const set &ruled_out_characters, + bool is_negated) { + bool first = true; + for (auto iter = ranges.begin(), end = ranges.end(); iter != end;) { + auto range = *iter; + + bool range_is_ruled_out = true; + for (uint32_t c = range.min; c <= range.max; c++) { + if (!ruled_out_characters.count(c)) { + range_is_ruled_out = false; + break; + } + } + + if (range_is_ruled_out) { + ++iter; + continue; + } + + auto next_iter = iter + 1; + while (next_iter != end) { + bool can_join_ranges = true; + for (uint32_t character = range.max + 1; character < next_iter->min; character++) { + if (!ruled_out_characters.count(character)) { + can_join_ranges = false; + break; + } } - add("("); - add_character_range_condition(range); - add(")"); - - first = false; + if (can_join_ranges) { + range.max = next_iter->max; + ++next_iter; + } else { + break; + } } + + if (!first) { + add(is_negated ? " &&" : " ||"); + line(" "); + } + + add_character_range_condition(range, is_negated); + first = false; + iter = next_iter; } + + return !first; } - void add_character_range_condition(const rules::CharacterRange &range) { - if (range.min == range.max) { - add("lookahead == " + escape_char(range.min)); + void add_character_range_condition(const rules::CharacterRange &range, bool is_negated) { + auto min = escape_char(range.min); + auto max = escape_char(range.max); + if (is_negated) { + if (range.max == range.min) { + add("lookahead != " + min); + } else if (range.max == range.min + 1) { + add("lookahead != " + min + " &&"); + line(" lookahead != " + max); + } else { + add("(lookahead < " + min + " || lookahead > " + max + ")"); + } } else { - add(escape_char(range.min) + string(" <= lookahead && lookahead <= ") + - escape_char(range.max)); + if (range.max == range.min) { + add("lookahead == " + min); + } else if (range.max == range.min + 1) { + add("lookahead == " + min + " ||"); + line(" lookahead == " + max); + } else { + add("(" + min + " <= lookahead && lookahead <= " + max + ")"); + } } } @@ -490,12 +544,17 @@ class CCodeGenerator { break; case ParseActionTypeReduce: if (action.fragile) { - add("REDUCE_FRAGILE(" + symbol_id(action.symbol) + ", " + - to_string(action.consumed_symbol_count) + ")"); + add("REDUCE_FRAGILE"); } else { - add("REDUCE(" + symbol_id(action.symbol) + ", " + - to_string(action.consumed_symbol_count) + ")"); + add("REDUCE"); } + + add("("); + add(symbol_id(action.symbol)); + add(", "); + add(to_string(action.consumed_symbol_count)); + add(", " + to_string(action.dynamic_precedence)); + add(")"); break; case ParseActionTypeRecover: add("RECOVER(" + to_string(action.state_index) + ")"); @@ -594,13 +653,6 @@ class CCodeGenerator { indent(body); } - void _if(function condition, function body) { - line("if ("); - indent(condition); - add(")"); - indent(body); - } - string sanitize_name_for_string(string name) { util::str_replace(&name, "\\", "\\\\"); util::str_replace(&name, "\n", "\\n"); diff --git a/src/compiler/parse_grammar.cc b/src/compiler/parse_grammar.cc index aa1e2fb8..7589904c 100644 --- a/src/compiler/parse_grammar.cc +++ b/src/compiler/parse_grammar.cc @@ -184,6 +184,20 @@ ParseRuleResult parse_rule(json_value *rule_json) { return Rule(Metadata::prec_right(precedence_json.u.integer, result.rule)); } + if (type == "PREC_DYNAMIC") { + json_value precedence_json = rule_json->operator[]("value"); + if (precedence_json.type != json_integer) { + return "Precedence value must be an integer"; + } + + json_value content_json = rule_json->operator[]("content"); + auto result = parse_rule(&content_json); + if (!result.error_message.empty()) { + return "Invalid precedence content: " + result.error_message; + } + return Rule(Metadata::prec_dynamic(precedence_json.u.integer, result.rule)); + } + return "Unknown rule type: " + type; } diff --git a/src/compiler/parse_table.cc b/src/compiler/parse_table.cc index ffa57760..82c4429c 100644 --- a/src/compiler/parse_table.cc +++ b/src/compiler/parse_table.cc @@ -13,25 +13,14 @@ using std::vector; using std::function; using rules::Symbol; -ParseAction::ParseAction(ParseActionType type, ParseStateId state_index, - Symbol symbol, size_t consumed_symbol_count, - const Production *production) - : type(type), - extra(false), - fragile(false), - state_index(state_index), - symbol(symbol), - consumed_symbol_count(consumed_symbol_count), - production(production) {} - ParseAction::ParseAction() - : type(ParseActionTypeError), + : production(nullptr), + consumed_symbol_count(0), + symbol(rules::NONE()), + type(ParseActionTypeError), extra(false), fragile(false), - state_index(-1), - symbol(rules::NONE()), - consumed_symbol_count(0), - production(nullptr) {} + state_index(-1) {} ParseAction ParseAction::Error() { return ParseAction(); @@ -44,12 +33,17 @@ ParseAction ParseAction::Accept() { } ParseAction ParseAction::Shift(ParseStateId state_index) { - return ParseAction(ParseActionTypeShift, state_index, rules::NONE(), 0, nullptr); + ParseAction result; + result.type = ParseActionTypeShift; + result.state_index = state_index; + return result; } ParseAction ParseAction::Recover(ParseStateId state_index) { - return ParseAction(ParseActionTypeRecover, state_index, rules::NONE(), 0, - nullptr); + ParseAction result; + result.type = ParseActionTypeRecover; + result.state_index = state_index; + return result; } ParseAction ParseAction::ShiftExtra() { @@ -61,8 +55,13 @@ ParseAction ParseAction::ShiftExtra() { ParseAction ParseAction::Reduce(Symbol symbol, size_t consumed_symbol_count, const Production &production) { - return ParseAction(ParseActionTypeReduce, 0, symbol, consumed_symbol_count, - &production); + ParseAction result; + result.type = ParseActionTypeReduce; + result.symbol = symbol; + result.consumed_symbol_count = consumed_symbol_count; + result.production = &production; + result.dynamic_precedence = production.dynamic_precedence; + return result; } int ParseAction::precedence() const { diff --git a/src/compiler/parse_table.h b/src/compiler/parse_table.h index c00969d2..8a127bed 100644 --- a/src/compiler/parse_table.h +++ b/src/compiler/parse_table.h @@ -24,9 +24,6 @@ enum ParseActionType { struct ParseAction { ParseAction(); - ParseAction(ParseActionType type, ParseStateId state_index, - rules::Symbol symbol, size_t consumed_symbol_count, - const Production *); static ParseAction Accept(); static ParseAction Error(); static ParseAction Shift(ParseStateId state_index); @@ -39,13 +36,14 @@ struct ParseAction { rules::Associativity associativity() const; int precedence() const; + const Production *production; + size_t consumed_symbol_count; + rules::Symbol symbol; + int dynamic_precedence; ParseActionType type; bool extra; bool fragile; ParseStateId state_index; - rules::Symbol symbol; - size_t consumed_symbol_count; - const Production *production; }; struct ParseTableEntry { @@ -71,7 +69,6 @@ struct ParseState { std::map terminal_entries; std::map nonterminal_entries; LexStateId lex_state_id; - size_t shift_actions_signature; }; struct ParseTableSymbolMetadata { diff --git a/src/compiler/prepare_grammar/flatten_grammar.cc b/src/compiler/prepare_grammar/flatten_grammar.cc index 71b19f21..86c76a86 100644 --- a/src/compiler/prepare_grammar/flatten_grammar.cc +++ b/src/compiler/prepare_grammar/flatten_grammar.cc @@ -1,6 +1,7 @@ #include "compiler/prepare_grammar/flatten_grammar.h" #include #include +#include #include #include "compiler/prepare_grammar/extract_choices.h" #include "compiler/prepare_grammar/initial_syntax_grammar.h" @@ -26,7 +27,7 @@ class FlattenRule { void apply(const Rule &rule) { rule.match( [&](const rules::Symbol &symbol) { - production.push_back(ProductionStep{ + production.steps.push_back(ProductionStep{ symbol, precedence_stack.back(), associativity_stack.back() @@ -42,6 +43,10 @@ class FlattenRule { associativity_stack.push_back(metadata.params.associativity); } + if (abs(metadata.params.dynamic_precedence) > abs(production.dynamic_precedence)) { + production.dynamic_precedence = metadata.params.dynamic_precedence; + } + apply(*metadata.rule); if (metadata.params.has_precedence) { diff --git a/src/compiler/rules/character_set.cc b/src/compiler/rules/character_set.cc index 5b0c3464..c199368d 100644 --- a/src/compiler/rules/character_set.cc +++ b/src/compiler/rules/character_set.cc @@ -38,20 +38,11 @@ static set add_chars(set *left, const set &right) return result; } -static vector consolidate_ranges(const set &chars) { +static vector consolidate_ranges(const set &characters) { vector result; - for (uint32_t c : chars) { - auto size = result.size(); - if (size >= 2 && result[size - 2].max == (c - 2)) { - result.pop_back(); + for (uint32_t c : characters) { + if (!result.empty() && result.back().max == c - 1) { result.back().max = c; - } else if (size >= 1) { - CharacterRange &last = result.back(); - if (last.min < last.max && last.max == (c - 1)) { - last.max = c; - } else { - result.push_back(CharacterRange(c)); - } } else { result.push_back(CharacterRange(c)); } @@ -70,15 +61,17 @@ bool CharacterSet::operator==(const CharacterSet &other) const { } bool CharacterSet::operator<(const CharacterSet &other) const { - if (!includes_all && other.includes_all) - return true; - if (includes_all && !other.includes_all) - return false; - if (included_chars < other.included_chars) - return true; - if (other.included_chars < included_chars) - return false; - return excluded_chars < other.excluded_chars; + if (!includes_all && other.includes_all) return true; + if (includes_all && !other.includes_all) return false; + if (includes_all) { + if (excluded_chars.size() > other.excluded_chars.size()) return true; + if (excluded_chars.size() < other.excluded_chars.size()) return false; + return excluded_chars < other.excluded_chars; + } else { + if (included_chars.size() < other.included_chars.size()) return true; + if (included_chars.size() > other.included_chars.size()) return false; + return included_chars < other.included_chars; + } } CharacterSet &CharacterSet::include_all() { @@ -131,8 +124,7 @@ void CharacterSet::add_set(const CharacterSet &other) { excluded_chars.insert(c); included_chars.clear(); } else { - for (uint32_t c : other.included_chars) - included_chars.insert(c); + included_chars.insert(other.included_chars.begin(), other.included_chars.end()); } } } diff --git a/src/compiler/rules/metadata.cc b/src/compiler/rules/metadata.cc index ff98a54b..08baa8da 100644 --- a/src/compiler/rules/metadata.cc +++ b/src/compiler/rules/metadata.cc @@ -51,6 +51,12 @@ Metadata Metadata::prec_right(int precedence, const Rule &rule) { return Metadata{rule, params}; } +Metadata Metadata::prec_dynamic(int dynamic_precedence, const Rule &rule) { + MetadataParams params; + params.dynamic_precedence = dynamic_precedence; + return Metadata{rule, params}; +} + Metadata Metadata::separator(const Rule &rule) { MetadataParams params; params.has_precedence = true; diff --git a/src/compiler/rules/metadata.h b/src/compiler/rules/metadata.h index 0d55dfd2..af0b3e4e 100644 --- a/src/compiler/rules/metadata.h +++ b/src/compiler/rules/metadata.h @@ -14,6 +14,7 @@ enum Associativity { struct MetadataParams { int precedence; + int dynamic_precedence; Associativity associativity; bool has_precedence; bool has_associativity; @@ -23,8 +24,8 @@ struct MetadataParams { bool is_main_token; inline MetadataParams() : - precedence{0}, associativity{AssociativityNone}, has_precedence{false}, - has_associativity{false}, is_token{false}, is_string{false}, + precedence{0}, dynamic_precedence{0}, associativity{AssociativityNone}, + has_precedence{false}, has_associativity{false}, is_token{false}, is_string{false}, is_active{false}, is_main_token{false} {} inline bool operator==(const MetadataParams &other) const { @@ -33,6 +34,7 @@ struct MetadataParams { associativity == other.associativity && has_precedence == other.has_precedence && has_associativity == other.has_associativity && + dynamic_precedence == other.dynamic_precedence && is_token == other.is_token && is_string == other.is_string && is_active == other.is_active && @@ -54,6 +56,7 @@ struct Metadata { static Metadata prec(int precedence, const Rule &rule); static Metadata prec_left(int precedence, const Rule &rule); static Metadata prec_right(int precedence, const Rule &rule); + static Metadata prec_dynamic(int precedence, const Rule &rule); static Metadata separator(const Rule &rule); static Metadata main_token(const Rule &rule); @@ -63,4 +66,4 @@ struct Metadata { } // namespace rules } // namespace tree_sitter -#endif // COMPILER_RULES_METADATA_H_ \ No newline at end of file +#endif // COMPILER_RULES_METADATA_H_ diff --git a/src/compiler/syntax_grammar.h b/src/compiler/syntax_grammar.h index 4c177240..4eeba90c 100644 --- a/src/compiler/syntax_grammar.h +++ b/src/compiler/syntax_grammar.h @@ -11,8 +11,9 @@ namespace tree_sitter { struct ProductionStep { inline bool operator==(const ProductionStep &other) const { - return symbol == other.symbol && precedence == other.precedence && - associativity == other.associativity; + return symbol == other.symbol && + precedence == other.precedence && + associativity == other.associativity; } rules::Symbol symbol; @@ -20,7 +21,21 @@ struct ProductionStep { rules::Associativity associativity; }; -typedef std::vector Production; +struct Production { + std::vector steps; + int dynamic_precedence = 0; + + inline bool operator==(const Production &other) const { + return steps == other.steps && dynamic_precedence == other.dynamic_precedence; + } + + inline ProductionStep &back() { return steps.back(); } + inline const ProductionStep &back() const { return steps.back(); } + inline bool empty() const { return steps.empty(); } + inline size_t size() const { return steps.size(); } + inline const ProductionStep &operator[](int i) const { return steps[i]; } + inline const ProductionStep &at(int i) const { return steps[i]; } +}; struct SyntaxVariable { std::string name; diff --git a/src/runtime/parser.c b/src/runtime/parser.c index 13ef9a76..2b01dddb 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -437,22 +437,36 @@ static Tree *parser__get_lookahead(Parser *self, StackVersion version, } static bool parser__select_tree(Parser *self, Tree *left, Tree *right) { - if (!left) - return true; - if (!right) - return false; + if (!left) return true; + if (!right) return false; + if (right->error_cost < left->error_cost) { LOG("select_smaller_error symbol:%s, over_symbol:%s", SYM_NAME(right->symbol), SYM_NAME(left->symbol)); return true; } + if (left->error_cost < right->error_cost) { LOG("select_smaller_error symbol:%s, over_symbol:%s", SYM_NAME(left->symbol), SYM_NAME(right->symbol)); return false; } - if (left->error_cost > 0) return -1; + if (right->dynamic_precedence > left->dynamic_precedence) { + LOG("select_higher_precedence symbol:%s, prec:%u, over_symbol:%s, other_prec:%u", + SYM_NAME(right->symbol), right->dynamic_precedence, SYM_NAME(left->symbol), + left->dynamic_precedence); + return true; + } + + if (left->dynamic_precedence > right->dynamic_precedence) { + LOG("select_higher_precedence symbol:%s, prec:%u, over_symbol:%s, other_prec:%u", + SYM_NAME(left->symbol), left->dynamic_precedence, SYM_NAME(right->symbol), + right->dynamic_precedence); + return false; + } + + if (left->error_cost > 0) return true; int comparison = ts_tree_compare(left, right); switch (comparison) { @@ -544,7 +558,8 @@ static bool parser__switch_children(Parser *self, Tree *tree, static StackPopResult parser__reduce(Parser *self, StackVersion version, TSSymbol symbol, unsigned count, - bool fragile, bool allow_skipping) { + bool fragile, int dynamic_precedence, + bool allow_skipping) { uint32_t initial_version_count = ts_stack_version_count(self->stack); StackPopResult pop = ts_stack_pop_count(self->stack, version, count); @@ -587,6 +602,8 @@ static StackPopResult parser__reduce(Parser *self, StackVersion version, } } + parent->dynamic_precedence += dynamic_precedence; + TSStateId state = ts_stack_top_state(self->stack, slice.version); TSStateId next_state = ts_language_next_state(language, state, symbol); if (fragile || self->is_split || pop.slices.size > 1 || initial_version_count > 1) { @@ -929,6 +946,7 @@ static bool parser__do_potential_reductions(Parser *self, StackVersion version) ts_reduce_action_set_add(&self->reduce_actions, (ReduceAction){ .symbol = action.params.symbol, .count = action.params.child_count, + .dynamic_precedence = action.params.dynamic_precedence }); default: break; @@ -939,8 +957,10 @@ static bool parser__do_potential_reductions(Parser *self, StackVersion version) bool did_reduce = false; for (uint32_t i = 0; i < self->reduce_actions.size; i++) { ReduceAction action = self->reduce_actions.contents[i]; - StackPopResult reduction = - parser__reduce(self, version, action.symbol, action.count, true, false); + StackPopResult reduction = parser__reduce( + self, version, action.symbol, action.count, true, + action.dynamic_precedence, false + ); if (reduction.stopped_at_error) { ts_tree_array_delete(&reduction.slices.contents[0].trees); ts_stack_remove_version(self->stack, reduction.slices.contents[0].version); @@ -1180,12 +1200,13 @@ static void parser__advance(Parser *self, StackVersion version, unsigned child_count = action.params.child_count; TSSymbol symbol = action.params.symbol; + unsigned dynamic_precedence = action.params.dynamic_precedence; bool fragile = action.fragile; LOG("reduce sym:%s, child_count:%u", SYM_NAME(symbol), child_count); StackPopResult reduction = - parser__reduce(self, version, symbol, child_count, fragile, true); + parser__reduce(self, version, symbol, child_count, fragile, dynamic_precedence, true); StackSlice slice = *array_front(&reduction.slices); if (reduction.stopped_at_error) { reduction_stopped_at_error = true; diff --git a/src/runtime/reduce_action.h b/src/runtime/reduce_action.h index 8fc1f533..aad1f619 100644 --- a/src/runtime/reduce_action.h +++ b/src/runtime/reduce_action.h @@ -11,6 +11,7 @@ extern "C" { typedef struct { uint32_t count; TSSymbol symbol; + int dynamic_precedence; } ReduceAction; typedef Array(ReduceAction) ReduceActionSet; diff --git a/src/runtime/tree.c b/src/runtime/tree.c index b76680e6..af9a5477 100644 --- a/src/runtime/tree.c +++ b/src/runtime/tree.c @@ -150,6 +150,7 @@ void ts_tree_set_children(Tree *self, uint32_t child_count, Tree **children) { self->visible_child_count = 0; self->error_cost = 0; self->has_external_tokens = false; + self->dynamic_precedence = 0; for (uint32_t i = 0; i < child_count; i++) { Tree *child = children[i]; @@ -165,6 +166,7 @@ void ts_tree_set_children(Tree *self, uint32_t child_count, Tree **children) { } self->error_cost += child->error_cost; + self->dynamic_precedence += child->dynamic_precedence; if (child->visible) { self->visible_child_count++; diff --git a/src/runtime/tree.h b/src/runtime/tree.h index e69fcba7..3b217722 100644 --- a/src/runtime/tree.h +++ b/src/runtime/tree.h @@ -46,6 +46,7 @@ typedef struct Tree { } first_leaf; uint32_t ref_count; + int dynamic_precedence; bool visible : 1; bool named : 1; bool extra : 1; diff --git a/test/compiler/build_tables/lex_table_builder_test.cc b/test/compiler/build_tables/lex_table_builder_test.cc index 7376bd2c..e9f70aee 100644 --- a/test/compiler/build_tables/lex_table_builder_test.cc +++ b/test/compiler/build_tables/lex_table_builder_test.cc @@ -16,7 +16,7 @@ describe("LexTableBuilder::detect_conflict", []() { auto builder = LexTableBuilder::create(LexicalGrammar{ { LexicalVariable{ - "token_1", + "token_0", VariableTypeNamed, Rule::seq({ CharacterSet({ 'a' }), @@ -26,7 +26,7 @@ describe("LexTableBuilder::detect_conflict", []() { false }, LexicalVariable{ - "token_2", + "token_1", VariableTypeNamed, Rule::seq({ CharacterSet({ 'b' }), @@ -39,22 +39,22 @@ describe("LexTableBuilder::detect_conflict", []() { separators }); - AssertThat(builder->detect_conflict(0, 1), IsFalse()); - AssertThat(builder->detect_conflict(1, 0), IsFalse()); + AssertThat(builder->detect_conflict(0, 1, {{}, {}}), IsFalse()); + AssertThat(builder->detect_conflict(1, 0, {{}, {}}), IsFalse()); }); - it("returns true when one token matches a string that the other matches, " - "plus some addition content that begins with a separator character", [&]() { + it("returns true when the left token can match a string that the right token matches, " + "plus a separator character", [&]() { LexicalGrammar grammar{ { LexicalVariable{ - "token_1", + "token_0", VariableTypeNamed, Rule::repeat(CharacterSet().include_all().exclude('\n')), // regex: /.+/ false }, LexicalVariable{ - "token_2", + "token_1", VariableTypeNamed, Rule::seq({ CharacterSet({ 'a' }), CharacterSet({ 'b' }), CharacterSet({ 'c' }) }), // string: 'abc' true @@ -64,24 +64,32 @@ describe("LexTableBuilder::detect_conflict", []() { }; auto builder = LexTableBuilder::create(grammar); - AssertThat(builder->detect_conflict(0, 1), IsTrue()); - AssertThat(builder->detect_conflict(1, 0), IsFalse()); + AssertThat(builder->detect_conflict(0, 1, {{}, {}}), IsTrue()); + AssertThat(builder->detect_conflict(1, 0, {{}, {}}), IsFalse()); grammar.variables[1].is_string = false; - AssertThat(builder->detect_conflict(0, 1), IsTrue()); - AssertThat(builder->detect_conflict(1, 0), IsFalse()); + AssertThat(builder->detect_conflict(0, 1, {{}, {}}), IsTrue()); + AssertThat(builder->detect_conflict(1, 0, {{}, {}}), IsFalse()); }); - it("returns true when one token matches a string that the other matches, " - "plus some addition content that matches another one-character token", [&]() { + it("returns true when the left token matches a string that the right token matches, " + "plus the first character of some token that can follow the right token", [&]() { LexicalGrammar grammar{ { + LexicalVariable{ + "token_0", + VariableTypeNamed, + Rule::seq({ + CharacterSet({ '>' }), + CharacterSet({ '=' }), + }), + true + }, LexicalVariable{ "token_1", VariableTypeNamed, Rule::seq({ CharacterSet({ '>' }), - CharacterSet({ '>' }), }), true }, @@ -89,7 +97,7 @@ describe("LexTableBuilder::detect_conflict", []() { "token_2", VariableTypeNamed, Rule::seq({ - CharacterSet({ '>' }), + CharacterSet({ '=' }), }), true }, @@ -97,9 +105,17 @@ describe("LexTableBuilder::detect_conflict", []() { separators }; + // If no tokens can follow token_1, then there's no conflict auto builder = LexTableBuilder::create(grammar); - AssertThat(builder->detect_conflict(0, 1), IsTrue()); - AssertThat(builder->detect_conflict(1, 0), IsFalse()); + vector> following_tokens_by_token_index(3); + AssertThat(builder->detect_conflict(0, 1, following_tokens_by_token_index), IsFalse()); + AssertThat(builder->detect_conflict(1, 0, following_tokens_by_token_index), IsFalse()); + + // If token_2 can follow token_1, then token_0 conflicts with token_1 + builder = LexTableBuilder::create(grammar); + following_tokens_by_token_index[1].insert(2); + AssertThat(builder->detect_conflict(0, 1, following_tokens_by_token_index), IsTrue()); + AssertThat(builder->detect_conflict(1, 0, following_tokens_by_token_index), IsFalse()); }); }); diff --git a/test/compiler/build_tables/parse_item_set_builder_test.cc b/test/compiler/build_tables/parse_item_set_builder_test.cc index 8583c7b1..ab1efed2 100644 --- a/test/compiler/build_tables/parse_item_set_builder_test.cc +++ b/test/compiler/build_tables/parse_item_set_builder_test.cc @@ -25,25 +25,25 @@ describe("ParseItemSetBuilder", []() { it("adds items at the beginnings of referenced rules", [&]() { SyntaxGrammar grammar{{ SyntaxVariable{"rule0", VariableTypeNamed, { - Production({ + Production{{ {Symbol::non_terminal(1), 0, AssociativityNone}, {Symbol::terminal(11), 0, AssociativityNone}, - }), + }, 0}, }}, SyntaxVariable{"rule1", VariableTypeNamed, { - Production({ + Production{{ {Symbol::terminal(12), 0, AssociativityNone}, {Symbol::terminal(13), 0, AssociativityNone}, - }), - Production({ + }, 0}, + Production{{ {Symbol::non_terminal(2), 0, AssociativityNone}, - }) + }, 0} }}, SyntaxVariable{"rule2", VariableTypeNamed, { - Production({ + Production{{ {Symbol::terminal(14), 0, AssociativityNone}, {Symbol::terminal(15), 0, AssociativityNone}, - }) + }, 0} }}, }, {}, {}, {}}; @@ -84,17 +84,17 @@ describe("ParseItemSetBuilder", []() { it("handles rules with empty productions", [&]() { SyntaxGrammar grammar{{ SyntaxVariable{"rule0", VariableTypeNamed, { - Production({ + Production{{ {Symbol::non_terminal(1), 0, AssociativityNone}, {Symbol::terminal(11), 0, AssociativityNone}, - }), + }, 0}, }}, SyntaxVariable{"rule1", VariableTypeNamed, { - Production({ + Production{{ {Symbol::terminal(12), 0, AssociativityNone}, {Symbol::terminal(13), 0, AssociativityNone}, - }), - Production({}) + }, 0}, + Production{{}, 0} }}, }, {}, {}, {}}; diff --git a/test/compiler/prepare_grammar/flatten_grammar_test.cc b/test/compiler/prepare_grammar/flatten_grammar_test.cc index 50d48fb4..7b77caa8 100644 --- a/test/compiler/prepare_grammar/flatten_grammar_test.cc +++ b/test/compiler/prepare_grammar/flatten_grammar_test.cc @@ -34,22 +34,64 @@ describe("flatten_grammar", []() { AssertThat(result.name, Equals("test")); AssertThat(result.type, Equals(VariableTypeNamed)); AssertThat(result.productions, Equals(vector({ - Production({ + Production{{ {Symbol::non_terminal(1), 0, AssociativityNone}, {Symbol::non_terminal(2), 101, AssociativityLeft}, {Symbol::non_terminal(3), 102, AssociativityRight}, {Symbol::non_terminal(4), 101, AssociativityLeft}, {Symbol::non_terminal(6), 0, AssociativityNone}, {Symbol::non_terminal(7), 0, AssociativityNone}, - }), - Production({ + }, 0}, + Production{{ {Symbol::non_terminal(1), 0, AssociativityNone}, {Symbol::non_terminal(2), 101, AssociativityLeft}, {Symbol::non_terminal(5), 101, AssociativityLeft}, {Symbol::non_terminal(6), 0, AssociativityNone}, {Symbol::non_terminal(7), 0, AssociativityNone}, + }, 0} + }))); + }); + + it("stores the maximum dynamic precedence specified in each production", [&]() { + SyntaxVariable result = flatten_rule({ + "test", + VariableTypeNamed, + Rule::seq({ + Symbol::non_terminal(1), + Metadata::prec_dynamic(101, Rule::seq({ + Symbol::non_terminal(2), + Rule::choice({ + Metadata::prec_dynamic(102, Rule::seq({ + Symbol::non_terminal(3), + Symbol::non_terminal(4) + })), + Symbol::non_terminal(5), + }), + Symbol::non_terminal(6), + })), + Symbol::non_terminal(7), }) - }))) + }); + + AssertThat(result.name, Equals("test")); + AssertThat(result.type, Equals(VariableTypeNamed)); + AssertThat(result.productions, Equals(vector({ + Production{{ + {Symbol::non_terminal(1), 0, AssociativityNone}, + {Symbol::non_terminal(2), 0, AssociativityNone}, + {Symbol::non_terminal(3), 0, AssociativityNone}, + {Symbol::non_terminal(4), 0, AssociativityNone}, + {Symbol::non_terminal(6), 0, AssociativityNone}, + {Symbol::non_terminal(7), 0, AssociativityNone}, + }, 102}, + Production{{ + {Symbol::non_terminal(1), 0, AssociativityNone}, + {Symbol::non_terminal(2), 0, AssociativityNone}, + {Symbol::non_terminal(5), 0, AssociativityNone}, + {Symbol::non_terminal(6), 0, AssociativityNone}, + {Symbol::non_terminal(7), 0, AssociativityNone}, + }, 101} + }))); }); it("uses the last assigned precedence", [&]() { @@ -63,11 +105,11 @@ describe("flatten_grammar", []() { }); AssertThat(result.productions, Equals(vector({ - Production({ + Production{{ {Symbol::non_terminal(1), 101, AssociativityLeft}, - {Symbol::non_terminal(2), 101, AssociativityLeft}, - }) - }))) + {Symbol::non_terminal(2), 101, AssociativityLeft}, + }, 0} + }))); result = flatten_rule({ "test2", @@ -78,10 +120,10 @@ describe("flatten_grammar", []() { }); AssertThat(result.productions, Equals(vector({ - Production({ + Production{{ {Symbol::non_terminal(1), 101, AssociativityLeft}, - }) - }))) + }, 0} + }))); }); }); diff --git a/test/compiler/rules/character_set_test.cc b/test/compiler/rules/character_set_test.cc index f7c2e632..dfe67604 100644 --- a/test/compiler/rules/character_set_test.cc +++ b/test/compiler/rules/character_set_test.cc @@ -305,29 +305,17 @@ describe("CharacterSet", []() { }); describe("::included_ranges", [&]() { - it("consolidates sequences of 3 or more consecutive characters into ranges", [&]() { + it("consolidates consecutive sequences of characters into ranges", [&]() { CharacterSet set1 = CharacterSet() .include('a', 'c') - .include('g') + .include('e', 'j') + .include('m') .include('z'); AssertThat(set1.included_ranges(), Equals(vector({ CharacterRange{'a', 'c'}, - CharacterRange('g'), - CharacterRange('z'), - }))); - }); - - it("doesn't consolidate sequences of 2 consecutive characters", [&]() { - CharacterSet set1 = CharacterSet() - .include('a', 'b') - .include('g') - .include('z'); - - AssertThat(set1.included_ranges(), Equals(vector({ - CharacterRange('a'), - CharacterRange('b'), - CharacterRange('g'), + CharacterRange{'e', 'j'}, + CharacterRange('m'), CharacterRange('z'), }))); }); diff --git a/test/fixtures/test_grammars/dynamic_precedence/corpus.txt b/test/fixtures/test_grammars/dynamic_precedence/corpus.txt new file mode 100644 index 00000000..242efc14 --- /dev/null +++ b/test/fixtures/test_grammars/dynamic_precedence/corpus.txt @@ -0,0 +1,25 @@ +=============================== +Declarations +=============================== + +int * x + +--- + +(program (declaration + (type (identifier)) + (declarator (identifier)))) + +=============================== +Expressions +=============================== + +int * x * y + +--- + +(program (expression + (expression + (expression (identifier)) + (expression (identifier))) + (expression (identifier)))) diff --git a/test/fixtures/test_grammars/dynamic_precedence/grammar.json b/test/fixtures/test_grammars/dynamic_precedence/grammar.json new file mode 100644 index 00000000..381ed4c2 --- /dev/null +++ b/test/fixtures/test_grammars/dynamic_precedence/grammar.json @@ -0,0 +1,73 @@ +{ + "name": "dynamic_precedence", + + "conflicts": [ + ["expression", "type"] + ], + + "extras": [ + {"type": "PATTERN", "value": "\\s"} + ], + + "rules": { + "program": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "declaration"}, + {"type": "SYMBOL", "name": "expression"}, + ] + }, + + "expression": { + "type": "PREC_LEFT", + "value": 0, + "content": { + "type": "CHOICE", + "members": [ + { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "*"}, + {"type": "SYMBOL", "name": "expression"} + ] + }, + { + "type": "SYMBOL", + "name": "identifier" + } + ] + } + }, + + "declaration": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "type"}, + {"type": "SYMBOL", "name": "declarator"} + ] + }, + + "declarator": { + "type": "PREC_DYNAMIC", + "value": 1, + "content": { + "type": "SEQ", + "members": [ + {"type": "STRING", "value": "*"}, + {"type": "SYMBOL", "name": "identifier"} + ] + } + }, + + "type": { + "type": "SYMBOL", + "name": "identifier" + }, + + "identifier": { + "type": "PATTERN", + "value": "[a-zA-Z]+" + } + } +} diff --git a/test/fixtures/test_grammars/dynamic_precedence/readme.md b/test/fixtures/test_grammars/dynamic_precedence/readme.md new file mode 100644 index 00000000..a48a7997 --- /dev/null +++ b/test/fixtures/test_grammars/dynamic_precedence/readme.md @@ -0,0 +1 @@ +This grammar contains a conflict that is resolved at runtime. The PREC_DYNAMIC rule is used to indicate that the `declarator` rule should be preferred to the `expression` rule at runtime. diff --git a/test/fixtures/test_grammars/precedence_on_single_child_missing/expected_error.txt b/test/fixtures/test_grammars/precedence_on_single_child_missing/expected_error.txt index b1be0828..6ee80f23 100644 --- a/test/fixtures/test_grammars/precedence_on_single_child_missing/expected_error.txt +++ b/test/fixtures/test_grammars/precedence_on_single_child_missing/expected_error.txt @@ -1,11 +1,11 @@ Unresolved conflict for symbol sequence: - identifier • '{' … + identifier identifier • '{' … Possible interpretations: - 1: (expression identifier) • '{' … - 2: (function_call identifier • block) + 1: identifier (expression identifier) • '{' … + 2: identifier (function_call identifier • block) Possible resolutions: diff --git a/test/helpers/stream_methods.cc b/test/helpers/stream_methods.cc index 23a03d21..f88ccaee 100644 --- a/test/helpers/stream_methods.cc +++ b/test/helpers/stream_methods.cc @@ -136,9 +136,14 @@ ostream &operator<<(ostream &stream, const Variable &variable) { return stream << "(Variable " << variable.name << " " << variable.rule << ")"; } +ostream &operator<<(ostream &stream, const Production &production) { + return stream << "(Production " << production.steps << " " << + to_string(production.dynamic_precedence) << ")"; +} + ostream &operator<<(ostream &stream, const SyntaxVariable &variable) { return stream << "(Variable " << variable.name << " " << variable.productions << - " " << to_string(variable.type) << "}"; + " " << to_string(variable.type) << ")"; } ostream &operator<<(ostream &stream, const LexicalVariable &variable) { diff --git a/test/helpers/stream_methods.h b/test/helpers/stream_methods.h index 49853c19..07fcd6e0 100644 --- a/test/helpers/stream_methods.h +++ b/test/helpers/stream_methods.h @@ -110,6 +110,7 @@ ostream &operator<<(ostream &, const InputGrammar &); ostream &operator<<(ostream &, const CompileError &); ostream &operator<<(ostream &, const ExternalToken &); ostream &operator<<(ostream &, const ProductionStep &); +ostream &operator<<(ostream &, const Production &); ostream &operator<<(ostream &, const PrecedenceRange &); ostream &operator<<(ostream &, const Variable &); ostream &operator<<(ostream &, const LexicalVariable &); diff --git a/test/integration/test_grammars.cc b/test/integration/test_grammars.cc index e2334bc7..c08af27c 100644 --- a/test/integration/test_grammars.cc +++ b/test/integration/test_grammars.cc @@ -13,7 +13,7 @@ vector test_languages = list_directory(grammars_dir_path); for (auto &language_name : test_languages) { if (language_name == "readme.md") continue; - describe(("test language: " + language_name).c_str(), [&]() { + describe(("test grammar: " + language_name).c_str(), [&]() { string directory_path = grammars_dir_path + "/" + language_name; string grammar_path = directory_path + "/grammar.json"; string grammar_json = read_file(grammar_path);