From cb652239f6351235125cfcd1879df158f9bd113f Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 6 Jul 2017 12:48:50 -0700 Subject: [PATCH 01/12] Add missing semicolons in flatten_grammar test --- test/compiler/prepare_grammar/flatten_grammar_test.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/compiler/prepare_grammar/flatten_grammar_test.cc b/test/compiler/prepare_grammar/flatten_grammar_test.cc index 50d48fb4..e9c90716 100644 --- a/test/compiler/prepare_grammar/flatten_grammar_test.cc +++ b/test/compiler/prepare_grammar/flatten_grammar_test.cc @@ -49,7 +49,7 @@ describe("flatten_grammar", []() { {Symbol::non_terminal(6), 0, AssociativityNone}, {Symbol::non_terminal(7), 0, AssociativityNone}, }) - }))) + }))); }); it("uses the last assigned precedence", [&]() { @@ -67,7 +67,7 @@ describe("flatten_grammar", []() { {Symbol::non_terminal(1), 101, AssociativityLeft}, {Symbol::non_terminal(2), 101, AssociativityLeft}, }) - }))) + }))); result = flatten_rule({ "test2", @@ -81,7 +81,7 @@ describe("flatten_grammar", []() { Production({ {Symbol::non_terminal(1), 101, AssociativityLeft}, }) - }))) + }))); }); }); From d8e9d04fe7e3598c10e11c2b46667dba5bc30f0c Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 6 Jul 2017 15:20:11 -0700 Subject: [PATCH 02/12] Add PREC_DYNAMIC rule for resolving runtime ambiguities --- include/tree_sitter/parser.h | 28 ++++--- .../build_tables/build_parse_table.cc | 10 ++- src/compiler/build_tables/parse_item.cc | 4 + src/compiler/build_tables/parse_item.h | 1 + src/compiler/generate_code/c_code.cc | 13 +++- src/compiler/parse_grammar.cc | 14 ++++ src/compiler/parse_table.cc | 41 +++++------ src/compiler/parse_table.h | 10 +-- .../prepare_grammar/flatten_grammar.cc | 6 +- src/compiler/rules/metadata.cc | 6 ++ src/compiler/rules/metadata.h | 9 ++- src/compiler/syntax_grammar.h | 21 +++++- src/runtime/parser.c | 39 +++++++--- src/runtime/reduce_action.h | 1 + src/runtime/tree.c | 2 + src/runtime/tree.h | 1 + .../parse_item_set_builder_test.cc | 26 +++---- .../prepare_grammar/flatten_grammar_test.cc | 58 +++++++++++++-- .../dynamic_precedence/corpus.txt | 25 +++++++ .../dynamic_precedence/grammar.json | 73 +++++++++++++++++++ .../dynamic_precedence/readme.md | 1 + test/helpers/stream_methods.cc | 7 +- test/helpers/stream_methods.h | 1 + test/integration/test_grammars.cc | 2 +- 24 files changed, 316 insertions(+), 83 deletions(-) create mode 100644 test/fixtures/test_grammars/dynamic_precedence/corpus.txt create mode 100644 test/fixtures/test_grammars/dynamic_precedence/grammar.json create mode 100644 test/fixtures/test_grammars/dynamic_precedence/readme.md diff --git a/include/tree_sitter/parser.h b/include/tree_sitter/parser.h index 6e68a9f4..295b3c96 100644 --- a/include/tree_sitter/parser.h +++ b/include/tree_sitter/parser.h @@ -42,6 +42,7 @@ typedef struct { union { TSStateId to_state; struct { + short dynamic_precedence; TSSymbol symbol; unsigned short child_count; }; @@ -145,21 +146,30 @@ typedef struct TSLanguage { { .type = TSParseActionTypeShift, .extra = true } \ } -#define REDUCE(symbol_val, child_count_val) \ +#define REDUCE(symbol_val, child_count_val, dynamic_precedence_val) \ { \ { \ .type = TSParseActionTypeReduce, \ - .params = {.symbol = symbol_val, .child_count = child_count_val } \ + .params = { \ + .symbol = symbol_val, \ + .child_count = child_count_val, \ + .dynamic_precedence = dynamic_precedence_val, \ + } \ } \ } -#define REDUCE_FRAGILE(symbol_val, child_count_val) \ - { \ - { \ - .type = TSParseActionTypeReduce, .fragile = true, \ - .params = {.symbol = symbol_val, .child_count = child_count_val } \ - } \ - } +#define REDUCE_FRAGILE(symbol_val, child_count_val, dynamic_precedence_val) \ +{ \ + { \ + .type = TSParseActionTypeReduce, \ + .fragile = true, \ + .params = { \ + .symbol = symbol_val, \ + .child_count = child_count_val, \ + .dynamic_precedence = dynamic_precedence_val, \ + } \ + } \ +} #define ACCEPT_INPUT() \ { \ diff --git a/src/compiler/build_tables/build_parse_table.cc b/src/compiler/build_tables/build_parse_table.cc index c5519555..db540247 100644 --- a/src/compiler/build_tables/build_parse_table.cc +++ b/src/compiler/build_tables/build_parse_table.cc @@ -55,7 +55,8 @@ class ParseTableBuilder { Symbol::non_terminal(0); Production start_production{ - ProductionStep{start_symbol, 0, rules::AssociativityNone}, + {ProductionStep{start_symbol, 0, rules::AssociativityNone}}, + 0 }; // Placeholder for error state @@ -281,9 +282,10 @@ class ParseTableBuilder { for (ParseAction &action : actions) { if (action.type == ParseActionTypeReduce) { - if (has_fragile_production(action.production)) + if (has_fragile_production(action.production)) { action.fragile = true; - action.production = NULL; + } + action.production = nullptr; } } @@ -586,7 +588,7 @@ class ParseTableBuilder { } description += " (" + symbol_name(action.symbol); - for (const ProductionStep &step : *action.production) { + for (const ProductionStep &step : action.production->steps) { description += " " + symbol_name(step.symbol); } description += ")"; diff --git a/src/compiler/build_tables/parse_item.cc b/src/compiler/build_tables/parse_item.cc index baf10a00..b672bb1b 100644 --- a/src/compiler/build_tables/parse_item.cc +++ b/src/compiler/build_tables/parse_item.cc @@ -60,6 +60,10 @@ int ParseItem::precedence() const { } } +int ParseItem::dynamic_precedence() const { + return production->dynamic_precedence; +} + rules::Associativity ParseItem::associativity() const { if (is_done()) { if (production->empty()) { diff --git a/src/compiler/build_tables/parse_item.h b/src/compiler/build_tables/parse_item.h index 020afc07..5133970d 100644 --- a/src/compiler/build_tables/parse_item.h +++ b/src/compiler/build_tables/parse_item.h @@ -26,6 +26,7 @@ struct ParseItem { rules::Symbol lhs() const; rules::Symbol next_symbol() const; int precedence() const; + int dynamic_precedence() const; rules::Associativity associativity() const; bool is_done() const; diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index 2d6c22c9..b2359233 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -490,12 +490,17 @@ class CCodeGenerator { break; case ParseActionTypeReduce: if (action.fragile) { - add("REDUCE_FRAGILE(" + symbol_id(action.symbol) + ", " + - to_string(action.consumed_symbol_count) + ")"); + add("REDUCE_FRAGILE"); } else { - add("REDUCE(" + symbol_id(action.symbol) + ", " + - to_string(action.consumed_symbol_count) + ")"); + add("REDUCE"); } + + add("("); + add(symbol_id(action.symbol)); + add(", "); + add(to_string(action.consumed_symbol_count)); + add(", " + to_string(action.dynamic_precedence)); + add(")"); break; case ParseActionTypeRecover: add("RECOVER(" + to_string(action.state_index) + ")"); diff --git a/src/compiler/parse_grammar.cc b/src/compiler/parse_grammar.cc index aa1e2fb8..7589904c 100644 --- a/src/compiler/parse_grammar.cc +++ b/src/compiler/parse_grammar.cc @@ -184,6 +184,20 @@ ParseRuleResult parse_rule(json_value *rule_json) { return Rule(Metadata::prec_right(precedence_json.u.integer, result.rule)); } + if (type == "PREC_DYNAMIC") { + json_value precedence_json = rule_json->operator[]("value"); + if (precedence_json.type != json_integer) { + return "Precedence value must be an integer"; + } + + json_value content_json = rule_json->operator[]("content"); + auto result = parse_rule(&content_json); + if (!result.error_message.empty()) { + return "Invalid precedence content: " + result.error_message; + } + return Rule(Metadata::prec_dynamic(precedence_json.u.integer, result.rule)); + } + return "Unknown rule type: " + type; } diff --git a/src/compiler/parse_table.cc b/src/compiler/parse_table.cc index ffa57760..82c4429c 100644 --- a/src/compiler/parse_table.cc +++ b/src/compiler/parse_table.cc @@ -13,25 +13,14 @@ using std::vector; using std::function; using rules::Symbol; -ParseAction::ParseAction(ParseActionType type, ParseStateId state_index, - Symbol symbol, size_t consumed_symbol_count, - const Production *production) - : type(type), - extra(false), - fragile(false), - state_index(state_index), - symbol(symbol), - consumed_symbol_count(consumed_symbol_count), - production(production) {} - ParseAction::ParseAction() - : type(ParseActionTypeError), + : production(nullptr), + consumed_symbol_count(0), + symbol(rules::NONE()), + type(ParseActionTypeError), extra(false), fragile(false), - state_index(-1), - symbol(rules::NONE()), - consumed_symbol_count(0), - production(nullptr) {} + state_index(-1) {} ParseAction ParseAction::Error() { return ParseAction(); @@ -44,12 +33,17 @@ ParseAction ParseAction::Accept() { } ParseAction ParseAction::Shift(ParseStateId state_index) { - return ParseAction(ParseActionTypeShift, state_index, rules::NONE(), 0, nullptr); + ParseAction result; + result.type = ParseActionTypeShift; + result.state_index = state_index; + return result; } ParseAction ParseAction::Recover(ParseStateId state_index) { - return ParseAction(ParseActionTypeRecover, state_index, rules::NONE(), 0, - nullptr); + ParseAction result; + result.type = ParseActionTypeRecover; + result.state_index = state_index; + return result; } ParseAction ParseAction::ShiftExtra() { @@ -61,8 +55,13 @@ ParseAction ParseAction::ShiftExtra() { ParseAction ParseAction::Reduce(Symbol symbol, size_t consumed_symbol_count, const Production &production) { - return ParseAction(ParseActionTypeReduce, 0, symbol, consumed_symbol_count, - &production); + ParseAction result; + result.type = ParseActionTypeReduce; + result.symbol = symbol; + result.consumed_symbol_count = consumed_symbol_count; + result.production = &production; + result.dynamic_precedence = production.dynamic_precedence; + return result; } int ParseAction::precedence() const { diff --git a/src/compiler/parse_table.h b/src/compiler/parse_table.h index c00969d2..555dad31 100644 --- a/src/compiler/parse_table.h +++ b/src/compiler/parse_table.h @@ -24,9 +24,6 @@ enum ParseActionType { struct ParseAction { ParseAction(); - ParseAction(ParseActionType type, ParseStateId state_index, - rules::Symbol symbol, size_t consumed_symbol_count, - const Production *); static ParseAction Accept(); static ParseAction Error(); static ParseAction Shift(ParseStateId state_index); @@ -39,13 +36,14 @@ struct ParseAction { rules::Associativity associativity() const; int precedence() const; + const Production *production; + size_t consumed_symbol_count; + rules::Symbol symbol; + int dynamic_precedence; ParseActionType type; bool extra; bool fragile; ParseStateId state_index; - rules::Symbol symbol; - size_t consumed_symbol_count; - const Production *production; }; struct ParseTableEntry { diff --git a/src/compiler/prepare_grammar/flatten_grammar.cc b/src/compiler/prepare_grammar/flatten_grammar.cc index 71b19f21..ae802287 100644 --- a/src/compiler/prepare_grammar/flatten_grammar.cc +++ b/src/compiler/prepare_grammar/flatten_grammar.cc @@ -26,7 +26,7 @@ class FlattenRule { void apply(const Rule &rule) { rule.match( [&](const rules::Symbol &symbol) { - production.push_back(ProductionStep{ + production.steps.push_back(ProductionStep{ symbol, precedence_stack.back(), associativity_stack.back() @@ -42,6 +42,10 @@ class FlattenRule { associativity_stack.push_back(metadata.params.associativity); } + if (metadata.params.dynamic_precedence > production.dynamic_precedence) { + production.dynamic_precedence = metadata.params.dynamic_precedence; + } + apply(*metadata.rule); if (metadata.params.has_precedence) { diff --git a/src/compiler/rules/metadata.cc b/src/compiler/rules/metadata.cc index ff98a54b..08baa8da 100644 --- a/src/compiler/rules/metadata.cc +++ b/src/compiler/rules/metadata.cc @@ -51,6 +51,12 @@ Metadata Metadata::prec_right(int precedence, const Rule &rule) { return Metadata{rule, params}; } +Metadata Metadata::prec_dynamic(int dynamic_precedence, const Rule &rule) { + MetadataParams params; + params.dynamic_precedence = dynamic_precedence; + return Metadata{rule, params}; +} + Metadata Metadata::separator(const Rule &rule) { MetadataParams params; params.has_precedence = true; diff --git a/src/compiler/rules/metadata.h b/src/compiler/rules/metadata.h index 0d55dfd2..af0b3e4e 100644 --- a/src/compiler/rules/metadata.h +++ b/src/compiler/rules/metadata.h @@ -14,6 +14,7 @@ enum Associativity { struct MetadataParams { int precedence; + int dynamic_precedence; Associativity associativity; bool has_precedence; bool has_associativity; @@ -23,8 +24,8 @@ struct MetadataParams { bool is_main_token; inline MetadataParams() : - precedence{0}, associativity{AssociativityNone}, has_precedence{false}, - has_associativity{false}, is_token{false}, is_string{false}, + precedence{0}, dynamic_precedence{0}, associativity{AssociativityNone}, + has_precedence{false}, has_associativity{false}, is_token{false}, is_string{false}, is_active{false}, is_main_token{false} {} inline bool operator==(const MetadataParams &other) const { @@ -33,6 +34,7 @@ struct MetadataParams { associativity == other.associativity && has_precedence == other.has_precedence && has_associativity == other.has_associativity && + dynamic_precedence == other.dynamic_precedence && is_token == other.is_token && is_string == other.is_string && is_active == other.is_active && @@ -54,6 +56,7 @@ struct Metadata { static Metadata prec(int precedence, const Rule &rule); static Metadata prec_left(int precedence, const Rule &rule); static Metadata prec_right(int precedence, const Rule &rule); + static Metadata prec_dynamic(int precedence, const Rule &rule); static Metadata separator(const Rule &rule); static Metadata main_token(const Rule &rule); @@ -63,4 +66,4 @@ struct Metadata { } // namespace rules } // namespace tree_sitter -#endif // COMPILER_RULES_METADATA_H_ \ No newline at end of file +#endif // COMPILER_RULES_METADATA_H_ diff --git a/src/compiler/syntax_grammar.h b/src/compiler/syntax_grammar.h index 4c177240..4eeba90c 100644 --- a/src/compiler/syntax_grammar.h +++ b/src/compiler/syntax_grammar.h @@ -11,8 +11,9 @@ namespace tree_sitter { struct ProductionStep { inline bool operator==(const ProductionStep &other) const { - return symbol == other.symbol && precedence == other.precedence && - associativity == other.associativity; + return symbol == other.symbol && + precedence == other.precedence && + associativity == other.associativity; } rules::Symbol symbol; @@ -20,7 +21,21 @@ struct ProductionStep { rules::Associativity associativity; }; -typedef std::vector Production; +struct Production { + std::vector steps; + int dynamic_precedence = 0; + + inline bool operator==(const Production &other) const { + return steps == other.steps && dynamic_precedence == other.dynamic_precedence; + } + + inline ProductionStep &back() { return steps.back(); } + inline const ProductionStep &back() const { return steps.back(); } + inline bool empty() const { return steps.empty(); } + inline size_t size() const { return steps.size(); } + inline const ProductionStep &operator[](int i) const { return steps[i]; } + inline const ProductionStep &at(int i) const { return steps[i]; } +}; struct SyntaxVariable { std::string name; diff --git a/src/runtime/parser.c b/src/runtime/parser.c index 13ef9a76..2b01dddb 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -437,22 +437,36 @@ static Tree *parser__get_lookahead(Parser *self, StackVersion version, } static bool parser__select_tree(Parser *self, Tree *left, Tree *right) { - if (!left) - return true; - if (!right) - return false; + if (!left) return true; + if (!right) return false; + if (right->error_cost < left->error_cost) { LOG("select_smaller_error symbol:%s, over_symbol:%s", SYM_NAME(right->symbol), SYM_NAME(left->symbol)); return true; } + if (left->error_cost < right->error_cost) { LOG("select_smaller_error symbol:%s, over_symbol:%s", SYM_NAME(left->symbol), SYM_NAME(right->symbol)); return false; } - if (left->error_cost > 0) return -1; + if (right->dynamic_precedence > left->dynamic_precedence) { + LOG("select_higher_precedence symbol:%s, prec:%u, over_symbol:%s, other_prec:%u", + SYM_NAME(right->symbol), right->dynamic_precedence, SYM_NAME(left->symbol), + left->dynamic_precedence); + return true; + } + + if (left->dynamic_precedence > right->dynamic_precedence) { + LOG("select_higher_precedence symbol:%s, prec:%u, over_symbol:%s, other_prec:%u", + SYM_NAME(left->symbol), left->dynamic_precedence, SYM_NAME(right->symbol), + right->dynamic_precedence); + return false; + } + + if (left->error_cost > 0) return true; int comparison = ts_tree_compare(left, right); switch (comparison) { @@ -544,7 +558,8 @@ static bool parser__switch_children(Parser *self, Tree *tree, static StackPopResult parser__reduce(Parser *self, StackVersion version, TSSymbol symbol, unsigned count, - bool fragile, bool allow_skipping) { + bool fragile, int dynamic_precedence, + bool allow_skipping) { uint32_t initial_version_count = ts_stack_version_count(self->stack); StackPopResult pop = ts_stack_pop_count(self->stack, version, count); @@ -587,6 +602,8 @@ static StackPopResult parser__reduce(Parser *self, StackVersion version, } } + parent->dynamic_precedence += dynamic_precedence; + TSStateId state = ts_stack_top_state(self->stack, slice.version); TSStateId next_state = ts_language_next_state(language, state, symbol); if (fragile || self->is_split || pop.slices.size > 1 || initial_version_count > 1) { @@ -929,6 +946,7 @@ static bool parser__do_potential_reductions(Parser *self, StackVersion version) ts_reduce_action_set_add(&self->reduce_actions, (ReduceAction){ .symbol = action.params.symbol, .count = action.params.child_count, + .dynamic_precedence = action.params.dynamic_precedence }); default: break; @@ -939,8 +957,10 @@ static bool parser__do_potential_reductions(Parser *self, StackVersion version) bool did_reduce = false; for (uint32_t i = 0; i < self->reduce_actions.size; i++) { ReduceAction action = self->reduce_actions.contents[i]; - StackPopResult reduction = - parser__reduce(self, version, action.symbol, action.count, true, false); + StackPopResult reduction = parser__reduce( + self, version, action.symbol, action.count, true, + action.dynamic_precedence, false + ); if (reduction.stopped_at_error) { ts_tree_array_delete(&reduction.slices.contents[0].trees); ts_stack_remove_version(self->stack, reduction.slices.contents[0].version); @@ -1180,12 +1200,13 @@ static void parser__advance(Parser *self, StackVersion version, unsigned child_count = action.params.child_count; TSSymbol symbol = action.params.symbol; + unsigned dynamic_precedence = action.params.dynamic_precedence; bool fragile = action.fragile; LOG("reduce sym:%s, child_count:%u", SYM_NAME(symbol), child_count); StackPopResult reduction = - parser__reduce(self, version, symbol, child_count, fragile, true); + parser__reduce(self, version, symbol, child_count, fragile, dynamic_precedence, true); StackSlice slice = *array_front(&reduction.slices); if (reduction.stopped_at_error) { reduction_stopped_at_error = true; diff --git a/src/runtime/reduce_action.h b/src/runtime/reduce_action.h index 8fc1f533..aad1f619 100644 --- a/src/runtime/reduce_action.h +++ b/src/runtime/reduce_action.h @@ -11,6 +11,7 @@ extern "C" { typedef struct { uint32_t count; TSSymbol symbol; + int dynamic_precedence; } ReduceAction; typedef Array(ReduceAction) ReduceActionSet; diff --git a/src/runtime/tree.c b/src/runtime/tree.c index b76680e6..af9a5477 100644 --- a/src/runtime/tree.c +++ b/src/runtime/tree.c @@ -150,6 +150,7 @@ void ts_tree_set_children(Tree *self, uint32_t child_count, Tree **children) { self->visible_child_count = 0; self->error_cost = 0; self->has_external_tokens = false; + self->dynamic_precedence = 0; for (uint32_t i = 0; i < child_count; i++) { Tree *child = children[i]; @@ -165,6 +166,7 @@ void ts_tree_set_children(Tree *self, uint32_t child_count, Tree **children) { } self->error_cost += child->error_cost; + self->dynamic_precedence += child->dynamic_precedence; if (child->visible) { self->visible_child_count++; diff --git a/src/runtime/tree.h b/src/runtime/tree.h index e69fcba7..3b217722 100644 --- a/src/runtime/tree.h +++ b/src/runtime/tree.h @@ -46,6 +46,7 @@ typedef struct Tree { } first_leaf; uint32_t ref_count; + int dynamic_precedence; bool visible : 1; bool named : 1; bool extra : 1; diff --git a/test/compiler/build_tables/parse_item_set_builder_test.cc b/test/compiler/build_tables/parse_item_set_builder_test.cc index 8583c7b1..ab1efed2 100644 --- a/test/compiler/build_tables/parse_item_set_builder_test.cc +++ b/test/compiler/build_tables/parse_item_set_builder_test.cc @@ -25,25 +25,25 @@ describe("ParseItemSetBuilder", []() { it("adds items at the beginnings of referenced rules", [&]() { SyntaxGrammar grammar{{ SyntaxVariable{"rule0", VariableTypeNamed, { - Production({ + Production{{ {Symbol::non_terminal(1), 0, AssociativityNone}, {Symbol::terminal(11), 0, AssociativityNone}, - }), + }, 0}, }}, SyntaxVariable{"rule1", VariableTypeNamed, { - Production({ + Production{{ {Symbol::terminal(12), 0, AssociativityNone}, {Symbol::terminal(13), 0, AssociativityNone}, - }), - Production({ + }, 0}, + Production{{ {Symbol::non_terminal(2), 0, AssociativityNone}, - }) + }, 0} }}, SyntaxVariable{"rule2", VariableTypeNamed, { - Production({ + Production{{ {Symbol::terminal(14), 0, AssociativityNone}, {Symbol::terminal(15), 0, AssociativityNone}, - }) + }, 0} }}, }, {}, {}, {}}; @@ -84,17 +84,17 @@ describe("ParseItemSetBuilder", []() { it("handles rules with empty productions", [&]() { SyntaxGrammar grammar{{ SyntaxVariable{"rule0", VariableTypeNamed, { - Production({ + Production{{ {Symbol::non_terminal(1), 0, AssociativityNone}, {Symbol::terminal(11), 0, AssociativityNone}, - }), + }, 0}, }}, SyntaxVariable{"rule1", VariableTypeNamed, { - Production({ + Production{{ {Symbol::terminal(12), 0, AssociativityNone}, {Symbol::terminal(13), 0, AssociativityNone}, - }), - Production({}) + }, 0}, + Production{{}, 0} }}, }, {}, {}, {}}; diff --git a/test/compiler/prepare_grammar/flatten_grammar_test.cc b/test/compiler/prepare_grammar/flatten_grammar_test.cc index e9c90716..7b77caa8 100644 --- a/test/compiler/prepare_grammar/flatten_grammar_test.cc +++ b/test/compiler/prepare_grammar/flatten_grammar_test.cc @@ -34,21 +34,63 @@ describe("flatten_grammar", []() { AssertThat(result.name, Equals("test")); AssertThat(result.type, Equals(VariableTypeNamed)); AssertThat(result.productions, Equals(vector({ - Production({ + Production{{ {Symbol::non_terminal(1), 0, AssociativityNone}, {Symbol::non_terminal(2), 101, AssociativityLeft}, {Symbol::non_terminal(3), 102, AssociativityRight}, {Symbol::non_terminal(4), 101, AssociativityLeft}, {Symbol::non_terminal(6), 0, AssociativityNone}, {Symbol::non_terminal(7), 0, AssociativityNone}, - }), - Production({ + }, 0}, + Production{{ {Symbol::non_terminal(1), 0, AssociativityNone}, {Symbol::non_terminal(2), 101, AssociativityLeft}, {Symbol::non_terminal(5), 101, AssociativityLeft}, {Symbol::non_terminal(6), 0, AssociativityNone}, {Symbol::non_terminal(7), 0, AssociativityNone}, + }, 0} + }))); + }); + + it("stores the maximum dynamic precedence specified in each production", [&]() { + SyntaxVariable result = flatten_rule({ + "test", + VariableTypeNamed, + Rule::seq({ + Symbol::non_terminal(1), + Metadata::prec_dynamic(101, Rule::seq({ + Symbol::non_terminal(2), + Rule::choice({ + Metadata::prec_dynamic(102, Rule::seq({ + Symbol::non_terminal(3), + Symbol::non_terminal(4) + })), + Symbol::non_terminal(5), + }), + Symbol::non_terminal(6), + })), + Symbol::non_terminal(7), }) + }); + + AssertThat(result.name, Equals("test")); + AssertThat(result.type, Equals(VariableTypeNamed)); + AssertThat(result.productions, Equals(vector({ + Production{{ + {Symbol::non_terminal(1), 0, AssociativityNone}, + {Symbol::non_terminal(2), 0, AssociativityNone}, + {Symbol::non_terminal(3), 0, AssociativityNone}, + {Symbol::non_terminal(4), 0, AssociativityNone}, + {Symbol::non_terminal(6), 0, AssociativityNone}, + {Symbol::non_terminal(7), 0, AssociativityNone}, + }, 102}, + Production{{ + {Symbol::non_terminal(1), 0, AssociativityNone}, + {Symbol::non_terminal(2), 0, AssociativityNone}, + {Symbol::non_terminal(5), 0, AssociativityNone}, + {Symbol::non_terminal(6), 0, AssociativityNone}, + {Symbol::non_terminal(7), 0, AssociativityNone}, + }, 101} }))); }); @@ -63,10 +105,10 @@ describe("flatten_grammar", []() { }); AssertThat(result.productions, Equals(vector({ - Production({ + Production{{ {Symbol::non_terminal(1), 101, AssociativityLeft}, - {Symbol::non_terminal(2), 101, AssociativityLeft}, - }) + {Symbol::non_terminal(2), 101, AssociativityLeft}, + }, 0} }))); result = flatten_rule({ @@ -78,9 +120,9 @@ describe("flatten_grammar", []() { }); AssertThat(result.productions, Equals(vector({ - Production({ + Production{{ {Symbol::non_terminal(1), 101, AssociativityLeft}, - }) + }, 0} }))); }); }); diff --git a/test/fixtures/test_grammars/dynamic_precedence/corpus.txt b/test/fixtures/test_grammars/dynamic_precedence/corpus.txt new file mode 100644 index 00000000..242efc14 --- /dev/null +++ b/test/fixtures/test_grammars/dynamic_precedence/corpus.txt @@ -0,0 +1,25 @@ +=============================== +Declarations +=============================== + +int * x + +--- + +(program (declaration + (type (identifier)) + (declarator (identifier)))) + +=============================== +Expressions +=============================== + +int * x * y + +--- + +(program (expression + (expression + (expression (identifier)) + (expression (identifier))) + (expression (identifier)))) diff --git a/test/fixtures/test_grammars/dynamic_precedence/grammar.json b/test/fixtures/test_grammars/dynamic_precedence/grammar.json new file mode 100644 index 00000000..381ed4c2 --- /dev/null +++ b/test/fixtures/test_grammars/dynamic_precedence/grammar.json @@ -0,0 +1,73 @@ +{ + "name": "dynamic_precedence", + + "conflicts": [ + ["expression", "type"] + ], + + "extras": [ + {"type": "PATTERN", "value": "\\s"} + ], + + "rules": { + "program": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "declaration"}, + {"type": "SYMBOL", "name": "expression"}, + ] + }, + + "expression": { + "type": "PREC_LEFT", + "value": 0, + "content": { + "type": "CHOICE", + "members": [ + { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "*"}, + {"type": "SYMBOL", "name": "expression"} + ] + }, + { + "type": "SYMBOL", + "name": "identifier" + } + ] + } + }, + + "declaration": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "type"}, + {"type": "SYMBOL", "name": "declarator"} + ] + }, + + "declarator": { + "type": "PREC_DYNAMIC", + "value": 1, + "content": { + "type": "SEQ", + "members": [ + {"type": "STRING", "value": "*"}, + {"type": "SYMBOL", "name": "identifier"} + ] + } + }, + + "type": { + "type": "SYMBOL", + "name": "identifier" + }, + + "identifier": { + "type": "PATTERN", + "value": "[a-zA-Z]+" + } + } +} diff --git a/test/fixtures/test_grammars/dynamic_precedence/readme.md b/test/fixtures/test_grammars/dynamic_precedence/readme.md new file mode 100644 index 00000000..a48a7997 --- /dev/null +++ b/test/fixtures/test_grammars/dynamic_precedence/readme.md @@ -0,0 +1 @@ +This grammar contains a conflict that is resolved at runtime. The PREC_DYNAMIC rule is used to indicate that the `declarator` rule should be preferred to the `expression` rule at runtime. diff --git a/test/helpers/stream_methods.cc b/test/helpers/stream_methods.cc index 23a03d21..f88ccaee 100644 --- a/test/helpers/stream_methods.cc +++ b/test/helpers/stream_methods.cc @@ -136,9 +136,14 @@ ostream &operator<<(ostream &stream, const Variable &variable) { return stream << "(Variable " << variable.name << " " << variable.rule << ")"; } +ostream &operator<<(ostream &stream, const Production &production) { + return stream << "(Production " << production.steps << " " << + to_string(production.dynamic_precedence) << ")"; +} + ostream &operator<<(ostream &stream, const SyntaxVariable &variable) { return stream << "(Variable " << variable.name << " " << variable.productions << - " " << to_string(variable.type) << "}"; + " " << to_string(variable.type) << ")"; } ostream &operator<<(ostream &stream, const LexicalVariable &variable) { diff --git a/test/helpers/stream_methods.h b/test/helpers/stream_methods.h index 49853c19..07fcd6e0 100644 --- a/test/helpers/stream_methods.h +++ b/test/helpers/stream_methods.h @@ -110,6 +110,7 @@ ostream &operator<<(ostream &, const InputGrammar &); ostream &operator<<(ostream &, const CompileError &); ostream &operator<<(ostream &, const ExternalToken &); ostream &operator<<(ostream &, const ProductionStep &); +ostream &operator<<(ostream &, const Production &); ostream &operator<<(ostream &, const PrecedenceRange &); ostream &operator<<(ostream &, const Variable &); ostream &operator<<(ostream &, const LexicalVariable &); diff --git a/test/integration/test_grammars.cc b/test/integration/test_grammars.cc index e2334bc7..c08af27c 100644 --- a/test/integration/test_grammars.cc +++ b/test/integration/test_grammars.cc @@ -13,7 +13,7 @@ vector test_languages = list_directory(grammars_dir_path); for (auto &language_name : test_languages) { if (language_name == "readme.md") continue; - describe(("test language: " + language_name).c_str(), [&]() { + describe(("test grammar: " + language_name).c_str(), [&]() { string directory_path = grammars_dir_path + "/" + language_name; string grammar_path = directory_path + "/grammar.json"; string grammar_json = read_file(grammar_path); From 08bb365f6c31f92578bf3890dfccf09f3cf3a6c4 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 6 Jul 2017 15:51:03 -0700 Subject: [PATCH 03/12] Allow PREC_DYNAMIC in JSON schema --- doc/grammar-schema.json | 40 ++-------------------------------------- 1 file changed, 2 insertions(+), 38 deletions(-) diff --git a/doc/grammar-schema.json b/doc/grammar-schema.json index 916d5eff..282a51d0 100644 --- a/doc/grammar-schema.json +++ b/doc/grammar-schema.json @@ -182,41 +182,7 @@ "properties": { "type": { "type": "string", - "pattern": "^PREC$" - }, - "value": { - "type": "integer" - }, - "content": { - "$ref": "#/definitions/rule" - } - }, - "required": ["type", "content", "value"] - }, - - "prec-left-rule": { - "type": "object", - "properties": { - "type": { - "type": "string", - "pattern": "^PREC_LEFT$" - }, - "value": { - "type": "integer" - }, - "content": { - "$ref": "#/definitions/rule" - } - }, - "required": ["type", "content", "value"] - }, - - "prec-right-rule": { - "type": "object", - "properties": { - "type": { - "type": "string", - "pattern": "^PREC_RIGHT$" + "pattern": "^(PREC|PREC_LEFT|PREC_RIGHT|PREC_DYNAMIC)$" }, "value": { "type": "integer" @@ -239,9 +205,7 @@ { "$ref": "#/definitions/repeat1-rule" }, { "$ref": "#/definitions/repeat-rule" }, { "$ref": "#/definitions/token-rule" }, - { "$ref": "#/definitions/prec-rule" }, - { "$ref": "#/definitions/prec-left-rule" }, - { "$ref": "#/definitions/prec-right-rule" } + { "$ref": "#/definitions/prec-rule" } ] } } From 107feb79608866a71fe027b3bd2d5505b3a0e624 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 6 Jul 2017 15:58:29 -0700 Subject: [PATCH 04/12] Bump the language version number after adding dynamic precedences --- include/tree_sitter/runtime.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/tree_sitter/runtime.h b/include/tree_sitter/runtime.h index 503387f6..852f7b7d 100644 --- a/include/tree_sitter/runtime.h +++ b/include/tree_sitter/runtime.h @@ -9,7 +9,7 @@ extern "C" { #include #include -#define TREE_SITTER_LANGUAGE_VERSION 2 +#define TREE_SITTER_LANGUAGE_VERSION 3 typedef unsigned short TSSymbol; typedef struct TSLanguage TSLanguage; From 0de93b3bf2619e284149e393e3464061f319ff1e Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 6 Jul 2017 22:21:59 -0700 Subject: [PATCH 05/12] Allow negative dynamic precedences --- src/compiler/prepare_grammar/flatten_grammar.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/compiler/prepare_grammar/flatten_grammar.cc b/src/compiler/prepare_grammar/flatten_grammar.cc index ae802287..86c76a86 100644 --- a/src/compiler/prepare_grammar/flatten_grammar.cc +++ b/src/compiler/prepare_grammar/flatten_grammar.cc @@ -1,6 +1,7 @@ #include "compiler/prepare_grammar/flatten_grammar.h" #include #include +#include #include #include "compiler/prepare_grammar/extract_choices.h" #include "compiler/prepare_grammar/initial_syntax_grammar.h" @@ -42,7 +43,7 @@ class FlattenRule { associativity_stack.push_back(metadata.params.associativity); } - if (metadata.params.dynamic_precedence > production.dynamic_precedence) { + if (abs(metadata.params.dynamic_precedence) > abs(production.dynamic_precedence)) { production.dynamic_precedence = metadata.params.dynamic_precedence; } From c91ceaaa8da8fa4fd99e74972a42aeabd0350db5 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 7 Jul 2017 13:59:57 -0700 Subject: [PATCH 06/12] :art: build_parse_table --- .../build_tables/build_parse_table.cc | 54 ++++++++----------- 1 file changed, 22 insertions(+), 32 deletions(-) diff --git a/src/compiler/build_tables/build_parse_table.cc b/src/compiler/build_tables/build_parse_table.cc index db540247..4f35c4cf 100644 --- a/src/compiler/build_tables/build_parse_table.cc +++ b/src/compiler/build_tables/build_parse_table.cc @@ -39,53 +39,41 @@ class ParseTableBuilder { ParseItemSetBuilder item_set_builder; set fragile_productions; vector> incompatible_tokens_by_index; - bool allow_any_conflict; + bool processing_recovery_states; public: - ParseTableBuilder(const SyntaxGrammar &grammar, - const LexicalGrammar &lex_grammar) - : grammar(grammar), - lexical_grammar(lex_grammar), - item_set_builder(grammar, lex_grammar), - allow_any_conflict(false) {} + ParseTableBuilder(const SyntaxGrammar &grammar, const LexicalGrammar &lex_grammar) + : grammar(grammar), + lexical_grammar(lex_grammar), + item_set_builder(grammar, lex_grammar), + processing_recovery_states(false) {} pair build() { Symbol start_symbol = grammar.variables.empty() ? Symbol::terminal(0) : Symbol::non_terminal(0); + Production start_production{{{start_symbol, 0, rules::AssociativityNone}}, 0}; - Production start_production{ - {ProductionStep{start_symbol, 0, rules::AssociativityNone}}, - 0 - }; - - // Placeholder for error state - add_parse_state(ParseItemSet()); - + ParseStateId error_state_id = add_parse_state(ParseItemSet()); add_parse_state(ParseItemSet({ { ParseItem(rules::START(), start_production, 0), - LookaheadSet({ END_OF_INPUT() }), + LookaheadSet({END_OF_INPUT()}), }, })); CompileError error = process_part_state_queue(); - if (error.type != TSCompileErrorTypeNone) { - return { parse_table, error }; - } + if (error.type != TSCompileErrorTypeNone) return {parse_table, error}; compute_unmergable_token_pairs(); - build_error_parse_state(); - - allow_any_conflict = true; + processing_recovery_states = true; + build_error_parse_state(error_state_id); process_part_state_queue(); - allow_any_conflict = false; mark_fragile_actions(); remove_duplicate_parse_states(); - - return { parse_table, CompileError::none() }; + return {parse_table, CompileError::none()}; } private: @@ -107,7 +95,7 @@ class ParseTableBuilder { return CompileError::none(); } - void build_error_parse_state() { + void build_error_parse_state(ParseStateId state_id) { ParseState error_state; for (unsigned i = 0; i < lexical_grammar.variables.size(); i++) { @@ -142,7 +130,7 @@ class ParseTableBuilder { } error_state.terminal_entries[END_OF_INPUT()].actions.push_back(ParseAction::Recover(0)); - parse_table.states[0] = error_state; + parse_table.states[state_id] = error_state; } void add_out_of_context_parse_state(ParseState *error_state, @@ -198,7 +186,7 @@ class ParseTableBuilder { parse_table.add_terminal_action(state_id, lookahead, action); } else { ParseAction &existing_action = entry.actions[0]; - if (existing_action.type == ParseActionTypeAccept || allow_any_conflict) { + if (existing_action.type == ParseActionTypeAccept || processing_recovery_states) { entry.actions.push_back(action); } else { int existing_precedence = existing_action.precedence(); @@ -241,9 +229,10 @@ class ParseTableBuilder { ParseState &state = parse_table.states[state_id]; bool had_existing_action = !state.terminal_entries[lookahead].actions.empty(); parse_table.add_terminal_action(state_id, lookahead, ParseAction::Shift(next_state_id)); - if (!allow_any_conflict) { - if (had_existing_action) + if (!processing_recovery_states) { + if (had_existing_action) { lookaheads_with_conflicts.insert(lookahead); + } recovery_states[lookahead].add(next_item_set); } } @@ -254,8 +243,9 @@ class ParseTableBuilder { ParseItemSet &next_item_set = pair.second; ParseStateId next_state = add_parse_state(next_item_set); parse_table.set_nonterminal_action(state_id, lookahead, next_state); - if (!allow_any_conflict) + if (!processing_recovery_states) { recovery_states[Symbol::non_terminal(lookahead)].add(next_item_set); + } } for (Symbol lookahead : lookaheads_with_conflicts) { @@ -267,7 +257,7 @@ class ParseTableBuilder { ParseState &state = parse_table.states[state_id]; for (const Symbol &extra_symbol : grammar.extra_tokens) { if (!state.terminal_entries.count(extra_symbol) || - state.has_shift_action() || allow_any_conflict) { + state.has_shift_action() || processing_recovery_states) { parse_table.add_terminal_action(state_id, extra_symbol, shift_extra); } } From a98abde529e5201b54dfdc533d1d7ae61f5aa0e5 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 7 Jul 2017 14:42:18 -0700 Subject: [PATCH 07/12] Provide all preceding symbols as context when reporting conflicts --- .../build_tables/build_parse_table.cc | 91 +++++++++++-------- .../expected_error.txt | 6 +- 2 files changed, 55 insertions(+), 42 deletions(-) diff --git a/src/compiler/build_tables/build_parse_table.cc b/src/compiler/build_tables/build_parse_table.cc index 4f35c4cf..e6a93099 100644 --- a/src/compiler/build_tables/build_parse_table.cc +++ b/src/compiler/build_tables/build_parse_table.cc @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -16,11 +17,13 @@ namespace tree_sitter { namespace build_tables { +using std::deque; using std::find; using std::pair; using std::vector; using std::set; using std::map; +using std::move; using std::string; using std::to_string; using std::unordered_map; @@ -28,12 +31,20 @@ using rules::Associativity; using rules::Symbol; using rules::END_OF_INPUT; +using SymbolSequence = vector; + +struct ParseStateQueueEntry { + SymbolSequence preceding_symbols; + ParseItemSet item_set; + ParseStateId state_id; +}; + class ParseTableBuilder { const SyntaxGrammar grammar; const LexicalGrammar lexical_grammar; unordered_map recovery_states; unordered_map parse_state_ids; - vector> item_sets_to_process; + deque parse_state_queue; ParseTable parse_table; set conflicts; ParseItemSetBuilder item_set_builder; @@ -54,8 +65,8 @@ class ParseTableBuilder { Symbol::non_terminal(0); Production start_production{{{start_symbol, 0, rules::AssociativityNone}}, 0}; - ParseStateId error_state_id = add_parse_state(ParseItemSet()); - add_parse_state(ParseItemSet({ + ParseStateId error_state_id = add_parse_state({}, ParseItemSet()); + add_parse_state({}, ParseItemSet({ { ParseItem(rules::START(), start_production, 0), LookaheadSet({END_OF_INPUT()}), @@ -78,14 +89,16 @@ class ParseTableBuilder { private: CompileError process_part_state_queue() { - while (!item_sets_to_process.empty()) { - auto pair = item_sets_to_process.back(); - ParseItemSet &item_set = pair.first; - ParseStateId state_id = pair.second; - item_sets_to_process.pop_back(); + while (!parse_state_queue.empty()) { + auto entry = parse_state_queue.front(); + parse_state_queue.pop_front(); - item_set_builder.apply_transitive_closure(&item_set); - string conflict = add_actions(item_set, state_id); + item_set_builder.apply_transitive_closure(&entry.item_set); + string conflict = add_actions( + move(entry.preceding_symbols), + move(entry.item_set), + entry.state_id + ); if (!conflict.empty()) { return CompileError(TSCompileErrorTypeParseConflict, conflict); @@ -137,7 +150,7 @@ class ParseTableBuilder { const rules::Symbol &symbol) { const ParseItemSet &item_set = recovery_states[symbol]; if (!item_set.entries.empty()) { - ParseStateId state = add_parse_state(item_set); + ParseStateId state = add_parse_state({}, item_set); if (symbol.is_non_terminal()) { error_state->nonterminal_entries[symbol.index] = state; } else { @@ -146,21 +159,25 @@ class ParseTableBuilder { } } - ParseStateId add_parse_state(const ParseItemSet &item_set) { + ParseStateId add_parse_state(SymbolSequence &&preceding_symbols, const ParseItemSet &item_set) { auto pair = parse_state_ids.find(item_set); if (pair == parse_state_ids.end()) { ParseStateId state_id = parse_table.states.size(); parse_table.states.push_back(ParseState()); parse_state_ids[item_set] = state_id; parse_table.states[state_id].shift_actions_signature = item_set.unfinished_item_signature(); - item_sets_to_process.push_back({ std::move(item_set), state_id }); + parse_state_queue.push_back({ + move(preceding_symbols), + move(item_set), + state_id + }); return state_id; } else { return pair->second; } } - string add_actions(const ParseItemSet &item_set, ParseStateId state_id) { + string add_actions(SymbolSequence &&sequence, ParseItemSet &&item_set, ParseStateId state_id) { map terminal_successors; map nonterminal_successors; set lookaheads_with_conflicts; @@ -225,7 +242,7 @@ class ParseTableBuilder { for (auto &pair : terminal_successors) { Symbol lookahead = pair.first; ParseItemSet &next_item_set = pair.second; - ParseStateId next_state_id = add_parse_state(next_item_set); + ParseStateId next_state_id = add_parse_state(append_symbol(sequence, lookahead), next_item_set); ParseState &state = parse_table.states[state_id]; bool had_existing_action = !state.terminal_entries[lookahead].actions.empty(); parse_table.add_terminal_action(state_id, lookahead, ParseAction::Shift(next_state_id)); @@ -239,17 +256,17 @@ class ParseTableBuilder { // Add a Shift action for each non-terminal transition. for (auto &pair : nonterminal_successors) { - Symbol::Index lookahead = pair.first; + Symbol lookahead = Symbol::non_terminal(pair.first); ParseItemSet &next_item_set = pair.second; - ParseStateId next_state = add_parse_state(next_item_set); - parse_table.set_nonterminal_action(state_id, lookahead, next_state); + ParseStateId next_state_id = add_parse_state(append_symbol(sequence, lookahead), next_item_set); + parse_table.set_nonterminal_action(state_id, lookahead.index, next_state_id); if (!processing_recovery_states) { - recovery_states[Symbol::non_terminal(lookahead)].add(next_item_set); + recovery_states[lookahead].add(next_item_set); } } for (Symbol lookahead : lookaheads_with_conflicts) { - string conflict = handle_conflict(item_set, state_id, lookahead); + string conflict = handle_conflict(item_set, sequence, state_id, lookahead); if (!conflict.empty()) return conflict; } @@ -453,8 +470,8 @@ class ParseTableBuilder { return true; } - string handle_conflict(const ParseItemSet &item_set, ParseStateId state_id, - Symbol lookahead) { + string handle_conflict(const ParseItemSet &item_set, const SymbolSequence &preceding_symbols, + ParseStateId state_id, Symbol lookahead) { ParseTableEntry &entry = parse_table.states[state_id].terminal_entries[lookahead]; int reduction_precedence = entry.actions.front().precedence(); set shift_items; @@ -548,24 +565,13 @@ class ParseTableBuilder { if (expected_conflict == actual_conflict) return ""; - ParseItem earliest_starting_item; - for (const ParseAction &action : entry.actions) - if (action.type == ParseActionTypeReduce) - if (action.consumed_symbol_count > earliest_starting_item.step_index) - earliest_starting_item = ParseItem(action.symbol, *action.production, action.consumed_symbol_count); - - for (const ParseItem &shift_item : shift_items) - if (shift_item.step_index > earliest_starting_item.step_index) - earliest_starting_item = shift_item; - string description = "Unresolved conflict for symbol sequence:\n\n"; - for (size_t i = 0; i < earliest_starting_item.step_index; i++) { - description += " " + symbol_name(earliest_starting_item.production->at(i).symbol); + for (auto &symbol : preceding_symbols) { + description += " " + symbol_name(symbol); } description += " \u2022 " + symbol_name(lookahead) + " \u2026"; description += "\n\n"; - description += "Possible interpretations:\n\n"; size_t interpretation_count = 1; @@ -573,8 +579,8 @@ class ParseTableBuilder { if (action.type == ParseActionTypeReduce) { description += " " + to_string(interpretation_count++) + ":"; - for (size_t i = 0; i < earliest_starting_item.step_index - action.consumed_symbol_count; i++) { - description += " " + symbol_name(earliest_starting_item.production->at(i).symbol); + for (size_t i = 0; i < preceding_symbols.size() - action.consumed_symbol_count; i++) { + description += " " + symbol_name(preceding_symbols[i]); } description += " (" + symbol_name(action.symbol); @@ -590,8 +596,8 @@ class ParseTableBuilder { for (const ParseItem &shift_item : shift_items) { description += " " + to_string(interpretation_count++) + ":"; - for (size_t i = 0; i < earliest_starting_item.step_index - shift_item.step_index; i++) { - description += " " + symbol_name(earliest_starting_item.production->at(i).symbol); + for (size_t i = 0; i < preceding_symbols.size() - shift_item.step_index; i++) { + description += " " + symbol_name(preceding_symbols[i]); } description += " (" + symbol_name(shift_item.lhs()); @@ -682,6 +688,13 @@ class ParseTableBuilder { bool has_fragile_production(const Production *production) { return fragile_productions.find(production) != fragile_productions.end(); } + + SymbolSequence append_symbol(const SymbolSequence &sequence, const Symbol &symbol) { + SymbolSequence result(sequence.size() + 1); + result.assign(sequence.begin(), sequence.end()); + result.push_back(symbol); + return result; + } }; pair build_parse_table( diff --git a/test/fixtures/test_grammars/precedence_on_single_child_missing/expected_error.txt b/test/fixtures/test_grammars/precedence_on_single_child_missing/expected_error.txt index b1be0828..6ee80f23 100644 --- a/test/fixtures/test_grammars/precedence_on_single_child_missing/expected_error.txt +++ b/test/fixtures/test_grammars/precedence_on_single_child_missing/expected_error.txt @@ -1,11 +1,11 @@ Unresolved conflict for symbol sequence: - identifier • '{' … + identifier identifier • '{' … Possible interpretations: - 1: (expression identifier) • '{' … - 2: (function_call identifier • block) + 1: identifier (expression identifier) • '{' … + 2: identifier (function_call identifier • block) Possible resolutions: From 1586d70cbe4a70e3f02ea28cf149199096d75106 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 7 Jul 2017 17:54:24 -0700 Subject: [PATCH 08/12] Compute conflicting tokens more precisely While generating the parse table, keep track of which tokens can follow one another. Then use this information to evaluate token conflicts more precisely. This will result in a smaller parse table than the previous, overly-conservative approach. --- .../build_tables/build_parse_table.cc | 24 ++++- .../build_tables/lex_table_builder.cc | 102 +++++++++++------- src/compiler/build_tables/lex_table_builder.h | 8 +- .../build_tables/parse_item_set_builder.cc | 33 +++++- .../build_tables/parse_item_set_builder.h | 2 + .../build_tables/lex_table_builder_test.cc | 52 +++++---- 6 files changed, 160 insertions(+), 61 deletions(-) diff --git a/src/compiler/build_tables/build_parse_table.cc b/src/compiler/build_tables/build_parse_table.cc index e6a93099..bcb602d5 100644 --- a/src/compiler/build_tables/build_parse_table.cc +++ b/src/compiler/build_tables/build_parse_table.cc @@ -50,6 +50,7 @@ class ParseTableBuilder { ParseItemSetBuilder item_set_builder; set fragile_productions; vector> incompatible_tokens_by_index; + vector> following_terminals_by_terminal_index; bool processing_recovery_states; public: @@ -57,6 +58,8 @@ class ParseTableBuilder { : grammar(grammar), lexical_grammar(lex_grammar), item_set_builder(grammar, lex_grammar), + incompatible_tokens_by_index(lexical_grammar.variables.size()), + following_terminals_by_terminal_index(lexical_grammar.variables.size()), processing_recovery_states(false) {} pair build() { @@ -314,8 +317,6 @@ class ParseTableBuilder { } void compute_unmergable_token_pairs() { - incompatible_tokens_by_index.resize(lexical_grammar.variables.size()); - auto lex_table_builder = LexTableBuilder::create(lexical_grammar); for (unsigned i = 0, n = lexical_grammar.variables.size(); i < n; i++) { Symbol token = Symbol::terminal(i); @@ -323,7 +324,7 @@ class ParseTableBuilder { for (unsigned j = 0; j < n; j++) { if (i == j) continue; - if (lex_table_builder->detect_conflict(i, j)) { + if (lex_table_builder->detect_conflict(i, j, following_terminals_by_terminal_index)) { incompatible_indices.insert(Symbol::terminal(j)); } } @@ -690,6 +691,23 @@ class ParseTableBuilder { } SymbolSequence append_symbol(const SymbolSequence &sequence, const Symbol &symbol) { + if (!sequence.empty()) { + const LookaheadSet &left_tokens = item_set_builder.get_last_set(sequence.back()); + const LookaheadSet &right_tokens = item_set_builder.get_first_set(symbol); + + if (!left_tokens.empty() && !right_tokens.empty()) { + for (const Symbol &left_symbol : *left_tokens.entries) { + if (left_symbol.is_terminal() && !left_symbol.is_built_in()) { + for (const Symbol &right_symbol : *right_tokens.entries) { + if (right_symbol.is_terminal() && !right_symbol.is_built_in()) { + following_terminals_by_terminal_index[left_symbol.index].insert(right_symbol.index); + } + } + } + } + } + } + SymbolSequence result(sequence.size() + 1); result.assign(sequence.begin(), sequence.end()); result.push_back(symbol); diff --git a/src/compiler/build_tables/lex_table_builder.cc b/src/compiler/build_tables/lex_table_builder.cc index 4101f854..4f59a374 100644 --- a/src/compiler/build_tables/lex_table_builder.cc +++ b/src/compiler/build_tables/lex_table_builder.cc @@ -5,6 +5,7 @@ #include #include #include +#include #include #include "compiler/build_tables/lex_conflict_manager.h" #include "compiler/build_tables/lex_item.h" @@ -70,14 +71,16 @@ class LexTableBuilderImpl : public LexTableBuilder { LexTable lex_table; const LexicalGrammar grammar; vector separator_rules; - CharacterSet separator_start_characters; - CharacterSet token_start_characters; LexConflictManager conflict_manager; unordered_map lex_state_ids; - public: - vector shadowed_token_indices; + map following_characters_by_token_index; + CharacterSet separator_start_characters; + CharacterSet current_conflict_detection_following_characters; + Symbol::Index current_conflict_detection_token_index; + bool current_conflict_value; + public: LexTableBuilderImpl(const LexicalGrammar &grammar) : grammar(grammar) { StartingCharacterAggregator separator_character_aggregator; for (const auto &rule : grammar.separators) { @@ -86,20 +89,6 @@ class LexTableBuilderImpl : public LexTableBuilder { } separator_rules.push_back(Blank{}); separator_start_characters = separator_character_aggregator.result; - - StartingCharacterAggregator token_start_character_aggregator; - for (const auto &variable : grammar.variables) { - token_start_character_aggregator.apply(variable.rule); - } - token_start_characters = token_start_character_aggregator.result; - token_start_characters - .exclude('a', 'z') - .exclude('A', 'Z') - .exclude('0', '9') - .exclude('_') - .exclude('$'); - - shadowed_token_indices.resize(grammar.variables.size()); } LexTable build(ParseTable *parse_table) { @@ -113,7 +102,10 @@ class LexTableBuilderImpl : public LexTableBuilder { return lex_table; } - bool detect_conflict(Symbol::Index left, Symbol::Index right) { + bool detect_conflict(Symbol::Index left, Symbol::Index right, + const vector> &following_terminals_by_terminal_index) { + clear(); + StartingCharacterAggregator left_starting_characters; StartingCharacterAggregator right_starting_characters; left_starting_characters.apply(grammar.variables[left].rule); @@ -124,12 +116,47 @@ class LexTableBuilderImpl : public LexTableBuilder { return false; } - clear(); - map terminals; - terminals[Symbol::terminal(left)]; - terminals[Symbol::terminal(right)]; - add_lex_state(item_set_for_terminals(terminals)); - return shadowed_token_indices[right]; + auto following_characters_entry = following_characters_by_token_index.find(right); + if (following_characters_entry == following_characters_by_token_index.end()) { + StartingCharacterAggregator aggregator; + for (auto following_token_index : following_terminals_by_terminal_index[right]) { + aggregator.apply(grammar.variables[following_token_index].rule); + } + following_characters_entry = + following_characters_by_token_index.insert({right, aggregator.result}).first; + + // TODO - Refactor this. In general, a keyword token cannot be followed immediately by + // another alphanumeric character. But this requirement is currently not expressed anywhere in + // the grammar. So without this hack, we would be overly conservative about merging parse + // states because we would often consider `identifier` tokens to *conflict* with keyword + // tokens. + if (is_keyword(grammar.variables[right])) { + following_characters_entry->second + .exclude('a', 'z') + .exclude('A', 'Z') + .exclude('0', '9') + .exclude('_') + .exclude('$'); + } + } + + current_conflict_detection_token_index = right; + current_conflict_detection_following_characters = following_characters_entry->second; + add_lex_state(item_set_for_terminals({{Symbol::terminal(left), {}}, {Symbol::terminal(right), {}}})); + return current_conflict_value; + } + + bool is_keyword(const LexicalVariable &variable) { + return variable.is_string && iswalpha(get_last_character(variable.rule)); + } + + static uint32_t get_last_character(const Rule &rule) { + return rule.match( + [](const Seq &sequence) { return get_last_character(*sequence.right); }, + [](const rules::CharacterSet &rule) { return *rule.included_chars.begin(); }, + [](const rules::Metadata &rule) { return get_last_character(*rule.rule); }, + [](auto) { return 0; } + ); } LexStateId add_lex_state(const LexItemSet &item_set) { @@ -149,7 +176,8 @@ class LexTableBuilderImpl : public LexTableBuilder { void clear() { lex_table.states.clear(); lex_state_ids.clear(); - shadowed_token_indices.assign(grammar.variables.size(), false); + current_conflict_detection_following_characters = CharacterSet(); + current_conflict_value = false; } private: @@ -166,17 +194,18 @@ class LexTableBuilderImpl : public LexTableBuilder { for (const LexItem &item : transition.destination.entries) { if (item.lhs == accept_action.symbol) { can_advance_for_accepted_token = true; - } else if (!prefer_advancing && !transition.in_main_token && !item.lhs.is_built_in()) { - shadowed_token_indices[item.lhs.index] = true; + } else if (item.lhs.index == current_conflict_detection_token_index && + !prefer_advancing && !transition.in_main_token) { + current_conflict_value = true; } } - if (!can_advance_for_accepted_token) { - if (characters.intersects(separator_start_characters) || - (grammar.variables[accept_action.symbol.index].is_string && - characters.intersects(token_start_characters))) { - shadowed_token_indices[accept_action.symbol.index] = true; - } + if (accept_action.symbol.index == current_conflict_detection_token_index && + !can_advance_for_accepted_token && + (characters.intersects(separator_start_characters) || + (characters.intersects(current_conflict_detection_following_characters) && + grammar.variables[accept_action.symbol.index].is_string))) { + current_conflict_value = true; } if (!prefer_advancing) continue; @@ -346,8 +375,9 @@ LexTable LexTableBuilder::build(ParseTable *parse_table) { return static_cast(this)->build(parse_table); } -bool LexTableBuilder::detect_conflict(Symbol::Index left, Symbol::Index right) { - return static_cast(this)->detect_conflict(left, right); +bool LexTableBuilder::detect_conflict(Symbol::Index left, Symbol::Index right, + const vector> &following_terminals) { + return static_cast(this)->detect_conflict(left, right, following_terminals); } } // namespace build_tables diff --git a/src/compiler/build_tables/lex_table_builder.h b/src/compiler/build_tables/lex_table_builder.h index 91f24f70..3b896bb7 100644 --- a/src/compiler/build_tables/lex_table_builder.h +++ b/src/compiler/build_tables/lex_table_builder.h @@ -2,6 +2,8 @@ #define COMPILER_BUILD_TABLES_LEX_TABLE_BUILDER_H_ #include +#include +#include #include "compiler/lex_table.h" namespace tree_sitter { @@ -15,7 +17,11 @@ class LexTableBuilder { public: static std::unique_ptr create(const LexicalGrammar &); LexTable build(ParseTable *); - bool detect_conflict(rules::Symbol::Index, rules::Symbol::Index); + bool detect_conflict( + rules::Symbol::Index, + rules::Symbol::Index, + const std::vector> &following_terminals_by_terminal_index + ); protected: LexTableBuilder() = default; }; diff --git a/src/compiler/build_tables/parse_item_set_builder.cc b/src/compiler/build_tables/parse_item_set_builder.cc index 36c3942f..77fde864 100644 --- a/src/compiler/build_tables/parse_item_set_builder.cc +++ b/src/compiler/build_tables/parse_item_set_builder.cc @@ -1,4 +1,5 @@ #include "compiler/build_tables/parse_item_set_builder.h" +#include #include #include #include @@ -26,18 +27,20 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar, for (size_t i = 0, n = lexical_grammar.variables.size(); i < n; i++) { Symbol symbol = Symbol::terminal(i); - first_sets.insert({symbol, LookaheadSet({ symbol })}); + first_sets.insert({symbol, LookaheadSet({symbol})}); + last_sets.insert({symbol, LookaheadSet({symbol})}); } for (size_t i = 0, n = grammar.external_tokens.size(); i < n; i++) { Symbol symbol = Symbol::external(i); - first_sets.insert({symbol, LookaheadSet({ symbol })}); + first_sets.insert({symbol, LookaheadSet({symbol})}); + last_sets.insert({symbol, LookaheadSet({symbol})}); } for (size_t i = 0, n = grammar.variables.size(); i < n; i++) { Symbol symbol = Symbol::non_terminal(i); - LookaheadSet first_set; + LookaheadSet first_set; processed_non_terminals.clear(); symbols_to_process.clear(); symbols_to_process.push_back(symbol); @@ -57,6 +60,26 @@ ParseItemSetBuilder::ParseItemSetBuilder(const SyntaxGrammar &grammar, } first_sets.insert({symbol, first_set}); + + LookaheadSet last_set; + processed_non_terminals.clear(); + symbols_to_process.clear(); + symbols_to_process.push_back(symbol); + while (!symbols_to_process.empty()) { + Symbol current_symbol = symbols_to_process.back(); + symbols_to_process.pop_back(); + + if (!current_symbol.is_non_terminal()) { + last_set.insert(current_symbol); + } else if (processed_non_terminals.insert(current_symbol.index).second) { + for (const Production &production : grammar.variables[current_symbol.index].productions) { + if (!production.empty()) { + symbols_to_process.push_back(production.back().symbol); + } + } + } + } + last_sets.insert({symbol, last_set}); } vector components_to_process; @@ -161,5 +184,9 @@ LookaheadSet ParseItemSetBuilder::get_first_set(const rules::Symbol &symbol) con return first_sets.find(symbol)->second; } +LookaheadSet ParseItemSetBuilder::get_last_set(const rules::Symbol &symbol) const { + return last_sets.find(symbol)->second; +} + } // namespace build_tables } // namespace tree_sitter diff --git a/src/compiler/build_tables/parse_item_set_builder.h b/src/compiler/build_tables/parse_item_set_builder.h index a319d698..b0334e68 100644 --- a/src/compiler/build_tables/parse_item_set_builder.h +++ b/src/compiler/build_tables/parse_item_set_builder.h @@ -20,6 +20,7 @@ class ParseItemSetBuilder { }; std::map first_sets; + std::map last_sets; std::map> component_cache; std::vector> item_set_buffer; @@ -27,6 +28,7 @@ class ParseItemSetBuilder { ParseItemSetBuilder(const SyntaxGrammar &, const LexicalGrammar &); void apply_transitive_closure(ParseItemSet *); LookaheadSet get_first_set(const rules::Symbol &) const; + LookaheadSet get_last_set(const rules::Symbol &) const; }; } // namespace build_tables diff --git a/test/compiler/build_tables/lex_table_builder_test.cc b/test/compiler/build_tables/lex_table_builder_test.cc index 7376bd2c..e9f70aee 100644 --- a/test/compiler/build_tables/lex_table_builder_test.cc +++ b/test/compiler/build_tables/lex_table_builder_test.cc @@ -16,7 +16,7 @@ describe("LexTableBuilder::detect_conflict", []() { auto builder = LexTableBuilder::create(LexicalGrammar{ { LexicalVariable{ - "token_1", + "token_0", VariableTypeNamed, Rule::seq({ CharacterSet({ 'a' }), @@ -26,7 +26,7 @@ describe("LexTableBuilder::detect_conflict", []() { false }, LexicalVariable{ - "token_2", + "token_1", VariableTypeNamed, Rule::seq({ CharacterSet({ 'b' }), @@ -39,22 +39,22 @@ describe("LexTableBuilder::detect_conflict", []() { separators }); - AssertThat(builder->detect_conflict(0, 1), IsFalse()); - AssertThat(builder->detect_conflict(1, 0), IsFalse()); + AssertThat(builder->detect_conflict(0, 1, {{}, {}}), IsFalse()); + AssertThat(builder->detect_conflict(1, 0, {{}, {}}), IsFalse()); }); - it("returns true when one token matches a string that the other matches, " - "plus some addition content that begins with a separator character", [&]() { + it("returns true when the left token can match a string that the right token matches, " + "plus a separator character", [&]() { LexicalGrammar grammar{ { LexicalVariable{ - "token_1", + "token_0", VariableTypeNamed, Rule::repeat(CharacterSet().include_all().exclude('\n')), // regex: /.+/ false }, LexicalVariable{ - "token_2", + "token_1", VariableTypeNamed, Rule::seq({ CharacterSet({ 'a' }), CharacterSet({ 'b' }), CharacterSet({ 'c' }) }), // string: 'abc' true @@ -64,24 +64,32 @@ describe("LexTableBuilder::detect_conflict", []() { }; auto builder = LexTableBuilder::create(grammar); - AssertThat(builder->detect_conflict(0, 1), IsTrue()); - AssertThat(builder->detect_conflict(1, 0), IsFalse()); + AssertThat(builder->detect_conflict(0, 1, {{}, {}}), IsTrue()); + AssertThat(builder->detect_conflict(1, 0, {{}, {}}), IsFalse()); grammar.variables[1].is_string = false; - AssertThat(builder->detect_conflict(0, 1), IsTrue()); - AssertThat(builder->detect_conflict(1, 0), IsFalse()); + AssertThat(builder->detect_conflict(0, 1, {{}, {}}), IsTrue()); + AssertThat(builder->detect_conflict(1, 0, {{}, {}}), IsFalse()); }); - it("returns true when one token matches a string that the other matches, " - "plus some addition content that matches another one-character token", [&]() { + it("returns true when the left token matches a string that the right token matches, " + "plus the first character of some token that can follow the right token", [&]() { LexicalGrammar grammar{ { + LexicalVariable{ + "token_0", + VariableTypeNamed, + Rule::seq({ + CharacterSet({ '>' }), + CharacterSet({ '=' }), + }), + true + }, LexicalVariable{ "token_1", VariableTypeNamed, Rule::seq({ CharacterSet({ '>' }), - CharacterSet({ '>' }), }), true }, @@ -89,7 +97,7 @@ describe("LexTableBuilder::detect_conflict", []() { "token_2", VariableTypeNamed, Rule::seq({ - CharacterSet({ '>' }), + CharacterSet({ '=' }), }), true }, @@ -97,9 +105,17 @@ describe("LexTableBuilder::detect_conflict", []() { separators }; + // If no tokens can follow token_1, then there's no conflict auto builder = LexTableBuilder::create(grammar); - AssertThat(builder->detect_conflict(0, 1), IsTrue()); - AssertThat(builder->detect_conflict(1, 0), IsFalse()); + vector> following_tokens_by_token_index(3); + AssertThat(builder->detect_conflict(0, 1, following_tokens_by_token_index), IsFalse()); + AssertThat(builder->detect_conflict(1, 0, following_tokens_by_token_index), IsFalse()); + + // If token_2 can follow token_1, then token_0 conflicts with token_1 + builder = LexTableBuilder::create(grammar); + following_tokens_by_token_index[1].insert(2); + AssertThat(builder->detect_conflict(0, 1, following_tokens_by_token_index), IsTrue()); + AssertThat(builder->detect_conflict(1, 0, following_tokens_by_token_index), IsFalse()); }); }); From 2755b07222573c2589cfbe01c454aced44be8654 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 10 Jul 2017 10:39:16 -0700 Subject: [PATCH 09/12] Don't store unfinished item signature on ParseStates --- externals/crypto-algorithms | 2 +- .../build_tables/build_parse_table.cc | 45 +++++++++---------- src/compiler/build_tables/parse_item.cc | 13 +++--- src/compiler/parse_table.h | 1 - 4 files changed, 29 insertions(+), 32 deletions(-) diff --git a/externals/crypto-algorithms b/externals/crypto-algorithms index cfbde484..c7e5c23a 160000 --- a/externals/crypto-algorithms +++ b/externals/crypto-algorithms @@ -1 +1 @@ -Subproject commit cfbde48414baacf51fc7c74f275190881f037d32 +Subproject commit c7e5c23ab04ecfb5465cbefbe17ba23d4cb3bc9d diff --git a/src/compiler/build_tables/build_parse_table.cc b/src/compiler/build_tables/build_parse_table.cc index bcb602d5..5dd12fa8 100644 --- a/src/compiler/build_tables/build_parse_table.cc +++ b/src/compiler/build_tables/build_parse_table.cc @@ -168,7 +168,6 @@ class ParseTableBuilder { ParseStateId state_id = parse_table.states.size(); parse_table.states.push_back(ParseState()); parse_state_ids[item_set] = state_id; - parse_table.states[state_id].shift_actions_signature = item_set.unfinished_item_signature(); parse_state_queue.push_back({ move(preceding_symbols), move(item_set), @@ -340,11 +339,12 @@ class ParseTableBuilder { } void remove_duplicate_parse_states() { - map> state_indices_by_signature; + unordered_map> state_indices_by_signature; - for (ParseStateId i = 0, n = parse_table.states.size(); i < n; i++) { - ParseState &state = parse_table.states[i]; - state_indices_by_signature[state.shift_actions_signature].insert(i); + for (auto &pair : parse_state_ids) { + const ParseItemSet &item_set = pair.first; + ParseStateId state_id = pair.second; + state_indices_by_signature[item_set.unfinished_item_signature()].insert(state_id); } set deleted_states; @@ -353,14 +353,18 @@ class ParseTableBuilder { map state_replacements; for (auto &pair : state_indices_by_signature) { - auto &state_group = pair.second; + auto &state_indices = pair.second; - for (ParseStateId i : state_group) { - for (ParseStateId j : state_group) { - if (j == i) break; - if (!state_replacements.count(j) && merge_parse_state(j, i)) { - state_replacements.insert({ i, j }); - deleted_states.insert(i); + for (auto i = state_indices.begin(), end = state_indices.end(); i != end;) { + for (ParseStateId j : state_indices) { + if (j == *i) { + ++i; + break; + } + if (!state_replacements.count(j) && merge_parse_state(j, *i)) { + state_replacements.insert({*i, j}); + deleted_states.insert(*i); + i = state_indices.erase(i); break; } } @@ -370,11 +374,8 @@ class ParseTableBuilder { if (state_replacements.empty()) break; for (ParseStateId i = 0, n = parse_table.states.size(); i < n; i++) { - ParseState &state = parse_table.states[i]; - - if (state_replacements.count(i)) { - state_indices_by_signature[state.shift_actions_signature].erase(i); - } else { + if (!state_replacements.count(i)) { + ParseState &state = parse_table.states[i]; state.each_referenced_state([&state_replacements](ParseStateId *state_index) { auto replacement = state_replacements.find(*state_index); if (replacement != state_replacements.end()) { @@ -414,7 +415,7 @@ class ParseTableBuilder { static bool has_entry(const ParseState &state, const ParseTableEntry &entry) { for (const auto &pair : state.terminal_entries) - if (pair.second == entry) + if (pair.second.actions == entry.actions) return true; return false; } @@ -427,13 +428,12 @@ class ParseTableBuilder { for (auto &entry : state.terminal_entries) { Symbol lookahead = entry.first; - const auto &other_entry = other.terminal_entries.find(lookahead); + const auto &other_entry = other.terminal_entries.find(lookahead); if (other_entry == other.terminal_entries.end()) { + if (lookahead.is_external()) return false; if (entry.second.actions.back().type != ParseActionTypeReduce) return false; if (!has_entry(other, entry.second)) return false; - - if (lookahead.is_external()) return false; if (!lookahead.is_built_in()) { for (const Symbol &incompatible_token : incompatible_tokens_by_index[lookahead.index]) { if (other.terminal_entries.count(incompatible_token)) return false; @@ -450,10 +450,9 @@ class ParseTableBuilder { Symbol lookahead = entry.first; if (!state.terminal_entries.count(lookahead)) { + if (lookahead.is_external()) return false; if (entry.second.actions.back().type != ParseActionTypeReduce) return false; if (!has_entry(state, entry.second)) return false; - - if (lookahead.is_external()) return false; if (!lookahead.is_built_in()) { for (const Symbol &incompatible_token : incompatible_tokens_by_index[lookahead.index]) { if (state.terminal_entries.count(incompatible_token)) return false; diff --git a/src/compiler/build_tables/parse_item.cc b/src/compiler/build_tables/parse_item.cc index b672bb1b..99acb3fc 100644 --- a/src/compiler/build_tables/parse_item.cc +++ b/src/compiler/build_tables/parse_item.cc @@ -97,13 +97,12 @@ size_t ParseItemSet::unfinished_item_signature() const { ParseItem previous_item; for (auto &pair : entries) { const ParseItem &item = pair.first; - if (item.step_index < item.production->size()) { - if (item.variable_index != previous_item.variable_index && - item.step_index != previous_item.step_index) { - hash_combine(&result, item.variable_index); - hash_combine(&result, item.step_index); - previous_item = item; - } + if (item.step_index < item.production->size() && + (item.variable_index != previous_item.variable_index || + item.step_index != previous_item.step_index)) { + hash_combine(&result, item.variable_index); + hash_combine(&result, item.step_index); + previous_item = item; } } return result; diff --git a/src/compiler/parse_table.h b/src/compiler/parse_table.h index 555dad31..8a127bed 100644 --- a/src/compiler/parse_table.h +++ b/src/compiler/parse_table.h @@ -69,7 +69,6 @@ struct ParseState { std::map terminal_entries; std::map nonterminal_entries; LexStateId lex_state_id; - size_t shift_actions_signature; }; struct ParseTableSymbolMetadata { From 59236d2ed105907bb76ae27fdcca6a1631dbf8c3 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 10 Jul 2017 14:09:31 -0700 Subject: [PATCH 10/12] Avoid redundant character comparisons in generated lex function --- src/compiler/generate_code/c_code.cc | 115 +++++++++++++++------- src/compiler/rules/character_set.cc | 38 +++---- test/compiler/rules/character_set_test.cc | 22 +---- 3 files changed, 101 insertions(+), 74 deletions(-) diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index b2359233..1d5ff690 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -401,52 +401,106 @@ class CCodeGenerator { add_accept_token_action(lex_state.accept_action); } + set ruled_out_characters; for (const auto &pair : lex_state.advance_actions) { - if (!pair.first.is_empty()) { - _if([&]() { add_character_set_condition(pair.first); }, - [&]() { add_advance_action(pair.second); }); + if (pair.first.is_empty()) continue; + + size_t current_length = buffer.size(); + + line("if ("); + if (add_character_set_condition(pair.first, ruled_out_characters)) { + add(")"); + indent([&]() { add_advance_action(pair.second); }); + ruled_out_characters.insert(pair.first.included_chars.begin(), pair.first.included_chars.end()); + } else { + buffer.resize(current_length); + add_advance_action(pair.second); } } line("END_STATE();"); } - void add_character_set_condition(const rules::CharacterSet &rule) { + bool add_character_set_condition(const rules::CharacterSet &rule, const set &ruled_out_characters) { if (rule.includes_all) { - add("!("); - add_character_range_conditions(rule.excluded_ranges()); - add(")"); + return add_character_range_conditions(rule.excluded_ranges(), ruled_out_characters, true); } else { - add_character_range_conditions(rule.included_ranges()); + return add_character_range_conditions(rule.included_ranges(), ruled_out_characters, false); } } - void add_character_range_conditions(const vector &ranges) { - if (ranges.size() == 1) { - add_character_range_condition(*ranges.begin()); - } else { - bool first = true; - for (const auto &range : ranges) { - if (!first) { - add(" ||"); - line(" "); + bool add_character_range_conditions(const vector &ranges, + const set &ruled_out_characters, + bool is_negated) { + bool first = true; + for (auto iter = ranges.begin(), end = ranges.end(); iter != end;) { + auto range = *iter; + + bool range_is_ruled_out = true; + for (uint32_t c = range.min; c <= range.max; c++) { + if (!ruled_out_characters.count(c)) { + range_is_ruled_out = false; + break; + } + } + + if (range_is_ruled_out) { + ++iter; + continue; + } + + auto next_iter = iter + 1; + while (next_iter != end) { + bool can_join_ranges = true; + for (uint32_t character = range.max + 1; character < next_iter->min; character++) { + if (!ruled_out_characters.count(character)) { + can_join_ranges = false; + break; + } } - add("("); - add_character_range_condition(range); - add(")"); - - first = false; + if (can_join_ranges) { + range.max = next_iter->max; + ++next_iter; + } else { + break; + } } + + if (!first) { + add(is_negated ? " &&" : " ||"); + line(" "); + } + + add_character_range_condition(range, is_negated); + first = false; + iter = next_iter; } + + return !first; } - void add_character_range_condition(const rules::CharacterRange &range) { - if (range.min == range.max) { - add("lookahead == " + escape_char(range.min)); + void add_character_range_condition(const rules::CharacterRange &range, bool is_negated) { + auto min = escape_char(range.min); + auto max = escape_char(range.max); + if (is_negated) { + if (range.max == range.min) { + add("lookahead != " + min); + } else if (range.max == range.min + 1) { + add("lookahead != " + min + " &&"); + line(" lookahead != " + max); + } else { + add("(lookahead < " + min + " || lookahead > " + max + ")"); + } } else { - add(escape_char(range.min) + string(" <= lookahead && lookahead <= ") + - escape_char(range.max)); + if (range.max == range.min) { + add("lookahead == " + min); + } else if (range.max == range.min + 1) { + add("lookahead == " + min + " ||"); + line(" lookahead == " + max); + } else { + add("(" + min + " <= lookahead && lookahead <= " + max + ")"); + } } } @@ -599,13 +653,6 @@ class CCodeGenerator { indent(body); } - void _if(function condition, function body) { - line("if ("); - indent(condition); - add(")"); - indent(body); - } - string sanitize_name_for_string(string name) { util::str_replace(&name, "\\", "\\\\"); util::str_replace(&name, "\n", "\\n"); diff --git a/src/compiler/rules/character_set.cc b/src/compiler/rules/character_set.cc index 5b0c3464..c199368d 100644 --- a/src/compiler/rules/character_set.cc +++ b/src/compiler/rules/character_set.cc @@ -38,20 +38,11 @@ static set add_chars(set *left, const set &right) return result; } -static vector consolidate_ranges(const set &chars) { +static vector consolidate_ranges(const set &characters) { vector result; - for (uint32_t c : chars) { - auto size = result.size(); - if (size >= 2 && result[size - 2].max == (c - 2)) { - result.pop_back(); + for (uint32_t c : characters) { + if (!result.empty() && result.back().max == c - 1) { result.back().max = c; - } else if (size >= 1) { - CharacterRange &last = result.back(); - if (last.min < last.max && last.max == (c - 1)) { - last.max = c; - } else { - result.push_back(CharacterRange(c)); - } } else { result.push_back(CharacterRange(c)); } @@ -70,15 +61,17 @@ bool CharacterSet::operator==(const CharacterSet &other) const { } bool CharacterSet::operator<(const CharacterSet &other) const { - if (!includes_all && other.includes_all) - return true; - if (includes_all && !other.includes_all) - return false; - if (included_chars < other.included_chars) - return true; - if (other.included_chars < included_chars) - return false; - return excluded_chars < other.excluded_chars; + if (!includes_all && other.includes_all) return true; + if (includes_all && !other.includes_all) return false; + if (includes_all) { + if (excluded_chars.size() > other.excluded_chars.size()) return true; + if (excluded_chars.size() < other.excluded_chars.size()) return false; + return excluded_chars < other.excluded_chars; + } else { + if (included_chars.size() < other.included_chars.size()) return true; + if (included_chars.size() > other.included_chars.size()) return false; + return included_chars < other.included_chars; + } } CharacterSet &CharacterSet::include_all() { @@ -131,8 +124,7 @@ void CharacterSet::add_set(const CharacterSet &other) { excluded_chars.insert(c); included_chars.clear(); } else { - for (uint32_t c : other.included_chars) - included_chars.insert(c); + included_chars.insert(other.included_chars.begin(), other.included_chars.end()); } } } diff --git a/test/compiler/rules/character_set_test.cc b/test/compiler/rules/character_set_test.cc index f7c2e632..dfe67604 100644 --- a/test/compiler/rules/character_set_test.cc +++ b/test/compiler/rules/character_set_test.cc @@ -305,29 +305,17 @@ describe("CharacterSet", []() { }); describe("::included_ranges", [&]() { - it("consolidates sequences of 3 or more consecutive characters into ranges", [&]() { + it("consolidates consecutive sequences of characters into ranges", [&]() { CharacterSet set1 = CharacterSet() .include('a', 'c') - .include('g') + .include('e', 'j') + .include('m') .include('z'); AssertThat(set1.included_ranges(), Equals(vector({ CharacterRange{'a', 'c'}, - CharacterRange('g'), - CharacterRange('z'), - }))); - }); - - it("doesn't consolidate sequences of 2 consecutive characters", [&]() { - CharacterSet set1 = CharacterSet() - .include('a', 'b') - .include('g') - .include('z'); - - AssertThat(set1.included_ranges(), Equals(vector({ - CharacterRange('a'), - CharacterRange('b'), - CharacterRange('g'), + CharacterRange{'e', 'j'}, + CharacterRange('m'), CharacterRange('z'), }))); }); From bf4b8bf55b7fbb830e216455ea9980913cfce0d5 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 10 Jul 2017 14:29:14 -0700 Subject: [PATCH 11/12] Use my fork of crypto-algorithms --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 72fbdb67..bdbfaaf8 100644 --- a/.gitmodules +++ b/.gitmodules @@ -12,4 +12,4 @@ url = https://github.com/udp/json-parser.git [submodule "externals/crypto-algorithms"] path = externals/crypto-algorithms - url = https://github.com/B-Con/crypto-algorithms.git + url = https://github.com/maxbrunsfeld/crypto-algorithms.git From 5bd5b4bb052ccd250c7af0d3f7f0f8160ad59841 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 10 Jul 2017 14:35:14 -0700 Subject: [PATCH 12/12] Replace -> --- src/compiler/build_tables/lex_table_builder.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/compiler/build_tables/lex_table_builder.cc b/src/compiler/build_tables/lex_table_builder.cc index 4f59a374..78413baa 100644 --- a/src/compiler/build_tables/lex_table_builder.cc +++ b/src/compiler/build_tables/lex_table_builder.cc @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include #include "compiler/build_tables/lex_conflict_manager.h" #include "compiler/build_tables/lex_item.h" @@ -16,6 +16,7 @@ namespace tree_sitter { namespace build_tables { +using std::iswalpha; using std::map; using std::pair; using std::set;