From 68a0e16d1ec6c3ec7759681fd0be912ec193e892 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sat, 17 Jan 2015 14:10:56 -0800 Subject: [PATCH 1/4] Add void specialization of RuleFn template --- src/compiler/rules/visitor.h | 53 ++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/src/compiler/rules/visitor.h b/src/compiler/rules/visitor.h index 66dfd4b2..7507e55d 100644 --- a/src/compiler/rules/visitor.h +++ b/src/compiler/rules/visitor.h @@ -90,6 +90,59 @@ class RuleFn : private Visitor { T value_; }; +template <> +class RuleFn : private Visitor { + public: + void apply(const rule_ptr &rule) { + rule->accept(this); + } + + protected: + virtual void default_apply(const Rule *rule) {} + + virtual void apply_to(const Blank *rule) { + return default_apply((const Rule *)rule); + } + virtual void apply_to(const CharacterSet *rule) { + return default_apply((const Rule *)rule); + } + virtual void apply_to(const Choice *rule) { + return default_apply((const Rule *)rule); + } + virtual void apply_to(const Metadata *rule) { + return default_apply((const Rule *)rule); + } + virtual void apply_to(const Pattern *rule) { + return default_apply((const Rule *)rule); + } + virtual void apply_to(const Repeat *rule) { + return default_apply((const Rule *)rule); + } + virtual void apply_to(const Seq *rule) { + return default_apply((const Rule *)rule); + } + virtual void apply_to(const String *rule) { + return default_apply((const Rule *)rule); + } + virtual void apply_to(const NamedSymbol *rule) { + return default_apply((const Rule *)rule); + } + virtual void apply_to(const Symbol *rule) { + return default_apply((const Rule *)rule); + } + + void visit(const Blank *rule) { apply_to(rule); } + void visit(const CharacterSet *rule) { apply_to(rule); } + void visit(const Choice *rule) { apply_to(rule); } + void visit(const Metadata *rule) { apply_to(rule); } + void visit(const Pattern *rule) { apply_to(rule); } + void visit(const Repeat *rule) { apply_to(rule); } + void visit(const Seq *rule) { apply_to(rule); } + void visit(const String *rule) { apply_to(rule); } + void visit(const NamedSymbol *rule) { apply_to(rule); } + void visit(const Symbol *rule) { apply_to(rule); } +}; + class IdentityRuleFn : public RuleFn { protected: virtual rule_ptr default_apply(const Rule *rule); From 52daffb3f3e5f9183de9e7eb77f382f3d06e0e35 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sun, 11 Jan 2015 23:21:58 -0800 Subject: [PATCH 2/4] Separate syntax rules into flat lists of symbols This way, every ParseItem can be associated with a particular production for its non-terminal. That lets us keep track of which productions are involved in shift/reduce conflicts. --- project.gyp | 4 +- .../action_takes_precedence_spec.cc | 1 - .../build_tables/build_conflict_spec.cc | 20 +-- .../build_tables/build_parse_table_spec.cc | 22 ++- .../build_tables/first_symbols_spec.cc | 103 ------------- .../build_tables/item_set_closure_spec.cc | 69 ++++++--- .../build_tables/item_set_transitions_spec.cc | 104 +++++++------ .../build_tables/rule_can_be_blank_spec.cc | 22 --- spec/compiler/helpers/containers.h | 9 ++ spec/compiler/helpers/stream_methods.h | 4 +- .../prepare_grammar/expand_repeats_spec.cc | 13 +- .../prepare_grammar/extract_choices_spec.cc | 58 +++++++ .../prepare_grammar/extract_tokens_spec.cc | 5 +- .../prepare_grammar/flatten_grammar_spec.cc | 109 +++++++++++++ .../build_tables/action_takes_precedence.h | 1 - .../build_tables/build_parse_table.cc | 23 +-- src/compiler/build_tables/first_symbols.cc | 68 -------- src/compiler/build_tables/first_symbols.h | 24 --- src/compiler/build_tables/item_set_closure.cc | 42 +++-- .../build_tables/item_set_transitions.cc | 36 +++-- src/compiler/build_tables/parse_item.cc | 21 ++- src/compiler/build_tables/parse_item.h | 14 +- .../build_tables/rule_can_be_blank.cc | 31 ---- src/compiler/build_tables/rule_can_be_blank.h | 5 - .../prepare_grammar/expand_repeats.cc | 6 +- src/compiler/prepare_grammar/expand_repeats.h | 7 +- .../prepare_grammar/extract_choices.cc | 58 +++++++ .../prepare_grammar/extract_choices.h | 15 ++ .../prepare_grammar/extract_tokens.cc | 10 +- src/compiler/prepare_grammar/extract_tokens.h | 5 +- .../prepare_grammar/flatten_grammar.cc | 145 ++++++++++++++++++ .../prepare_grammar/flatten_grammar.h | 13 ++ .../prepare_grammar/initial_syntax_grammar.cc | 37 +++++ .../prepare_grammar/initial_syntax_grammar.h | 36 +++++ .../prepare_grammar/prepare_grammar.cc | 8 +- src/compiler/syntax_grammar.cc | 83 ++++++++-- src/compiler/syntax_grammar.h | 37 +++-- 37 files changed, 842 insertions(+), 426 deletions(-) delete mode 100644 spec/compiler/build_tables/first_symbols_spec.cc create mode 100644 spec/compiler/prepare_grammar/extract_choices_spec.cc create mode 100644 spec/compiler/prepare_grammar/flatten_grammar_spec.cc delete mode 100644 src/compiler/build_tables/first_symbols.cc delete mode 100644 src/compiler/build_tables/first_symbols.h create mode 100644 src/compiler/prepare_grammar/extract_choices.cc create mode 100644 src/compiler/prepare_grammar/extract_choices.h create mode 100644 src/compiler/prepare_grammar/flatten_grammar.cc create mode 100644 src/compiler/prepare_grammar/flatten_grammar.h create mode 100644 src/compiler/prepare_grammar/initial_syntax_grammar.cc create mode 100644 src/compiler/prepare_grammar/initial_syntax_grammar.h diff --git a/project.gyp b/project.gyp index e9737874..19dda780 100644 --- a/project.gyp +++ b/project.gyp @@ -15,7 +15,6 @@ 'src/compiler/build_tables/build_lex_table.cc', 'src/compiler/build_tables/build_parse_table.cc', 'src/compiler/build_tables/build_tables.cc', - 'src/compiler/build_tables/first_symbols.cc', 'src/compiler/build_tables/get_metadata.cc', 'src/compiler/build_tables/item.cc', 'src/compiler/build_tables/item_set_closure.cc', @@ -33,12 +32,15 @@ 'src/compiler/parse_table.cc', 'src/compiler/prepare_grammar/expand_repeats.cc', 'src/compiler/prepare_grammar/expand_tokens.cc', + 'src/compiler/prepare_grammar/extract_choices.cc', 'src/compiler/prepare_grammar/extract_tokens.cc', + 'src/compiler/prepare_grammar/flatten_grammar.cc', 'src/compiler/prepare_grammar/intern_symbols.cc', 'src/compiler/prepare_grammar/is_token.cc', 'src/compiler/prepare_grammar/parse_regex.cc', 'src/compiler/prepare_grammar/prepare_grammar.cc', 'src/compiler/prepare_grammar/token_description.cc', + 'src/compiler/prepare_grammar/initial_syntax_grammar.cc', 'src/compiler/syntax_grammar.cc', 'src/compiler/rules/blank.cc', 'src/compiler/rules/built_in_symbols.cc', diff --git a/spec/compiler/build_tables/action_takes_precedence_spec.cc b/spec/compiler/build_tables/action_takes_precedence_spec.cc index fa4ee904..892e65cd 100644 --- a/spec/compiler/build_tables/action_takes_precedence_spec.cc +++ b/spec/compiler/build_tables/action_takes_precedence_spec.cc @@ -2,7 +2,6 @@ #include "compiler/rules/built_in_symbols.h" #include "compiler/parse_table.h" #include "compiler/build_tables/action_takes_precedence.h" -#include "compiler/syntax_grammar.h" using namespace rules; using namespace build_tables; diff --git a/spec/compiler/build_tables/build_conflict_spec.cc b/spec/compiler/build_tables/build_conflict_spec.cc index 1f30ac12..ad2f66d9 100644 --- a/spec/compiler/build_tables/build_conflict_spec.cc +++ b/spec/compiler/build_tables/build_conflict_spec.cc @@ -12,11 +12,11 @@ describe("build_conflict", []() { Conflict conflict(""); SyntaxGrammar parse_grammar({ - { "in_progress_rule1", i_token(0) }, - { "in_progress_rule2", i_token(0) }, - { "reduced_rule", i_token(0) }, - { "other_ruel1", i_token(0) }, - { "other_rule2", i_token(0) }, + { "in_progress_rule1", {} }, + { "in_progress_rule2", {} }, + { "reduced_rule", {} }, + { "other_ruel1", {} }, + { "other_rule2", {} }, }, {}, { Symbol(2, SymbolOptionToken) }); LexicalGrammar lex_grammar({ @@ -30,15 +30,15 @@ describe("build_conflict", []() { ParseAction::Reduce(Symbol(2), 1, 0), // reduced_rule ParseItemSet({ { - ParseItem(Symbol(0), blank(), 2), // in_progress_rule1 + ParseItem(Symbol(0), 0, 0, 2), // in_progress_rule1 set({ Symbol(2, SymbolOptionToken) }) }, { - ParseItem(Symbol(1), blank(), 2), // in_progress_rule2 + ParseItem(Symbol(1), 0, 0, 2), // in_progress_rule2 set({ Symbol(2, SymbolOptionToken) }) }, { - ParseItem(Symbol(3), blank(), 0), // other_rule1 + ParseItem(Symbol(3), 0, 0, 0), // other_rule1 set({ Symbol(2, SymbolOptionToken) }) }, }), @@ -58,11 +58,11 @@ describe("build_conflict", []() { ParseAction::Shift(2, set()), ParseItemSet({ { - ParseItem(Symbol(0), blank(), 2), // in_progress_rule1 + ParseItem(Symbol(0), 0, 0, 2), // in_progress_rule1 set({ Symbol(2, SymbolOptionToken) }) }, { - ParseItem(Symbol(1), blank(), 2), // in_progress_rule2 + ParseItem(Symbol(1), 0, 0, 2), // in_progress_rule2 set({ Symbol(2, SymbolOptionToken) }) }, }), diff --git a/spec/compiler/build_tables/build_parse_table_spec.cc b/spec/compiler/build_tables/build_parse_table_spec.cc index 4a6874c8..c67a9418 100644 --- a/spec/compiler/build_tables/build_parse_table_spec.cc +++ b/spec/compiler/build_tables/build_parse_table_spec.cc @@ -12,9 +12,25 @@ START_TEST describe("build_parse_table", []() { SyntaxGrammar parse_grammar({ - { "rule0", choice({ i_sym(1), i_sym(2) }) }, - { "rule1", i_token(0) }, - { "rule2", i_token(1) }, + { + "rule0", + { + Production({ {Symbol(1), 0, 1} }, 0), + Production({ {Symbol(2), 0, 2} }, 0) + } + }, + { + "rule1", + { + Production({ {Symbol(0, SymbolOptionToken), 0, 3} }, 0) + } + }, + { + "rule2", + { + Production({ {Symbol(1, SymbolOptionToken), 0, 4} }, 0) + } + }, }, {}, { Symbol(2, SymbolOptionToken) }); LexicalGrammar lex_grammar({ diff --git a/spec/compiler/build_tables/first_symbols_spec.cc b/spec/compiler/build_tables/first_symbols_spec.cc deleted file mode 100644 index 111e4910..00000000 --- a/spec/compiler/build_tables/first_symbols_spec.cc +++ /dev/null @@ -1,103 +0,0 @@ -#include "compiler/compiler_spec_helper.h" -#include "compiler/syntax_grammar.h" -#include "compiler/build_tables/first_symbols.h" -#include "compiler/rules/metadata.h" - -using namespace build_tables; -using namespace rules; - -START_TEST - -describe("first_symbols", []() { - const SyntaxGrammar null_grammar; - - describe("for a sequence AB", [&]() { - it("ignores B when A cannot be blank", [&]() { - auto rule = seq({ i_token(0), i_token(1) }); - - AssertThat(first_symbols(rule, null_grammar), Equals(set({ - Symbol(0, SymbolOptionToken), - }))); - }); - - it("includes first_symbols(B) when A can be blank", [&]() { - auto rule = seq({ - choice({ - i_token(0), - blank() }), - i_token(1) }); - - AssertThat(first_symbols(rule, null_grammar), Equals(set({ - Symbol(0, SymbolOptionToken), - Symbol(1, SymbolOptionToken) - }))); - }); - - it("includes first_symbols(A's right hand side) when A is a non-terminal", [&]() { - auto rule = choice({ - seq({ - i_token(0), - i_token(1) }), - i_sym(0) }); - - SyntaxGrammar grammar({ - { "rule0", seq({ - i_token(2), - i_token(3), - i_token(4) }) } - }, {}); - - AssertThat(first_symbols(rule, grammar), Equals(set({ - Symbol(0), - Symbol(0, SymbolOptionToken), - Symbol(2, SymbolOptionToken), - }))); - }); - - it("includes first_symbols(B) when A is a non-terminal and its expansion can be blank", [&]() { - auto rule = seq({ - i_sym(0), - i_token(1) }); - - SyntaxGrammar grammar({ - { "rule0", choice({ - i_token(0), - blank() }) } - }, {}); - - AssertThat(first_symbols(rule, grammar), Equals(set({ - Symbol(0), - Symbol(0, SymbolOptionToken), - Symbol(1, SymbolOptionToken), - }))); - }); - }); - - describe("when there are left-recursive rules", [&]() { - it("terminates", [&]() { - SyntaxGrammar grammar({ - { "rule0", choice({ - seq({ i_sym(0), i_token(10) }), - i_token(11), - }) }, - }, {}); - - auto rule = i_sym(0); - - AssertThat(first_symbols(rule, grammar), Equals(set({ - Symbol(0), - Symbol(11, SymbolOptionToken) - }))); - }); - }); - - it("ignores metadata rules", [&]() { - auto rule = make_shared(i_token(3), map()); - - AssertThat(first_symbols(rule, null_grammar), Equals(set({ - Symbol(3, SymbolOptionToken), - }))); - }); -}); - -END_TEST diff --git a/spec/compiler/build_tables/item_set_closure_spec.cc b/spec/compiler/build_tables/item_set_closure_spec.cc index 0b81b66a..866e7381 100644 --- a/spec/compiler/build_tables/item_set_closure_spec.cc +++ b/spec/compiler/build_tables/item_set_closure_spec.cc @@ -10,30 +10,61 @@ START_TEST describe("item_set_closure", []() { SyntaxGrammar grammar({ - { "E", seq({ - i_sym(1), - i_token(11) }) }, - { "T", seq({ - i_token(12), - i_token(13) }) }, - }, {}); + { + "rule0", + { + Production({ + {Symbol(1), 0, 100}, + {Symbol(11, SymbolOptionToken), 0, 101} + }, 107), + } + }, + { + "rule1", + { + Production({ + {Symbol(12, SymbolOptionToken), 0, 102}, + {Symbol(13, SymbolOptionToken), 0, 103} + }, 108), + Production({ + {Symbol(2), 0, 104}, + }, 109) + } + }, + { + "rule2", + { + Production({ + {Symbol(14, SymbolOptionToken), 0, 105}, + {Symbol(15, SymbolOptionToken), 0, 106} + }, 110) + } + }, + }, {}, set()); it("adds items at the beginnings of referenced rules", [&]() { ParseItemSet item_set = item_set_closure( - ParseItem(Symbol(0), grammar.rule(Symbol(0)), 0), - set({ Symbol(10, SymbolOptionToken) }), - grammar - ); + ParseItem(Symbol(0), 0, 100, 0), + set({ Symbol(10, SymbolOptionToken) }), + grammar); AssertThat(item_set, Equals(ParseItemSet({ - { - ParseItem(Symbol(1), grammar.rule(Symbol(1)), 0), - set({ Symbol(11, SymbolOptionToken) }), - }, - { - ParseItem(Symbol(0), grammar.rule(Symbol(0)), 0), - set({ Symbol(10, SymbolOptionToken) }), - }, + { + ParseItem(Symbol(0), 0, 100, 0), + set({ Symbol(10, SymbolOptionToken) }) + }, + { + ParseItem(Symbol(1), 0, 102, 0), + set({ Symbol(11, SymbolOptionToken) }) + }, + { + ParseItem(Symbol(1), 1, 104, 0), + set({ Symbol(11, SymbolOptionToken) }) + }, + { + ParseItem(Symbol(2), 0, 105, 0), + set({ Symbol(11, SymbolOptionToken) }) + }, }))); }); }); diff --git a/spec/compiler/build_tables/item_set_transitions_spec.cc b/spec/compiler/build_tables/item_set_transitions_spec.cc index 977054e4..05df4c44 100644 --- a/spec/compiler/build_tables/item_set_transitions_spec.cc +++ b/spec/compiler/build_tables/item_set_transitions_spec.cc @@ -12,63 +12,79 @@ describe("char_transitions(LexItemSet)", []() { describe("when two items in the set have transitions on the same character", [&]() { it("merges the transitions by computing the union of the two item sets", [&]() { LexItemSet set1({ - LexItem(Symbol(1), CharacterSet().include('a', 'f').copy()), - LexItem(Symbol(2), CharacterSet().include('e', 'x').copy()) + LexItem(Symbol(1), CharacterSet().include('a', 'f').copy()), + LexItem(Symbol(2), CharacterSet().include('e', 'x').copy()) }); AssertThat(char_transitions(set1), Equals(map({ - { - CharacterSet().include('a', 'd'), - LexItemSet({ - LexItem(Symbol(1), blank()), - }) - }, - { - CharacterSet().include('e', 'f'), - LexItemSet({ - LexItem(Symbol(1), blank()), - LexItem(Symbol(2), blank()), - }) - }, - { - CharacterSet().include('g', 'x'), - LexItemSet({ - LexItem(Symbol(2), blank()), - }) - }, + { + CharacterSet().include('a', 'd'), + LexItemSet({ + LexItem(Symbol(1), blank()), + }) + }, + { + CharacterSet().include('e', 'f'), + LexItemSet({ + LexItem(Symbol(1), blank()), + LexItem(Symbol(2), blank()), + }) + }, + { + CharacterSet().include('g', 'x'), + LexItemSet({ + LexItem(Symbol(2), blank()), + }) + }, }))); }); }); }); -describe("sym_transitions(ParseItemSet, SyntaxGrammar)", [&]() { - SyntaxGrammar grammar({ - { "A", blank() }, - { "B", i_token(21) }, - }, {}, set()); - +describe("sym_transitions(ParseItemSet, InitialSyntaxGrammar)", [&]() { it("computes the closure of the new item sets", [&]() { - ParseItemSet set1({ - { - ParseItem(Symbol(0), seq({ i_token(22), i_sym(1) }), 3), - set({ Symbol(23, SymbolOptionToken) }) + SyntaxGrammar grammar({ + { + "A", { + Production({ + {Symbol(11, SymbolOptionToken), 0, 101}, + {Symbol(12, SymbolOptionToken), 0, 102}, + {Symbol(13, SymbolOptionToken), 0, 103}, + {Symbol(1), 0, 104}, + {Symbol(14, SymbolOptionToken), 0, 105}, + }, 1) }, + }, + { + "B", { + Production({ + {Symbol(15, SymbolOptionToken), 0, 106}, + }, 2) + }, + } + }, {}, set()); + + ParseItemSet set1({ + { + ParseItem(Symbol(0), 0, 103, 2), + set({ Symbol(16, SymbolOptionToken) }) + } }); AssertThat(sym_transitions(set1, grammar), Equals(map({ - { - Symbol(22, SymbolOptionToken), - ParseItemSet({ - { - ParseItem(Symbol(0), i_sym(1), 4), - set({ Symbol(23, SymbolOptionToken) }), - }, - { - ParseItem(Symbol(1), i_token(21), 0), - set({ Symbol(23, SymbolOptionToken) }) - }, - }) - }, + { + Symbol(13, SymbolOptionToken), + ParseItemSet({ + { + ParseItem(Symbol(0), 0, 104, 3), + set({ Symbol(16, SymbolOptionToken) }) + }, + { + ParseItem(Symbol(1), 0, 106, 0), + set({ Symbol(14, SymbolOptionToken) }) + }, + }) + }, }))); }); }); diff --git a/spec/compiler/build_tables/rule_can_be_blank_spec.cc b/spec/compiler/build_tables/rule_can_be_blank_spec.cc index 4c0d03fd..313ca5f9 100644 --- a/spec/compiler/build_tables/rule_can_be_blank_spec.cc +++ b/spec/compiler/build_tables/rule_can_be_blank_spec.cc @@ -1,7 +1,6 @@ #include "compiler/compiler_spec_helper.h" #include "compiler/build_tables/rule_can_be_blank.h" #include "compiler/rules/metadata.h" -#include "compiler/syntax_grammar.h" using namespace rules; using build_tables::rule_can_be_blank; @@ -54,27 +53,6 @@ describe("rule_can_be_blank", [&]() { rule = make_shared(sym("one"), map()); AssertThat(rule_can_be_blank(rule), IsFalse()); }); - - describe("checking recursively (by expanding non-terminals)", [&]() { - SyntaxGrammar grammar({ - { "A", choice({ - seq({ i_sym(0), i_token(11) }), - blank() }) }, - { "B", choice({ - seq({ i_sym(1), i_token(12) }), - i_token(13) }) }, - }, {}, set()); - - it("terminates for left-recursive rules that can be blank", [&]() { - rule = i_sym(0); - AssertThat(rule_can_be_blank(rule, grammar), IsTrue()); - }); - - it("terminates for left-recursive rules that can't be blank", [&]() { - rule = i_sym(1); - AssertThat(rule_can_be_blank(rule, grammar), IsFalse()); - }); - }); }); END_TEST diff --git a/spec/compiler/helpers/containers.h b/spec/compiler/helpers/containers.h index 6c1ecb37..0d80c150 100644 --- a/spec/compiler/helpers/containers.h +++ b/spec/compiler/helpers/containers.h @@ -15,6 +15,15 @@ using std::initializer_list; using std::pair; using tree_sitter::rules::rule_ptr; +template +std::vector::type> +collect(const std::vector &v, Func f) { + vector::type> result; + for (const T &item : v) + result.push_back(f(item)); + return result; +} + template class rule_map : public map { public: diff --git a/spec/compiler/helpers/stream_methods.h b/spec/compiler/helpers/stream_methods.h index 5d3fc663..e8093973 100644 --- a/spec/compiler/helpers/stream_methods.h +++ b/spec/compiler/helpers/stream_methods.h @@ -36,8 +36,8 @@ inline std::ostream& operator<<(std::ostream &stream, const std::set &set) { return stream << ")"; } -template -inline std::ostream& operator<<(std::ostream &stream, const std::unordered_set &set) { +template +inline std::ostream& operator<<(std::ostream &stream, const std::unordered_set &set) { stream << std::string("(set: "); bool started = false; for (auto item : set) { diff --git a/spec/compiler/prepare_grammar/expand_repeats_spec.cc b/spec/compiler/prepare_grammar/expand_repeats_spec.cc index 62f12b8b..069529a8 100644 --- a/spec/compiler/prepare_grammar/expand_repeats_spec.cc +++ b/spec/compiler/prepare_grammar/expand_repeats_spec.cc @@ -1,16 +1,17 @@ #include "compiler/compiler_spec_helper.h" -#include "compiler/syntax_grammar.h" +#include "compiler/prepare_grammar/initial_syntax_grammar.h" #include "compiler/prepare_grammar/expand_repeats.h" #include "compiler/helpers/containers.h" START_TEST using namespace rules; +using prepare_grammar::InitialSyntaxGrammar; using prepare_grammar::expand_repeats; describe("expand_repeats", []() { it("replaces repeat rules with pairs of recursive rules", [&]() { - SyntaxGrammar grammar({ + InitialSyntaxGrammar grammar({ { "rule0", repeat(i_token(0)) }, }, {}, set()); @@ -28,7 +29,7 @@ describe("expand_repeats", []() { }); it("replaces repeats inside of sequences", [&]() { - SyntaxGrammar grammar({ + InitialSyntaxGrammar grammar({ { "rule0", seq({ i_token(10), repeat(i_token(11)) }) }, @@ -50,7 +51,7 @@ describe("expand_repeats", []() { }); it("replaces repeats inside of choices", [&]() { - SyntaxGrammar grammar({ + InitialSyntaxGrammar grammar({ { "rule0", choice({ i_token(10), repeat(i_token(11)) }) }, }, {}, set()); @@ -68,7 +69,7 @@ describe("expand_repeats", []() { }); it("can replace multiple repeats in the same rule", [&]() { - SyntaxGrammar grammar({ + InitialSyntaxGrammar grammar({ { "rule0", seq({ repeat(i_token(10)), repeat(i_token(11)) }) }, @@ -93,7 +94,7 @@ describe("expand_repeats", []() { }); it("can replace repeats in multiple rules", [&]() { - SyntaxGrammar grammar({ + InitialSyntaxGrammar grammar({ { "rule0", repeat(i_token(10)) }, { "rule1", repeat(i_token(11)) }, }, {}, set()); diff --git a/spec/compiler/prepare_grammar/extract_choices_spec.cc b/spec/compiler/prepare_grammar/extract_choices_spec.cc new file mode 100644 index 00000000..bb61c2eb --- /dev/null +++ b/spec/compiler/prepare_grammar/extract_choices_spec.cc @@ -0,0 +1,58 @@ +#include "compiler/compiler_spec_helper.h" +#include "compiler/prepare_grammar/extract_choices.h" +#include "compiler/helpers/containers.h" + +START_TEST + +using namespace rules; +using prepare_grammar::extract_choices; + +describe("extract_choices", []() { + it("expands rules containing choices into multiple rules", [&]() { + auto rule = seq({ + sym("a"), + choice({ sym("b"), sym("c"), sym("d") }), + sym("e") + }); + + AssertThat(extract_choices(rule), Equals(rule_vector({ + seq({ sym("a"), sym("b"), sym("e") }), + seq({ sym("a"), sym("c"), sym("e") }), + seq({ sym("a"), sym("d"), sym("e") }), + }))); + }); + + it("handles metadata rules", [&]() { + auto rule = prec(5, choice({ sym("b"), sym("c"), sym("d") })); + + AssertThat(extract_choices(rule), Equals(rule_vector({ + prec(5, sym("b")), + prec(5, sym("c")), + prec(5, sym("d")), + }))); + }); + + it("handles nested choices", [&]() { + auto rule = choice({ + seq({ choice({ sym("a"), sym("b") }), sym("c") }), + sym("d") + }); + + AssertThat(extract_choices(rule), Equals(rule_vector({ + seq({ sym("a"), sym("c") }), + seq({ sym("b"), sym("c") }), + sym("d"), + }))); + }); + + it("handles repeats", [&]() { + auto rule = repeat(choice({ sym("a"), sym("b") })); + + AssertThat(extract_choices(rule), Equals(rule_vector({ + repeat(sym("a")), + repeat(sym("b")), + }))); + }); +}); + +END_TEST diff --git a/spec/compiler/prepare_grammar/extract_tokens_spec.cc b/spec/compiler/prepare_grammar/extract_tokens_spec.cc index 40d614d4..eab63109 100644 --- a/spec/compiler/prepare_grammar/extract_tokens_spec.cc +++ b/spec/compiler/prepare_grammar/extract_tokens_spec.cc @@ -1,6 +1,6 @@ #include "compiler/compiler_spec_helper.h" #include "compiler/lexical_grammar.h" -#include "compiler/syntax_grammar.h" +#include "compiler/prepare_grammar/initial_syntax_grammar.h" #include "compiler/prepare_grammar/extract_tokens.h" #include "compiler/helpers/containers.h" @@ -8,10 +8,11 @@ START_TEST using namespace rules; using prepare_grammar::extract_tokens; +using prepare_grammar::InitialSyntaxGrammar; describe("extract_tokens", []() { it("moves string rules into the lexical grammar", [&]() { - tuple result = + tuple result = extract_tokens(Grammar({ { "rule_A", seq({ str("ab"), i_sym(0) }) } })); diff --git a/spec/compiler/prepare_grammar/flatten_grammar_spec.cc b/spec/compiler/prepare_grammar/flatten_grammar_spec.cc new file mode 100644 index 00000000..99872010 --- /dev/null +++ b/spec/compiler/prepare_grammar/flatten_grammar_spec.cc @@ -0,0 +1,109 @@ +#include "compiler/compiler_spec_helper.h" +#include "compiler/prepare_grammar/flatten_grammar.h" +#include "compiler/prepare_grammar/initial_syntax_grammar.h" +#include "compiler/syntax_grammar.h" +#include "compiler/helpers/containers.h" + +START_TEST + +using namespace rules; +using prepare_grammar::flatten_grammar; +using prepare_grammar::InitialSyntaxGrammar; + +describe("flatten_grammar", []() { + InitialSyntaxGrammar input_grammar({ + { "rule1", seq({ + i_sym(1), + choice({ i_sym(2), i_sym(3) }), + i_sym(4) }) }, + { "rule2", seq({ + i_sym(1), + prec(50, seq({ + i_sym(2), + choice({ + prec(100, seq({ + i_sym(3), + i_sym(4) + })), + i_sym(5), + }), + i_sym(6) })), + i_sym(7) }) }, + }, {}); + + it("turns each rule into a list of possible symbol sequences", [&]() { + SyntaxGrammar grammar = flatten_grammar(input_grammar); + + auto get_symbol_lists = [&](int rule_index) { + return collect(grammar.rules[rule_index].second, [](Production p) { + return collect(p.entries, [](ProductionEntry e) { + return e.symbol; + }); + }); + }; + + AssertThat(grammar.rules[0].first, Equals("rule1")); + AssertThat(grammar.rules[1].first, Equals("rule2")); + + AssertThat( + get_symbol_lists(0), + Equals(vector>({ + { Symbol(1), Symbol(2), Symbol(4) }, + { Symbol(1), Symbol(3), Symbol(4) } + }))); + + AssertThat( + get_symbol_lists(1), + Equals(vector>({ + { Symbol(1), Symbol(2), Symbol(3), Symbol(4), Symbol(6), Symbol(7) }, + { Symbol(1), Symbol(2), Symbol(5), Symbol(6), Symbol(7) } + }))); + }); + + it("associates each symbol with the precedence binding it to its previous neighbor", [&]() { + SyntaxGrammar grammar = flatten_grammar(input_grammar); + + auto get_precedence_lists = [&](int rule_index) { + return collect(grammar.rules[rule_index].second, [](Production p) { + return collect(p.entries, [](ProductionEntry e) { + return e.precedence; + }); + }); + }; + + AssertThat( + get_precedence_lists(0), + Equals(vector>({ + { 0, 0, 0 }, + { 0, 0, 0 } + }))); + + AssertThat( + get_precedence_lists(1), + Equals(vector>({ + { 0, 0, 50, 100, 50, 0 }, + { 0, 0, 50, 50, 0 } + }))); + }); + + it("associates each unique subsequence of symbols and precedences with a rule_id", [&]() { + SyntaxGrammar grammar = flatten_grammar(input_grammar); + + auto rule_id = [&](int rule_index, int production_index, int symbol_index) { + return grammar.rules[rule_index].second[production_index].rule_id_at(symbol_index); + }; + + // Rule 1: last symbol is the same for both productions. + AssertThat(rule_id(0, 0, 0), !Equals(rule_id(0, 1, 0))); + AssertThat(rule_id(0, 0, 1), !Equals(rule_id(0, 1, 1))); + AssertThat(rule_id(0, 0, 2), Equals(rule_id(0, 1, 2))); + + // Rule 2: last two symbols are the same for both productions. + AssertThat(rule_id(1, 0, 0), !Equals(rule_id(1, 1, 0))); + AssertThat(rule_id(1, 0, 1), !Equals(rule_id(1, 1, 1))); + AssertThat(rule_id(1, 0, 4), Equals(rule_id(1, 1, 3))); + AssertThat(rule_id(1, 0, 5), Equals(rule_id(1, 1, 4))); + }); +}); + +END_TEST diff --git a/src/compiler/build_tables/action_takes_precedence.h b/src/compiler/build_tables/action_takes_precedence.h index 79603611..34c188d7 100644 --- a/src/compiler/build_tables/action_takes_precedence.h +++ b/src/compiler/build_tables/action_takes_precedence.h @@ -5,7 +5,6 @@ #include "tree_sitter/compiler.h" #include "compiler/parse_table.h" #include "compiler/rules/symbol.h" -#include "compiler/syntax_grammar.h" namespace tree_sitter { namespace build_tables { diff --git a/src/compiler/build_tables/build_parse_table.cc b/src/compiler/build_tables/build_parse_table.cc index 9ba6e3f8..0c6a0282 100644 --- a/src/compiler/build_tables/build_parse_table.cc +++ b/src/compiler/build_tables/build_parse_table.cc @@ -40,12 +40,8 @@ class ParseTableBuilder { : grammar(grammar), lex_grammar(lex_grammar) {} pair> build() { - auto start_symbol = grammar.rules.empty() - ? make_shared(0, rules::SymbolOptionToken) - : make_shared(0); - ParseItem start_item(rules::START(), start_symbol, 0); - add_parse_state( - item_set_closure(start_item, { rules::END_OF_INPUT() }, grammar)); + ParseItem start_item(rules::START(), 0, -2, 0); + add_parse_state(item_set_closure(start_item, { rules::END_OF_INPUT() }, grammar)); while (!item_sets_to_process.empty()) { auto pair = item_sets_to_process.back(); @@ -100,12 +96,13 @@ class ParseTableBuilder { const ParseItem &item = pair.first; const set &lookahead_symbols = pair.second; - if (item.is_done()) { + + if (item_is_done(item)) { ParseAction action = (item.lhs == rules::START()) ? ParseAction::Accept() : ParseAction::Reduce(item.lhs, item.consumed_symbol_count, - item.precedence()); + item_precedence(item)); for (const auto &lookahead_sym : lookahead_symbols) if (should_add_action(state_id, lookahead_sym, action, ParseItemSet())) @@ -170,11 +167,19 @@ class ParseTableBuilder { for (const auto &pair : item_set) { const ParseItem &item = pair.first; if (item.consumed_symbol_count > 0) - result.insert(item.precedence()); + result.insert(item_precedence(item)); } return result; } + bool item_is_done(const ParseItem &item) { + return item.consumed_symbol_count == grammar.productions(item.lhs)[item.production_index].size(); + } + + int item_precedence(const ParseItem &item) { + return grammar.productions(item.lhs)[item.production_index].precedence_at(item.consumed_symbol_count - 1); + } + void record_conflict(const Symbol &sym, const ParseAction &left, const ParseAction &right, const ParseItemSet &item_set) { conflicts.insert( diff --git a/src/compiler/build_tables/first_symbols.cc b/src/compiler/build_tables/first_symbols.cc deleted file mode 100644 index f90d7886..00000000 --- a/src/compiler/build_tables/first_symbols.cc +++ /dev/null @@ -1,68 +0,0 @@ -#include "compiler/build_tables/first_symbols.h" -#include "compiler/build_tables/rule_can_be_blank.h" -#include "compiler/syntax_grammar.h" -#include "compiler/rules/choice.h" -#include "compiler/rules/metadata.h" -#include "compiler/rules/seq.h" -#include "compiler/rules/symbol.h" -#include "compiler/rules/visitor.h" -#include "tree_sitter/compiler.h" - -namespace tree_sitter { -namespace build_tables { - -using std::set; -using rules::Symbol; - -class FirstSymbols : public rules::RuleFn> { - const SyntaxGrammar *grammar; - set visited_symbols; - - public: - explicit FirstSymbols(const SyntaxGrammar *grammar) : grammar(grammar) {} - - private: - set apply_to(const Symbol *rule) { - auto insertion_result = visited_symbols.insert(*rule); - if (!insertion_result.second) - return set(); - - set result({ *rule }); - if (!rule->is_token()) { - set &&symbols = apply(grammar->rule(*rule)); - result.insert(symbols.begin(), symbols.end()); - } - - return result; - } - - set apply_to(const rules::Metadata *rule) { - return apply(rule->rule); - } - - set apply_to(const rules::Choice *rule) { - set result; - for (const auto &element : rule->elements) { - auto &&element_symbols = apply(element); - result.insert(element_symbols.begin(), element_symbols.end()); - } - return result; - } - - set apply_to(const rules::Seq *rule) { - auto &&result = apply(rule->left); - if (rule_can_be_blank(rule->left, *grammar)) { - auto &&right_symbols = apply(rule->right); - result.insert(right_symbols.begin(), right_symbols.end()); - } - return result; - } -}; - -set first_symbols(const rules::rule_ptr &rule, - const SyntaxGrammar &grammar) { - return FirstSymbols(&grammar).apply(rule); -} - -} // namespace build_tables -} // namespace tree_sitter diff --git a/src/compiler/build_tables/first_symbols.h b/src/compiler/build_tables/first_symbols.h deleted file mode 100644 index 09c32be5..00000000 --- a/src/compiler/build_tables/first_symbols.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef COMPILER_BUILD_TABLES_FIRST_SYMBOLS_H_ -#define COMPILER_BUILD_TABLES_FIRST_SYMBOLS_H_ - -#include -#include "compiler/rules/symbol.h" -#include "tree_sitter/compiler.h" - -namespace tree_sitter { - -class SyntaxGrammar; - -namespace build_tables { - -/* - * Returns the set of symbols that can appear at the beginning of a sentential - * form derivable from a given rule in a given grammar. - */ -std::set first_symbols(const rules::rule_ptr &rule, - const SyntaxGrammar &grammar); - -} // namespace build_tables -} // namespace tree_sitter - -#endif // COMPILER_BUILD_TABLES_FIRST_SYMBOLS_H_ diff --git a/src/compiler/build_tables/item_set_closure.cc b/src/compiler/build_tables/item_set_closure.cc index b84104b3..c4d981da 100644 --- a/src/compiler/build_tables/item_set_closure.cc +++ b/src/compiler/build_tables/item_set_closure.cc @@ -3,7 +3,6 @@ #include #include #include "tree_sitter/compiler.h" -#include "compiler/build_tables/first_symbols.h" #include "compiler/build_tables/rule_transitions.h" #include "compiler/build_tables/rule_can_be_blank.h" #include "compiler/build_tables/item.h" @@ -34,24 +33,41 @@ const ParseItemSet item_set_closure(const ParseItem &starting_item, size_t previous_size = lookahead_symbols.size(); lookahead_symbols.insert(new_lookahead_symbols.begin(), new_lookahead_symbols.end()); - if (lookahead_symbols.size() == previous_size) continue; - for (const auto &pair : sym_transitions(item.rule)) { - const Symbol &symbol = pair.first; - const rule_ptr &next_rule = pair.second; + const Production &item_production = grammar.productions(item.lhs)[item.production_index]; + if (item_production.size() <= item.consumed_symbol_count) + continue; - if (symbol.is_token() || symbol.is_built_in()) - continue; + Symbol symbol = item_production.symbol_at(item.consumed_symbol_count); + if (symbol.is_token() || symbol.is_built_in()) + continue; - set next_lookahead_symbols = first_symbols(next_rule, grammar); - if (rule_can_be_blank(next_rule, grammar)) - next_lookahead_symbols.insert(lookahead_symbols.begin(), - lookahead_symbols.end()); + set next_lookahead_symbols; + if (item.consumed_symbol_count + 1 >= item_production.size()) { + next_lookahead_symbols = lookahead_symbols; + } else { + vector symbols_to_process({ item_production.symbol_at(item.consumed_symbol_count + 1) }); - items_to_process.push_back({ ParseItem(symbol, grammar.rule(symbol), 0), - next_lookahead_symbols }); + while (!symbols_to_process.empty()) { + Symbol following_symbol = symbols_to_process.back(); + symbols_to_process.pop_back(); + if (!next_lookahead_symbols.insert(following_symbol).second) + continue; + + for (const auto &production : grammar.productions(following_symbol)) + symbols_to_process.push_back(production.symbol_at(0)); + } + } + + size_t i = 0; + for (const Production &production : grammar.productions(symbol)) { + items_to_process.push_back({ + ParseItem(symbol, i, production.rule_id_at(0), 0), + next_lookahead_symbols + }); + i++; } } diff --git a/src/compiler/build_tables/item_set_transitions.cc b/src/compiler/build_tables/item_set_transitions.cc index 93dcba18..abacd5cd 100644 --- a/src/compiler/build_tables/item_set_transitions.cc +++ b/src/compiler/build_tables/item_set_transitions.cc @@ -21,18 +21,30 @@ map sym_transitions(const ParseItemSet &item_set, for (const auto &pair : item_set) { const ParseItem &item = pair.first; const set &lookahead_symbols = pair.second; - for (auto &transition : sym_transitions(item.rule)) { - ParseItem new_item(item.lhs, transition.second, - item.consumed_symbol_count + 1); - merge_sym_transition( - &result, { transition.first, - item_set_closure(new_item, lookahead_symbols, grammar) }, - [](ParseItemSet *left, const ParseItemSet *right) { - for (auto &pair : *right) - left->operator[](pair.first) - .insert(pair.second.begin(), pair.second.end()); - }); - } + const auto &productions = grammar.productions(item.lhs); + if (productions.empty()) + continue; + + const Production &production = grammar.productions(item.lhs)[item.production_index]; + if (production.size() <= item.consumed_symbol_count) + continue; + + const Symbol &symbol = production.symbol_at(item.consumed_symbol_count); + ParseItem new_item( + item.lhs, + item.production_index, + production.rule_id_at(item.consumed_symbol_count + 1), + item.consumed_symbol_count + 1 + ); + + merge_sym_transition( + &result, + { symbol, item_set_closure(new_item, { lookahead_symbols }, grammar) }, + [](ParseItemSet *left, const ParseItemSet *right) { + for (auto &pair : *right) + left->operator[](pair.first) + .insert(pair.second.begin(), pair.second.end()); + }); } return result; } diff --git a/src/compiler/build_tables/parse_item.cc b/src/compiler/build_tables/parse_item.cc index 840eb1d4..6da35b7d 100644 --- a/src/compiler/build_tables/parse_item.cc +++ b/src/compiler/build_tables/parse_item.cc @@ -1,25 +1,32 @@ #include "compiler/build_tables/parse_item.h" +#include "compiler/syntax_grammar.h" #include "tree_sitter/compiler.h" namespace tree_sitter { namespace build_tables { using std::string; +using std::to_string; using std::ostream; -ParseItem::ParseItem(const rules::Symbol &lhs, const rules::rule_ptr rule, - size_t consumed_symbol_count) - : Item(lhs, rule), consumed_symbol_count(consumed_symbol_count) {} +ParseItem::ParseItem(const rules::Symbol &lhs, size_t production_index, + int rule_id, size_t consumed_symbol_count) + : lhs(lhs), production_index(production_index), + rule_id(rule_id), + consumed_symbol_count(consumed_symbol_count) {} bool ParseItem::operator==(const ParseItem &other) const { return (lhs == other.lhs) && - (consumed_symbol_count == other.consumed_symbol_count) && - (rule == other.rule || rule->operator==(*other.rule)); + (rule_id == other.rule_id) && + (consumed_symbol_count == other.consumed_symbol_count); } ostream &operator<<(ostream &stream, const ParseItem &item) { - return stream << string("(item ") << item.lhs << string(" ") << *item.rule - << string(")"); + return stream << string("(item lhs:") << item.lhs + << string(" index:") << to_string(item.production_index) + << string(" remaining_rule:") << to_string(item.rule_id) + << string(" consumed:") << to_string(item.consumed_symbol_count) + << string(")"); } } // namespace build_tables diff --git a/src/compiler/build_tables/parse_item.h b/src/compiler/build_tables/parse_item.h index f3ec4413..ccae88df 100644 --- a/src/compiler/build_tables/parse_item.h +++ b/src/compiler/build_tables/parse_item.h @@ -9,11 +9,15 @@ namespace tree_sitter { namespace build_tables { -class ParseItem : public Item { +class ParseItem { public: - ParseItem(const rules::Symbol &lhs, rules::rule_ptr rule, - const size_t consumed_symbol_count); + ParseItem(const rules::Symbol &lhs, size_t production_index, + int rule_id, size_t consumed_symbol_count); bool operator==(const ParseItem &other) const; + + rules::Symbol lhs; + size_t production_index; + int rule_id; size_t consumed_symbol_count; }; @@ -30,8 +34,8 @@ template <> struct hash { size_t operator()(const tree_sitter::build_tables::ParseItem &item) const { return hash()(item.lhs) ^ - hash()(item.rule) ^ - hash()(item.consumed_symbol_count); + hash()(item.rule_id) ^ + hash()(item.consumed_symbol_count); } }; diff --git a/src/compiler/build_tables/rule_can_be_blank.cc b/src/compiler/build_tables/rule_can_be_blank.cc index 687bbeb6..d3d692f0 100644 --- a/src/compiler/build_tables/rule_can_be_blank.cc +++ b/src/compiler/build_tables/rule_can_be_blank.cc @@ -1,7 +1,5 @@ #include "compiler/build_tables/rule_can_be_blank.h" -#include #include "tree_sitter/compiler.h" -#include "compiler/syntax_grammar.h" #include "compiler/rules/symbol.h" #include "compiler/rules/visitor.h" #include "compiler/rules/seq.h" @@ -12,8 +10,6 @@ namespace tree_sitter { namespace build_tables { -using std::set; - class CanBeBlank : public rules::RuleFn { protected: bool apply_to(const rules::Blank *) { return true; } @@ -34,36 +30,9 @@ class CanBeBlank : public rules::RuleFn { bool apply_to(const rules::Metadata *rule) { return apply(rule->rule); } }; -class CanBeBlankRecursive : public CanBeBlank { - const SyntaxGrammar *grammar; - set visited_symbols; - using CanBeBlank::visit; - - public: - explicit CanBeBlankRecursive(const SyntaxGrammar *grammar) - : grammar(grammar) {} - - private: - using CanBeBlank::apply_to; - - bool apply_to(const rules::Symbol *rule) { - if (visited_symbols.find(*rule) == visited_symbols.end()) { - visited_symbols.insert(*rule); - return !rule->is_token() && apply(grammar->rule(*rule)); - } else { - return false; - } - } -}; - bool rule_can_be_blank(const rules::rule_ptr &rule) { return CanBeBlank().apply(rule); } -bool rule_can_be_blank(const rules::rule_ptr &rule, - const SyntaxGrammar &grammar) { - return CanBeBlankRecursive(&grammar).apply(rule); -} - } // namespace build_tables } // namespace tree_sitter diff --git a/src/compiler/build_tables/rule_can_be_blank.h b/src/compiler/build_tables/rule_can_be_blank.h index f7d0d7b3..39c046ab 100644 --- a/src/compiler/build_tables/rule_can_be_blank.h +++ b/src/compiler/build_tables/rule_can_be_blank.h @@ -4,14 +4,9 @@ #include "tree_sitter/compiler.h" namespace tree_sitter { - -class SyntaxGrammar; - namespace build_tables { bool rule_can_be_blank(const rules::rule_ptr &rule); -bool rule_can_be_blank(const rules::rule_ptr &rule, - const SyntaxGrammar &grammar); } // namespace build_tables } // namespace tree_sitter diff --git a/src/compiler/prepare_grammar/expand_repeats.cc b/src/compiler/prepare_grammar/expand_repeats.cc index 4273d153..1972f8de 100644 --- a/src/compiler/prepare_grammar/expand_repeats.cc +++ b/src/compiler/prepare_grammar/expand_repeats.cc @@ -2,7 +2,7 @@ #include #include #include -#include "compiler/syntax_grammar.h" +#include "compiler/prepare_grammar/initial_syntax_grammar.h" #include "compiler/rules/visitor.h" #include "compiler/rules/seq.h" #include "compiler/rules/symbol.h" @@ -50,7 +50,7 @@ class ExpandRepeats : public rules::IdentityRuleFn { vector> aux_rules; }; -SyntaxGrammar expand_repeats(const SyntaxGrammar &grammar) { +InitialSyntaxGrammar expand_repeats(const InitialSyntaxGrammar &grammar) { vector> rules, aux_rules(grammar.aux_rules); for (auto &pair : grammar.rules) { @@ -60,7 +60,7 @@ SyntaxGrammar expand_repeats(const SyntaxGrammar &grammar) { expander.aux_rules.end()); } - return SyntaxGrammar(rules, aux_rules, grammar.ubiquitous_tokens); + return InitialSyntaxGrammar(rules, aux_rules, grammar.ubiquitous_tokens); } } // namespace prepare_grammar diff --git a/src/compiler/prepare_grammar/expand_repeats.h b/src/compiler/prepare_grammar/expand_repeats.h index 19c687f2..90d96952 100644 --- a/src/compiler/prepare_grammar/expand_repeats.h +++ b/src/compiler/prepare_grammar/expand_repeats.h @@ -4,12 +4,11 @@ #include "tree_sitter/compiler.h" namespace tree_sitter { - -class SyntaxGrammar; - namespace prepare_grammar { -SyntaxGrammar expand_repeats(const SyntaxGrammar &); +class InitialSyntaxGrammar; + +InitialSyntaxGrammar expand_repeats(const InitialSyntaxGrammar &); } // namespace prepare_grammar } // namespace tree_sitter diff --git a/src/compiler/prepare_grammar/extract_choices.cc b/src/compiler/prepare_grammar/extract_choices.cc new file mode 100644 index 00000000..4413fe16 --- /dev/null +++ b/src/compiler/prepare_grammar/extract_choices.cc @@ -0,0 +1,58 @@ +#include "compiler/prepare_grammar/extract_choices.h" +#include +#include +#include "compiler/rules/visitor.h" +#include "compiler/rules/seq.h" +#include "compiler/rules/choice.h" +#include "compiler/rules/metadata.h" +#include "compiler/rules/repeat.h" + +namespace tree_sitter { +namespace prepare_grammar { + +using std::make_shared; +using std::vector; +using rules::rule_ptr; + +class ExtractChoices : public rules::RuleFn> { + vector default_apply(const rules::Rule *rule) { + return vector({ rule->copy() }); + } + + vector apply_to(const rules::Seq *rule) { + vector result; + for (auto left_entry : apply(rule->left)) + for (auto right_entry : apply(rule->right)) + result.push_back(rules::Seq::build({ left_entry, right_entry })); + return result; + } + + vector apply_to(const rules::Metadata *rule) { + vector result; + for (auto entry : apply(rule->rule)) + result.push_back(make_shared(entry, rule->value)); + return result; + } + + vector apply_to(const rules::Choice *rule) { + vector result; + for (auto element : rule->elements) + for (auto entry : apply(element)) + result.push_back(entry); + return result; + } + + vector apply_to(const rules::Repeat *rule) { + vector result; + for (auto element : apply(rule->content)) + result.push_back(make_shared(element)); + return result; + } +}; + +std::vector extract_choices(const rules::rule_ptr &rule) { + return ExtractChoices().apply(rule); +} + +} // namespace prepare_grammar +} // namespace tree_sitter diff --git a/src/compiler/prepare_grammar/extract_choices.h b/src/compiler/prepare_grammar/extract_choices.h new file mode 100644 index 00000000..c833124d --- /dev/null +++ b/src/compiler/prepare_grammar/extract_choices.h @@ -0,0 +1,15 @@ +#ifndef COMPILER_PREPARE_GRAMMAR_EXTRACT_CHOICES_H_ +#define COMPILER_PREPARE_GRAMMAR_EXTRACT_CHOICES_H_ + +#include +#include "tree_sitter/compiler.h" + +namespace tree_sitter { +namespace prepare_grammar { + +std::vector extract_choices(const rules::rule_ptr &); + +} // namespace prepare_grammar +} // namespace tree_sitter + +#endif // COMPILER_PREPARE_GRAMMAR_EXTRACT_CHOICES_H_ diff --git a/src/compiler/prepare_grammar/extract_tokens.cc b/src/compiler/prepare_grammar/extract_tokens.cc index 4dbf854b..d7785bac 100644 --- a/src/compiler/prepare_grammar/extract_tokens.cc +++ b/src/compiler/prepare_grammar/extract_tokens.cc @@ -5,7 +5,7 @@ #include #include "tree_sitter/compiler.h" #include "compiler/lexical_grammar.h" -#include "compiler/syntax_grammar.h" +#include "compiler/prepare_grammar/initial_syntax_grammar.h" #include "compiler/rules/visitor.h" #include "compiler/rules/symbol.h" #include "compiler/rules/string.h" @@ -92,14 +92,14 @@ class TokenExtractor : public rules::IdentityRuleFn { vector> tokens; }; -static tuple ubiq_token_err( +static tuple ubiq_token_err( const string &msg) { - return make_tuple(SyntaxGrammar(), LexicalGrammar(), + return make_tuple(InitialSyntaxGrammar(), LexicalGrammar(), new GrammarError(GrammarErrorTypeInvalidUbiquitousToken, "Not a token: " + msg)); } -tuple extract_tokens( +tuple extract_tokens( const Grammar &grammar) { vector> rules, tokens; vector separators; @@ -139,7 +139,7 @@ tuple extract_tokens( } } - return make_tuple(SyntaxGrammar(rules, {}, ubiquitous_tokens), + return make_tuple(InitialSyntaxGrammar(rules, {}, ubiquitous_tokens), LexicalGrammar(tokens, extractor.tokens, separators), nullptr); } diff --git a/src/compiler/prepare_grammar/extract_tokens.h b/src/compiler/prepare_grammar/extract_tokens.h index 1f3b3413..aeb63b15 100644 --- a/src/compiler/prepare_grammar/extract_tokens.h +++ b/src/compiler/prepare_grammar/extract_tokens.h @@ -7,12 +7,13 @@ namespace tree_sitter { class Grammar; -class SyntaxGrammar; class LexicalGrammar; namespace prepare_grammar { -std::tuple extract_tokens( +class InitialSyntaxGrammar; + +std::tuple extract_tokens( const Grammar &); } // namespace prepare_grammar diff --git a/src/compiler/prepare_grammar/flatten_grammar.cc b/src/compiler/prepare_grammar/flatten_grammar.cc new file mode 100644 index 00000000..cc368d25 --- /dev/null +++ b/src/compiler/prepare_grammar/flatten_grammar.cc @@ -0,0 +1,145 @@ +#include "compiler/prepare_grammar/flatten_grammar.h" +#include "compiler/prepare_grammar/extract_choices.h" +#include "compiler/prepare_grammar/initial_syntax_grammar.h" +#include "compiler/rules/visitor.h" +#include "compiler/rules/seq.h" +#include "compiler/rules/symbol.h" +#include "compiler/rules/metadata.h" +#include +#include + +namespace tree_sitter { +namespace prepare_grammar { + +using std::find; +using std::pair; +using std::string; +using std::vector; +using rules::rule_ptr; + +class FlattenRule : public rules::RuleFn { + public: + bool has_pending_precedence; + int pending_precedence; + vector precedence_stack; + vector entries; + + FlattenRule() : has_pending_precedence(false), pending_precedence(0) {} + + void apply_to(const rules::Symbol *sym) { + entries.push_back({ *sym, current_precedence(), 0 }); + if (has_pending_precedence) { + precedence_stack.push_back(pending_precedence); + has_pending_precedence = false; + } + } + + void apply_to(const rules::Metadata *metadata) { + int precedence = metadata->value_for(rules::PRECEDENCE); + if (precedence != 0) { + pending_precedence = precedence; + has_pending_precedence = true; + apply(metadata->rule); + precedence_stack.pop_back(); + } else { + apply(metadata->rule); + } + } + + void apply_to(const rules::Seq *seq) { + apply(seq->left); + apply(seq->right); + } + + private: + int current_precedence() { + if (precedence_stack.empty()) + return 0; + else + return *precedence_stack.rbegin(); + } +}; + +Production flatten_rule(const rule_ptr &rule) { + FlattenRule flattener; + flattener.apply(rule); + return Production(flattener.entries, 0); +} + +struct ProductionSlice { + vector::const_iterator start; + vector::const_iterator end; + int end_precedence; + + bool operator==(const ProductionSlice &other) const { + if (end_precedence != other.end_precedence) return false; + if (end - start != other.end - other.start) return false; + for (auto iter1 = start, iter2 = other.start; iter1 != end; ++iter1, ++iter2) + if (!(iter1->symbol == iter2->symbol) || iter1->precedence != iter2->precedence) + return false; + return true; + } +}; + +void assign_rule_ids(Production *production, vector *unique_slices) { + auto &entries = production->entries; + auto end = entries.end(); + + for (auto iter = entries.begin(); iter != end; ++iter) { + ProductionSlice slice{iter, end, 0}; + auto existing_id = find(unique_slices->cbegin(), unique_slices->cend(), slice); + if (existing_id == unique_slices->end()) { + unique_slices->push_back(slice); + iter->rule_id = unique_slices->size() - 1; + } else { + iter->rule_id = existing_id - unique_slices->cbegin(); + } + } + + ProductionSlice slice{end, end, production->precedence_at(production->size() - 1)}; + auto existing_id = find(unique_slices->cbegin(), unique_slices->cend(), slice); + if (existing_id == unique_slices->end()) { + unique_slices->push_back(slice); + production->end_rule_id = unique_slices->size() - 1; + } else { + production->end_rule_id = existing_id - unique_slices->cbegin(); + } +} + +SyntaxGrammar flatten_grammar(const InitialSyntaxGrammar &grammar) { + vector>> rules, aux_rules; + + for (const auto &pair : grammar.rules) { + vector productions; + for (const auto &rule_component : extract_choices(pair.second)) + productions.push_back(flatten_rule(rule_component)); + rules.push_back({ pair.first, productions }); + } + + for (const auto &pair : grammar.aux_rules) { + vector productions; + for (const auto &rule_component : extract_choices(pair.second)) + productions.push_back(flatten_rule(rule_component)); + aux_rules.push_back({ pair.first, productions }); + } + + if (rules.empty()) { + rules.push_back({ + "START", + { Production({ {rules::Symbol(0, rules::SymbolOptionToken), 0, 0} }, 0) } + }); + } + + vector unique_slices; + for (auto &pair : rules) + for (Production &production : pair.second) + assign_rule_ids(&production, &unique_slices); + for (auto &pair : aux_rules) + for (Production &production : pair.second) + assign_rule_ids(&production, &unique_slices); + + return SyntaxGrammar(rules, aux_rules, grammar.ubiquitous_tokens); +} + +} // namespace prepare_grammar +} // namespace tree_sitter diff --git a/src/compiler/prepare_grammar/flatten_grammar.h b/src/compiler/prepare_grammar/flatten_grammar.h new file mode 100644 index 00000000..aaee9be0 --- /dev/null +++ b/src/compiler/prepare_grammar/flatten_grammar.h @@ -0,0 +1,13 @@ +#include +#include "tree_sitter/compiler.h" +#include "compiler/syntax_grammar.h" + +namespace tree_sitter { +namespace prepare_grammar { + +class InitialSyntaxGrammar; + +SyntaxGrammar flatten_grammar(const InitialSyntaxGrammar &); + +} // namespace prepare_grammar +} // namespace tree_sitter diff --git a/src/compiler/prepare_grammar/initial_syntax_grammar.cc b/src/compiler/prepare_grammar/initial_syntax_grammar.cc new file mode 100644 index 00000000..9303d03c --- /dev/null +++ b/src/compiler/prepare_grammar/initial_syntax_grammar.cc @@ -0,0 +1,37 @@ +#include "compiler/prepare_grammar/initial_syntax_grammar.h" +#include +#include +#include +#include "compiler/rules/symbol.h" + +namespace tree_sitter { +namespace prepare_grammar { + +using std::string; +using std::pair; +using std::vector; +using std::set; + +InitialSyntaxGrammar::InitialSyntaxGrammar() {} + +InitialSyntaxGrammar::InitialSyntaxGrammar(const vector> &rules, + const vector> &aux_rules) + : rules(rules), aux_rules(aux_rules) {} + +InitialSyntaxGrammar::InitialSyntaxGrammar(const vector> &rules, + const vector> &aux_rules, + const set &ubiquitous_tokens) + : rules(rules), aux_rules(aux_rules), ubiquitous_tokens(ubiquitous_tokens) {} + +const rules::rule_ptr &InitialSyntaxGrammar::rule(const rules::Symbol &symbol) const { + return symbol.is_auxiliary() ? aux_rules[symbol.index].second + : rules[symbol.index].second; +} + +const string &InitialSyntaxGrammar::rule_name(const rules::Symbol &symbol) const { + return symbol.is_auxiliary() ? aux_rules[symbol.index].first + : rules[symbol.index].first; +} + +} // namespace prepare_grammar +} // namespace tree_sitter diff --git a/src/compiler/prepare_grammar/initial_syntax_grammar.h b/src/compiler/prepare_grammar/initial_syntax_grammar.h new file mode 100644 index 00000000..485883dc --- /dev/null +++ b/src/compiler/prepare_grammar/initial_syntax_grammar.h @@ -0,0 +1,36 @@ +#ifndef COMPILER_INITIAL_SYNTAX_GRAMMAR_H_ +#define COMPILER_INITIAL_SYNTAX_GRAMMAR_H_ + +#include +#include +#include +#include +#include "tree_sitter/compiler.h" +#include "compiler/rules/symbol.h" + +namespace tree_sitter { +namespace prepare_grammar { + +class InitialSyntaxGrammar { + public: + InitialSyntaxGrammar(); + InitialSyntaxGrammar( + const std::vector> &rules, + const std::vector> &aux_rules); + InitialSyntaxGrammar( + const std::vector> &rules, + const std::vector> &aux_rules, + const std::set &ubiquitous_tokens); + + const std::string &rule_name(const rules::Symbol &symbol) const; + const rules::rule_ptr &rule(const rules::Symbol &symbol) const; + + const std::vector> rules; + const std::vector> aux_rules; + std::set ubiquitous_tokens; +}; + +} // namespace prepare_grammar +} // namespace tree_sitter + +#endif // COMPILER_INITIAL_SYNTAX_GRAMMAR_H_ diff --git a/src/compiler/prepare_grammar/prepare_grammar.cc b/src/compiler/prepare_grammar/prepare_grammar.cc index d5fa73e9..b9f8abd6 100644 --- a/src/compiler/prepare_grammar/prepare_grammar.cc +++ b/src/compiler/prepare_grammar/prepare_grammar.cc @@ -1,9 +1,11 @@ +#include "compiler/prepare_grammar/prepare_grammar.h" #include "compiler/prepare_grammar/expand_repeats.h" #include "compiler/prepare_grammar/expand_tokens.h" #include "compiler/prepare_grammar/extract_tokens.h" #include "compiler/prepare_grammar/intern_symbols.h" -#include "compiler/prepare_grammar/prepare_grammar.h" +#include "compiler/prepare_grammar/flatten_grammar.h" #include "compiler/lexical_grammar.h" +#include "compiler/prepare_grammar/initial_syntax_grammar.h" #include "compiler/syntax_grammar.h" namespace tree_sitter { @@ -29,7 +31,7 @@ tuple prepare_grammar( return make_tuple(SyntaxGrammar(), LexicalGrammar(), error); // Replace `Repeat` rules with pairs of recursive rules - const SyntaxGrammar &syntax_grammar = expand_repeats(get<0>(extract_result)); + const InitialSyntaxGrammar &syntax_grammar = expand_repeats(get<0>(extract_result)); // Expand `String` and `Pattern` rules into full rule trees auto expand_tokens_result = expand_tokens(get<1>(extract_result)); @@ -38,7 +40,7 @@ tuple prepare_grammar( if (error) return make_tuple(SyntaxGrammar(), LexicalGrammar(), error); - return make_tuple(syntax_grammar, lex_grammar, nullptr); + return make_tuple(flatten_grammar(syntax_grammar), lex_grammar, nullptr); } } // namespace prepare_grammar diff --git a/src/compiler/syntax_grammar.cc b/src/compiler/syntax_grammar.cc index 88192065..7a237cdd 100644 --- a/src/compiler/syntax_grammar.cc +++ b/src/compiler/syntax_grammar.cc @@ -3,33 +3,92 @@ #include #include #include "compiler/rules/symbol.h" +#include "compiler/rules/built_in_symbols.h" namespace tree_sitter { using std::string; +using std::to_string; using std::pair; using std::vector; using std::set; +static const vector START_PRODUCTIONS({ + Production({ {rules::Symbol(0), 0, -1} }, 2) +}); + +static const vector NO_PRODUCTIONS({}); + +bool ProductionEntry::operator==(const ProductionEntry &other) const { + return symbol == other.symbol && precedence == other.precedence && + rule_id == other.rule_id; +} + +Production::Production(const vector &entries, int last_rule_id) : + entries(entries), end_rule_id(last_rule_id) {} + +int Production::precedence_at(size_t index) const { + if (index >= size()) + return 0; + else + return entries[index].precedence; +} + +int Production::rule_id_at(size_t index) const { + if (index >= size()) + return end_rule_id; + else + return entries[index].rule_id; +} + +const rules::Symbol &Production::symbol_at(size_t index) const { + return entries[index].symbol; +} + +size_t Production::size() const { + return entries.size(); +} + SyntaxGrammar::SyntaxGrammar() {} -SyntaxGrammar::SyntaxGrammar(const vector> &rules, - const vector> &aux_rules) - : rules(rules), aux_rules(aux_rules) {} - -SyntaxGrammar::SyntaxGrammar(const vector> &rules, - const vector> &aux_rules, - const set &ubiquitous_tokens) +SyntaxGrammar::SyntaxGrammar( + const vector>> &rules, + const vector>> &aux_rules, + const set &ubiquitous_tokens) : rules(rules), aux_rules(aux_rules), ubiquitous_tokens(ubiquitous_tokens) {} -const rules::rule_ptr &SyntaxGrammar::rule(const rules::Symbol &symbol) const { - return symbol.is_auxiliary() ? aux_rules[symbol.index].second - : rules[symbol.index].second; -} - const string &SyntaxGrammar::rule_name(const rules::Symbol &symbol) const { return symbol.is_auxiliary() ? aux_rules[symbol.index].first : rules[symbol.index].first; } +const vector &SyntaxGrammar::productions(const rules::Symbol &symbol) const { + if (symbol == rules::START()) + return START_PRODUCTIONS; + if (symbol.is_built_in() || symbol.is_token()) + return NO_PRODUCTIONS; + if (symbol.is_auxiliary()) + return aux_rules[symbol.index].second; + else + return rules[symbol.index].second; +} + +std::ostream &operator<<(std::ostream &stream, const ProductionEntry &entry) { + return stream << string("(entry symbol:") << entry.symbol << + string(" precedence: ") << to_string(entry.precedence) << + string(" id: ") << to_string(entry.rule_id) << string(")"); +} + +std::ostream &operator<<(std::ostream &stream, const Production &production) { + stream << string("(production entries: ("); + bool started = false; + for (const auto &entry : production.entries) { + if (started) stream << string(" "); + stream << entry; + started = true; + } + return stream << string(") end_rule_id: ") << + to_string(production.end_rule_id) << string(")"); +} + } // namespace tree_sitter diff --git a/src/compiler/syntax_grammar.h b/src/compiler/syntax_grammar.h index 101ca78f..d0f2d157 100644 --- a/src/compiler/syntax_grammar.h +++ b/src/compiler/syntax_grammar.h @@ -10,22 +10,41 @@ namespace tree_sitter { +struct ProductionEntry { + rules::Symbol symbol; + int precedence; + int rule_id; + + bool operator==(const ProductionEntry &) const; +}; + +class Production { +public: + std::vector entries; + int end_rule_id; + Production(const std::vector &, int); + size_t size() const; + const rules::Symbol &symbol_at(size_t) const; + int precedence_at(size_t) const; + int rule_id_at(size_t) const; +}; + +std::ostream &operator<<(std::ostream &, const ProductionEntry &); +std::ostream &operator<<(std::ostream &, const Production &); + class SyntaxGrammar { public: SyntaxGrammar(); SyntaxGrammar( - const std::vector> &rules, - const std::vector> &aux_rules); - SyntaxGrammar( - const std::vector> &rules, - const std::vector> &aux_rules, + const std::vector>> &rules, + const std::vector>> &aux_rules, const std::set &ubiquitous_tokens); const std::string &rule_name(const rules::Symbol &symbol) const; - const rules::rule_ptr &rule(const rules::Symbol &symbol) const; - - const std::vector> rules; - const std::vector> aux_rules; + const std::vector &productions(const rules::Symbol &) const; + + std::vector>> rules; + std::vector>> aux_rules; std::set ubiquitous_tokens; }; From 1ba8701ada802865ab0dee3ed7e13c81f0696a48 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 27 Jan 2015 08:28:48 -0800 Subject: [PATCH 3/4] Compute fewer item set closures in item set transitions function --- .../build_tables/item_set_closure_spec.cc | 12 ++++++--- .../build_tables/build_parse_table.cc | 7 +++-- src/compiler/build_tables/item_set_closure.cc | 12 +++------ src/compiler/build_tables/item_set_closure.h | 5 +--- .../build_tables/item_set_transitions.cc | 26 ++++++------------- 5 files changed, 26 insertions(+), 36 deletions(-) diff --git a/spec/compiler/build_tables/item_set_closure_spec.cc b/spec/compiler/build_tables/item_set_closure_spec.cc index 866e7381..07d9aab8 100644 --- a/spec/compiler/build_tables/item_set_closure_spec.cc +++ b/spec/compiler/build_tables/item_set_closure_spec.cc @@ -43,10 +43,14 @@ describe("item_set_closure", []() { }, {}, set()); it("adds items at the beginnings of referenced rules", [&]() { - ParseItemSet item_set = item_set_closure( - ParseItem(Symbol(0), 0, 100, 0), - set({ Symbol(10, SymbolOptionToken) }), - grammar); + ParseItemSet item_set({ + { + ParseItem(Symbol(0), 0, 100, 0), + set({ Symbol(10, SymbolOptionToken) }), + } + }); + + item_set_closure(&item_set, grammar); AssertThat(item_set, Equals(ParseItemSet({ { diff --git a/src/compiler/build_tables/build_parse_table.cc b/src/compiler/build_tables/build_parse_table.cc index 0c6a0282..8d386064 100644 --- a/src/compiler/build_tables/build_parse_table.cc +++ b/src/compiler/build_tables/build_parse_table.cc @@ -40,8 +40,11 @@ class ParseTableBuilder { : grammar(grammar), lex_grammar(lex_grammar) {} pair> build() { - ParseItem start_item(rules::START(), 0, -2, 0); - add_parse_state(item_set_closure(start_item, { rules::END_OF_INPUT() }, grammar)); + ParseItemSet start_item_set({ + { ParseItem(rules::START(), 0, -2, 0), { rules::END_OF_INPUT() } } + }); + item_set_closure(&start_item_set, grammar); + add_parse_state(start_item_set); while (!item_sets_to_process.empty()) { auto pair = item_sets_to_process.back(); diff --git a/src/compiler/build_tables/item_set_closure.cc b/src/compiler/build_tables/item_set_closure.cc index c4d981da..6f22b7d3 100644 --- a/src/compiler/build_tables/item_set_closure.cc +++ b/src/compiler/build_tables/item_set_closure.cc @@ -17,19 +17,17 @@ using std::pair; using rules::Symbol; using rules::rule_ptr; -const ParseItemSet item_set_closure(const ParseItem &starting_item, - const set &starting_lookahead_symbols, - const SyntaxGrammar &grammar) { - ParseItemSet result; +void item_set_closure(ParseItemSet *item_set, const SyntaxGrammar &grammar) { vector>> items_to_process; - items_to_process.push_back({ starting_item, starting_lookahead_symbols }); + items_to_process.insert(items_to_process.end(), item_set->begin(), item_set->end()); + item_set->clear(); while (!items_to_process.empty()) { ParseItem item = items_to_process.back().first; set new_lookahead_symbols = items_to_process.back().second; items_to_process.pop_back(); - set &lookahead_symbols = result[item]; + set &lookahead_symbols = item_set->operator[](item); size_t previous_size = lookahead_symbols.size(); lookahead_symbols.insert(new_lookahead_symbols.begin(), new_lookahead_symbols.end()); @@ -70,8 +68,6 @@ const ParseItemSet item_set_closure(const ParseItem &starting_item, i++; } } - - return result; } } // namespace build_tables diff --git a/src/compiler/build_tables/item_set_closure.h b/src/compiler/build_tables/item_set_closure.h index b16fc99a..02ed4871 100644 --- a/src/compiler/build_tables/item_set_closure.h +++ b/src/compiler/build_tables/item_set_closure.h @@ -1,7 +1,6 @@ #ifndef COMPILER_BUILD_TABLES_ITEM_SET_CLOSURE_H_ #define COMPILER_BUILD_TABLES_ITEM_SET_CLOSURE_H_ -#include #include "compiler/build_tables/parse_item.h" #include "compiler/rules/symbol.h" @@ -11,9 +10,7 @@ class SyntaxGrammar; namespace build_tables { -const ParseItemSet item_set_closure(const ParseItem &, - const std::set &, - const SyntaxGrammar &); +void item_set_closure(ParseItemSet *, const SyntaxGrammar &); } // namespace build_tables } // namespace tree_sitter diff --git a/src/compiler/build_tables/item_set_transitions.cc b/src/compiler/build_tables/item_set_transitions.cc index abacd5cd..15032028 100644 --- a/src/compiler/build_tables/item_set_transitions.cc +++ b/src/compiler/build_tables/item_set_transitions.cc @@ -12,6 +12,7 @@ namespace build_tables { using std::map; using std::set; +using std::vector; using rules::CharacterSet; using rules::Symbol; @@ -21,31 +22,20 @@ map sym_transitions(const ParseItemSet &item_set, for (const auto &pair : item_set) { const ParseItem &item = pair.first; const set &lookahead_symbols = pair.second; - const auto &productions = grammar.productions(item.lhs); - if (productions.empty()) - continue; - const Production &production = grammar.productions(item.lhs)[item.production_index]; if (production.size() <= item.consumed_symbol_count) continue; const Symbol &symbol = production.symbol_at(item.consumed_symbol_count); - ParseItem new_item( - item.lhs, - item.production_index, - production.rule_id_at(item.consumed_symbol_count + 1), - item.consumed_symbol_count + 1 - ); + int rule_id = production.rule_id_at(item.consumed_symbol_count + 1); + ParseItem new_item(item.lhs, item.production_index, rule_id, item.consumed_symbol_count + 1); - merge_sym_transition( - &result, - { symbol, item_set_closure(new_item, { lookahead_symbols }, grammar) }, - [](ParseItemSet *left, const ParseItemSet *right) { - for (auto &pair : *right) - left->operator[](pair.first) - .insert(pair.second.begin(), pair.second.end()); - }); + result[symbol][new_item].insert(lookahead_symbols.begin(), lookahead_symbols.end()); } + + for (auto &pair : result) + item_set_closure(&pair.second, grammar); + return result; } From 8ac4b9fc1785719fbce38743d5be7f73a50b3159 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 27 Jan 2015 19:56:49 -0800 Subject: [PATCH 4/4] Store productions' end rule ids in the vector --- .../build_tables/build_parse_table_spec.cc | 20 ++++++++++--- .../build_tables/item_set_closure_spec.cc | 19 +++++++----- .../build_tables/item_set_transitions_spec.cc | 4 +-- .../prepare_grammar/flatten_grammar_spec.cc | 19 ++++++------ .../build_tables/build_parse_table.cc | 5 ++-- src/compiler/build_tables/item_set_closure.cc | 12 ++++---- .../build_tables/item_set_transitions.cc | 6 ++-- .../prepare_grammar/flatten_grammar.cc | 25 +++++----------- src/compiler/rules/built_in_symbols.cc | 1 + src/compiler/rules/built_in_symbols.h | 1 + src/compiler/syntax_grammar.cc | 30 +++++-------------- src/compiler/syntax_grammar.h | 13 ++++---- 12 files changed, 74 insertions(+), 81 deletions(-) diff --git a/spec/compiler/build_tables/build_parse_table_spec.cc b/spec/compiler/build_tables/build_parse_table_spec.cc index c67a9418..dd670799 100644 --- a/spec/compiler/build_tables/build_parse_table_spec.cc +++ b/spec/compiler/build_tables/build_parse_table_spec.cc @@ -15,20 +15,32 @@ describe("build_parse_table", []() { { "rule0", { - Production({ {Symbol(1), 0, 1} }, 0), - Production({ {Symbol(2), 0, 2} }, 0) + Production({ + {Symbol(1), 0, 1}, + {rules::NONE(), 0, 5} + }), + Production({ + {Symbol(2), 0, 2}, + {rules::NONE(), 0, 6} + }) } }, { "rule1", { - Production({ {Symbol(0, SymbolOptionToken), 0, 3} }, 0) + Production({ + {Symbol(0, SymbolOptionToken), 0, 3}, + {rules::NONE(), 0, 7} + }) } }, { "rule2", { - Production({ {Symbol(1, SymbolOptionToken), 0, 4} }, 0) + Production({ + {Symbol(1, SymbolOptionToken), 0, 4}, + {rules::NONE(), 0, 8} + }) } }, }, {}, { Symbol(2, SymbolOptionToken) }); diff --git a/spec/compiler/build_tables/item_set_closure_spec.cc b/spec/compiler/build_tables/item_set_closure_spec.cc index 07d9aab8..4f395c28 100644 --- a/spec/compiler/build_tables/item_set_closure_spec.cc +++ b/spec/compiler/build_tables/item_set_closure_spec.cc @@ -2,6 +2,7 @@ #include "compiler/syntax_grammar.h" #include "compiler/build_tables/item_set_closure.h" #include "compiler/build_tables/item_set_transitions.h" +#include "compiler/rules/built_in_symbols.h" using namespace build_tables; using namespace rules; @@ -15,8 +16,9 @@ describe("item_set_closure", []() { { Production({ {Symbol(1), 0, 100}, - {Symbol(11, SymbolOptionToken), 0, 101} - }, 107), + {Symbol(11, SymbolOptionToken), 0, 101}, + {rules::NONE(), 0, 107} + }), } }, { @@ -24,11 +26,13 @@ describe("item_set_closure", []() { { Production({ {Symbol(12, SymbolOptionToken), 0, 102}, - {Symbol(13, SymbolOptionToken), 0, 103} - }, 108), + {Symbol(13, SymbolOptionToken), 0, 103}, + {rules::NONE(), 0, 108} + }), Production({ {Symbol(2), 0, 104}, - }, 109) + {rules::NONE(), 0, 109} + }) } }, { @@ -36,8 +40,9 @@ describe("item_set_closure", []() { { Production({ {Symbol(14, SymbolOptionToken), 0, 105}, - {Symbol(15, SymbolOptionToken), 0, 106} - }, 110) + {Symbol(15, SymbolOptionToken), 0, 106}, + {rules::NONE(), 0, 110} + }) } }, }, {}, set()); diff --git a/spec/compiler/build_tables/item_set_transitions_spec.cc b/spec/compiler/build_tables/item_set_transitions_spec.cc index 05df4c44..64fe6d5a 100644 --- a/spec/compiler/build_tables/item_set_transitions_spec.cc +++ b/spec/compiler/build_tables/item_set_transitions_spec.cc @@ -52,14 +52,14 @@ describe("sym_transitions(ParseItemSet, InitialSyntaxGrammar)", [&]() { {Symbol(13, SymbolOptionToken), 0, 103}, {Symbol(1), 0, 104}, {Symbol(14, SymbolOptionToken), 0, 105}, - }, 1) + }) }, }, { "B", { Production({ {Symbol(15, SymbolOptionToken), 0, 106}, - }, 2) + }) }, } }, {}, set()); diff --git a/spec/compiler/prepare_grammar/flatten_grammar_spec.cc b/spec/compiler/prepare_grammar/flatten_grammar_spec.cc index 99872010..988cc45b 100644 --- a/spec/compiler/prepare_grammar/flatten_grammar_spec.cc +++ b/spec/compiler/prepare_grammar/flatten_grammar_spec.cc @@ -3,6 +3,7 @@ #include "compiler/prepare_grammar/initial_syntax_grammar.h" #include "compiler/syntax_grammar.h" #include "compiler/helpers/containers.h" +#include "compiler/rules/built_in_symbols.h" START_TEST @@ -48,15 +49,15 @@ describe("flatten_grammar", []() { AssertThat( get_symbol_lists(0), Equals(vector>({ - { Symbol(1), Symbol(2), Symbol(4) }, - { Symbol(1), Symbol(3), Symbol(4) } + { Symbol(1), Symbol(2), Symbol(4), rules::NONE() }, + { Symbol(1), Symbol(3), Symbol(4), rules::NONE() } }))); AssertThat( get_symbol_lists(1), Equals(vector>({ - { Symbol(1), Symbol(2), Symbol(3), Symbol(4), Symbol(6), Symbol(7) }, - { Symbol(1), Symbol(2), Symbol(5), Symbol(6), Symbol(7) } + { Symbol(1), Symbol(2), Symbol(3), Symbol(4), Symbol(6), Symbol(7), rules::NONE() }, + { Symbol(1), Symbol(2), Symbol(5), Symbol(6), Symbol(7), rules::NONE() } }))); }); @@ -74,15 +75,15 @@ describe("flatten_grammar", []() { AssertThat( get_precedence_lists(0), Equals(vector>({ - { 0, 0, 0 }, - { 0, 0, 0 } + { 0, 0, 0, 0 }, + { 0, 0, 0, 0 } }))); AssertThat( get_precedence_lists(1), Equals(vector>({ - { 0, 0, 50, 100, 50, 0 }, - { 0, 0, 50, 50, 0 } + { 0, 0, 50, 100, 50, 0, 0 }, + { 0, 0, 50, 50, 0, 0 } }))); }); @@ -90,7 +91,7 @@ describe("flatten_grammar", []() { SyntaxGrammar grammar = flatten_grammar(input_grammar); auto rule_id = [&](int rule_index, int production_index, int symbol_index) { - return grammar.rules[rule_index].second[production_index].rule_id_at(symbol_index); + return grammar.rules[rule_index].second[production_index][symbol_index].rule_id; }; // Rule 1: last symbol is the same for both productions. diff --git a/src/compiler/build_tables/build_parse_table.cc b/src/compiler/build_tables/build_parse_table.cc index 8d386064..a3001b42 100644 --- a/src/compiler/build_tables/build_parse_table.cc +++ b/src/compiler/build_tables/build_parse_table.cc @@ -176,11 +176,12 @@ class ParseTableBuilder { } bool item_is_done(const ParseItem &item) { - return item.consumed_symbol_count == grammar.productions(item.lhs)[item.production_index].size(); + return item.consumed_symbol_count == + grammar.productions(item.lhs)[item.production_index].symbol_count(); } int item_precedence(const ParseItem &item) { - return grammar.productions(item.lhs)[item.production_index].precedence_at(item.consumed_symbol_count - 1); + return grammar.productions(item.lhs)[item.production_index][item.consumed_symbol_count - 1].precedence; } void record_conflict(const Symbol &sym, const ParseAction &left, diff --git a/src/compiler/build_tables/item_set_closure.cc b/src/compiler/build_tables/item_set_closure.cc index 6f22b7d3..f2a431a9 100644 --- a/src/compiler/build_tables/item_set_closure.cc +++ b/src/compiler/build_tables/item_set_closure.cc @@ -35,18 +35,18 @@ void item_set_closure(ParseItemSet *item_set, const SyntaxGrammar &grammar) { continue; const Production &item_production = grammar.productions(item.lhs)[item.production_index]; - if (item_production.size() <= item.consumed_symbol_count) + if (item.consumed_symbol_count >= item_production.symbol_count()) continue; - Symbol symbol = item_production.symbol_at(item.consumed_symbol_count); + Symbol symbol = item_production[item.consumed_symbol_count].symbol; if (symbol.is_token() || symbol.is_built_in()) continue; set next_lookahead_symbols; - if (item.consumed_symbol_count + 1 >= item_production.size()) { + if (item.consumed_symbol_count + 1 >= item_production.symbol_count()) { next_lookahead_symbols = lookahead_symbols; } else { - vector symbols_to_process({ item_production.symbol_at(item.consumed_symbol_count + 1) }); + vector symbols_to_process({ item_production[item.consumed_symbol_count + 1].symbol }); while (!symbols_to_process.empty()) { Symbol following_symbol = symbols_to_process.back(); @@ -55,14 +55,14 @@ void item_set_closure(ParseItemSet *item_set, const SyntaxGrammar &grammar) { continue; for (const auto &production : grammar.productions(following_symbol)) - symbols_to_process.push_back(production.symbol_at(0)); + symbols_to_process.push_back(production[0].symbol); } } size_t i = 0; for (const Production &production : grammar.productions(symbol)) { items_to_process.push_back({ - ParseItem(symbol, i, production.rule_id_at(0), 0), + ParseItem(symbol, i, production[0].rule_id, 0), next_lookahead_symbols }); i++; diff --git a/src/compiler/build_tables/item_set_transitions.cc b/src/compiler/build_tables/item_set_transitions.cc index 15032028..f1f3abc9 100644 --- a/src/compiler/build_tables/item_set_transitions.cc +++ b/src/compiler/build_tables/item_set_transitions.cc @@ -23,11 +23,11 @@ map sym_transitions(const ParseItemSet &item_set, const ParseItem &item = pair.first; const set &lookahead_symbols = pair.second; const Production &production = grammar.productions(item.lhs)[item.production_index]; - if (production.size() <= item.consumed_symbol_count) + if (item.consumed_symbol_count >= production.symbol_count()) continue; - const Symbol &symbol = production.symbol_at(item.consumed_symbol_count); - int rule_id = production.rule_id_at(item.consumed_symbol_count + 1); + const Symbol &symbol = production[item.consumed_symbol_count].symbol; + int rule_id = production[item.consumed_symbol_count + 1].rule_id; ParseItem new_item(item.lhs, item.production_index, rule_id, item.consumed_symbol_count + 1); result[symbol][new_item].insert(lookahead_symbols.begin(), lookahead_symbols.end()); diff --git a/src/compiler/prepare_grammar/flatten_grammar.cc b/src/compiler/prepare_grammar/flatten_grammar.cc index cc368d25..1c2b82b3 100644 --- a/src/compiler/prepare_grammar/flatten_grammar.cc +++ b/src/compiler/prepare_grammar/flatten_grammar.cc @@ -5,6 +5,7 @@ #include "compiler/rules/seq.h" #include "compiler/rules/symbol.h" #include "compiler/rules/metadata.h" +#include "compiler/rules/built_in_symbols.h" #include #include @@ -63,16 +64,16 @@ class FlattenRule : public rules::RuleFn { Production flatten_rule(const rule_ptr &rule) { FlattenRule flattener; flattener.apply(rule); - return Production(flattener.entries, 0); + int end_precedence = flattener.entries.back().precedence; + flattener.entries.push_back({ rules::NONE(), end_precedence, 0 }); + return Production(flattener.entries); } struct ProductionSlice { vector::const_iterator start; vector::const_iterator end; - int end_precedence; bool operator==(const ProductionSlice &other) const { - if (end_precedence != other.end_precedence) return false; if (end - start != other.end - other.start) return false; for (auto iter1 = start, iter2 = other.start; iter1 != end; ++iter1, ++iter2) if (!(iter1->symbol == iter2->symbol) || iter1->precedence != iter2->precedence) @@ -82,11 +83,10 @@ struct ProductionSlice { }; void assign_rule_ids(Production *production, vector *unique_slices) { - auto &entries = production->entries; - auto end = entries.end(); + auto end = production->entries.end(); - for (auto iter = entries.begin(); iter != end; ++iter) { - ProductionSlice slice{iter, end, 0}; + for (auto iter = production->entries.begin(); iter != end; ++iter) { + ProductionSlice slice{iter, end}; auto existing_id = find(unique_slices->cbegin(), unique_slices->cend(), slice); if (existing_id == unique_slices->end()) { unique_slices->push_back(slice); @@ -95,15 +95,6 @@ void assign_rule_ids(Production *production, vector *unique_sli iter->rule_id = existing_id - unique_slices->cbegin(); } } - - ProductionSlice slice{end, end, production->precedence_at(production->size() - 1)}; - auto existing_id = find(unique_slices->cbegin(), unique_slices->cend(), slice); - if (existing_id == unique_slices->end()) { - unique_slices->push_back(slice); - production->end_rule_id = unique_slices->size() - 1; - } else { - production->end_rule_id = existing_id - unique_slices->cbegin(); - } } SyntaxGrammar flatten_grammar(const InitialSyntaxGrammar &grammar) { @@ -126,7 +117,7 @@ SyntaxGrammar flatten_grammar(const InitialSyntaxGrammar &grammar) { if (rules.empty()) { rules.push_back({ "START", - { Production({ {rules::Symbol(0, rules::SymbolOptionToken), 0, 0} }, 0) } + { Production({ {rules::Symbol(0, rules::SymbolOptionToken), 0, 0} }) } }); } diff --git a/src/compiler/rules/built_in_symbols.cc b/src/compiler/rules/built_in_symbols.cc index 7a648a3d..4ca355a2 100644 --- a/src/compiler/rules/built_in_symbols.cc +++ b/src/compiler/rules/built_in_symbols.cc @@ -7,6 +7,7 @@ Symbol END_OF_INPUT() { return Symbol(-1, SymbolOptionToken); } Symbol ERROR() { return Symbol(-2, SymbolOptionToken); } Symbol START() { return Symbol(-3); } Symbol DOCUMENT() { return Symbol(-4); } +Symbol NONE() { return Symbol(-5); } } // namespace rules } // namespace tree_sitter diff --git a/src/compiler/rules/built_in_symbols.h b/src/compiler/rules/built_in_symbols.h index 63ad3df4..640f99db 100644 --- a/src/compiler/rules/built_in_symbols.h +++ b/src/compiler/rules/built_in_symbols.h @@ -10,6 +10,7 @@ Symbol ERROR(); Symbol START(); Symbol END_OF_INPUT(); Symbol DOCUMENT(); +Symbol NONE(); } // namespace rules } // namespace tree_sitter diff --git a/src/compiler/syntax_grammar.cc b/src/compiler/syntax_grammar.cc index 7a237cdd..82798309 100644 --- a/src/compiler/syntax_grammar.cc +++ b/src/compiler/syntax_grammar.cc @@ -14,7 +14,7 @@ using std::vector; using std::set; static const vector START_PRODUCTIONS({ - Production({ {rules::Symbol(0), 0, -1} }, 2) + Production({ {rules::Symbol(0), 0, -1}, { rules::NONE(), 0, -2} }) }); static const vector NO_PRODUCTIONS({}); @@ -24,29 +24,14 @@ bool ProductionEntry::operator==(const ProductionEntry &other) const { rule_id == other.rule_id; } -Production::Production(const vector &entries, int last_rule_id) : - entries(entries), end_rule_id(last_rule_id) {} +Production::Production(const vector &entries) : entries(entries) {} -int Production::precedence_at(size_t index) const { - if (index >= size()) - return 0; - else - return entries[index].precedence; +size_t Production::symbol_count() const { + return entries.size() - 1; } -int Production::rule_id_at(size_t index) const { - if (index >= size()) - return end_rule_id; - else - return entries[index].rule_id; -} - -const rules::Symbol &Production::symbol_at(size_t index) const { - return entries[index].symbol; -} - -size_t Production::size() const { - return entries.size(); +const ProductionEntry &Production::operator[](int i) const { + return entries[i]; } SyntaxGrammar::SyntaxGrammar() {} @@ -87,8 +72,7 @@ std::ostream &operator<<(std::ostream &stream, const Production &production) { stream << entry; started = true; } - return stream << string(") end_rule_id: ") << - to_string(production.end_rule_id) << string(")"); + return stream << string(")"); } } // namespace tree_sitter diff --git a/src/compiler/syntax_grammar.h b/src/compiler/syntax_grammar.h index d0f2d157..e192309c 100644 --- a/src/compiler/syntax_grammar.h +++ b/src/compiler/syntax_grammar.h @@ -18,15 +18,12 @@ struct ProductionEntry { bool operator==(const ProductionEntry &) const; }; -class Production { -public: +struct Production { + Production(); + Production(const std::vector &); + size_t symbol_count() const; + const ProductionEntry &operator[](int) const; std::vector entries; - int end_rule_id; - Production(const std::vector &, int); - size_t size() const; - const rules::Symbol &symbol_at(size_t) const; - int precedence_at(size_t) const; - int rule_id_at(size_t) const; }; std::ostream &operator<<(std::ostream &, const ProductionEntry &);