From 0985fa300854346f315bbdc111d4d3c86affe47b Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sun, 5 Jan 2014 01:19:32 -0800 Subject: [PATCH] Collapse rules that contain only a single token --- .../compiler/grammar/prepare_grammar_spec.cpp | 19 +++++ spec/fixtures/parsers/arithmetic.c | 76 ++++++------------- src/compiler/grammar/extract_tokens.cpp | 28 +++++-- src/compiler/grammar/grammar.cpp | 2 +- src/compiler/lr/table_builder.cpp | 1 - src/compiler/rules/rule.cpp | 4 + src/compiler/rules/rule.h | 1 + 7 files changed, 70 insertions(+), 61 deletions(-) diff --git a/spec/compiler/grammar/prepare_grammar_spec.cpp b/spec/compiler/grammar/prepare_grammar_spec.cpp index 8613b9d7..52859d9c 100644 --- a/spec/compiler/grammar/prepare_grammar_spec.cpp +++ b/spec/compiler/grammar/prepare_grammar_spec.cpp @@ -34,6 +34,25 @@ describe("preparing a grammar", []() { rules::character('b') }) } }))); }); + + it("turns entire rules into tokens when they contain no symbols", [&]() { + auto result = prepare_grammar(Grammar({ + { "rule1", sym("rule2") }, + { "rule2", seq({ + character('a'), + character('b') }) } + })); + + AssertThat(result.first, Equals(Grammar({ + { "rule1", sym("rule2") } + }))); + + AssertThat(result.second, Equals(Grammar("", { + { "rule2", seq({ + character('a'), + character('b') }) } + }))); + }); }); END_TEST \ No newline at end of file diff --git a/spec/fixtures/parsers/arithmetic.c b/spec/fixtures/parsers/arithmetic.c index fca3d303..a600c694 100644 --- a/spec/fixtures/parsers/arithmetic.c +++ b/spec/fixtures/parsers/arithmetic.c @@ -4,15 +4,13 @@ typedef enum { ts_symbol_expression, ts_symbol_term, - ts_symbol_number, ts_symbol_factor, - ts_symbol_variable, - ts_symbol_6, - ts_symbol_5, ts_symbol_4, ts_symbol_3, - ts_symbol_2, ts_symbol_1, + ts_symbol_2, + ts_symbol_number, + ts_symbol_variable, ts_symbol___END__ } ts_symbol; @@ -20,23 +18,23 @@ static void ts_lex(TSParser *parser) { START_LEXER(); switch (LEX_STATE()) { case 0: - if (isdigit(LOOKAHEAD_CHAR())) - ADVANCE(3); - if (LOOKAHEAD_CHAR() == '(') - ADVANCE(2); if (isalnum(LOOKAHEAD_CHAR())) + ADVANCE(3); + if (isdigit(LOOKAHEAD_CHAR())) + ADVANCE(2); + if (LOOKAHEAD_CHAR() == '(') ADVANCE(1); LEX_ERROR(); case 1: - if (isalnum(LOOKAHEAD_CHAR())) - ADVANCE(1); ACCEPT_TOKEN(ts_symbol_1); case 2: - ACCEPT_TOKEN(ts_symbol_2); - case 3: if (isdigit(LOOKAHEAD_CHAR())) + ADVANCE(2); + ACCEPT_TOKEN(ts_symbol_number); + case 3: + if (isalnum(LOOKAHEAD_CHAR())) ADVANCE(3); - ACCEPT_TOKEN(ts_symbol_4); + ACCEPT_TOKEN(ts_symbol_variable); case 4: LEX_ERROR(); case 5: @@ -44,19 +42,19 @@ static void ts_lex(TSParser *parser) { ADVANCE(6); LEX_ERROR(); case 6: - ACCEPT_TOKEN(ts_symbol_6); + ACCEPT_TOKEN(ts_symbol_4); case 7: if (LOOKAHEAD_CHAR() == '*') ADVANCE(8); LEX_ERROR(); case 8: - ACCEPT_TOKEN(ts_symbol_5); + ACCEPT_TOKEN(ts_symbol_3); case 9: if (LOOKAHEAD_CHAR() == ')') ADVANCE(10); LEX_ERROR(); case 10: - ACCEPT_TOKEN(ts_symbol_3); + ACCEPT_TOKEN(ts_symbol_2); default: LEX_ERROR(); } @@ -69,8 +67,6 @@ TSTree ts_parse_arithmetic(const char *input) { case 0: SET_LEX_STATE(0); switch (LOOKAHEAD_SYM()) { - case ts_symbol_4: - SHIFT(13); case ts_symbol_variable: SHIFT(8); case ts_symbol_factor: @@ -78,8 +74,6 @@ TSTree ts_parse_arithmetic(const char *input) { case ts_symbol_number: SHIFT(8); case ts_symbol_1: - SHIFT(12); - case ts_symbol_2: SHIFT(9); case ts_symbol_term: SHIFT(2); @@ -99,7 +93,7 @@ TSTree ts_parse_arithmetic(const char *input) { case 2: SET_LEX_STATE(5); switch (LOOKAHEAD_SYM()) { - case ts_symbol_6: + case ts_symbol_4: SHIFT(3); default: REDUCE(ts_symbol_expression, 1); @@ -107,20 +101,16 @@ TSTree ts_parse_arithmetic(const char *input) { case 3: SET_LEX_STATE(0); switch (LOOKAHEAD_SYM()) { - case ts_symbol_4: - SHIFT(13); - case ts_symbol_1: - SHIFT(12); - case ts_symbol_term: - SHIFT(4); - case ts_symbol_2: - SHIFT(9); case ts_symbol_variable: SHIFT(8); case ts_symbol_factor: SHIFT(5); + case ts_symbol_1: + SHIFT(9); case ts_symbol_number: SHIFT(8); + case ts_symbol_term: + SHIFT(4); default: PARSE_ERROR(); } @@ -133,7 +123,7 @@ TSTree ts_parse_arithmetic(const char *input) { case 5: SET_LEX_STATE(7); switch (LOOKAHEAD_SYM()) { - case ts_symbol_5: + case ts_symbol_3: SHIFT(6); default: REDUCE(ts_symbol_term, 1); @@ -141,12 +131,8 @@ TSTree ts_parse_arithmetic(const char *input) { case 6: SET_LEX_STATE(0); switch (LOOKAHEAD_SYM()) { - case ts_symbol_4: - SHIFT(13); - case ts_symbol_2: - SHIFT(9); case ts_symbol_1: - SHIFT(12); + SHIFT(9); case ts_symbol_number: SHIFT(8); case ts_symbol_variable: @@ -171,8 +157,6 @@ TSTree ts_parse_arithmetic(const char *input) { case 9: SET_LEX_STATE(0); switch (LOOKAHEAD_SYM()) { - case ts_symbol_4: - SHIFT(13); case ts_symbol_variable: SHIFT(8); case ts_symbol_factor: @@ -180,8 +164,6 @@ TSTree ts_parse_arithmetic(const char *input) { case ts_symbol_number: SHIFT(8); case ts_symbol_1: - SHIFT(12); - case ts_symbol_2: SHIFT(9); case ts_symbol_term: SHIFT(2); @@ -193,7 +175,7 @@ TSTree ts_parse_arithmetic(const char *input) { case 10: SET_LEX_STATE(9); switch (LOOKAHEAD_SYM()) { - case ts_symbol_3: + case ts_symbol_2: SHIFT(11); default: PARSE_ERROR(); @@ -204,18 +186,6 @@ TSTree ts_parse_arithmetic(const char *input) { default: REDUCE(ts_symbol_factor, 3); } - case 12: - SET_LEX_STATE(4); - switch (LOOKAHEAD_SYM()) { - default: - REDUCE(ts_symbol_variable, 1); - } - case 13: - SET_LEX_STATE(4); - switch (LOOKAHEAD_SYM()) { - default: - REDUCE(ts_symbol_number, 1); - } default: PARSE_ERROR(); } diff --git a/src/compiler/grammar/extract_tokens.cpp b/src/compiler/grammar/extract_tokens.cpp index f5c39ae6..6de5a5dc 100644 --- a/src/compiler/grammar/extract_tokens.cpp +++ b/src/compiler/grammar/extract_tokens.cpp @@ -11,8 +11,22 @@ namespace tree_sitter { class TokenExtractor : rules::Visitor { public: rules::rule_ptr value; + size_t anonymous_token_count = 0; unordered_map tokens; + rules::rule_ptr initial_apply(string name, const rules::rule_ptr rule) { + auto result = apply(rule); + auto symbol = std::dynamic_pointer_cast(result); + if (symbol && *symbol != *rule) { + tokens.insert({ name, tokens[symbol->name] }); + tokens.erase(symbol->name); + anonymous_token_count--; + return rules::rule_ptr(); + } else { + return result; + } + } + rules::rule_ptr apply(const rules::rule_ptr rule) { if (search_for_symbols(rule)) { rule->accept(*this); @@ -24,10 +38,10 @@ namespace tree_sitter { } string add_token(const rules::rule_ptr &rule) { - for (auto pair : tokens) { - if (*pair.second == *rule) return pair.first; - } - string name = to_string(tokens.size() + 1); + for (auto pair : tokens) + if (*pair.second == *rule) + return pair.first; + string name = to_string(++anonymous_token_count); tokens.insert({ name, rule }); return name; } @@ -50,8 +64,10 @@ namespace tree_sitter { unordered_map rules; for (auto pair : input_grammar.rules) { - auto new_rule = extractor.apply(pair.second); - rules.insert({ pair.first, new_rule }); + string name = pair.first; + auto new_rule = extractor.initial_apply(name, pair.second); + if (new_rule.get()) + rules.insert({ name, new_rule }); } return { diff --git a/src/compiler/grammar/grammar.cpp b/src/compiler/grammar/grammar.cpp index ca1ad9e7..940c0920 100644 --- a/src/compiler/grammar/grammar.cpp +++ b/src/compiler/grammar/grammar.cpp @@ -16,7 +16,6 @@ namespace tree_sitter { rules(rules), start_rule_name(start_rule_name) {} - const rules::rule_ptr Grammar::rule(const string &name) const { auto iter = rules.find(name); return (iter == rules.end()) ? @@ -38,6 +37,7 @@ namespace tree_sitter { for (auto pair : rules) { auto other_pair = other.rules.find(pair.first); if (other_pair == other.rules.end()) return false; + auto orr = other_pair->second->to_string();; if (!other_pair->second->operator==(*pair.second)) return false; } return true; diff --git a/src/compiler/lr/table_builder.cpp b/src/compiler/lr/table_builder.cpp index 77ac31cd..39455a1e 100644 --- a/src/compiler/lr/table_builder.cpp +++ b/src/compiler/lr/table_builder.cpp @@ -31,7 +31,6 @@ namespace tree_sitter { void add_shift_actions(const ItemSet &item_set, size_t state_index) { auto x = item_set.sym_transitions(grammar); for (auto transition : x) { - rules::Symbol symbol = *transition.first; ItemSet item_set = *transition.second; size_t new_state_index = add_parse_state(item_set); diff --git a/src/compiler/rules/rule.cpp b/src/compiler/rules/rule.cpp index aea595e1..d7b4b875 100644 --- a/src/compiler/rules/rule.cpp +++ b/src/compiler/rules/rule.cpp @@ -6,6 +6,10 @@ using std::string; namespace tree_sitter { namespace rules { + bool Rule::operator!=(const Rule &other) const { + return !this->operator==(other); + } + ostream& operator<<(ostream& stream, const Rule &rule) { return stream << rule.to_string(); } diff --git a/src/compiler/rules/rule.h b/src/compiler/rules/rule.h index 9e52b496..fb90ed04 100644 --- a/src/compiler/rules/rule.h +++ b/src/compiler/rules/rule.h @@ -13,6 +13,7 @@ namespace tree_sitter { class Rule { public: virtual bool operator==(const Rule& other) const = 0; + bool operator!=(const Rule& other) const; virtual size_t hash_code() const = 0; virtual rule_ptr copy() const = 0; virtual std::string to_string() const = 0;