Backfill tests for token extraction in auxiliary rules

2014-05-12 13:21:14 -07:00 · 2014-05-12 13:21:14 -07:00 · 5245bc01fe
commit 5245bc01fe
parent 608b5ce02b
3 changed files with 19242 additions and 14280 deletions
--- a/examples/parsers/javascript.c
+++ b/examples/parsers/javascript.c
--- a/spec/compiler/prepare_grammar/extract_tokens_spec.cc
+++ b/spec/compiler/prepare_grammar/extract_tokens_spec.cc
@ -8,13 +8,13 @@ using namespace rules;
 using prepare_grammar::extract_tokens;

 describe("extracting tokens from a grammar", []() {
-    it("moves strings into the lexical grammar", [&]() {
+    it("moves string rules into the lexical grammar", [&]() {
        pair<PreparedGrammar, PreparedGrammar> result = extract_tokens(PreparedGrammar({
-            { "rule0", seq({ str("ab"), i_sym(0) }) }
+            { "rule_A", seq({ str("ab"), i_sym(0) }) }
        }, {}));

        AssertThat(result.first, Equals(PreparedGrammar({
-            { "rule0", seq({ i_aux_token(0), i_sym(0) }) }
+            { "rule_A", seq({ i_aux_token(0), i_sym(0) }) }
        }, {})));

        AssertThat(result.second, Equals(PreparedGrammar({}, {
@ -22,13 +22,13 @@ describe("extracting tokens from a grammar", []() {
        })));
    });

-    it("moves patterns into the lexical grammar", [&]() {
+    it("moves pattern rules into the lexical grammar", [&]() {
        pair<PreparedGrammar, PreparedGrammar> result = extract_tokens(PreparedGrammar({
-            { "rule0", seq({ pattern("a+"), i_sym(0) }) }
+            { "rule_A", seq({ pattern("a+"), i_sym(0) }) }
        }, {}));

        AssertThat(result.first, Equals(PreparedGrammar({
-            { "rule0", seq({ i_aux_token(0), i_sym(0) }) }
+            { "rule_A", seq({ i_aux_token(0), i_sym(0) }) }
        }, {})));

        AssertThat(result.second, Equals(PreparedGrammar({}, {
@ -38,13 +38,13 @@ describe("extracting tokens from a grammar", []() {

    it("moves other rules marked as tokens into the lexical grammar", [&]() {
        pair<PreparedGrammar, PreparedGrammar> result = extract_tokens(PreparedGrammar({
-            { "rule0", seq({
+            { "rule_A", seq({
                token(choice({ str("a"), str("b") })),
                i_sym(0) }) }
        }, {}));

        AssertThat(result.first, Equals(PreparedGrammar({
-            { "rule0", seq({ i_aux_token(0), i_sym(0) }) }
+            { "rule_A", seq({ i_aux_token(0), i_sym(0) }) }
        }, {})));

        AssertThat(result.second, Equals(PreparedGrammar({}, {
@ -52,13 +52,13 @@ describe("extracting tokens from a grammar", []() {
        })));
    });

-    it("does not extract blanks into tokens", [&]() {
+    it("does not extract blanks", [&]() {
        pair<PreparedGrammar, PreparedGrammar> result = extract_tokens(PreparedGrammar({
-            { "rule1", choice({ i_sym(0), blank() }) },
+            { "rule_A", choice({ i_sym(0), blank() }) },
        }, {}));

        AssertThat(result.first, Equals(PreparedGrammar({
-            { "rule1", choice({ i_sym(0), blank() }) },
+            { "rule_A", choice({ i_sym(0), blank() }) },
        }, {})));

        AssertThat(result.second, Equals(PreparedGrammar({}, {})));
@ -66,11 +66,11 @@ describe("extracting tokens from a grammar", []() {

    it("does not create duplicate tokens in the lexical grammar", [&]() {
        pair<PreparedGrammar, PreparedGrammar> result = extract_tokens(PreparedGrammar({
-            { "rule0", seq({ str("ab"), i_sym(0), str("ab") }) },
+            { "rule_A", seq({ str("ab"), i_sym(0), str("ab") }) },
        }, {}));

        AssertThat(result.first, Equals(PreparedGrammar({
-            { "rule0", seq({ i_aux_token(0), i_sym(0), i_aux_token(0) }) }
+            { "rule_A", seq({ i_aux_token(0), i_sym(0), i_aux_token(0) }) }
        }, {})));

        AssertThat(result.second, Equals(PreparedGrammar({}, {
@ -78,52 +78,85 @@ describe("extracting tokens from a grammar", []() {
        })));
    });

-    it("moves entire rules into the lexical grammar when possible, updating referencing symbols", [&]() {
-        auto result = extract_tokens(PreparedGrammar({
-            { "rule0", i_sym(1) },
-            { "rule1", pattern("a|b") },
-            { "rule2", token(seq({ str("a"), str("b") })) },
-        }, {}));
-
-        AssertThat(result.first, Equals(PreparedGrammar({
-            { "rule0", i_token(0) }
-        }, {})));
-
-        AssertThat(result.second, Equals(PreparedGrammar({
-            { "rule1", pattern("a|b") },
-            { "rule2", token(seq({ str("a"), str("b") })) },
-        }, {})));
+    it("extracts tokens from the grammar's auxiliary rules", [&]() {
+        pair<PreparedGrammar, PreparedGrammar> result = extract_tokens(PreparedGrammar({}, {
+            { "rule_A", seq({ str("ab"), i_sym(0) }) }
+        }));
+        
+        AssertThat(result.first, Equals(PreparedGrammar({}, {
+            { "rule_A", seq({ i_aux_token(0), i_sym(0) }) }
+        })));
+        
+        AssertThat(result.second, Equals(PreparedGrammar({}, {
+            { "token0", str("ab") },
+        })));
    });

-    it("updates symbols whose indices need to change due to deleted rules", [&]() {
-        auto result = extract_tokens(PreparedGrammar({
-            { "rule0", str("ab") },
-            { "rule1", i_sym(0) },
-            { "rule2", i_sym(1) },
-        }, {}));
-
-        AssertThat(result.first, Equals(PreparedGrammar({
-            { "rule1", i_token(0) },
-            { "rule2", i_sym(0) },
-        }, {})));
-
-        AssertThat(result.second, Equals(PreparedGrammar({
-            { "rule0", str("ab") },
-        }, {})));
-    });
-
-    it("updates the grammar's ubiquitous_tokens", [&]() {
-        auto result = extract_tokens(PreparedGrammar({
-            { "rule0", str("ab") },
-            { "rule1", i_sym(0) },
-            { "rule2", i_sym(1) },
-        }, {}, PreparedGrammarOptions({
-            { Symbol(0) }
-        })));
-
-        AssertThat(result.first.options.ubiquitous_tokens, Equals(vector<Symbol>({
-            { Symbol(0, SymbolOptionToken) }
-        })));
+    describe("when an entire rule can be extracted", [&]() {
+        it("moves the rule the lexical grammar when possible and updates referencing symbols", [&]() {
+            auto result = extract_tokens(PreparedGrammar({
+                { "rule_A", i_sym(1) },
+                { "rule_B", pattern("a|b") },
+                { "rule_C", token(seq({ str("a"), str("b") })) },
+            }, {}));
+            
+            AssertThat(result.first, Equals(PreparedGrammar({
+                { "rule_A", i_token(0) }
+            }, {})));
+            
+            AssertThat(result.second, Equals(PreparedGrammar({
+                { "rule_B", pattern("a|b") },
+                { "rule_C", token(seq({ str("a"), str("b") })) },
+            }, {})));
+        });
+        
+        it("updates symbols whose indices need to change due to deleted rules", [&]() {
+            auto result = extract_tokens(PreparedGrammar({
+                { "rule_A", str("ab") },
+                { "rule_B", i_sym(0) },
+                { "rule_C", i_sym(1) },
+            }, {}));
+            
+            AssertThat(result.first, Equals(PreparedGrammar({
+                { "rule_B", i_token(0) },
+                { "rule_C", i_sym(0) },
+            }, {})));
+            
+            AssertThat(result.second, Equals(PreparedGrammar({
+                { "rule_A", str("ab") },
+            }, {})));
+        });
+        
+        it("updates the grammar's ubiquitous_tokens", [&]() {
+            auto result = extract_tokens(PreparedGrammar({
+                { "rule_A", str("ab") },
+                { "rule_B", i_sym(0) },
+                { "rule_C", i_sym(1) },
+            }, {}, PreparedGrammarOptions({
+                { Symbol(0) }
+            })));
+            
+            AssertThat(result.first.options.ubiquitous_tokens, Equals(vector<Symbol>({
+                { Symbol(0, SymbolOptionToken) }
+            })));
+        });
+        
+        it("extracts entire auxiliary rules", [&]() {
+            auto result = extract_tokens(PreparedGrammar({}, {
+                { "rule_A", str("ab") },
+                { "rule_B", i_aux_sym(0) },
+                { "rule_C", i_aux_sym(1) },
+            }));
+            
+            AssertThat(result.first, Equals(PreparedGrammar({}, {
+                { "rule_B", i_aux_token(0) },
+                { "rule_C", i_aux_sym(0) },
+            })));
+            
+            AssertThat(result.second, Equals(PreparedGrammar({}, {
+                { "rule_A", str("ab") },
+            })));
+        });
    });
 });

--- a/src/compiler/prepare_grammar/extract_tokens.cc
+++ b/src/compiler/prepare_grammar/extract_tokens.cc
@ -5,10 +5,6 @@
 #include "tree_sitter/compiler.h"
 #include "compiler/prepared_grammar.h"
 #include "compiler/rules/visitor.h"
-#include "compiler/rules/seq.h"
-#include "compiler/rules/choice.h"
-#include "compiler/rules/repeat.h"
-#include "compiler/rules/blank.h"
 #include "compiler/rules/symbol.h"
 #include "compiler/rules/string.h"
 #include "compiler/rules/metadata.h"
@ -23,7 +19,6 @@ namespace tree_sitter {
    using std::make_shared;
    using rules::rule_ptr;
    using rules::Symbol;
-    using std::dynamic_pointer_cast;

    namespace prepare_grammar {
        class IsToken : public rules::RuleFn<bool> {
@ -51,49 +46,46 @@ namespace tree_sitter {

        public:
            Symbol replace_symbol(const Symbol &rule) {
+                if (rule.is_built_in()) return rule;
                auto replacement_pair = replacements.find(rule);
                if (replacement_pair != replacements.end())
                    return replacement_pair->second;
-                else if (rule.is_built_in())
-                    return rule;
                else
                    return Symbol(new_index_for_symbol(rule), rule.options);
            }

-            SymbolInliner(const map<Symbol, Symbol> &replacements, size_t rule_count, size_t aux_rule_count) :
-                replacements(replacements)
-                {}
+            SymbolInliner(const map<Symbol, Symbol> &replacements) : replacements(replacements) {}
        };
+        
+        const rules::SymbolOption SymbolOptionAuxToken = rules::SymbolOption(rules::SymbolOptionToken|rules::SymbolOptionAuxiliary);

        class TokenExtractor : public rules::IdentityRuleFn {
-            size_t add_token(rule_ptr rule) {
+            rule_ptr apply_to_token(const rules::Rule *rule) {
+                auto result = rule->copy();
                for (size_t i = 0; i < tokens.size(); i++)
                    if (tokens[i].second->operator==(*rule))
-                        return i;
+                        return make_shared<Symbol>(i, SymbolOptionAuxToken);
                size_t index = tokens.size();
-                tokens.push_back({ "token" + to_string(index), rule });
-                return index;
-            }
+                tokens.push_back({ "token" + to_string(index), result });
+                return make_shared<Symbol>(index, SymbolOptionAuxToken);

-            rule_ptr apply_to_token(const rules::rule_ptr rule) {
-                size_t index = add_token(rule);
-                return make_shared<rules::Symbol>(index, rules::SymbolOption(rules::SymbolOptionToken|rules::SymbolOptionAuxiliary));
            }
-
+            
            rule_ptr default_apply(const rules::Rule *rule) {
                auto result = rule->copy();
-                if (IsToken().apply(result)) {
-                    return apply_to_token(result);
+                if (IsToken().apply(rule->copy())) {
+                    return apply_to_token(rule);
                } else {
                    return result;
                }
            }
-
+            
            rule_ptr apply_to(const rules::Metadata *rule) {
-                if (rule->value_for(rules::IS_TOKEN)) {
-                    return apply_to_token(rule->copy());
+                auto result = rule->copy();
+                if (IsToken().apply(rule->copy())) {
+                    return apply_to_token(rule);
                } else {
-                    return make_shared<rules::Metadata>(apply(rule->rule), rule->value);
+                    return rules::IdentityRuleFn::apply_to(rule);
                }
            }

@ -103,6 +95,8 @@ namespace tree_sitter {

        pair<PreparedGrammar, PreparedGrammar> extract_tokens(const PreparedGrammar &input_grammar) {
            vector<pair<string, rule_ptr>> rules, tokens, aux_rules, aux_tokens;
+            vector<Symbol> ubiquitous_tokens;
+
            TokenExtractor extractor;
            map<Symbol, Symbol> symbol_replacements;

@ -112,7 +106,7 @@ namespace tree_sitter {
                    tokens.push_back(pair);
                    symbol_replacements.insert({
                        Symbol(i),
-                        Symbol(tokens.size() - 1, rules::SymbolOption(rules::SymbolOptionToken))
+                        Symbol(tokens.size() - 1, rules::SymbolOptionToken)
                    });
                } else {
                    rules.push_back({ pair.first, extractor.apply(pair.second) });
@ -134,16 +128,13 @@ namespace tree_sitter {

            aux_tokens.insert(aux_tokens.end(), extractor.tokens.begin(), extractor.tokens.end());

-            SymbolInliner inliner(symbol_replacements, input_grammar.rules.size(), input_grammar.aux_rules.size());
-
-            vector<Symbol> ubiquitous_tokens;
+            SymbolInliner inliner(symbol_replacements);
            for (auto &pair : rules)
                pair.second = inliner.apply(pair.second);
            for (auto &pair : aux_rules)
                pair.second = inliner.apply(pair.second);
-            for (auto &symbol : input_grammar.options.ubiquitous_tokens) {
+            for (auto &symbol : input_grammar.options.ubiquitous_tokens)
                ubiquitous_tokens.push_back(inliner.replace_symbol(symbol));
-            }

            PreparedGrammarOptions parse_options(input_grammar.options);
            parse_options.ubiquitous_tokens = ubiquitous_tokens;