Expand regex/string rules as part of grammar preparation

This makes it possible to report errors in regex parsing
2014-05-19 20:54:59 -07:00 · 2014-05-19 20:54:59 -07:00 · 649f200831
commit 649f200831
parent 5245bc01fe
26 changed files with 883 additions and 651 deletions
--- a/src/compiler/build_tables/rule_transitions.cc
+++ b/src/compiler/build_tables/rule_transitions.cc
@ -94,20 +94,6 @@ namespace tree_sitter {
                });
                return result;
            }
-
-            map<T, rule_ptr> apply_to(const rules::String *rule) {
-                rule_ptr result = make_shared<rules::Blank>();
-                for (char val : rule->value)
-                    result = rules::Seq::Build({
-                        result,
-                        CharacterSet({ val }).copy()
-                    });
-                return this->apply(result);
-            }
-
-            map<T, rule_ptr> apply_to(const rules::Pattern *rule) {
-                return this->apply(rule->to_rule_tree());
-            }
        };

        map<CharacterSet, rule_ptr> char_transitions(const rule_ptr &rule) {
--- a/src/compiler/generate_code/c_code.cc
+++ b/src/compiler/generate_code/c_code.cc
@ -113,8 +113,7 @@ namespace tree_sitter {
                } else if (symbol.is_token() && symbol.is_auxiliary()) {
                    return token_description(grammar_for_symbol(symbol).rule(symbol));
                } else {
-                    string name = grammar_for_symbol(symbol).rule_name(symbol);
-                    return name;
+                    return grammar_for_symbol(symbol).rule_name(symbol);
                }
            }

--- a/src/compiler/grammar.cc
+++ b/src/compiler/grammar.cc
@ -48,6 +48,10 @@ namespace tree_sitter {
    GrammarError::GrammarError(GrammarErrorType type, std::string message) :
        type(type),
        message(message) {}
+    
+    bool GrammarError::operator==(const GrammarError &other) const {
+        return type == other.type && message == other.message;
+    }

    ostream& operator<<(ostream &stream, const GrammarError *error) {
        if (error)
--- a/src/compiler/prepare_grammar/expand_tokens.cc
+++ b/src/compiler/prepare_grammar/expand_tokens.cc
@ -0,0 +1,68 @@
+#include "compiler/prepare_grammar/expand_tokens.h"
+#include <vector>
+#include <string>
+#include <utility>
+#include "compiler/prepared_grammar.h"
+#include "compiler/rules/visitor.h"
+#include "compiler/rules/pattern.h"
+#include "compiler/rules/string.h"
+#include "compiler/rules/blank.h"
+#include "compiler/rules/seq.h"
+#include "compiler/rules/character_set.h"
+#include "compiler/prepare_grammar/parse_regex.h"
+
+namespace tree_sitter {
+    using std::string;
+    using std::vector;
+    using std::pair;
+    using std::make_shared;
+    using rules::rule_ptr;
+    using rules::String;
+    using rules::Pattern;
+    
+    namespace prepare_grammar {
+        class ExpandTokens : public rules::IdentityRuleFn {
+            using rules::IdentityRuleFn::apply_to;
+
+            rule_ptr apply_to(const String *rule) {
+                vector<rule_ptr> elements;
+                for (char val : rule->value)
+                    elements.push_back(rules::CharacterSet({ val }).copy());
+                return rules::Seq::Build(elements);
+            }
+            
+            rule_ptr apply_to(const Pattern *rule) {
+                auto pair = parse_regex(rule->value);
+                if (!error)
+                    error = pair.second;
+                return pair.first;
+            }
+            
+        public:
+            const GrammarError *error;
+            ExpandTokens() : error(nullptr) {}
+        };
+        
+        pair<PreparedGrammar, const GrammarError *>
+        expand_tokens(const PreparedGrammar &grammar) {
+            vector<pair<string, rule_ptr>> rules, aux_rules;
+            ExpandTokens expander;
+            
+            for (auto &pair : grammar.rules) {
+                auto rule = expander.apply(pair.second);
+                if (expander.error)
+                    return { PreparedGrammar(), expander.error };
+                rules.push_back({ pair.first, rule });
+            }
+            
+            for (auto &pair : grammar.aux_rules) {
+                auto rule = expander.apply(pair.second);
+                if (expander.error)
+                    return { PreparedGrammar(), expander.error };
+                aux_rules.push_back({ pair.first, rule });
+            }
+            
+            return { PreparedGrammar(rules, aux_rules, grammar.options), nullptr };
+        }
+    }
+}
--- a/src/compiler/prepare_grammar/expand_tokens.h
+++ b/src/compiler/prepare_grammar/expand_tokens.h
@ -0,0 +1,16 @@
+#ifndef COMPILER_PREPARE_GRAMMAR_EXPAND_TOKENS_H_
+#define COMPILER_PREPARE_GRAMMAR_EXPAND_TOKENS_H_
+
+#include "tree_sitter/compiler.h"
+
+namespace tree_sitter {
+    class PreparedGrammar;
+    
+    namespace prepare_grammar {
+        std::pair<PreparedGrammar, const GrammarError *>
+        expand_tokens(const PreparedGrammar &);
+    }
+}
+
+#endif  // COMPILER_PREPARE_GRAMMAR_EXPAND_TOKENS_H_
+
--- a/src/compiler/prepare_grammar/parse_regex.cc
+++ b/src/compiler/prepare_grammar/parse_regex.cc
@ -0,0 +1,210 @@
+#include "compiler/prepare_grammar/parse_regex.h"
+#include <string>
+#include <utility>
+#include "compiler/rules/choice.h"
+#include "compiler/rules/seq.h"
+#include "compiler/rules/repeat.h"
+#include "compiler/rules/character_set.h"
+#include "compiler/rules/blank.h"
+#include "compiler/util/string_helpers.h"
+
+namespace tree_sitter {
+    using std::string;
+    using std::vector;
+    using std::pair;
+    using std::make_shared;
+    using rules::rule_ptr;
+    using rules::CharacterSet;
+    using rules::Seq;
+    using rules::Blank;
+    using rules::Choice;
+    using rules::Repeat;
+    using rules::CharacterRange;
+    using rules::blank;
+    
+    namespace prepare_grammar {
+        class PatternParser {
+        public:
+            explicit PatternParser(const string &input) :
+                input(input),
+                length(input.length()),
+                position(0) {}
+            
+            pair<rule_ptr, const GrammarError *> rule(bool nested) {
+                vector<rule_ptr> choices = {};
+                do {
+                    if (!choices.empty()) {
+                        if (peek() == '|')
+                            next();
+                        else
+                            break;
+                    }
+                    auto pair = term(nested);
+                    if (pair.second)
+                        return { blank(), pair.second };
+                    choices.push_back(pair.first);
+                } while (has_more_input());
+                auto rule = (choices.size() > 1) ? make_shared<Choice>(choices) : choices.front();
+                return { rule, nullptr };
+            }
+            
+        private:
+            pair<rule_ptr, const GrammarError *> term(bool nested) {
+                rule_ptr result = blank();
+                do {
+                    if (peek() == '|')
+                        break;
+                    if (nested && peek() == ')')
+                        break;
+                    auto pair = factor();
+                    if (pair.second)
+                        return { blank(), pair.second };
+                    result = Seq::Build({ result, pair.first });
+                } while (has_more_input());
+                return { result, nullptr };
+            }
+            
+            pair<rule_ptr, const GrammarError *> factor() {
+                auto pair = atom();
+                if (pair.second)
+                    return { blank(), pair.second };
+                rule_ptr result = pair.first;
+                if (has_more_input()) {
+                    switch (peek()) {
+                        case '*':
+                            next();
+                            result = make_shared<Repeat>(result);
+                            break;
+                        case '+':
+                            next();
+                            result = make_shared<Seq>(result, make_shared<Repeat>(result));
+                            break;
+                        case '?':
+                            next();
+                            result = Choice::Build({ result, make_shared<Blank>() });
+                            break;
+                    }
+                }
+                return { result, nullptr };
+            }
+            
+            pair<rule_ptr, const GrammarError *> atom() {
+                switch (peek()) {
+                    case '(': {
+                        next();
+                        auto pair = rule(true);
+                        if (pair.second)
+                            return { blank(), pair.second };
+                        if (peek() != ')')
+                            return error("unmatched open paren");
+                        next();
+                        return { pair.first, nullptr };
+                    }
+                    case '[': {
+                        next();
+                        auto pair = char_set();
+                        if (pair.second) 
+                            return { blank(), pair.second };
+                        if (peek() != ']')
+                            return error("unmatched open square bracket");
+                        next();
+                        return { pair.first.copy(), nullptr };
+                    }
+                    case ')': {
+                        return error("unmatched close paren");
+                    }
+                    case ']': {
+                        return error("unmatched close square bracket");
+                    }
+                    case '.': {
+                        next();
+                        return { CharacterSet({ '\n' }).complement().copy(), nullptr };
+                    }
+                    default: {
+                        auto pair = single_char();
+                        if (pair.second) 
+                            return { blank(), pair.second };
+                        return { pair.first.copy(), nullptr };
+                    }
+                }
+            }
+            
+            pair<CharacterSet, const GrammarError *> char_set() {
+                bool is_affirmative = true;
+                if (peek() == '^') {
+                    next();
+                    is_affirmative = false;
+                }
+                CharacterSet result;
+                while (has_more_input() && (peek() != ']')) {
+                    auto pair = single_char();
+                    if (pair.second)
+                        return { CharacterSet(), pair.second };
+                    result.add_set(pair.first);
+                }
+                if (!is_affirmative)
+                    result = result.complement();
+                return { result, nullptr };
+            }
+            
+            pair<CharacterSet, const GrammarError *> single_char() {
+                CharacterSet value;
+                switch (peek()) {
+                    case '\\':
+                        next();
+                        value = escaped_char(peek());
+                        next();
+                        break;
+                    default:
+                        char first_char = peek();
+                        next();
+                        if (peek() == '-') {
+                            next();
+                            value = CharacterSet({ CharacterRange(first_char, peek()) });
+                            next();
+                        } else {
+                            value = CharacterSet({ first_char });
+                        }
+                }
+                return { value, nullptr };
+            }
+            
+            CharacterSet escaped_char(char value) {
+                switch (value) {
+                    case 'a':
+                        return CharacterSet({ {'a', 'z'}, {'A', 'Z'} });
+                    case 'w':
+                        return CharacterSet({ {'a', 'z'}, {'A', 'Z'}, {'0', '9'}});
+                    case 'd':
+                        return CharacterSet({ {'0', '9'} });
+                    default:
+                        return CharacterSet({ value });
+                }
+            }
+            
+            void next() {
+                position++;
+            }
+            
+            char peek() {
+                return input[position];
+            }
+            
+            bool has_more_input() {
+                return position < length;
+            }
+            
+            pair<rule_ptr, const GrammarError *> error(string msg) {
+                return { blank(), new GrammarError(GrammarErrorTypeRegex, msg) };
+            }
+            
+            const string input;
+            const size_t length;
+            size_t position;
+        };
+
+        pair<rule_ptr, const GrammarError *> parse_regex(const std::string &input) {
+            return PatternParser(input).rule(false);
+        }
+    }
+}
--- a/src/compiler/prepare_grammar/parse_regex.h
+++ b/src/compiler/prepare_grammar/parse_regex.h
@ -0,0 +1,16 @@
+#ifndef COMPILER_PREPARE_GRAMMAR_PARSE_REGEX_H_
+#define COMPILER_PREPARE_GRAMMAR_PARSE_REGEX_H_
+
+#include "tree_sitter/compiler.h"
+#include <string>
+#include <utility>
+
+namespace tree_sitter {
+    namespace prepare_grammar {
+        std::pair<rules::rule_ptr, const GrammarError *>
+        parse_regex(const std::string &);
+    }
+}
+
+
+#endif  // COMPILER_PREPARE_GRAMMAR_PARSE_REGEX_H_
--- a/src/compiler/prepare_grammar/prepare_grammar.cc
+++ b/src/compiler/prepare_grammar/prepare_grammar.cc
@ -2,8 +2,11 @@
 #include "compiler/prepared_grammar.h"
 #include "compiler/prepare_grammar/extract_tokens.h"
 #include "compiler/prepare_grammar/expand_repeats.h"
+#include "compiler/prepare_grammar/expand_tokens.h"
 #include "compiler/prepare_grammar/intern_symbols.h"

+#include "stream_methods.h"
+
 namespace tree_sitter {
    using std::tuple;
    using std::make_tuple;
@ -16,12 +19,17 @@ namespace tree_sitter {
            const GrammarError *error = result.second;

            if (error)
-                return make_tuple(PreparedGrammar({}, {}), PreparedGrammar({}, {}), error);
+                return make_tuple(PreparedGrammar(), PreparedGrammar(), error);

            auto grammars = extract_tokens(grammar);
            const PreparedGrammar &rule_grammar = expand_repeats(grammars.first);
-            const PreparedGrammar &lex_grammar = grammars.second;
-
+            auto expand_tokens_result = expand_tokens(grammars.second);
+            const PreparedGrammar &lex_grammar = expand_tokens_result.first;
+            error = expand_tokens_result.second;
+            
+            if (error)
+                return make_tuple(PreparedGrammar(), PreparedGrammar(), error);
+            
            return make_tuple(rule_grammar, lex_grammar, nullptr);
        }
    }
--- a/src/compiler/prepared_grammar.cc
+++ b/src/compiler/prepared_grammar.cc
@ -10,6 +10,8 @@ namespace tree_sitter {
    using std::ostream;
    using rules::rule_ptr;
    using rules::Symbol;
+    
+    PreparedGrammar::PreparedGrammar() : Grammar({}), aux_rules({}), options({}) {}

    PreparedGrammar::PreparedGrammar(const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,
                                     const std::vector<std::pair<std::string, rules::rule_ptr>> &aux_rules) :
--- a/src/compiler/prepared_grammar.h
+++ b/src/compiler/prepared_grammar.h
@ -14,6 +14,7 @@ namespace tree_sitter {

    class PreparedGrammar : public Grammar {
    public:
+        PreparedGrammar();
        PreparedGrammar(const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,
                        const std::vector<std::pair<std::string, rules::rule_ptr>> &aux_rules);
        PreparedGrammar(const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,
--- a/src/compiler/rules/metadata.h
+++ b/src/compiler/rules/metadata.h
@ -11,6 +11,7 @@ namespace tree_sitter  {
            START_TOKEN,
            PRECEDENCE,
            IS_TOKEN,
+            DESCRIPTION,
        } MetadataKey;

        class Metadata : public Rule {
--- a/src/compiler/rules/pattern.cc
+++ b/src/compiler/rules/pattern.cc
@ -1,173 +1,12 @@
 #include "compiler/rules/pattern.h"
-#include <set>
 #include <string>
-#include <vector>
 #include "compiler/rules/visitor.h"
-#include "compiler/rules/choice.h"
-#include "compiler/rules/seq.h"
-#include "compiler/rules/repeat.h"
-#include "compiler/rules/character_set.h"
-#include "compiler/rules/blank.h"
 #include "compiler/util/string_helpers.h"

 namespace tree_sitter {
    namespace rules {
        using std::string;
        using std::hash;
-        using std::make_shared;
-        using std::set;
-        using std::vector;
-
-        class PatternParser {
-        public:
-            explicit PatternParser(const string &input) :
-                input(input),
-                length(input.length()),
-                position(0) {}
-
-            rule_ptr rule() {
-                vector<rule_ptr> choices = { term() };
-                while (has_more_input() && peek() == '|') {
-                    next();
-                    choices.push_back(term());
-                }
-                return (choices.size() > 1) ? Choice::Build(choices) : choices.front();
-            }
-
-        private:
-            rule_ptr term() {
-                rule_ptr result = factor();
-                while (has_more_input() && (peek() != '|') && (peek() != ')'))
-                    result = Seq::Build({ result, factor() });
-                return result;
-            }
-
-            rule_ptr factor() {
-                rule_ptr result = atom();
-                if (has_more_input()) {
-                    switch (peek()) {
-                        case '*':
-                            next();
-                            result = make_shared<Repeat>(result);
-                            break;
-                        case '+':
-                            next();
-                            result = make_shared<Seq>(result, make_shared<Repeat>(result));
-                            break;
-                        case '?':
-                            next();
-                            result = Choice::Build({ result, make_shared<Blank>() });
-                            break;
-                    }
-                }
-                return result;
-            }
-
-            rule_ptr atom() {
-                rule_ptr result;
-                switch (peek()) {
-                    case '(':
-                        next();
-                        result = rule();
-                        if (has_error()) return result;
-                        if (peek() != ')') {
-                            error = "mismatched parens";
-                            return result;
-                        }
-                        next();
-                        break;
-                    case '[':
-                        next();
-                        result = char_set().copy();
-                        if (has_error()) return result;
-                        if (peek() != ']') {
-                            error = "mismatched square brackets";
-                            return result;
-                        }
-                        next();
-                        break;
-                    case ')':
-                        error = "mismatched parens";
-                        break;
-                    case '.':
-                        result = CharacterSet({ '\n' }).complement().copy();
-                        next();
-                        break;
-                    default:
-                        result = single_char().copy();
-                }
-                return result;
-            }
-
-            CharacterSet char_set() {
-                bool is_affirmative = true;
-                if (peek() == '^') {
-                    next();
-                    is_affirmative = false;
-                }
-                CharacterSet result;
-                while (has_more_input() && (peek() != ']'))
-                    result.add_set(single_char());
-                return is_affirmative ? result : result.complement();
-            }
-
-            CharacterSet single_char() {
-                CharacterSet value;
-                switch (peek()) {
-                    case '\\':
-                        next();
-                        value = escaped_char(peek());
-                        if (has_error()) return value;
-                        next();
-                        break;
-                    default:
-                        char first_char = peek();
-                        next();
-                        if (peek() == '-') {
-                            next();
-                            value = CharacterSet({ CharacterRange(first_char, peek()) });
-                            next();
-                        } else {
-                            value = CharacterSet({ first_char });
-                        }
-                }
-                return value;
-            }
-
-            CharacterSet escaped_char(char value) {
-                switch (value) {
-                    case 'a':
-                        return CharacterSet({ {'a', 'z'}, {'A', 'Z'} });
-                    case 'w':
-                        return CharacterSet({ {'a', 'z'}, {'A', 'Z'}, {'0', '9'}});
-                    case 'd':
-                        return CharacterSet({ {'0', '9'} });
-                    default:
-                        return CharacterSet({ value });
-                }
-            }
-
-            void next() {
-                position++;
-            }
-
-            char peek() {
-                return input[position];
-            }
-
-            bool has_more_input() {
-                return position < length;
-            }
-
-            bool has_error() {
-                return error != "";
-            }
-
-            string error;
-            const string input;
-            const size_t length;
-            size_t position;
-        };

        Pattern::Pattern(const string &string) : value(string) {}

@ -191,9 +30,5 @@ namespace tree_sitter {
        void Pattern::accept(Visitor *visitor) const {
            visitor->visit(this);
        }
-
-        rule_ptr Pattern::to_rule_tree() const {
-            return PatternParser(value).rule();
-        }
    }
 }
--- a/src/compiler/rules/pattern.h
+++ b/src/compiler/rules/pattern.h
@ -17,7 +17,6 @@ namespace tree_sitter {
            void accept(Visitor *visitor) const;

            const std::string value;
-            rule_ptr to_rule_tree() const;
        };
    }
 }