#include "compiler/prepare_grammar/parse_regex.h" #include #include #include #include "compiler/rules/choice.h" #include "compiler/rules/seq.h" #include "compiler/rules/repeat.h" #include "compiler/rules/character_set.h" #include "compiler/rules/blank.h" #include "compiler/util/string_helpers.h" #include "utf8proc.h" namespace tree_sitter { namespace prepare_grammar { using std::string; using std::vector; using std::pair; using std::make_shared; using rules::rule_ptr; using rules::CharacterSet; using rules::Seq; using rules::Blank; using rules::Choice; using rules::Repeat; using rules::blank; class PatternParser { public: explicit PatternParser(const string &input) : input(input), iter((const uint8_t *)input.data()), end(iter + input.size()) { next(); } pair rule(bool nested) { vector choices = {}; do { if (!choices.empty()) { if (peek() == '|') next(); else break; } auto pair = term(nested); if (pair.second) return { blank(), pair.second }; choices.push_back(pair.first); } while (has_more_input()); auto rule = (choices.size() > 1) ? make_shared(choices) : choices.front(); return { rule, nullptr }; } private: pair term(bool nested) { rule_ptr result = blank(); do { if (peek() == '|') break; if (nested && peek() == ')') break; auto pair = factor(); if (pair.second) return { blank(), pair.second }; result = Seq::build({ result, pair.first }); } while (has_more_input()); return { result, nullptr }; } pair factor() { auto pair = atom(); if (pair.second) return { blank(), pair.second }; rule_ptr result = pair.first; if (has_more_input()) { switch (peek()) { case '*': next(); result = make_shared(result); break; case '+': next(); result = make_shared(result, make_shared(result)); break; case '?': next(); result = Choice::build({ result, make_shared() }); break; } } return { result, nullptr }; } pair atom() { switch (peek()) { case '(': { next(); auto pair = rule(true); if (pair.second) return { blank(), pair.second }; if (peek() != ')') return error("unmatched open paren"); next(); return { pair.first, nullptr }; } case '[': { next(); auto pair = char_set(); if (pair.second) return { blank(), pair.second }; if (peek() != ']') return error("unmatched open square bracket"); next(); return { pair.first.copy(), nullptr }; } case ')': { return error("unmatched close paren"); } case ']': { return error("unmatched close square bracket"); } case '.': { next(); return { CharacterSet().include_all().exclude('\n').copy(), nullptr }; } default: { auto pair = single_char(); if (pair.second) return { blank(), pair.second }; return { pair.first.copy(), nullptr }; } } } pair char_set() { CharacterSet result; bool is_affirmative = true; if (peek() == '^') { next(); is_affirmative = false; result.include_all(); } while (has_more_input() && (peek() != ']')) { auto pair = single_char(); if (pair.second) return { CharacterSet(), pair.second }; if (is_affirmative) result.add_set(pair.first); else result.remove_set(pair.first); } return { result, nullptr }; } pair single_char() { CharacterSet value; switch (peek()) { case '\\': next(); value = escaped_char(peek()); next(); break; default: uint32_t first_char = peek(); next(); if (peek() == '-') { next(); value = CharacterSet().include(first_char, peek()); next(); } else { value = CharacterSet().include(first_char); } } return { value, nullptr }; } CharacterSet escaped_char(uint32_t value) { switch (value) { case 'a': return CharacterSet().include('a', 'z').include('A', 'Z'); case 'w': return CharacterSet() .include('a', 'z') .include('A', 'Z') .include('0', '9') .include('_'); case 'd': return CharacterSet().include('0', '9'); case 's': return CharacterSet().include(' ').include('\t').include('\n').include( '\r'); case 't': return CharacterSet().include('\t'); case 'n': return CharacterSet().include('\n'); case 'r': return CharacterSet().include('\r'); default: return CharacterSet().include(value); } } void next() { size_t lookahead_size = utf8proc_iterate(iter, end - iter, &lookahead); if (!lookahead_size) lookahead = 0; iter += lookahead_size; } uint32_t peek() { return lookahead; } bool has_more_input() { return lookahead && iter <= end; } pair error(string msg) { return { blank(), new GrammarError(GrammarErrorTypeRegex, msg) }; } string input; const uint8_t *iter; const uint8_t *end; int32_t lookahead; }; pair parse_regex(const std::string &input) { return PatternParser(input.c_str()).rule(false); } } // namespace prepare_grammar } // namespace tree_sitter