Expand regex/string rules as part of grammar preparation
This makes it possible to report errors in regex parsing
This commit is contained in:
parent
5245bc01fe
commit
649f200831
26 changed files with 883 additions and 651 deletions
|
|
@ -94,20 +94,6 @@ namespace tree_sitter {
|
|||
});
|
||||
return result;
|
||||
}
|
||||
|
||||
map<T, rule_ptr> apply_to(const rules::String *rule) {
|
||||
rule_ptr result = make_shared<rules::Blank>();
|
||||
for (char val : rule->value)
|
||||
result = rules::Seq::Build({
|
||||
result,
|
||||
CharacterSet({ val }).copy()
|
||||
});
|
||||
return this->apply(result);
|
||||
}
|
||||
|
||||
map<T, rule_ptr> apply_to(const rules::Pattern *rule) {
|
||||
return this->apply(rule->to_rule_tree());
|
||||
}
|
||||
};
|
||||
|
||||
map<CharacterSet, rule_ptr> char_transitions(const rule_ptr &rule) {
|
||||
|
|
|
|||
|
|
@ -113,8 +113,7 @@ namespace tree_sitter {
|
|||
} else if (symbol.is_token() && symbol.is_auxiliary()) {
|
||||
return token_description(grammar_for_symbol(symbol).rule(symbol));
|
||||
} else {
|
||||
string name = grammar_for_symbol(symbol).rule_name(symbol);
|
||||
return name;
|
||||
return grammar_for_symbol(symbol).rule_name(symbol);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -48,6 +48,10 @@ namespace tree_sitter {
|
|||
GrammarError::GrammarError(GrammarErrorType type, std::string message) :
|
||||
type(type),
|
||||
message(message) {}
|
||||
|
||||
bool GrammarError::operator==(const GrammarError &other) const {
|
||||
return type == other.type && message == other.message;
|
||||
}
|
||||
|
||||
ostream& operator<<(ostream &stream, const GrammarError *error) {
|
||||
if (error)
|
||||
|
|
|
|||
68
src/compiler/prepare_grammar/expand_tokens.cc
Normal file
68
src/compiler/prepare_grammar/expand_tokens.cc
Normal file
|
|
@ -0,0 +1,68 @@
|
|||
#include "compiler/prepare_grammar/expand_tokens.h"
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include "compiler/prepared_grammar.h"
|
||||
#include "compiler/rules/visitor.h"
|
||||
#include "compiler/rules/pattern.h"
|
||||
#include "compiler/rules/string.h"
|
||||
#include "compiler/rules/blank.h"
|
||||
#include "compiler/rules/seq.h"
|
||||
#include "compiler/rules/character_set.h"
|
||||
#include "compiler/prepare_grammar/parse_regex.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
using std::string;
|
||||
using std::vector;
|
||||
using std::pair;
|
||||
using std::make_shared;
|
||||
using rules::rule_ptr;
|
||||
using rules::String;
|
||||
using rules::Pattern;
|
||||
|
||||
namespace prepare_grammar {
|
||||
class ExpandTokens : public rules::IdentityRuleFn {
|
||||
using rules::IdentityRuleFn::apply_to;
|
||||
|
||||
rule_ptr apply_to(const String *rule) {
|
||||
vector<rule_ptr> elements;
|
||||
for (char val : rule->value)
|
||||
elements.push_back(rules::CharacterSet({ val }).copy());
|
||||
return rules::Seq::Build(elements);
|
||||
}
|
||||
|
||||
rule_ptr apply_to(const Pattern *rule) {
|
||||
auto pair = parse_regex(rule->value);
|
||||
if (!error)
|
||||
error = pair.second;
|
||||
return pair.first;
|
||||
}
|
||||
|
||||
public:
|
||||
const GrammarError *error;
|
||||
ExpandTokens() : error(nullptr) {}
|
||||
};
|
||||
|
||||
pair<PreparedGrammar, const GrammarError *>
|
||||
expand_tokens(const PreparedGrammar &grammar) {
|
||||
vector<pair<string, rule_ptr>> rules, aux_rules;
|
||||
ExpandTokens expander;
|
||||
|
||||
for (auto &pair : grammar.rules) {
|
||||
auto rule = expander.apply(pair.second);
|
||||
if (expander.error)
|
||||
return { PreparedGrammar(), expander.error };
|
||||
rules.push_back({ pair.first, rule });
|
||||
}
|
||||
|
||||
for (auto &pair : grammar.aux_rules) {
|
||||
auto rule = expander.apply(pair.second);
|
||||
if (expander.error)
|
||||
return { PreparedGrammar(), expander.error };
|
||||
aux_rules.push_back({ pair.first, rule });
|
||||
}
|
||||
|
||||
return { PreparedGrammar(rules, aux_rules, grammar.options), nullptr };
|
||||
}
|
||||
}
|
||||
}
|
||||
16
src/compiler/prepare_grammar/expand_tokens.h
Normal file
16
src/compiler/prepare_grammar/expand_tokens.h
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
#ifndef COMPILER_PREPARE_GRAMMAR_EXPAND_TOKENS_H_
|
||||
#define COMPILER_PREPARE_GRAMMAR_EXPAND_TOKENS_H_
|
||||
|
||||
#include "tree_sitter/compiler.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
class PreparedGrammar;
|
||||
|
||||
namespace prepare_grammar {
|
||||
std::pair<PreparedGrammar, const GrammarError *>
|
||||
expand_tokens(const PreparedGrammar &);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // COMPILER_PREPARE_GRAMMAR_EXPAND_TOKENS_H_
|
||||
|
||||
210
src/compiler/prepare_grammar/parse_regex.cc
Normal file
210
src/compiler/prepare_grammar/parse_regex.cc
Normal file
|
|
@ -0,0 +1,210 @@
|
|||
#include "compiler/prepare_grammar/parse_regex.h"
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include "compiler/rules/choice.h"
|
||||
#include "compiler/rules/seq.h"
|
||||
#include "compiler/rules/repeat.h"
|
||||
#include "compiler/rules/character_set.h"
|
||||
#include "compiler/rules/blank.h"
|
||||
#include "compiler/util/string_helpers.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
using std::string;
|
||||
using std::vector;
|
||||
using std::pair;
|
||||
using std::make_shared;
|
||||
using rules::rule_ptr;
|
||||
using rules::CharacterSet;
|
||||
using rules::Seq;
|
||||
using rules::Blank;
|
||||
using rules::Choice;
|
||||
using rules::Repeat;
|
||||
using rules::CharacterRange;
|
||||
using rules::blank;
|
||||
|
||||
namespace prepare_grammar {
|
||||
class PatternParser {
|
||||
public:
|
||||
explicit PatternParser(const string &input) :
|
||||
input(input),
|
||||
length(input.length()),
|
||||
position(0) {}
|
||||
|
||||
pair<rule_ptr, const GrammarError *> rule(bool nested) {
|
||||
vector<rule_ptr> choices = {};
|
||||
do {
|
||||
if (!choices.empty()) {
|
||||
if (peek() == '|')
|
||||
next();
|
||||
else
|
||||
break;
|
||||
}
|
||||
auto pair = term(nested);
|
||||
if (pair.second)
|
||||
return { blank(), pair.second };
|
||||
choices.push_back(pair.first);
|
||||
} while (has_more_input());
|
||||
auto rule = (choices.size() > 1) ? make_shared<Choice>(choices) : choices.front();
|
||||
return { rule, nullptr };
|
||||
}
|
||||
|
||||
private:
|
||||
pair<rule_ptr, const GrammarError *> term(bool nested) {
|
||||
rule_ptr result = blank();
|
||||
do {
|
||||
if (peek() == '|')
|
||||
break;
|
||||
if (nested && peek() == ')')
|
||||
break;
|
||||
auto pair = factor();
|
||||
if (pair.second)
|
||||
return { blank(), pair.second };
|
||||
result = Seq::Build({ result, pair.first });
|
||||
} while (has_more_input());
|
||||
return { result, nullptr };
|
||||
}
|
||||
|
||||
pair<rule_ptr, const GrammarError *> factor() {
|
||||
auto pair = atom();
|
||||
if (pair.second)
|
||||
return { blank(), pair.second };
|
||||
rule_ptr result = pair.first;
|
||||
if (has_more_input()) {
|
||||
switch (peek()) {
|
||||
case '*':
|
||||
next();
|
||||
result = make_shared<Repeat>(result);
|
||||
break;
|
||||
case '+':
|
||||
next();
|
||||
result = make_shared<Seq>(result, make_shared<Repeat>(result));
|
||||
break;
|
||||
case '?':
|
||||
next();
|
||||
result = Choice::Build({ result, make_shared<Blank>() });
|
||||
break;
|
||||
}
|
||||
}
|
||||
return { result, nullptr };
|
||||
}
|
||||
|
||||
pair<rule_ptr, const GrammarError *> atom() {
|
||||
switch (peek()) {
|
||||
case '(': {
|
||||
next();
|
||||
auto pair = rule(true);
|
||||
if (pair.second)
|
||||
return { blank(), pair.second };
|
||||
if (peek() != ')')
|
||||
return error("unmatched open paren");
|
||||
next();
|
||||
return { pair.first, nullptr };
|
||||
}
|
||||
case '[': {
|
||||
next();
|
||||
auto pair = char_set();
|
||||
if (pair.second)
|
||||
return { blank(), pair.second };
|
||||
if (peek() != ']')
|
||||
return error("unmatched open square bracket");
|
||||
next();
|
||||
return { pair.first.copy(), nullptr };
|
||||
}
|
||||
case ')': {
|
||||
return error("unmatched close paren");
|
||||
}
|
||||
case ']': {
|
||||
return error("unmatched close square bracket");
|
||||
}
|
||||
case '.': {
|
||||
next();
|
||||
return { CharacterSet({ '\n' }).complement().copy(), nullptr };
|
||||
}
|
||||
default: {
|
||||
auto pair = single_char();
|
||||
if (pair.second)
|
||||
return { blank(), pair.second };
|
||||
return { pair.first.copy(), nullptr };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pair<CharacterSet, const GrammarError *> char_set() {
|
||||
bool is_affirmative = true;
|
||||
if (peek() == '^') {
|
||||
next();
|
||||
is_affirmative = false;
|
||||
}
|
||||
CharacterSet result;
|
||||
while (has_more_input() && (peek() != ']')) {
|
||||
auto pair = single_char();
|
||||
if (pair.second)
|
||||
return { CharacterSet(), pair.second };
|
||||
result.add_set(pair.first);
|
||||
}
|
||||
if (!is_affirmative)
|
||||
result = result.complement();
|
||||
return { result, nullptr };
|
||||
}
|
||||
|
||||
pair<CharacterSet, const GrammarError *> single_char() {
|
||||
CharacterSet value;
|
||||
switch (peek()) {
|
||||
case '\\':
|
||||
next();
|
||||
value = escaped_char(peek());
|
||||
next();
|
||||
break;
|
||||
default:
|
||||
char first_char = peek();
|
||||
next();
|
||||
if (peek() == '-') {
|
||||
next();
|
||||
value = CharacterSet({ CharacterRange(first_char, peek()) });
|
||||
next();
|
||||
} else {
|
||||
value = CharacterSet({ first_char });
|
||||
}
|
||||
}
|
||||
return { value, nullptr };
|
||||
}
|
||||
|
||||
CharacterSet escaped_char(char value) {
|
||||
switch (value) {
|
||||
case 'a':
|
||||
return CharacterSet({ {'a', 'z'}, {'A', 'Z'} });
|
||||
case 'w':
|
||||
return CharacterSet({ {'a', 'z'}, {'A', 'Z'}, {'0', '9'}});
|
||||
case 'd':
|
||||
return CharacterSet({ {'0', '9'} });
|
||||
default:
|
||||
return CharacterSet({ value });
|
||||
}
|
||||
}
|
||||
|
||||
void next() {
|
||||
position++;
|
||||
}
|
||||
|
||||
char peek() {
|
||||
return input[position];
|
||||
}
|
||||
|
||||
bool has_more_input() {
|
||||
return position < length;
|
||||
}
|
||||
|
||||
pair<rule_ptr, const GrammarError *> error(string msg) {
|
||||
return { blank(), new GrammarError(GrammarErrorTypeRegex, msg) };
|
||||
}
|
||||
|
||||
const string input;
|
||||
const size_t length;
|
||||
size_t position;
|
||||
};
|
||||
|
||||
pair<rule_ptr, const GrammarError *> parse_regex(const std::string &input) {
|
||||
return PatternParser(input).rule(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
16
src/compiler/prepare_grammar/parse_regex.h
Normal file
16
src/compiler/prepare_grammar/parse_regex.h
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
#ifndef COMPILER_PREPARE_GRAMMAR_PARSE_REGEX_H_
|
||||
#define COMPILER_PREPARE_GRAMMAR_PARSE_REGEX_H_
|
||||
|
||||
#include "tree_sitter/compiler.h"
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
namespace tree_sitter {
|
||||
namespace prepare_grammar {
|
||||
std::pair<rules::rule_ptr, const GrammarError *>
|
||||
parse_regex(const std::string &);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#endif // COMPILER_PREPARE_GRAMMAR_PARSE_REGEX_H_
|
||||
|
|
@ -2,8 +2,11 @@
|
|||
#include "compiler/prepared_grammar.h"
|
||||
#include "compiler/prepare_grammar/extract_tokens.h"
|
||||
#include "compiler/prepare_grammar/expand_repeats.h"
|
||||
#include "compiler/prepare_grammar/expand_tokens.h"
|
||||
#include "compiler/prepare_grammar/intern_symbols.h"
|
||||
|
||||
#include "stream_methods.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
using std::tuple;
|
||||
using std::make_tuple;
|
||||
|
|
@ -16,12 +19,17 @@ namespace tree_sitter {
|
|||
const GrammarError *error = result.second;
|
||||
|
||||
if (error)
|
||||
return make_tuple(PreparedGrammar({}, {}), PreparedGrammar({}, {}), error);
|
||||
return make_tuple(PreparedGrammar(), PreparedGrammar(), error);
|
||||
|
||||
auto grammars = extract_tokens(grammar);
|
||||
const PreparedGrammar &rule_grammar = expand_repeats(grammars.first);
|
||||
const PreparedGrammar &lex_grammar = grammars.second;
|
||||
|
||||
auto expand_tokens_result = expand_tokens(grammars.second);
|
||||
const PreparedGrammar &lex_grammar = expand_tokens_result.first;
|
||||
error = expand_tokens_result.second;
|
||||
|
||||
if (error)
|
||||
return make_tuple(PreparedGrammar(), PreparedGrammar(), error);
|
||||
|
||||
return make_tuple(rule_grammar, lex_grammar, nullptr);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -10,6 +10,8 @@ namespace tree_sitter {
|
|||
using std::ostream;
|
||||
using rules::rule_ptr;
|
||||
using rules::Symbol;
|
||||
|
||||
PreparedGrammar::PreparedGrammar() : Grammar({}), aux_rules({}), options({}) {}
|
||||
|
||||
PreparedGrammar::PreparedGrammar(const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,
|
||||
const std::vector<std::pair<std::string, rules::rule_ptr>> &aux_rules) :
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ namespace tree_sitter {
|
|||
|
||||
class PreparedGrammar : public Grammar {
|
||||
public:
|
||||
PreparedGrammar();
|
||||
PreparedGrammar(const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,
|
||||
const std::vector<std::pair<std::string, rules::rule_ptr>> &aux_rules);
|
||||
PreparedGrammar(const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ namespace tree_sitter {
|
|||
START_TOKEN,
|
||||
PRECEDENCE,
|
||||
IS_TOKEN,
|
||||
DESCRIPTION,
|
||||
} MetadataKey;
|
||||
|
||||
class Metadata : public Rule {
|
||||
|
|
|
|||
|
|
@ -1,173 +1,12 @@
|
|||
#include "compiler/rules/pattern.h"
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "compiler/rules/visitor.h"
|
||||
#include "compiler/rules/choice.h"
|
||||
#include "compiler/rules/seq.h"
|
||||
#include "compiler/rules/repeat.h"
|
||||
#include "compiler/rules/character_set.h"
|
||||
#include "compiler/rules/blank.h"
|
||||
#include "compiler/util/string_helpers.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
namespace rules {
|
||||
using std::string;
|
||||
using std::hash;
|
||||
using std::make_shared;
|
||||
using std::set;
|
||||
using std::vector;
|
||||
|
||||
class PatternParser {
|
||||
public:
|
||||
explicit PatternParser(const string &input) :
|
||||
input(input),
|
||||
length(input.length()),
|
||||
position(0) {}
|
||||
|
||||
rule_ptr rule() {
|
||||
vector<rule_ptr> choices = { term() };
|
||||
while (has_more_input() && peek() == '|') {
|
||||
next();
|
||||
choices.push_back(term());
|
||||
}
|
||||
return (choices.size() > 1) ? Choice::Build(choices) : choices.front();
|
||||
}
|
||||
|
||||
private:
|
||||
rule_ptr term() {
|
||||
rule_ptr result = factor();
|
||||
while (has_more_input() && (peek() != '|') && (peek() != ')'))
|
||||
result = Seq::Build({ result, factor() });
|
||||
return result;
|
||||
}
|
||||
|
||||
rule_ptr factor() {
|
||||
rule_ptr result = atom();
|
||||
if (has_more_input()) {
|
||||
switch (peek()) {
|
||||
case '*':
|
||||
next();
|
||||
result = make_shared<Repeat>(result);
|
||||
break;
|
||||
case '+':
|
||||
next();
|
||||
result = make_shared<Seq>(result, make_shared<Repeat>(result));
|
||||
break;
|
||||
case '?':
|
||||
next();
|
||||
result = Choice::Build({ result, make_shared<Blank>() });
|
||||
break;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
rule_ptr atom() {
|
||||
rule_ptr result;
|
||||
switch (peek()) {
|
||||
case '(':
|
||||
next();
|
||||
result = rule();
|
||||
if (has_error()) return result;
|
||||
if (peek() != ')') {
|
||||
error = "mismatched parens";
|
||||
return result;
|
||||
}
|
||||
next();
|
||||
break;
|
||||
case '[':
|
||||
next();
|
||||
result = char_set().copy();
|
||||
if (has_error()) return result;
|
||||
if (peek() != ']') {
|
||||
error = "mismatched square brackets";
|
||||
return result;
|
||||
}
|
||||
next();
|
||||
break;
|
||||
case ')':
|
||||
error = "mismatched parens";
|
||||
break;
|
||||
case '.':
|
||||
result = CharacterSet({ '\n' }).complement().copy();
|
||||
next();
|
||||
break;
|
||||
default:
|
||||
result = single_char().copy();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
CharacterSet char_set() {
|
||||
bool is_affirmative = true;
|
||||
if (peek() == '^') {
|
||||
next();
|
||||
is_affirmative = false;
|
||||
}
|
||||
CharacterSet result;
|
||||
while (has_more_input() && (peek() != ']'))
|
||||
result.add_set(single_char());
|
||||
return is_affirmative ? result : result.complement();
|
||||
}
|
||||
|
||||
CharacterSet single_char() {
|
||||
CharacterSet value;
|
||||
switch (peek()) {
|
||||
case '\\':
|
||||
next();
|
||||
value = escaped_char(peek());
|
||||
if (has_error()) return value;
|
||||
next();
|
||||
break;
|
||||
default:
|
||||
char first_char = peek();
|
||||
next();
|
||||
if (peek() == '-') {
|
||||
next();
|
||||
value = CharacterSet({ CharacterRange(first_char, peek()) });
|
||||
next();
|
||||
} else {
|
||||
value = CharacterSet({ first_char });
|
||||
}
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
CharacterSet escaped_char(char value) {
|
||||
switch (value) {
|
||||
case 'a':
|
||||
return CharacterSet({ {'a', 'z'}, {'A', 'Z'} });
|
||||
case 'w':
|
||||
return CharacterSet({ {'a', 'z'}, {'A', 'Z'}, {'0', '9'}});
|
||||
case 'd':
|
||||
return CharacterSet({ {'0', '9'} });
|
||||
default:
|
||||
return CharacterSet({ value });
|
||||
}
|
||||
}
|
||||
|
||||
void next() {
|
||||
position++;
|
||||
}
|
||||
|
||||
char peek() {
|
||||
return input[position];
|
||||
}
|
||||
|
||||
bool has_more_input() {
|
||||
return position < length;
|
||||
}
|
||||
|
||||
bool has_error() {
|
||||
return error != "";
|
||||
}
|
||||
|
||||
string error;
|
||||
const string input;
|
||||
const size_t length;
|
||||
size_t position;
|
||||
};
|
||||
|
||||
Pattern::Pattern(const string &string) : value(string) {}
|
||||
|
||||
|
|
@ -191,9 +30,5 @@ namespace tree_sitter {
|
|||
void Pattern::accept(Visitor *visitor) const {
|
||||
visitor->visit(this);
|
||||
}
|
||||
|
||||
rule_ptr Pattern::to_rule_tree() const {
|
||||
return PatternParser(value).rule();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -17,7 +17,6 @@ namespace tree_sitter {
|
|||
void accept(Visitor *visitor) const;
|
||||
|
||||
const std::string value;
|
||||
rule_ptr to_rule_tree() const;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue