Expand regex/string rules as part of grammar preparation

This makes it possible to report errors in regex parsing
This commit is contained in:
Max Brunsfeld 2014-05-19 20:54:59 -07:00
parent 5245bc01fe
commit 649f200831
26 changed files with 883 additions and 651 deletions

View file

@ -94,20 +94,6 @@ namespace tree_sitter {
});
return result;
}
map<T, rule_ptr> apply_to(const rules::String *rule) {
rule_ptr result = make_shared<rules::Blank>();
for (char val : rule->value)
result = rules::Seq::Build({
result,
CharacterSet({ val }).copy()
});
return this->apply(result);
}
map<T, rule_ptr> apply_to(const rules::Pattern *rule) {
return this->apply(rule->to_rule_tree());
}
};
map<CharacterSet, rule_ptr> char_transitions(const rule_ptr &rule) {

View file

@ -113,8 +113,7 @@ namespace tree_sitter {
} else if (symbol.is_token() && symbol.is_auxiliary()) {
return token_description(grammar_for_symbol(symbol).rule(symbol));
} else {
string name = grammar_for_symbol(symbol).rule_name(symbol);
return name;
return grammar_for_symbol(symbol).rule_name(symbol);
}
}

View file

@ -48,6 +48,10 @@ namespace tree_sitter {
GrammarError::GrammarError(GrammarErrorType type, std::string message) :
type(type),
message(message) {}
bool GrammarError::operator==(const GrammarError &other) const {
return type == other.type && message == other.message;
}
ostream& operator<<(ostream &stream, const GrammarError *error) {
if (error)

View file

@ -0,0 +1,68 @@
#include "compiler/prepare_grammar/expand_tokens.h"
#include <vector>
#include <string>
#include <utility>
#include "compiler/prepared_grammar.h"
#include "compiler/rules/visitor.h"
#include "compiler/rules/pattern.h"
#include "compiler/rules/string.h"
#include "compiler/rules/blank.h"
#include "compiler/rules/seq.h"
#include "compiler/rules/character_set.h"
#include "compiler/prepare_grammar/parse_regex.h"
namespace tree_sitter {
using std::string;
using std::vector;
using std::pair;
using std::make_shared;
using rules::rule_ptr;
using rules::String;
using rules::Pattern;
namespace prepare_grammar {
class ExpandTokens : public rules::IdentityRuleFn {
using rules::IdentityRuleFn::apply_to;
rule_ptr apply_to(const String *rule) {
vector<rule_ptr> elements;
for (char val : rule->value)
elements.push_back(rules::CharacterSet({ val }).copy());
return rules::Seq::Build(elements);
}
rule_ptr apply_to(const Pattern *rule) {
auto pair = parse_regex(rule->value);
if (!error)
error = pair.second;
return pair.first;
}
public:
const GrammarError *error;
ExpandTokens() : error(nullptr) {}
};
pair<PreparedGrammar, const GrammarError *>
expand_tokens(const PreparedGrammar &grammar) {
vector<pair<string, rule_ptr>> rules, aux_rules;
ExpandTokens expander;
for (auto &pair : grammar.rules) {
auto rule = expander.apply(pair.second);
if (expander.error)
return { PreparedGrammar(), expander.error };
rules.push_back({ pair.first, rule });
}
for (auto &pair : grammar.aux_rules) {
auto rule = expander.apply(pair.second);
if (expander.error)
return { PreparedGrammar(), expander.error };
aux_rules.push_back({ pair.first, rule });
}
return { PreparedGrammar(rules, aux_rules, grammar.options), nullptr };
}
}
}

View file

@ -0,0 +1,16 @@
#ifndef COMPILER_PREPARE_GRAMMAR_EXPAND_TOKENS_H_
#define COMPILER_PREPARE_GRAMMAR_EXPAND_TOKENS_H_
#include "tree_sitter/compiler.h"
namespace tree_sitter {
class PreparedGrammar;
namespace prepare_grammar {
std::pair<PreparedGrammar, const GrammarError *>
expand_tokens(const PreparedGrammar &);
}
}
#endif // COMPILER_PREPARE_GRAMMAR_EXPAND_TOKENS_H_

View file

@ -0,0 +1,210 @@
#include "compiler/prepare_grammar/parse_regex.h"
#include <string>
#include <utility>
#include "compiler/rules/choice.h"
#include "compiler/rules/seq.h"
#include "compiler/rules/repeat.h"
#include "compiler/rules/character_set.h"
#include "compiler/rules/blank.h"
#include "compiler/util/string_helpers.h"
namespace tree_sitter {
using std::string;
using std::vector;
using std::pair;
using std::make_shared;
using rules::rule_ptr;
using rules::CharacterSet;
using rules::Seq;
using rules::Blank;
using rules::Choice;
using rules::Repeat;
using rules::CharacterRange;
using rules::blank;
namespace prepare_grammar {
class PatternParser {
public:
explicit PatternParser(const string &input) :
input(input),
length(input.length()),
position(0) {}
pair<rule_ptr, const GrammarError *> rule(bool nested) {
vector<rule_ptr> choices = {};
do {
if (!choices.empty()) {
if (peek() == '|')
next();
else
break;
}
auto pair = term(nested);
if (pair.second)
return { blank(), pair.second };
choices.push_back(pair.first);
} while (has_more_input());
auto rule = (choices.size() > 1) ? make_shared<Choice>(choices) : choices.front();
return { rule, nullptr };
}
private:
pair<rule_ptr, const GrammarError *> term(bool nested) {
rule_ptr result = blank();
do {
if (peek() == '|')
break;
if (nested && peek() == ')')
break;
auto pair = factor();
if (pair.second)
return { blank(), pair.second };
result = Seq::Build({ result, pair.first });
} while (has_more_input());
return { result, nullptr };
}
pair<rule_ptr, const GrammarError *> factor() {
auto pair = atom();
if (pair.second)
return { blank(), pair.second };
rule_ptr result = pair.first;
if (has_more_input()) {
switch (peek()) {
case '*':
next();
result = make_shared<Repeat>(result);
break;
case '+':
next();
result = make_shared<Seq>(result, make_shared<Repeat>(result));
break;
case '?':
next();
result = Choice::Build({ result, make_shared<Blank>() });
break;
}
}
return { result, nullptr };
}
pair<rule_ptr, const GrammarError *> atom() {
switch (peek()) {
case '(': {
next();
auto pair = rule(true);
if (pair.second)
return { blank(), pair.second };
if (peek() != ')')
return error("unmatched open paren");
next();
return { pair.first, nullptr };
}
case '[': {
next();
auto pair = char_set();
if (pair.second)
return { blank(), pair.second };
if (peek() != ']')
return error("unmatched open square bracket");
next();
return { pair.first.copy(), nullptr };
}
case ')': {
return error("unmatched close paren");
}
case ']': {
return error("unmatched close square bracket");
}
case '.': {
next();
return { CharacterSet({ '\n' }).complement().copy(), nullptr };
}
default: {
auto pair = single_char();
if (pair.second)
return { blank(), pair.second };
return { pair.first.copy(), nullptr };
}
}
}
pair<CharacterSet, const GrammarError *> char_set() {
bool is_affirmative = true;
if (peek() == '^') {
next();
is_affirmative = false;
}
CharacterSet result;
while (has_more_input() && (peek() != ']')) {
auto pair = single_char();
if (pair.second)
return { CharacterSet(), pair.second };
result.add_set(pair.first);
}
if (!is_affirmative)
result = result.complement();
return { result, nullptr };
}
pair<CharacterSet, const GrammarError *> single_char() {
CharacterSet value;
switch (peek()) {
case '\\':
next();
value = escaped_char(peek());
next();
break;
default:
char first_char = peek();
next();
if (peek() == '-') {
next();
value = CharacterSet({ CharacterRange(first_char, peek()) });
next();
} else {
value = CharacterSet({ first_char });
}
}
return { value, nullptr };
}
CharacterSet escaped_char(char value) {
switch (value) {
case 'a':
return CharacterSet({ {'a', 'z'}, {'A', 'Z'} });
case 'w':
return CharacterSet({ {'a', 'z'}, {'A', 'Z'}, {'0', '9'}});
case 'd':
return CharacterSet({ {'0', '9'} });
default:
return CharacterSet({ value });
}
}
void next() {
position++;
}
char peek() {
return input[position];
}
bool has_more_input() {
return position < length;
}
pair<rule_ptr, const GrammarError *> error(string msg) {
return { blank(), new GrammarError(GrammarErrorTypeRegex, msg) };
}
const string input;
const size_t length;
size_t position;
};
pair<rule_ptr, const GrammarError *> parse_regex(const std::string &input) {
return PatternParser(input).rule(false);
}
}
}

View file

@ -0,0 +1,16 @@
#ifndef COMPILER_PREPARE_GRAMMAR_PARSE_REGEX_H_
#define COMPILER_PREPARE_GRAMMAR_PARSE_REGEX_H_
#include "tree_sitter/compiler.h"
#include <string>
#include <utility>
namespace tree_sitter {
namespace prepare_grammar {
std::pair<rules::rule_ptr, const GrammarError *>
parse_regex(const std::string &);
}
}
#endif // COMPILER_PREPARE_GRAMMAR_PARSE_REGEX_H_

View file

@ -2,8 +2,11 @@
#include "compiler/prepared_grammar.h"
#include "compiler/prepare_grammar/extract_tokens.h"
#include "compiler/prepare_grammar/expand_repeats.h"
#include "compiler/prepare_grammar/expand_tokens.h"
#include "compiler/prepare_grammar/intern_symbols.h"
#include "stream_methods.h"
namespace tree_sitter {
using std::tuple;
using std::make_tuple;
@ -16,12 +19,17 @@ namespace tree_sitter {
const GrammarError *error = result.second;
if (error)
return make_tuple(PreparedGrammar({}, {}), PreparedGrammar({}, {}), error);
return make_tuple(PreparedGrammar(), PreparedGrammar(), error);
auto grammars = extract_tokens(grammar);
const PreparedGrammar &rule_grammar = expand_repeats(grammars.first);
const PreparedGrammar &lex_grammar = grammars.second;
auto expand_tokens_result = expand_tokens(grammars.second);
const PreparedGrammar &lex_grammar = expand_tokens_result.first;
error = expand_tokens_result.second;
if (error)
return make_tuple(PreparedGrammar(), PreparedGrammar(), error);
return make_tuple(rule_grammar, lex_grammar, nullptr);
}
}

View file

@ -10,6 +10,8 @@ namespace tree_sitter {
using std::ostream;
using rules::rule_ptr;
using rules::Symbol;
PreparedGrammar::PreparedGrammar() : Grammar({}), aux_rules({}), options({}) {}
PreparedGrammar::PreparedGrammar(const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,
const std::vector<std::pair<std::string, rules::rule_ptr>> &aux_rules) :

View file

@ -14,6 +14,7 @@ namespace tree_sitter {
class PreparedGrammar : public Grammar {
public:
PreparedGrammar();
PreparedGrammar(const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,
const std::vector<std::pair<std::string, rules::rule_ptr>> &aux_rules);
PreparedGrammar(const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,

View file

@ -11,6 +11,7 @@ namespace tree_sitter {
START_TOKEN,
PRECEDENCE,
IS_TOKEN,
DESCRIPTION,
} MetadataKey;
class Metadata : public Rule {

View file

@ -1,173 +1,12 @@
#include "compiler/rules/pattern.h"
#include <set>
#include <string>
#include <vector>
#include "compiler/rules/visitor.h"
#include "compiler/rules/choice.h"
#include "compiler/rules/seq.h"
#include "compiler/rules/repeat.h"
#include "compiler/rules/character_set.h"
#include "compiler/rules/blank.h"
#include "compiler/util/string_helpers.h"
namespace tree_sitter {
namespace rules {
using std::string;
using std::hash;
using std::make_shared;
using std::set;
using std::vector;
class PatternParser {
public:
explicit PatternParser(const string &input) :
input(input),
length(input.length()),
position(0) {}
rule_ptr rule() {
vector<rule_ptr> choices = { term() };
while (has_more_input() && peek() == '|') {
next();
choices.push_back(term());
}
return (choices.size() > 1) ? Choice::Build(choices) : choices.front();
}
private:
rule_ptr term() {
rule_ptr result = factor();
while (has_more_input() && (peek() != '|') && (peek() != ')'))
result = Seq::Build({ result, factor() });
return result;
}
rule_ptr factor() {
rule_ptr result = atom();
if (has_more_input()) {
switch (peek()) {
case '*':
next();
result = make_shared<Repeat>(result);
break;
case '+':
next();
result = make_shared<Seq>(result, make_shared<Repeat>(result));
break;
case '?':
next();
result = Choice::Build({ result, make_shared<Blank>() });
break;
}
}
return result;
}
rule_ptr atom() {
rule_ptr result;
switch (peek()) {
case '(':
next();
result = rule();
if (has_error()) return result;
if (peek() != ')') {
error = "mismatched parens";
return result;
}
next();
break;
case '[':
next();
result = char_set().copy();
if (has_error()) return result;
if (peek() != ']') {
error = "mismatched square brackets";
return result;
}
next();
break;
case ')':
error = "mismatched parens";
break;
case '.':
result = CharacterSet({ '\n' }).complement().copy();
next();
break;
default:
result = single_char().copy();
}
return result;
}
CharacterSet char_set() {
bool is_affirmative = true;
if (peek() == '^') {
next();
is_affirmative = false;
}
CharacterSet result;
while (has_more_input() && (peek() != ']'))
result.add_set(single_char());
return is_affirmative ? result : result.complement();
}
CharacterSet single_char() {
CharacterSet value;
switch (peek()) {
case '\\':
next();
value = escaped_char(peek());
if (has_error()) return value;
next();
break;
default:
char first_char = peek();
next();
if (peek() == '-') {
next();
value = CharacterSet({ CharacterRange(first_char, peek()) });
next();
} else {
value = CharacterSet({ first_char });
}
}
return value;
}
CharacterSet escaped_char(char value) {
switch (value) {
case 'a':
return CharacterSet({ {'a', 'z'}, {'A', 'Z'} });
case 'w':
return CharacterSet({ {'a', 'z'}, {'A', 'Z'}, {'0', '9'}});
case 'd':
return CharacterSet({ {'0', '9'} });
default:
return CharacterSet({ value });
}
}
void next() {
position++;
}
char peek() {
return input[position];
}
bool has_more_input() {
return position < length;
}
bool has_error() {
return error != "";
}
string error;
const string input;
const size_t length;
size_t position;
};
Pattern::Pattern(const string &string) : value(string) {}
@ -191,9 +30,5 @@ namespace tree_sitter {
void Pattern::accept(Visitor *visitor) const {
visitor->visit(this);
}
rule_ptr Pattern::to_rule_tree() const {
return PatternParser(value).rule();
}
}
}

View file

@ -17,7 +17,6 @@ namespace tree_sitter {
void accept(Visitor *visitor) const;
const std::string value;
rule_ptr to_rule_tree() const;
};
}
}