Handle string and regex rules w/ non-ascii chars

This commit is contained in:
Max Brunsfeld 2014-09-28 18:21:22 -07:00
parent e0185f84fc
commit cb5ecbd491
5 changed files with 114 additions and 52 deletions

View file

@ -13,6 +13,7 @@ cxx_flags = [
'-I', 'spec',
'-I', 'include',
'-I', 'externals/bandit',
'-I', 'externals/utf8proc',
'-isystem', '/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/lib/c++/v1',
]

View file

@ -7,6 +7,7 @@
'include_dirs': [
'include',
'src',
'externals/utf8proc',
],
'sources': [
'src/compiler/build_tables/build_lex_table.cc',
@ -54,6 +55,7 @@
'src/compiler/rules/symbol.cc',
'src/compiler/rules/visitor.cc',
'src/compiler/util/string_helpers.cc',
'externals/utf8proc/utf8proc.c',
],
'cflags_cc': [
'-std=c++0x',

View file

@ -9,55 +9,88 @@ using namespace rules;
using prepare_grammar::expand_tokens;
describe("expand_tokens", []() {
it("replaces regex patterns with their expansion", [&]() {
LexicalGrammar grammar({
{ "rule_A", seq({
i_sym(10),
pattern("x*"),
i_sym(11) }) },
}, {});
describe("string rules", [&]() {
it("replaces strings with sequences of character sets", [&]() {
LexicalGrammar grammar({
{ "rule_A", seq({
i_sym(10),
str("xyz"),
i_sym(11) }) },
}, {});
auto result = expand_tokens(grammar);
auto result = expand_tokens(grammar);
AssertThat(result.second, Equals((const GrammarError *)nullptr));
AssertThat(result.first.rules, Equals(rule_list({
{ "rule_A", seq({
i_sym(10),
repeat(character({ 'x' })),
i_sym(11) }) },
})));
AssertThat(result.second, Equals((const GrammarError *)nullptr));
AssertThat(result.first.rules, Equals(rule_list({
{ "rule_A", seq({
i_sym(10),
seq({ character({ 'x' }), character({ 'y' }), character({ 'z' }) }),
i_sym(11) }) },
})));
});
it("handles strings containing non-ASCII UTF8 characters", [&]() {
LexicalGrammar grammar({
// α β
{ "rule_A", str("\u03B1 \u03B2") },
}, {});
auto result = expand_tokens(grammar);
AssertThat(result.first.rules, Equals(rule_list({
{ "rule_A", seq({
character({ 945 }),
character({ ' ' }),
character({ 946 }) }) }
})));
});
});
it("replaces string rules with a sequence of characters", [&]() {
LexicalGrammar grammar({
{ "rule_A", seq({
i_sym(10),
str("xyz"),
i_sym(11) }) },
}, {});
describe("regexp rules", [&]() {
it("replaces regexps with the equivalent rule tree", [&]() {
LexicalGrammar grammar({
{ "rule_A", seq({
i_sym(10),
pattern("x*"),
i_sym(11) }) },
}, {});
auto result = expand_tokens(grammar);
auto result = expand_tokens(grammar);
AssertThat(result.second, Equals((const GrammarError *)nullptr));
AssertThat(result.first.rules, Equals(rule_list({
{ "rule_A", seq({
i_sym(10),
seq({ character({ 'x' }), character({ 'y' }), character({ 'z' }) }),
i_sym(11) }) },
})));
});
AssertThat(result.second, Equals((const GrammarError *)nullptr));
AssertThat(result.first.rules, Equals(rule_list({
{ "rule_A", seq({
i_sym(10),
repeat(character({ 'x' })),
i_sym(11) }) },
})));
});
it("returns an error when the grammar contains an invalid regex", [&]() {
LexicalGrammar grammar({
{ "rule_A", seq({
pattern("("),
str("xyz"),
pattern("[") }) },
}, {});
it("handles regexps containing non-ASCII UTF8 characters", [&]() {
LexicalGrammar grammar({
// [^α-δ]
{ "rule_A", pattern("[^\u03B1-\u03B4]*") },
}, {});
auto result = expand_tokens(grammar);
auto result = expand_tokens(grammar);
AssertThat(result.second, EqualsPointer(new GrammarError(GrammarErrorTypeRegex, "unmatched open paren")));
AssertThat(result.first.rules, Equals(rule_list({
{ "rule_A", repeat(character({ 945, 946, 947, 948 }, false)) }
})));
});
it("returns an error when the grammar contains an invalid regex", [&]() {
LexicalGrammar grammar({
{ "rule_A", seq({
pattern("("),
str("xyz"),
pattern("[") }) },
}, {});
auto result = expand_tokens(grammar);
AssertThat(result.second, EqualsPointer(new GrammarError(GrammarErrorTypeRegex, "unmatched open paren")));
});
});
});

View file

@ -10,6 +10,7 @@
#include "compiler/rules/seq.h"
#include "compiler/rules/character_set.h"
#include "compiler/prepare_grammar/parse_regex.h"
#include "utf8proc.h"
namespace tree_sitter {
namespace prepare_grammar {
@ -27,8 +28,19 @@ class ExpandTokens : public rules::IdentityRuleFn {
rule_ptr apply_to(const String *rule) {
vector<rule_ptr> elements;
for (char val : rule->value)
elements.push_back(rules::CharacterSet().include(val).copy());
uint8_t *iter = (uint8_t *)rule->value.data();
uint8_t *end = iter + rule->value.size();
while (iter < end) {
int32_t el;
size_t size = utf8proc_iterate(iter, (end - iter), &el);
if (!size)
break;
iter += size;
elements.push_back(rules::CharacterSet().include(el).copy());
}
return rules::Seq::Build(elements);
}

View file

@ -8,6 +8,7 @@
#include "compiler/rules/character_set.h"
#include "compiler/rules/blank.h"
#include "compiler/util/string_helpers.h"
#include "utf8proc.h"
namespace tree_sitter {
namespace prepare_grammar {
@ -27,7 +28,10 @@ using rules::blank;
class PatternParser {
public:
explicit PatternParser(const string &input)
: input(input), length(input.length()), position(0) {}
: input(input),
iter((const uint8_t *)input.data()),
end(iter + input.size())
{ next(); }
pair<rule_ptr, const GrammarError *> rule(bool nested) {
vector<rule_ptr> choices = {};
@ -156,7 +160,7 @@ class PatternParser {
next();
break;
default:
char first_char = peek();
uint32_t first_char = peek();
next();
if (peek() == '-') {
next();
@ -169,7 +173,7 @@ class PatternParser {
return { value, nullptr };
}
CharacterSet escaped_char(char value) {
CharacterSet escaped_char(uint32_t value) {
switch (value) {
case 'a':
return CharacterSet().include('a', 'z').include('A', 'Z');
@ -195,23 +199,33 @@ class PatternParser {
}
}
void next() { position++; }
void next() {
size_t lookahead_size = utf8proc_iterate(iter, end - iter, &lookahead);
if (!lookahead_size)
lookahead = 0;
iter += lookahead_size;
}
char peek() { return input[position]; }
uint32_t peek() {
return lookahead;
}
bool has_more_input() { return position < length; }
bool has_more_input() {
return lookahead && iter <= end;
}
pair<rule_ptr, const GrammarError *> error(string msg) {
return { blank(), new GrammarError(GrammarErrorTypeRegex, msg) };
}
const string input;
const size_t length;
size_t position;
string input;
const uint8_t *iter;
const uint8_t *end;
int32_t lookahead;
};
pair<rule_ptr, const GrammarError *> parse_regex(const std::string &input) {
return PatternParser(input).rule(false);
return PatternParser(input.c_str()).rule(false);
}
} // namespace prepare_grammar