Handle string and regex rules w/ non-ascii chars
This commit is contained in:
parent
e0185f84fc
commit
cb5ecbd491
5 changed files with 114 additions and 52 deletions
|
|
@ -13,6 +13,7 @@ cxx_flags = [
|
|||
'-I', 'spec',
|
||||
'-I', 'include',
|
||||
'-I', 'externals/bandit',
|
||||
'-I', 'externals/utf8proc',
|
||||
'-isystem', '/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/lib/c++/v1',
|
||||
]
|
||||
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@
|
|||
'include_dirs': [
|
||||
'include',
|
||||
'src',
|
||||
'externals/utf8proc',
|
||||
],
|
||||
'sources': [
|
||||
'src/compiler/build_tables/build_lex_table.cc',
|
||||
|
|
@ -54,6 +55,7 @@
|
|||
'src/compiler/rules/symbol.cc',
|
||||
'src/compiler/rules/visitor.cc',
|
||||
'src/compiler/util/string_helpers.cc',
|
||||
'externals/utf8proc/utf8proc.c',
|
||||
],
|
||||
'cflags_cc': [
|
||||
'-std=c++0x',
|
||||
|
|
|
|||
|
|
@ -9,55 +9,88 @@ using namespace rules;
|
|||
using prepare_grammar::expand_tokens;
|
||||
|
||||
describe("expand_tokens", []() {
|
||||
it("replaces regex patterns with their expansion", [&]() {
|
||||
LexicalGrammar grammar({
|
||||
{ "rule_A", seq({
|
||||
i_sym(10),
|
||||
pattern("x*"),
|
||||
i_sym(11) }) },
|
||||
}, {});
|
||||
describe("string rules", [&]() {
|
||||
it("replaces strings with sequences of character sets", [&]() {
|
||||
LexicalGrammar grammar({
|
||||
{ "rule_A", seq({
|
||||
i_sym(10),
|
||||
str("xyz"),
|
||||
i_sym(11) }) },
|
||||
}, {});
|
||||
|
||||
auto result = expand_tokens(grammar);
|
||||
auto result = expand_tokens(grammar);
|
||||
|
||||
AssertThat(result.second, Equals((const GrammarError *)nullptr));
|
||||
AssertThat(result.first.rules, Equals(rule_list({
|
||||
{ "rule_A", seq({
|
||||
i_sym(10),
|
||||
repeat(character({ 'x' })),
|
||||
i_sym(11) }) },
|
||||
})));
|
||||
AssertThat(result.second, Equals((const GrammarError *)nullptr));
|
||||
AssertThat(result.first.rules, Equals(rule_list({
|
||||
{ "rule_A", seq({
|
||||
i_sym(10),
|
||||
seq({ character({ 'x' }), character({ 'y' }), character({ 'z' }) }),
|
||||
i_sym(11) }) },
|
||||
})));
|
||||
});
|
||||
|
||||
it("handles strings containing non-ASCII UTF8 characters", [&]() {
|
||||
LexicalGrammar grammar({
|
||||
// α β
|
||||
{ "rule_A", str("\u03B1 \u03B2") },
|
||||
}, {});
|
||||
|
||||
auto result = expand_tokens(grammar);
|
||||
|
||||
AssertThat(result.first.rules, Equals(rule_list({
|
||||
{ "rule_A", seq({
|
||||
character({ 945 }),
|
||||
character({ ' ' }),
|
||||
character({ 946 }) }) }
|
||||
})));
|
||||
});
|
||||
});
|
||||
|
||||
it("replaces string rules with a sequence of characters", [&]() {
|
||||
LexicalGrammar grammar({
|
||||
{ "rule_A", seq({
|
||||
i_sym(10),
|
||||
str("xyz"),
|
||||
i_sym(11) }) },
|
||||
}, {});
|
||||
describe("regexp rules", [&]() {
|
||||
it("replaces regexps with the equivalent rule tree", [&]() {
|
||||
LexicalGrammar grammar({
|
||||
{ "rule_A", seq({
|
||||
i_sym(10),
|
||||
pattern("x*"),
|
||||
i_sym(11) }) },
|
||||
}, {});
|
||||
|
||||
auto result = expand_tokens(grammar);
|
||||
auto result = expand_tokens(grammar);
|
||||
|
||||
AssertThat(result.second, Equals((const GrammarError *)nullptr));
|
||||
AssertThat(result.first.rules, Equals(rule_list({
|
||||
{ "rule_A", seq({
|
||||
i_sym(10),
|
||||
seq({ character({ 'x' }), character({ 'y' }), character({ 'z' }) }),
|
||||
i_sym(11) }) },
|
||||
})));
|
||||
});
|
||||
AssertThat(result.second, Equals((const GrammarError *)nullptr));
|
||||
AssertThat(result.first.rules, Equals(rule_list({
|
||||
{ "rule_A", seq({
|
||||
i_sym(10),
|
||||
repeat(character({ 'x' })),
|
||||
i_sym(11) }) },
|
||||
})));
|
||||
});
|
||||
|
||||
it("returns an error when the grammar contains an invalid regex", [&]() {
|
||||
LexicalGrammar grammar({
|
||||
{ "rule_A", seq({
|
||||
pattern("("),
|
||||
str("xyz"),
|
||||
pattern("[") }) },
|
||||
}, {});
|
||||
it("handles regexps containing non-ASCII UTF8 characters", [&]() {
|
||||
LexicalGrammar grammar({
|
||||
// [^α-δ]
|
||||
{ "rule_A", pattern("[^\u03B1-\u03B4]*") },
|
||||
}, {});
|
||||
|
||||
auto result = expand_tokens(grammar);
|
||||
auto result = expand_tokens(grammar);
|
||||
|
||||
AssertThat(result.second, EqualsPointer(new GrammarError(GrammarErrorTypeRegex, "unmatched open paren")));
|
||||
AssertThat(result.first.rules, Equals(rule_list({
|
||||
{ "rule_A", repeat(character({ 945, 946, 947, 948 }, false)) }
|
||||
})));
|
||||
});
|
||||
|
||||
it("returns an error when the grammar contains an invalid regex", [&]() {
|
||||
LexicalGrammar grammar({
|
||||
{ "rule_A", seq({
|
||||
pattern("("),
|
||||
str("xyz"),
|
||||
pattern("[") }) },
|
||||
}, {});
|
||||
|
||||
auto result = expand_tokens(grammar);
|
||||
|
||||
AssertThat(result.second, EqualsPointer(new GrammarError(GrammarErrorTypeRegex, "unmatched open paren")));
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@
|
|||
#include "compiler/rules/seq.h"
|
||||
#include "compiler/rules/character_set.h"
|
||||
#include "compiler/prepare_grammar/parse_regex.h"
|
||||
#include "utf8proc.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
namespace prepare_grammar {
|
||||
|
|
@ -27,8 +28,19 @@ class ExpandTokens : public rules::IdentityRuleFn {
|
|||
|
||||
rule_ptr apply_to(const String *rule) {
|
||||
vector<rule_ptr> elements;
|
||||
for (char val : rule->value)
|
||||
elements.push_back(rules::CharacterSet().include(val).copy());
|
||||
uint8_t *iter = (uint8_t *)rule->value.data();
|
||||
uint8_t *end = iter + rule->value.size();
|
||||
|
||||
while (iter < end) {
|
||||
int32_t el;
|
||||
size_t size = utf8proc_iterate(iter, (end - iter), &el);
|
||||
if (!size)
|
||||
break;
|
||||
iter += size;
|
||||
|
||||
elements.push_back(rules::CharacterSet().include(el).copy());
|
||||
}
|
||||
|
||||
return rules::Seq::Build(elements);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@
|
|||
#include "compiler/rules/character_set.h"
|
||||
#include "compiler/rules/blank.h"
|
||||
#include "compiler/util/string_helpers.h"
|
||||
#include "utf8proc.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
namespace prepare_grammar {
|
||||
|
|
@ -27,7 +28,10 @@ using rules::blank;
|
|||
class PatternParser {
|
||||
public:
|
||||
explicit PatternParser(const string &input)
|
||||
: input(input), length(input.length()), position(0) {}
|
||||
: input(input),
|
||||
iter((const uint8_t *)input.data()),
|
||||
end(iter + input.size())
|
||||
{ next(); }
|
||||
|
||||
pair<rule_ptr, const GrammarError *> rule(bool nested) {
|
||||
vector<rule_ptr> choices = {};
|
||||
|
|
@ -156,7 +160,7 @@ class PatternParser {
|
|||
next();
|
||||
break;
|
||||
default:
|
||||
char first_char = peek();
|
||||
uint32_t first_char = peek();
|
||||
next();
|
||||
if (peek() == '-') {
|
||||
next();
|
||||
|
|
@ -169,7 +173,7 @@ class PatternParser {
|
|||
return { value, nullptr };
|
||||
}
|
||||
|
||||
CharacterSet escaped_char(char value) {
|
||||
CharacterSet escaped_char(uint32_t value) {
|
||||
switch (value) {
|
||||
case 'a':
|
||||
return CharacterSet().include('a', 'z').include('A', 'Z');
|
||||
|
|
@ -195,23 +199,33 @@ class PatternParser {
|
|||
}
|
||||
}
|
||||
|
||||
void next() { position++; }
|
||||
void next() {
|
||||
size_t lookahead_size = utf8proc_iterate(iter, end - iter, &lookahead);
|
||||
if (!lookahead_size)
|
||||
lookahead = 0;
|
||||
iter += lookahead_size;
|
||||
}
|
||||
|
||||
char peek() { return input[position]; }
|
||||
uint32_t peek() {
|
||||
return lookahead;
|
||||
}
|
||||
|
||||
bool has_more_input() { return position < length; }
|
||||
bool has_more_input() {
|
||||
return lookahead && iter <= end;
|
||||
}
|
||||
|
||||
pair<rule_ptr, const GrammarError *> error(string msg) {
|
||||
return { blank(), new GrammarError(GrammarErrorTypeRegex, msg) };
|
||||
}
|
||||
|
||||
const string input;
|
||||
const size_t length;
|
||||
size_t position;
|
||||
string input;
|
||||
const uint8_t *iter;
|
||||
const uint8_t *end;
|
||||
int32_t lookahead;
|
||||
};
|
||||
|
||||
pair<rule_ptr, const GrammarError *> parse_regex(const std::string &input) {
|
||||
return PatternParser(input).rule(false);
|
||||
return PatternParser(input.c_str()).rule(false);
|
||||
}
|
||||
|
||||
} // namespace prepare_grammar
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue