From cb5ecbd49138124af463f648fbf7b1c66a60bb06 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sun, 28 Sep 2014 18:21:22 -0700 Subject: [PATCH] Handle string and regex rules w/ non-ascii chars --- .ycm_extra_conf.py | 1 + project.gyp | 2 + .../prepare_grammar/expand_tokens_spec.cc | 113 +++++++++++------- src/compiler/prepare_grammar/expand_tokens.cc | 16 ++- src/compiler/prepare_grammar/parse_regex.cc | 34 ++++-- 5 files changed, 114 insertions(+), 52 deletions(-) diff --git a/.ycm_extra_conf.py b/.ycm_extra_conf.py index acf03757..34a724a5 100644 --- a/.ycm_extra_conf.py +++ b/.ycm_extra_conf.py @@ -13,6 +13,7 @@ cxx_flags = [ '-I', 'spec', '-I', 'include', '-I', 'externals/bandit', + '-I', 'externals/utf8proc', '-isystem', '/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/lib/c++/v1', ] diff --git a/project.gyp b/project.gyp index 603ef0d9..65d2694c 100644 --- a/project.gyp +++ b/project.gyp @@ -7,6 +7,7 @@ 'include_dirs': [ 'include', 'src', + 'externals/utf8proc', ], 'sources': [ 'src/compiler/build_tables/build_lex_table.cc', @@ -54,6 +55,7 @@ 'src/compiler/rules/symbol.cc', 'src/compiler/rules/visitor.cc', 'src/compiler/util/string_helpers.cc', + 'externals/utf8proc/utf8proc.c', ], 'cflags_cc': [ '-std=c++0x', diff --git a/spec/compiler/prepare_grammar/expand_tokens_spec.cc b/spec/compiler/prepare_grammar/expand_tokens_spec.cc index d876cce3..5d2bc2f8 100644 --- a/spec/compiler/prepare_grammar/expand_tokens_spec.cc +++ b/spec/compiler/prepare_grammar/expand_tokens_spec.cc @@ -9,55 +9,88 @@ using namespace rules; using prepare_grammar::expand_tokens; describe("expand_tokens", []() { - it("replaces regex patterns with their expansion", [&]() { - LexicalGrammar grammar({ - { "rule_A", seq({ - i_sym(10), - pattern("x*"), - i_sym(11) }) }, - }, {}); + describe("string rules", [&]() { + it("replaces strings with sequences of character sets", [&]() { + LexicalGrammar grammar({ + { "rule_A", seq({ + i_sym(10), + str("xyz"), + i_sym(11) }) }, + }, {}); - auto result = expand_tokens(grammar); + auto result = expand_tokens(grammar); - AssertThat(result.second, Equals((const GrammarError *)nullptr)); - AssertThat(result.first.rules, Equals(rule_list({ - { "rule_A", seq({ - i_sym(10), - repeat(character({ 'x' })), - i_sym(11) }) }, - }))); + AssertThat(result.second, Equals((const GrammarError *)nullptr)); + AssertThat(result.first.rules, Equals(rule_list({ + { "rule_A", seq({ + i_sym(10), + seq({ character({ 'x' }), character({ 'y' }), character({ 'z' }) }), + i_sym(11) }) }, + }))); + }); + + it("handles strings containing non-ASCII UTF8 characters", [&]() { + LexicalGrammar grammar({ + // α β + { "rule_A", str("\u03B1 \u03B2") }, + }, {}); + + auto result = expand_tokens(grammar); + + AssertThat(result.first.rules, Equals(rule_list({ + { "rule_A", seq({ + character({ 945 }), + character({ ' ' }), + character({ 946 }) }) } + }))); + }); }); - it("replaces string rules with a sequence of characters", [&]() { - LexicalGrammar grammar({ - { "rule_A", seq({ - i_sym(10), - str("xyz"), - i_sym(11) }) }, - }, {}); + describe("regexp rules", [&]() { + it("replaces regexps with the equivalent rule tree", [&]() { + LexicalGrammar grammar({ + { "rule_A", seq({ + i_sym(10), + pattern("x*"), + i_sym(11) }) }, + }, {}); - auto result = expand_tokens(grammar); + auto result = expand_tokens(grammar); - AssertThat(result.second, Equals((const GrammarError *)nullptr)); - AssertThat(result.first.rules, Equals(rule_list({ - { "rule_A", seq({ - i_sym(10), - seq({ character({ 'x' }), character({ 'y' }), character({ 'z' }) }), - i_sym(11) }) }, - }))); - }); + AssertThat(result.second, Equals((const GrammarError *)nullptr)); + AssertThat(result.first.rules, Equals(rule_list({ + { "rule_A", seq({ + i_sym(10), + repeat(character({ 'x' })), + i_sym(11) }) }, + }))); + }); - it("returns an error when the grammar contains an invalid regex", [&]() { - LexicalGrammar grammar({ - { "rule_A", seq({ - pattern("("), - str("xyz"), - pattern("[") }) }, - }, {}); + it("handles regexps containing non-ASCII UTF8 characters", [&]() { + LexicalGrammar grammar({ + // [^α-δ] + { "rule_A", pattern("[^\u03B1-\u03B4]*") }, + }, {}); - auto result = expand_tokens(grammar); + auto result = expand_tokens(grammar); - AssertThat(result.second, EqualsPointer(new GrammarError(GrammarErrorTypeRegex, "unmatched open paren"))); + AssertThat(result.first.rules, Equals(rule_list({ + { "rule_A", repeat(character({ 945, 946, 947, 948 }, false)) } + }))); + }); + + it("returns an error when the grammar contains an invalid regex", [&]() { + LexicalGrammar grammar({ + { "rule_A", seq({ + pattern("("), + str("xyz"), + pattern("[") }) }, + }, {}); + + auto result = expand_tokens(grammar); + + AssertThat(result.second, EqualsPointer(new GrammarError(GrammarErrorTypeRegex, "unmatched open paren"))); + }); }); }); diff --git a/src/compiler/prepare_grammar/expand_tokens.cc b/src/compiler/prepare_grammar/expand_tokens.cc index 674339d2..fa5cbc7d 100644 --- a/src/compiler/prepare_grammar/expand_tokens.cc +++ b/src/compiler/prepare_grammar/expand_tokens.cc @@ -10,6 +10,7 @@ #include "compiler/rules/seq.h" #include "compiler/rules/character_set.h" #include "compiler/prepare_grammar/parse_regex.h" +#include "utf8proc.h" namespace tree_sitter { namespace prepare_grammar { @@ -27,8 +28,19 @@ class ExpandTokens : public rules::IdentityRuleFn { rule_ptr apply_to(const String *rule) { vector elements; - for (char val : rule->value) - elements.push_back(rules::CharacterSet().include(val).copy()); + uint8_t *iter = (uint8_t *)rule->value.data(); + uint8_t *end = iter + rule->value.size(); + + while (iter < end) { + int32_t el; + size_t size = utf8proc_iterate(iter, (end - iter), &el); + if (!size) + break; + iter += size; + + elements.push_back(rules::CharacterSet().include(el).copy()); + } + return rules::Seq::Build(elements); } diff --git a/src/compiler/prepare_grammar/parse_regex.cc b/src/compiler/prepare_grammar/parse_regex.cc index 6912cbaf..a13aee95 100644 --- a/src/compiler/prepare_grammar/parse_regex.cc +++ b/src/compiler/prepare_grammar/parse_regex.cc @@ -8,6 +8,7 @@ #include "compiler/rules/character_set.h" #include "compiler/rules/blank.h" #include "compiler/util/string_helpers.h" +#include "utf8proc.h" namespace tree_sitter { namespace prepare_grammar { @@ -27,7 +28,10 @@ using rules::blank; class PatternParser { public: explicit PatternParser(const string &input) - : input(input), length(input.length()), position(0) {} + : input(input), + iter((const uint8_t *)input.data()), + end(iter + input.size()) + { next(); } pair rule(bool nested) { vector choices = {}; @@ -156,7 +160,7 @@ class PatternParser { next(); break; default: - char first_char = peek(); + uint32_t first_char = peek(); next(); if (peek() == '-') { next(); @@ -169,7 +173,7 @@ class PatternParser { return { value, nullptr }; } - CharacterSet escaped_char(char value) { + CharacterSet escaped_char(uint32_t value) { switch (value) { case 'a': return CharacterSet().include('a', 'z').include('A', 'Z'); @@ -195,23 +199,33 @@ class PatternParser { } } - void next() { position++; } + void next() { + size_t lookahead_size = utf8proc_iterate(iter, end - iter, &lookahead); + if (!lookahead_size) + lookahead = 0; + iter += lookahead_size; + } - char peek() { return input[position]; } + uint32_t peek() { + return lookahead; + } - bool has_more_input() { return position < length; } + bool has_more_input() { + return lookahead && iter <= end; + } pair error(string msg) { return { blank(), new GrammarError(GrammarErrorTypeRegex, msg) }; } - const string input; - const size_t length; - size_t position; + string input; + const uint8_t *iter; + const uint8_t *end; + int32_t lookahead; }; pair parse_regex(const std::string &input) { - return PatternParser(input).rule(false); + return PatternParser(input.c_str()).rule(false); } } // namespace prepare_grammar