From cb5ecbd49138124af463f648fbf7b1c66a60bb06 Mon Sep 17 00:00:00 2001
From: Max Brunsfeld <maxbrunsfeld@gmail.com>
Date: Sun, 28 Sep 2014 18:21:22 -0700
Subject: [PATCH] Handle string and regex rules w/ non-ascii chars

---
 .ycm_extra_conf.py                            |   1 +
 project.gyp                                   |   2 +
 .../prepare_grammar/expand_tokens_spec.cc     | 113 +++++++++++-------
 src/compiler/prepare_grammar/expand_tokens.cc |  16 ++-
 src/compiler/prepare_grammar/parse_regex.cc   |  34 ++++--
 5 files changed, 114 insertions(+), 52 deletions(-)

diff --git a/.ycm_extra_conf.py b/.ycm_extra_conf.py
index acf03757..34a724a5 100644
--- a/.ycm_extra_conf.py
+++ b/.ycm_extra_conf.py
@@ -13,6 +13,7 @@ cxx_flags = [
     '-I', 'spec',
     '-I', 'include',
     '-I', 'externals/bandit',
+    '-I', 'externals/utf8proc',
     '-isystem', '/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/lib/c++/v1',
 ]
 
diff --git a/project.gyp b/project.gyp
index 603ef0d9..65d2694c 100644
--- a/project.gyp
+++ b/project.gyp
@@ -7,6 +7,7 @@
       'include_dirs': [
         'include',
         'src',
+        'externals/utf8proc',
       ],
       'sources': [
         'src/compiler/build_tables/build_lex_table.cc',
@@ -54,6 +55,7 @@
         'src/compiler/rules/symbol.cc',
         'src/compiler/rules/visitor.cc',
         'src/compiler/util/string_helpers.cc',
+        'externals/utf8proc/utf8proc.c',
       ],
       'cflags_cc': [
         '-std=c++0x',
diff --git a/spec/compiler/prepare_grammar/expand_tokens_spec.cc b/spec/compiler/prepare_grammar/expand_tokens_spec.cc
index d876cce3..5d2bc2f8 100644
--- a/spec/compiler/prepare_grammar/expand_tokens_spec.cc
+++ b/spec/compiler/prepare_grammar/expand_tokens_spec.cc
@@ -9,55 +9,88 @@ using namespace rules;
 using prepare_grammar::expand_tokens;
 
 describe("expand_tokens", []() {
-  it("replaces regex patterns with their expansion", [&]() {
-    LexicalGrammar grammar({
-        { "rule_A", seq({
-            i_sym(10),
-            pattern("x*"),
-            i_sym(11) }) },
-    }, {});
+  describe("string rules", [&]() {
+    it("replaces strings with sequences of character sets", [&]() {
+      LexicalGrammar grammar({
+          { "rule_A", seq({
+              i_sym(10),
+              str("xyz"),
+              i_sym(11) }) },
+      }, {});
 
-    auto result = expand_tokens(grammar);
+      auto result = expand_tokens(grammar);
 
-    AssertThat(result.second, Equals((const GrammarError *)nullptr));
-    AssertThat(result.first.rules, Equals(rule_list({
-        { "rule_A", seq({
-            i_sym(10),
-            repeat(character({ 'x' })),
-            i_sym(11) }) },
-    })));
+      AssertThat(result.second, Equals((const GrammarError *)nullptr));
+      AssertThat(result.first.rules, Equals(rule_list({
+          { "rule_A", seq({
+              i_sym(10),
+              seq({ character({ 'x' }), character({ 'y' }), character({ 'z' }) }),
+              i_sym(11) }) },
+      })));
+    });
+
+    it("handles strings containing non-ASCII UTF8 characters", [&]() {
+      LexicalGrammar grammar({
+          // α β
+          { "rule_A", str("\u03B1 \u03B2") },
+      }, {});
+
+      auto result = expand_tokens(grammar);
+
+      AssertThat(result.first.rules, Equals(rule_list({
+          { "rule_A", seq({
+              character({ 945 }),
+              character({ ' ' }),
+              character({ 946 }) }) }
+      })));
+    });
   });
 
-  it("replaces string rules with a sequence of characters", [&]() {
-    LexicalGrammar grammar({
-        { "rule_A", seq({
-            i_sym(10),
-            str("xyz"),
-            i_sym(11) }) },
-    }, {});
+  describe("regexp rules", [&]() {
+    it("replaces regexps with the equivalent rule tree", [&]() {
+      LexicalGrammar grammar({
+          { "rule_A", seq({
+              i_sym(10),
+              pattern("x*"),
+              i_sym(11) }) },
+      }, {});
 
-    auto result = expand_tokens(grammar);
+      auto result = expand_tokens(grammar);
 
-    AssertThat(result.second, Equals((const GrammarError *)nullptr));
-    AssertThat(result.first.rules, Equals(rule_list({
-        { "rule_A", seq({
-            i_sym(10),
-            seq({ character({ 'x' }), character({ 'y' }), character({ 'z' }) }),
-            i_sym(11) }) },
-    })));
-  });
+      AssertThat(result.second, Equals((const GrammarError *)nullptr));
+      AssertThat(result.first.rules, Equals(rule_list({
+          { "rule_A", seq({
+              i_sym(10),
+              repeat(character({ 'x' })),
+              i_sym(11) }) },
+      })));
+    });
 
-  it("returns an error when the grammar contains an invalid regex", [&]() {
-    LexicalGrammar grammar({
-        { "rule_A", seq({
-            pattern("("),
-            str("xyz"),
-            pattern("[") }) },
-    }, {});
+    it("handles regexps containing non-ASCII UTF8 characters", [&]() {
+      LexicalGrammar grammar({
+          // [^α-δ]
+          { "rule_A", pattern("[^\u03B1-\u03B4]*") },
+      }, {});
 
-    auto result = expand_tokens(grammar);
+      auto result = expand_tokens(grammar);
 
-    AssertThat(result.second, EqualsPointer(new GrammarError(GrammarErrorTypeRegex, "unmatched open paren")));
+      AssertThat(result.first.rules, Equals(rule_list({
+          { "rule_A", repeat(character({ 945, 946, 947, 948 }, false)) }
+      })));
+    });
+
+    it("returns an error when the grammar contains an invalid regex", [&]() {
+      LexicalGrammar grammar({
+          { "rule_A", seq({
+              pattern("("),
+              str("xyz"),
+              pattern("[") }) },
+      }, {});
+
+      auto result = expand_tokens(grammar);
+
+      AssertThat(result.second, EqualsPointer(new GrammarError(GrammarErrorTypeRegex, "unmatched open paren")));
+    });
   });
 });
 
diff --git a/src/compiler/prepare_grammar/expand_tokens.cc b/src/compiler/prepare_grammar/expand_tokens.cc
index 674339d2..fa5cbc7d 100644
--- a/src/compiler/prepare_grammar/expand_tokens.cc
+++ b/src/compiler/prepare_grammar/expand_tokens.cc
@@ -10,6 +10,7 @@
 #include "compiler/rules/seq.h"
 #include "compiler/rules/character_set.h"
 #include "compiler/prepare_grammar/parse_regex.h"
+#include "utf8proc.h"
 
 namespace tree_sitter {
 namespace prepare_grammar {
@@ -27,8 +28,19 @@ class ExpandTokens : public rules::IdentityRuleFn {
 
   rule_ptr apply_to(const String *rule) {
     vector<rule_ptr> elements;
-    for (char val : rule->value)
-      elements.push_back(rules::CharacterSet().include(val).copy());
+    uint8_t *iter = (uint8_t *)rule->value.data();
+    uint8_t *end = iter + rule->value.size();
+
+    while (iter < end) {
+      int32_t el;
+      size_t size = utf8proc_iterate(iter, (end - iter), &el);
+      if (!size)
+        break;
+      iter += size;
+
+      elements.push_back(rules::CharacterSet().include(el).copy());
+    }
+
     return rules::Seq::Build(elements);
   }
 
diff --git a/src/compiler/prepare_grammar/parse_regex.cc b/src/compiler/prepare_grammar/parse_regex.cc
index 6912cbaf..a13aee95 100644
--- a/src/compiler/prepare_grammar/parse_regex.cc
+++ b/src/compiler/prepare_grammar/parse_regex.cc
@@ -8,6 +8,7 @@
 #include "compiler/rules/character_set.h"
 #include "compiler/rules/blank.h"
 #include "compiler/util/string_helpers.h"
+#include "utf8proc.h"
 
 namespace tree_sitter {
 namespace prepare_grammar {
@@ -27,7 +28,10 @@ using rules::blank;
 class PatternParser {
  public:
   explicit PatternParser(const string &input)
-      : input(input), length(input.length()), position(0) {}
+      : input(input),
+        iter((const uint8_t *)input.data()),
+        end(iter + input.size())
+  { next(); }
 
   pair<rule_ptr, const GrammarError *> rule(bool nested) {
     vector<rule_ptr> choices = {};
@@ -156,7 +160,7 @@ class PatternParser {
         next();
         break;
       default:
-        char first_char = peek();
+        uint32_t first_char = peek();
         next();
         if (peek() == '-') {
           next();
@@ -169,7 +173,7 @@ class PatternParser {
     return { value, nullptr };
   }
 
-  CharacterSet escaped_char(char value) {
+  CharacterSet escaped_char(uint32_t value) {
     switch (value) {
       case 'a':
         return CharacterSet().include('a', 'z').include('A', 'Z');
@@ -195,23 +199,33 @@ class PatternParser {
     }
   }
 
-  void next() { position++; }
+  void next() {
+    size_t lookahead_size = utf8proc_iterate(iter, end - iter, &lookahead);
+    if (!lookahead_size)
+      lookahead = 0;
+    iter += lookahead_size;
+  }
 
-  char peek() { return input[position]; }
+  uint32_t peek() { 
+    return lookahead;
+  }
 
-  bool has_more_input() { return position < length; }
+  bool has_more_input() {
+    return lookahead && iter <= end;
+  }
 
   pair<rule_ptr, const GrammarError *> error(string msg) {
     return { blank(), new GrammarError(GrammarErrorTypeRegex, msg) };
   }
 
-  const string input;
-  const size_t length;
-  size_t position;
+  string input;
+  const uint8_t *iter;
+  const uint8_t *end;
+  int32_t lookahead;
 };
 
 pair<rule_ptr, const GrammarError *> parse_regex(const std::string &input) {
-  return PatternParser(input).rule(false);
+  return PatternParser(input.c_str()).rule(false);
 }
 
 }  // namespace prepare_grammar