Handle string and regex rules w/ non-ascii chars

2014-09-28 18:21:22 -07:00 · 2014-09-28 18:21:22 -07:00 · cb5ecbd491
commit cb5ecbd491
parent e0185f84fc
5 changed files with 114 additions and 52 deletions
--- a/.ycm_extra_conf.py
+++ b/.ycm_extra_conf.py
@ -13,6 +13,7 @@ cxx_flags = [
    '-I', 'spec',
    '-I', 'include',
    '-I', 'externals/bandit',
+    '-I', 'externals/utf8proc',
    '-isystem', '/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/lib/c++/v1',
 ]

--- a/project.gyp
+++ b/project.gyp
@ -7,6 +7,7 @@
      'include_dirs': [
        'include',
        'src',
+        'externals/utf8proc',
      ],
      'sources': [
        'src/compiler/build_tables/build_lex_table.cc',
@ -54,6 +55,7 @@
        'src/compiler/rules/symbol.cc',
        'src/compiler/rules/visitor.cc',
        'src/compiler/util/string_helpers.cc',
+        'externals/utf8proc/utf8proc.c',
      ],
      'cflags_cc': [
        '-std=c++0x',
--- a/spec/compiler/prepare_grammar/expand_tokens_spec.cc
+++ b/spec/compiler/prepare_grammar/expand_tokens_spec.cc
@ -9,55 +9,88 @@ using namespace rules;
 using prepare_grammar::expand_tokens;

 describe("expand_tokens", []() {
-  it("replaces regex patterns with their expansion", [&]() {
-    LexicalGrammar grammar({
-        { "rule_A", seq({
-            i_sym(10),
-            pattern("x*"),
-            i_sym(11) }) },
-    }, {});
+  describe("string rules", [&]() {
+    it("replaces strings with sequences of character sets", [&]() {
+      LexicalGrammar grammar({
+          { "rule_A", seq({
+              i_sym(10),
+              str("xyz"),
+              i_sym(11) }) },
+      }, {});

-    auto result = expand_tokens(grammar);
+      auto result = expand_tokens(grammar);

-    AssertThat(result.second, Equals((const GrammarError *)nullptr));
-    AssertThat(result.first.rules, Equals(rule_list({
-        { "rule_A", seq({
-            i_sym(10),
-            repeat(character({ 'x' })),
-            i_sym(11) }) },
-    })));
+      AssertThat(result.second, Equals((const GrammarError *)nullptr));
+      AssertThat(result.first.rules, Equals(rule_list({
+          { "rule_A", seq({
+              i_sym(10),
+              seq({ character({ 'x' }), character({ 'y' }), character({ 'z' }) }),
+              i_sym(11) }) },
+      })));
+    });
+
+    it("handles strings containing non-ASCII UTF8 characters", [&]() {
+      LexicalGrammar grammar({
+          // α β
+          { "rule_A", str("\u03B1 \u03B2") },
+      }, {});
+
+      auto result = expand_tokens(grammar);
+
+      AssertThat(result.first.rules, Equals(rule_list({
+          { "rule_A", seq({
+              character({ 945 }),
+              character({ ' ' }),
+              character({ 946 }) }) }
+      })));
+    });
  });

-  it("replaces string rules with a sequence of characters", [&]() {
-    LexicalGrammar grammar({
-        { "rule_A", seq({
-            i_sym(10),
-            str("xyz"),
-            i_sym(11) }) },
-    }, {});
+  describe("regexp rules", [&]() {
+    it("replaces regexps with the equivalent rule tree", [&]() {
+      LexicalGrammar grammar({
+          { "rule_A", seq({
+              i_sym(10),
+              pattern("x*"),
+              i_sym(11) }) },
+      }, {});

-    auto result = expand_tokens(grammar);
+      auto result = expand_tokens(grammar);

-    AssertThat(result.second, Equals((const GrammarError *)nullptr));
-    AssertThat(result.first.rules, Equals(rule_list({
-        { "rule_A", seq({
-            i_sym(10),
-            seq({ character({ 'x' }), character({ 'y' }), character({ 'z' }) }),
-            i_sym(11) }) },
-    })));
-  });
+      AssertThat(result.second, Equals((const GrammarError *)nullptr));
+      AssertThat(result.first.rules, Equals(rule_list({
+          { "rule_A", seq({
+              i_sym(10),
+              repeat(character({ 'x' })),
+              i_sym(11) }) },
+      })));
+    });

-  it("returns an error when the grammar contains an invalid regex", [&]() {
-    LexicalGrammar grammar({
-        { "rule_A", seq({
-            pattern("("),
-            str("xyz"),
-            pattern("[") }) },
-    }, {});
+    it("handles regexps containing non-ASCII UTF8 characters", [&]() {
+      LexicalGrammar grammar({
+          // [^α-δ]
+          { "rule_A", pattern("[^\u03B1-\u03B4]*") },
+      }, {});

-    auto result = expand_tokens(grammar);
+      auto result = expand_tokens(grammar);

-    AssertThat(result.second, EqualsPointer(new GrammarError(GrammarErrorTypeRegex, "unmatched open paren")));
+      AssertThat(result.first.rules, Equals(rule_list({
+          { "rule_A", repeat(character({ 945, 946, 947, 948 }, false)) }
+      })));
+    });
+
+    it("returns an error when the grammar contains an invalid regex", [&]() {
+      LexicalGrammar grammar({
+          { "rule_A", seq({
+              pattern("("),
+              str("xyz"),
+              pattern("[") }) },
+      }, {});
+
+      auto result = expand_tokens(grammar);
+
+      AssertThat(result.second, EqualsPointer(new GrammarError(GrammarErrorTypeRegex, "unmatched open paren")));
+    });
  });
 });

--- a/src/compiler/prepare_grammar/expand_tokens.cc
+++ b/src/compiler/prepare_grammar/expand_tokens.cc
@ -10,6 +10,7 @@
 #include "compiler/rules/seq.h"
 #include "compiler/rules/character_set.h"
 #include "compiler/prepare_grammar/parse_regex.h"
+#include "utf8proc.h"

 namespace tree_sitter {
 namespace prepare_grammar {
@ -27,8 +28,19 @@ class ExpandTokens : public rules::IdentityRuleFn {

  rule_ptr apply_to(const String *rule) {
    vector<rule_ptr> elements;
-    for (char val : rule->value)
-      elements.push_back(rules::CharacterSet().include(val).copy());
+    uint8_t *iter = (uint8_t *)rule->value.data();
+    uint8_t *end = iter + rule->value.size();
+
+    while (iter < end) {
+      int32_t el;
+      size_t size = utf8proc_iterate(iter, (end - iter), &el);
+      if (!size)
+        break;
+      iter += size;
+
+      elements.push_back(rules::CharacterSet().include(el).copy());
+    }
+
    return rules::Seq::Build(elements);
  }

--- a/src/compiler/prepare_grammar/parse_regex.cc
+++ b/src/compiler/prepare_grammar/parse_regex.cc
@ -8,6 +8,7 @@
 #include "compiler/rules/character_set.h"
 #include "compiler/rules/blank.h"
 #include "compiler/util/string_helpers.h"
+#include "utf8proc.h"

 namespace tree_sitter {
 namespace prepare_grammar {
@ -27,7 +28,10 @@ using rules::blank;
 class PatternParser {
 public:
  explicit PatternParser(const string &input)
-      : input(input), length(input.length()), position(0) {}
+      : input(input),
+        iter((const uint8_t *)input.data()),
+        end(iter + input.size())
+  { next(); }

  pair<rule_ptr, const GrammarError *> rule(bool nested) {
    vector<rule_ptr> choices = {};
@ -156,7 +160,7 @@ class PatternParser {
        next();
        break;
      default:
-        char first_char = peek();
+        uint32_t first_char = peek();
        next();
        if (peek() == '-') {
          next();
@ -169,7 +173,7 @@ class PatternParser {
    return { value, nullptr };
  }

-  CharacterSet escaped_char(char value) {
+  CharacterSet escaped_char(uint32_t value) {
    switch (value) {
      case 'a':
        return CharacterSet().include('a', 'z').include('A', 'Z');
@ -195,23 +199,33 @@ class PatternParser {
    }
  }

-  void next() { position++; }
+  void next() {
+    size_t lookahead_size = utf8proc_iterate(iter, end - iter, &lookahead);
+    if (!lookahead_size)
+      lookahead = 0;
+    iter += lookahead_size;
+  }

-  char peek() { return input[position]; }
+  uint32_t peek() { 
+    return lookahead;
+  }

-  bool has_more_input() { return position < length; }
+  bool has_more_input() {
+    return lookahead && iter <= end;
+  }

  pair<rule_ptr, const GrammarError *> error(string msg) {
    return { blank(), new GrammarError(GrammarErrorTypeRegex, msg) };
  }

-  const string input;
-  const size_t length;
-  size_t position;
+  string input;
+  const uint8_t *iter;
+  const uint8_t *end;
+  int32_t lookahead;
 };

 pair<rule_ptr, const GrammarError *> parse_regex(const std::string &input) {
-  return PatternParser(input).rule(false);
+  return PatternParser(input.c_str()).rule(false);
 }

 }  // namespace prepare_grammar