Read unicode characters correctly in Lexer advance

2014-10-03 15:44:49 -07:00 · 2014-10-03 15:44:49 -07:00 · 808b003f1a
commit 808b003f1a
parent 78c5fe8e02
2 changed files with 31 additions and 34 deletions
--- a/spec/runtime/parser_spec.cc
+++ b/spec/runtime/parser_spec.cc
@ -287,8 +287,6 @@ describe("Parser", [&]() {

      describe("with non-ascii characters", [&]() {
        before_each([&]() {
-          chunk_size = 50;
-
          // αβδ + 1
          set_text("\u03b1\u03b2\u03b4 + 1");

@ -370,17 +368,14 @@ describe("Parser", [&]() {
    });

    it("recognizes UTF8 characters as single characters", [&]() {
-      // Inputs that return partial UTF8 characters are not yet supported
-      chunk_size = 50;
-
-      // x # Ω — Δ
-      set_text("x # \u03A9 \u2014 \u0394");
+      // x # ΩΩΩ — ΔΔ
+      set_text("x # \u03A9\u03A9\u03A9 \u2014 \u0394\u0394");

      AssertThat(ts_node_string(root), Equals("(DOCUMENT "
        "(expression (variable) (comment)))"));

-      AssertThat(ts_node_size(root).chars, Equals(strlen("x # O - D")));
-      AssertThat(ts_node_size(root).bytes, Equals(strlen("x # \u03A9 \u2014 \u0394")));
+      AssertThat(ts_node_size(root).chars, Equals(strlen("x # OOO - DD")));
+      AssertThat(ts_node_size(root).bytes, Equals(strlen("x # \u03A9\u03A9\u03A9 \u2014 \u0394\u0394")));
    });
  });
 });
--- a/src/runtime/lexer.c
+++ b/src/runtime/lexer.c
@ -4,40 +4,42 @@
 #include "runtime/length.h"
 #include "utf8proc.h"

+static const char *empty_chunk = "";
+
 static int advance(TSLexer *lexer) {
-  static const char *empty_chunk = "";

-  lexer->lookahead = 0;
-
-  if (lexer->chunk == empty_chunk) {
-    lexer->lookahead_size = 0;
+  /*
+   *  Return false if the Lexer has already reached the end of the input.
+   */
+  if (lexer->chunk == empty_chunk)
    return 0;
+
+  /*
+   *  Increment the Lexer's position.
+   */
+  if (lexer->lookahead_size) {
+    lexer->current_position.bytes += lexer->lookahead_size;
+    lexer->current_position.chars += 1;
  }

-  if (lexer->chunk_start + lexer->chunk_size <= lexer->current_position.bytes + 1) {
-    if (lexer->lookahead_size) {
-      lexer->current_position.bytes += lexer->lookahead_size;
-      lexer->current_position.chars += 1;
-    }
-    lexer->lookahead_size = 0;
+  /*
+   *  Request a new chunk of text from the Input if the Lexer has reached
+   *  the end of the current chunk.
+   */
+  if (lexer->current_position.bytes >= lexer->chunk_start + lexer->chunk_size) {
    lexer->chunk_start += lexer->chunk_size;
    lexer->chunk = lexer->input.read_fn(lexer->input.data, &lexer->chunk_size);
+    if (!lexer->chunk_size)
+      lexer->chunk = empty_chunk;
  }

-  if (lexer->chunk_size == 0) {
-    lexer->lookahead_size = 0;
-    lexer->chunk = empty_chunk;
-  } else {
-    if (lexer->lookahead_size) {
-      lexer->current_position.bytes += lexer->lookahead_size;
-      lexer->current_position.chars += 1;
-    }
-
-    lexer->lookahead_size = utf8proc_iterate(
-        (const uint8_t *)lexer->chunk + (lexer->current_position.bytes - lexer->chunk_start),
-        lexer->chunk_start + lexer->chunk_size - lexer->current_position.bytes + 1,
-        &lexer->lookahead);
-  }
+  /*
+   *  Read the next unicode character from the current chunk of text.
+   */
+  size_t position_in_chunk = lexer->current_position.bytes - lexer->chunk_start;
+  lexer->lookahead_size = utf8proc_iterate(
+      (const uint8_t *)lexer->chunk + position_in_chunk,
+      lexer->chunk_size - position_in_chunk + 1, &lexer->lookahead);

  return 1;
 }