From 035abc1e15e4f574eb6f794755f500d6f22f630a Mon Sep 17 00:00:00 2001
From: Phil Turnbull <philipturnbull@github.com>
Date: Mon, 17 Jul 2017 12:31:42 -0700
Subject: [PATCH 1/3] Add test for UTF16 out-of-bound read

utf16_iterate does not check that 'length' is a multiple of two which leads to
an out-of-bound read:

==105293== Conditional jump or move depends on uninitialised value(s)
==105293==    at 0x54F014: utf16_iterate (utf16.c:7)
==105293==    by 0x539251: string_iterate(TSInputEncoding, unsigned char const*, unsigned long, int*) (encoding_helpers.cc:15)
==105293==    by 0x53939D: string_byte_for_character(TSInputEncoding, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, unsigned long, unsigned long) (encoding_helpers.cc:43)
==105293==    by 0x507BAD: SpyInput::read(void*, unsigned int*) (spy_input.cc:47)
==105293==    by 0x551049: ts_lexer__get_chunk (lexer.c:29)
==105293==    by 0x5515C2: ts_lexer_start (lexer.c:152)
==105293==    by 0x5469AB: parser(long,...)(long long) (parser.c:297)
==105293==    by 0x547896: parser__get_lookahead (parser.c:439)
==105293==    by 0x54B2DF: parser__advance (parser.c:1150)
==105293==    by 0x54C2B6: parser_parse (parser.c:1348)
==105293==    by 0x53F06F: ts_document_parse_with_options (document.c:136)
==105293==    by 0x53EF4F: ts_document_parse (document.c:107)
---
 test/runtime/document_test.cc | 13 +++++++++++++
 1 file changed, 13 insertions(+)
diff --git a/test/runtime/document_test.cc b/test/runtime/document_test.cc
index 99d04a51..7bfc8304 100644
--- a/test/runtime/document_test.cc
+++ b/test/runtime/document_test.cc
@@ -72,6 +72,19 @@ describe("Document", [&]() {
         "(array (true) (false))");
     });
 
+    it("handles truncated UTF16 data", [&]() {
+      char *content = reinterpret_cast<char*>(malloc(1));
+
+      spy_input->content = string((const char *)content, 1);
+      spy_input->encoding = TSInputEncodingUTF16;
+
+      ts_document_set_input(document, spy_input->input());
+      ts_document_invalidate(document);
+      ts_document_parse(document);
+
+      free(content);
+    });
+
     it("allows columns to be measured in either bytes or characters", [&]() {
       const char16_t content[] = u"[true, false]";
       spy_input->content = string((const char *)content, sizeof(content));

From e7662c2213eff6243d72bb42bf8b88bb1db9452d Mon Sep 17 00:00:00 2001
From: Phil Turnbull <philipturnbull@github.com>
Date: Mon, 17 Jul 2017 13:57:10 -0700
Subject: [PATCH 2/3] Handle out-of-bound read in utf16_iterate

Also simplify the test so we call `utf16_iterate` directly. Calling
`utf16_iterate` via `SpyInput` and `ts_document_parse` doesn't seem to reliably
trigger the problem using valgrind.

valgrind also doesn't detect the problem if we use a string literal like:
  `utf16_iterate("", 1, &code_point);`
---
 src/runtime/utf16.c           |  5 +++++
 test/runtime/document_test.cc | 13 -------------
 test/runtime/lexer_test.cc    | 18 ++++++++++++++++++
 3 files changed, 23 insertions(+), 13 deletions(-)
 create mode 100644 test/runtime/lexer_test.cc

diff --git a/src/runtime/utf16.c b/src/runtime/utf16.c
index a8ae6bdd..050caf19 100644
--- a/src/runtime/utf16.c
+++ b/src/runtime/utf16.c
@@ -1,6 +1,11 @@
 #include "runtime/utf16.h"
 
 int utf16_iterate(const uint8_t *string, size_t length, int32_t *code_point) {
+  if (length < 2) {
+    *code_point = -1;
+    return 0;
+  }
+
   uint16_t *units = (uint16_t *)string;
   uint16_t unit = units[0];
 
diff --git a/test/runtime/document_test.cc b/test/runtime/document_test.cc
index 7bfc8304..99d04a51 100644
--- a/test/runtime/document_test.cc
+++ b/test/runtime/document_test.cc
@@ -72,19 +72,6 @@ describe("Document", [&]() {
         "(array (true) (false))");
     });
 
-    it("handles truncated UTF16 data", [&]() {
-      char *content = reinterpret_cast<char*>(malloc(1));
-
-      spy_input->content = string((const char *)content, 1);
-      spy_input->encoding = TSInputEncodingUTF16;
-
-      ts_document_set_input(document, spy_input->input());
-      ts_document_invalidate(document);
-      ts_document_parse(document);
-
-      free(content);
-    });
-
     it("allows columns to be measured in either bytes or characters", [&]() {
       const char16_t content[] = u"[true, false]";
       spy_input->content = string((const char *)content, sizeof(content));
diff --git a/test/runtime/lexer_test.cc b/test/runtime/lexer_test.cc
new file mode 100644
index 00000000..b4f370f2
--- /dev/null
+++ b/test/runtime/lexer_test.cc
@@ -0,0 +1,18 @@
+#include "test_helper.h"
+#include "runtime/utf16.h"
+
+START_TEST
+
+describe("Lexer", [&]() {
+    it("handles truncated UTF16 data", [&]() {
+      uint8_t *content = new uint8_t[1];
+      *content = 'A';
+
+      int32_t code_point = 0;
+      utf16_iterate(content, 1, &code_point);
+
+      delete[] content;
+    });
+});
+
+END_TEST

From 52cec9ed39a56f226d47499db6362a192138fd5a Mon Sep 17 00:00:00 2001
From: Phil Turnbull <philipturnbull@github.com>
Date: Tue, 18 Jul 2017 11:54:42 -0700
Subject: [PATCH 3/3] Rework SpyInput buffer handling

SpyInput uses a fixed-size buffer and explicitly zeros memory which is good for
catching logic errors but defeats valgrind's memory tracking. Use a separate
buffer of exactly the correct size for each request. This correctly catches the
problem under valgrind:

```
==8694== Invalid read of size 2
==8694==    at 0x54EFFB: utf16_iterate (utf16.c:10)
==8694==    by 0x551126: ts_lexer__get_lookahead (lexer.c:54)
==8694==    by 0x5515CD: ts_lexer_start (lexer.c:154)
==8694==    by 0x54699F: parser(long,...)(long long) (parser.c:297)
==8694==    by 0x54788A: parser__get_lookahead (parser.c:439)
==8694==    by 0x54B2D3: parser__advance (parser.c:1150)
==8694==    by 0x54C2AA: parser_parse (parser.c:1348)
==8694==    by 0x53F063: ts_document_parse_with_options (document.c:136)
==8694==    by 0x53EF43: ts_document_parse (document.c:107)
==8694==    by 0x4AED11: {lambda()#1}::operator()() const::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda()#4}::operator()() const (document_test.cc:82)
==8694==    by 0x4B56B6: std::_Function_handler<void (), {lambda()#1}::operator()() const::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda()#4}>::_M_invoke(std::_Any_data const&) (functional:1871)
==8694==    by 0x40F8C5: std::function<void ()>::operator()() const (functional:2267)
==8694==  Address 0x5d08be0 is 0 bytes inside a block of size 1 alloc'd
==8694==    at 0x4C2E80F: operator new[](unsigned long) (in /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so)
==8694==    by 0x507C3E: SpyInput::read(void*, unsigned int*) (spy_input.cc:66)
==8694==    by 0x55103D: ts_lexer__get_chunk (lexer.c:29)
==8694==    by 0x5515B6: ts_lexer_start (lexer.c:152)
==8694==    by 0x54699F: parser(long,...)(long long) (parser.c:297)
==8694==    by 0x54788A: parser__get_lookahead (parser.c:439)
==8694==    by 0x54B2D3: parser__advance (parser.c:1150)
==8694==    by 0x54C2AA: parser_parse (parser.c:1348)
==8694==    by 0x53F063: ts_document_parse_with_options (document.c:136)
==8694==    by 0x53EF43: ts_document_parse (document.c:107)
==8694==    by 0x4AED11: {lambda()#1}::operator()() const::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda()#4}::operator()() const (document_test.cc:82)
==8694==    by 0x4B56B6: std::_Function_handler<void (), {lambda()#1}::operator()() const::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda()#4}>::_M_invoke(std::_Any_data const&) (functional:1871)
```
---
 test/helpers/spy_input.cc     | 18 ++++++++++++------
 test/helpers/spy_input.h      |  1 -
 test/runtime/document_test.cc | 10 ++++++++++
 test/runtime/lexer_test.cc    | 18 ------------------
 4 files changed, 22 insertions(+), 25 deletions(-)
 delete mode 100644 test/runtime/lexer_test.cc

diff --git a/test/helpers/spy_input.cc b/test/helpers/spy_input.cc
index 9edaf554..ec370b80 100644
--- a/test/helpers/spy_input.cc
+++ b/test/helpers/spy_input.cc
@@ -12,8 +12,7 @@ static const size_t UTF8_MAX_CHAR_SIZE = 4;
 
 SpyInput::SpyInput(string content, size_t chars_per_chunk) :
   chars_per_chunk(chars_per_chunk),
-  buffer_size(UTF8_MAX_CHAR_SIZE * chars_per_chunk),
-  buffer(new char[buffer_size]),
+  buffer(nullptr),
   byte_offset(0),
   content(content),
   encoding(TSInputEncodingUTF8),
@@ -57,12 +56,19 @@ const char * SpyInput::read(void *payload, uint32_t *bytes_read) {
    * This class stores its entire `content` in a contiguous buffer, but we want
    * to ensure that the code under test cannot accidentally read more than
    * `*bytes_read` bytes past the returned pointer. To make sure that this type
-   * of error does not fly, we copy the chunk into a zeroed-out buffer and
+   * of error does not fly, we allocate a separate buffer for each request and
    * return a reference to that buffer, rather than a pointer into the main
-   * content.
+   * content. The temporary buffer only fits `*bytes_read` bytes so valgrind
+   * can detect code reading too many bytes from the buffer.
    */
-  memset(spy->buffer, 0, spy->buffer_size);
-  memcpy(spy->buffer, result.data(), byte_count);
+  delete[] spy->buffer;
+  if (byte_count) {
+    spy->buffer = new char[byte_count];
+    memcpy(spy->buffer, result.data(), byte_count);
+  } else {
+    spy->buffer = nullptr;
+  }
+
   return spy->buffer;
 }
 
diff --git a/test/helpers/spy_input.h b/test/helpers/spy_input.h
index 9e0ee8d1..2a15ad9b 100644
--- a/test/helpers/spy_input.h
+++ b/test/helpers/spy_input.h
@@ -13,7 +13,6 @@ struct SpyInputEdit {
 
 class SpyInput {
   uint32_t chars_per_chunk;
-  uint32_t buffer_size;
   char *buffer;
   uint32_t byte_offset;
   std::vector<SpyInputEdit> undo_stack;
diff --git a/test/runtime/document_test.cc b/test/runtime/document_test.cc
index 99d04a51..f3837836 100644
--- a/test/runtime/document_test.cc
+++ b/test/runtime/document_test.cc
@@ -72,6 +72,16 @@ describe("Document", [&]() {
         "(array (true) (false))");
     });
 
+    it("handles truncated UTF16 data", [&]() {
+      const char content[1] = { '\0' };
+      spy_input->content = string(content, sizeof(content));
+      spy_input->encoding = TSInputEncodingUTF16;
+
+      ts_document_set_input(document, spy_input->input());
+      ts_document_invalidate(document);
+      ts_document_parse(document);
+    });
+
     it("allows columns to be measured in either bytes or characters", [&]() {
       const char16_t content[] = u"[true, false]";
       spy_input->content = string((const char *)content, sizeof(content));
diff --git a/test/runtime/lexer_test.cc b/test/runtime/lexer_test.cc
deleted file mode 100644
index b4f370f2..00000000
--- a/test/runtime/lexer_test.cc
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "test_helper.h"
-#include "runtime/utf16.h"
-
-START_TEST
-
-describe("Lexer", [&]() {
-    it("handles truncated UTF16 data", [&]() {
-      uint8_t *content = new uint8_t[1];
-      *content = 'A';
-
-      int32_t code_point = 0;
-      utf16_iterate(content, 1, &code_point);
-
-      delete[] content;
-    });
-});
-
-END_TEST