Merge pull request #92 from tree-sitter/utf16-oob

Add test for UTF16 out-of-bound read
2017-07-18 17:24:31 -07:00 · 2017-07-18 17:24:31 -07:00 · 10d28d4b56
commit 10d28d4b56
parent 4649c3a37f 52cec9ed39
4 changed files with 27 additions and 7 deletions
--- a/src/runtime/utf16.c
+++ b/src/runtime/utf16.c
@ -1,6 +1,11 @@
 #include "runtime/utf16.h"

 int utf16_iterate(const uint8_t *string, size_t length, int32_t *code_point) {
+  if (length < 2) {
+    *code_point = -1;
+    return 0;
+  }
+
  uint16_t *units = (uint16_t *)string;
  uint16_t unit = units[0];

--- a/test/helpers/spy_input.cc
+++ b/test/helpers/spy_input.cc
@ -12,8 +12,7 @@ static const size_t UTF8_MAX_CHAR_SIZE = 4;

 SpyInput::SpyInput(string content, size_t chars_per_chunk) :
  chars_per_chunk(chars_per_chunk),
-  buffer_size(UTF8_MAX_CHAR_SIZE * chars_per_chunk),
-  buffer(new char[buffer_size]),
+  buffer(nullptr),
  byte_offset(0),
  content(content),
  encoding(TSInputEncodingUTF8),
@ -57,12 +56,19 @@ const char * SpyInput::read(void *payload, uint32_t *bytes_read) {
   * This class stores its entire `content` in a contiguous buffer, but we want
   * to ensure that the code under test cannot accidentally read more than
   * `*bytes_read` bytes past the returned pointer. To make sure that this type
-   * of error does not fly, we copy the chunk into a zeroed-out buffer and
+   * of error does not fly, we allocate a separate buffer for each request and
   * return a reference to that buffer, rather than a pointer into the main
-   * content.
+   * content. The temporary buffer only fits `*bytes_read` bytes so valgrind
+   * can detect code reading too many bytes from the buffer.
   */
-  memset(spy->buffer, 0, spy->buffer_size);
-  memcpy(spy->buffer, result.data(), byte_count);
+  delete[] spy->buffer;
+  if (byte_count) {
+    spy->buffer = new char[byte_count];
+    memcpy(spy->buffer, result.data(), byte_count);
+  } else {
+    spy->buffer = nullptr;
+  }
+
  return spy->buffer;
 }

--- a/test/helpers/spy_input.h
+++ b/test/helpers/spy_input.h
@ -13,7 +13,6 @@ struct SpyInputEdit {

 class SpyInput {
  uint32_t chars_per_chunk;
-  uint32_t buffer_size;
  char *buffer;
  uint32_t byte_offset;
  std::vector<SpyInputEdit> undo_stack;
--- a/test/runtime/document_test.cc
+++ b/test/runtime/document_test.cc
@ -74,6 +74,16 @@ describe("Document", [&]() {
        "(array (true) (false))");
    });

+    it("handles truncated UTF16 data", [&]() {
+      const char content[1] = { '\0' };
+      spy_input->content = string(content, sizeof(content));
+      spy_input->encoding = TSInputEncodingUTF16;
+
+      ts_document_set_input(document, spy_input->input());
+      ts_document_invalidate(document);
+      ts_document_parse(document);
+    });
+
    it("allows columns to be measured in either bytes or characters", [&]() {
      const char16_t content[] = u"[true, false]";
      spy_input->content = string((const char *)content, sizeof(content));