From 035abc1e15e4f574eb6f794755f500d6f22f630a Mon Sep 17 00:00:00 2001 From: Phil Turnbull Date: Mon, 17 Jul 2017 12:31:42 -0700 Subject: [PATCH 1/3] Add test for UTF16 out-of-bound read utf16_iterate does not check that 'length' is a multiple of two which leads to an out-of-bound read: ==105293== Conditional jump or move depends on uninitialised value(s) ==105293== at 0x54F014: utf16_iterate (utf16.c:7) ==105293== by 0x539251: string_iterate(TSInputEncoding, unsigned char const*, unsigned long, int*) (encoding_helpers.cc:15) ==105293== by 0x53939D: string_byte_for_character(TSInputEncoding, std::__cxx11::basic_string, std::allocator > const&, unsigned long, unsigned long) (encoding_helpers.cc:43) ==105293== by 0x507BAD: SpyInput::read(void*, unsigned int*) (spy_input.cc:47) ==105293== by 0x551049: ts_lexer__get_chunk (lexer.c:29) ==105293== by 0x5515C2: ts_lexer_start (lexer.c:152) ==105293== by 0x5469AB: parser(long,...)(long long) (parser.c:297) ==105293== by 0x547896: parser__get_lookahead (parser.c:439) ==105293== by 0x54B2DF: parser__advance (parser.c:1150) ==105293== by 0x54C2B6: parser_parse (parser.c:1348) ==105293== by 0x53F06F: ts_document_parse_with_options (document.c:136) ==105293== by 0x53EF4F: ts_document_parse (document.c:107) --- test/runtime/document_test.cc | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/test/runtime/document_test.cc b/test/runtime/document_test.cc index 99d04a51..7bfc8304 100644 --- a/test/runtime/document_test.cc +++ b/test/runtime/document_test.cc @@ -72,6 +72,19 @@ describe("Document", [&]() { "(array (true) (false))"); }); + it("handles truncated UTF16 data", [&]() { + char *content = reinterpret_cast(malloc(1)); + + spy_input->content = string((const char *)content, 1); + spy_input->encoding = TSInputEncodingUTF16; + + ts_document_set_input(document, spy_input->input()); + ts_document_invalidate(document); + ts_document_parse(document); + + free(content); + }); + it("allows columns to be measured in either bytes or characters", [&]() { const char16_t content[] = u"[true, false]"; spy_input->content = string((const char *)content, sizeof(content)); From e7662c2213eff6243d72bb42bf8b88bb1db9452d Mon Sep 17 00:00:00 2001 From: Phil Turnbull Date: Mon, 17 Jul 2017 13:57:10 -0700 Subject: [PATCH 2/3] Handle out-of-bound read in utf16_iterate Also simplify the test so we call `utf16_iterate` directly. Calling `utf16_iterate` via `SpyInput` and `ts_document_parse` doesn't seem to reliably trigger the problem using valgrind. valgrind also doesn't detect the problem if we use a string literal like: `utf16_iterate("", 1, &code_point);` --- src/runtime/utf16.c | 5 +++++ test/runtime/document_test.cc | 13 ------------- test/runtime/lexer_test.cc | 18 ++++++++++++++++++ 3 files changed, 23 insertions(+), 13 deletions(-) create mode 100644 test/runtime/lexer_test.cc diff --git a/src/runtime/utf16.c b/src/runtime/utf16.c index a8ae6bdd..050caf19 100644 --- a/src/runtime/utf16.c +++ b/src/runtime/utf16.c @@ -1,6 +1,11 @@ #include "runtime/utf16.h" int utf16_iterate(const uint8_t *string, size_t length, int32_t *code_point) { + if (length < 2) { + *code_point = -1; + return 0; + } + uint16_t *units = (uint16_t *)string; uint16_t unit = units[0]; diff --git a/test/runtime/document_test.cc b/test/runtime/document_test.cc index 7bfc8304..99d04a51 100644 --- a/test/runtime/document_test.cc +++ b/test/runtime/document_test.cc @@ -72,19 +72,6 @@ describe("Document", [&]() { "(array (true) (false))"); }); - it("handles truncated UTF16 data", [&]() { - char *content = reinterpret_cast(malloc(1)); - - spy_input->content = string((const char *)content, 1); - spy_input->encoding = TSInputEncodingUTF16; - - ts_document_set_input(document, spy_input->input()); - ts_document_invalidate(document); - ts_document_parse(document); - - free(content); - }); - it("allows columns to be measured in either bytes or characters", [&]() { const char16_t content[] = u"[true, false]"; spy_input->content = string((const char *)content, sizeof(content)); diff --git a/test/runtime/lexer_test.cc b/test/runtime/lexer_test.cc new file mode 100644 index 00000000..b4f370f2 --- /dev/null +++ b/test/runtime/lexer_test.cc @@ -0,0 +1,18 @@ +#include "test_helper.h" +#include "runtime/utf16.h" + +START_TEST + +describe("Lexer", [&]() { + it("handles truncated UTF16 data", [&]() { + uint8_t *content = new uint8_t[1]; + *content = 'A'; + + int32_t code_point = 0; + utf16_iterate(content, 1, &code_point); + + delete[] content; + }); +}); + +END_TEST From 52cec9ed39a56f226d47499db6362a192138fd5a Mon Sep 17 00:00:00 2001 From: Phil Turnbull Date: Tue, 18 Jul 2017 11:54:42 -0700 Subject: [PATCH 3/3] Rework SpyInput buffer handling SpyInput uses a fixed-size buffer and explicitly zeros memory which is good for catching logic errors but defeats valgrind's memory tracking. Use a separate buffer of exactly the correct size for each request. This correctly catches the problem under valgrind: ``` ==8694== Invalid read of size 2 ==8694== at 0x54EFFB: utf16_iterate (utf16.c:10) ==8694== by 0x551126: ts_lexer__get_lookahead (lexer.c:54) ==8694== by 0x5515CD: ts_lexer_start (lexer.c:154) ==8694== by 0x54699F: parser(long,...)(long long) (parser.c:297) ==8694== by 0x54788A: parser__get_lookahead (parser.c:439) ==8694== by 0x54B2D3: parser__advance (parser.c:1150) ==8694== by 0x54C2AA: parser_parse (parser.c:1348) ==8694== by 0x53F063: ts_document_parse_with_options (document.c:136) ==8694== by 0x53EF43: ts_document_parse (document.c:107) ==8694== by 0x4AED11: {lambda()#1}::operator()() const::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda()#4}::operator()() const (document_test.cc:82) ==8694== by 0x4B56B6: std::_Function_handler::_M_invoke(std::_Any_data const&) (functional:1871) ==8694== by 0x40F8C5: std::function::operator()() const (functional:2267) ==8694== Address 0x5d08be0 is 0 bytes inside a block of size 1 alloc'd ==8694== at 0x4C2E80F: operator new[](unsigned long) (in /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so) ==8694== by 0x507C3E: SpyInput::read(void*, unsigned int*) (spy_input.cc:66) ==8694== by 0x55103D: ts_lexer__get_chunk (lexer.c:29) ==8694== by 0x5515B6: ts_lexer_start (lexer.c:152) ==8694== by 0x54699F: parser(long,...)(long long) (parser.c:297) ==8694== by 0x54788A: parser__get_lookahead (parser.c:439) ==8694== by 0x54B2D3: parser__advance (parser.c:1150) ==8694== by 0x54C2AA: parser_parse (parser.c:1348) ==8694== by 0x53F063: ts_document_parse_with_options (document.c:136) ==8694== by 0x53EF43: ts_document_parse (document.c:107) ==8694== by 0x4AED11: {lambda()#1}::operator()() const::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda()#4}::operator()() const (document_test.cc:82) ==8694== by 0x4B56B6: std::_Function_handler::_M_invoke(std::_Any_data const&) (functional:1871) ``` --- test/helpers/spy_input.cc | 18 ++++++++++++------ test/helpers/spy_input.h | 1 - test/runtime/document_test.cc | 10 ++++++++++ test/runtime/lexer_test.cc | 18 ------------------ 4 files changed, 22 insertions(+), 25 deletions(-) delete mode 100644 test/runtime/lexer_test.cc diff --git a/test/helpers/spy_input.cc b/test/helpers/spy_input.cc index 9edaf554..ec370b80 100644 --- a/test/helpers/spy_input.cc +++ b/test/helpers/spy_input.cc @@ -12,8 +12,7 @@ static const size_t UTF8_MAX_CHAR_SIZE = 4; SpyInput::SpyInput(string content, size_t chars_per_chunk) : chars_per_chunk(chars_per_chunk), - buffer_size(UTF8_MAX_CHAR_SIZE * chars_per_chunk), - buffer(new char[buffer_size]), + buffer(nullptr), byte_offset(0), content(content), encoding(TSInputEncodingUTF8), @@ -57,12 +56,19 @@ const char * SpyInput::read(void *payload, uint32_t *bytes_read) { * This class stores its entire `content` in a contiguous buffer, but we want * to ensure that the code under test cannot accidentally read more than * `*bytes_read` bytes past the returned pointer. To make sure that this type - * of error does not fly, we copy the chunk into a zeroed-out buffer and + * of error does not fly, we allocate a separate buffer for each request and * return a reference to that buffer, rather than a pointer into the main - * content. + * content. The temporary buffer only fits `*bytes_read` bytes so valgrind + * can detect code reading too many bytes from the buffer. */ - memset(spy->buffer, 0, spy->buffer_size); - memcpy(spy->buffer, result.data(), byte_count); + delete[] spy->buffer; + if (byte_count) { + spy->buffer = new char[byte_count]; + memcpy(spy->buffer, result.data(), byte_count); + } else { + spy->buffer = nullptr; + } + return spy->buffer; } diff --git a/test/helpers/spy_input.h b/test/helpers/spy_input.h index 9e0ee8d1..2a15ad9b 100644 --- a/test/helpers/spy_input.h +++ b/test/helpers/spy_input.h @@ -13,7 +13,6 @@ struct SpyInputEdit { class SpyInput { uint32_t chars_per_chunk; - uint32_t buffer_size; char *buffer; uint32_t byte_offset; std::vector undo_stack; diff --git a/test/runtime/document_test.cc b/test/runtime/document_test.cc index 99d04a51..f3837836 100644 --- a/test/runtime/document_test.cc +++ b/test/runtime/document_test.cc @@ -72,6 +72,16 @@ describe("Document", [&]() { "(array (true) (false))"); }); + it("handles truncated UTF16 data", [&]() { + const char content[1] = { '\0' }; + spy_input->content = string(content, sizeof(content)); + spy_input->encoding = TSInputEncodingUTF16; + + ts_document_set_input(document, spy_input->input()); + ts_document_invalidate(document); + ts_document_parse(document); + }); + it("allows columns to be measured in either bytes or characters", [&]() { const char16_t content[] = u"[true, false]"; spy_input->content = string((const char *)content, sizeof(content)); diff --git a/test/runtime/lexer_test.cc b/test/runtime/lexer_test.cc deleted file mode 100644 index b4f370f2..00000000 --- a/test/runtime/lexer_test.cc +++ /dev/null @@ -1,18 +0,0 @@ -#include "test_helper.h" -#include "runtime/utf16.h" - -START_TEST - -describe("Lexer", [&]() { - it("handles truncated UTF16 data", [&]() { - uint8_t *content = new uint8_t[1]; - *content = 'A'; - - int32_t code_point = 0; - utf16_iterate(content, 1, &code_point); - - delete[] content; - }); -}); - -END_TEST