Fix OOB reads at ends of chunks

Signed-off-by: Philip Turnbull <philipturnbull@github.com>
2017-06-23 12:09:16 -07:00 · 2017-06-23 12:09:16 -07:00 · f62ee5a0f3
commit f62ee5a0f3
parent 8ee3f96960
2 changed files with 20 additions and 1 deletions
--- a/src/runtime/lexer.c
+++ b/src/runtime/lexer.c
@ -34,7 +34,13 @@ static void ts_lexer__get_chunk(Lexer *self) {
 static void ts_lexer__get_lookahead(Lexer *self) {
  uint32_t position_in_chunk = self->current_position.bytes - self->chunk_start;
  const uint8_t *chunk = (const uint8_t *)self->chunk + position_in_chunk;
-  uint32_t size = self->chunk_size - position_in_chunk + 1;
+  uint32_t size = self->chunk_size - position_in_chunk;
+
+  if (size == 0) {
+    self->lookahead_size = 1;
+    self->data.lookahead = '\0';
+    return;
+  }

  if (self->input.encoding == TSInputEncodingUTF8) {
    int64_t lookahead_size = utf8proc_iterate(chunk, size, &self->data.lookahead);
--- a/test/runtime/parser_test.cc
+++ b/test/runtime/parser_test.cc
@ -187,6 +187,19 @@ describe("Parser", [&]() {
        AssertThat(ts_node_end_point(error), Equals<TSPoint>({2, 2}));
      });
    });
+
+    it("handles invalid UTF8 characters at EOF", [&]() {
+      char *string = (char *)malloc(1);
+      string[0] = '\xdf';
+
+      ts_document_set_language(document, load_real_language("javascript"));
+      ts_document_set_input_string_with_length(document, string, 1);
+      ts_document_parse(document);
+
+      free(string);
+
+      assert_root_node("(ERROR (UNEXPECTED INVALID))");
+    });
  });

  describe("handling extra tokens", [&]() {