diff --git a/src/runtime/lexer.c b/src/runtime/lexer.c index 123a29fd..5646e101 100644 --- a/src/runtime/lexer.c +++ b/src/runtime/lexer.c @@ -36,11 +36,17 @@ static void ts_lexer__get_lookahead(Lexer *self) { const uint8_t *chunk = (const uint8_t *)self->chunk + position_in_chunk; uint32_t size = self->chunk_size - position_in_chunk + 1; - if (self->input.encoding == TSInputEncodingUTF8) - self->lookahead_size = - utf8proc_iterate(chunk, size, &self->data.lookahead); - else + if (self->input.encoding == TSInputEncodingUTF8) { + int64_t lookahead_size = utf8proc_iterate(chunk, size, &self->data.lookahead); + if (lookahead_size < 0) { + self->lookahead_size = 1; + } else { + self->lookahead_size = lookahead_size; + } + } + else { self->lookahead_size = utf16_iterate(chunk, size, &self->data.lookahead); + } } static void ts_lexer__advance(void *payload, bool skip) { diff --git a/test/runtime/parser_test.cc b/test/runtime/parser_test.cc index 7dfcf26b..7e409d45 100644 --- a/test/runtime/parser_test.cc +++ b/test/runtime/parser_test.cc @@ -473,6 +473,16 @@ describe("Parser", [&]() { AssertThat(ts_node_end_char(root), Equals(strlen("'OOO - DD';"))); AssertThat(ts_node_end_byte(root), Equals(strlen("'\u03A9\u03A9\u03A9 \u2014 \u0394\u0394';"))); }); + + it("handles non-UTF8 characters", [&]() { + // ts_document_set_logger(document, stderr_logger_new(true)); + ts_document_print_debugging_graphs(document, true); + ts_document_set_language(document, load_real_language("javascript")); + ts_document_set_input_string(document, "cons\xeb\x00e=ls\x83l6hi');\x0a"); + ts_document_parse(document); + + AssertThat(ts_node_end_byte(root), Equals(strlen("cons\xeb\x00e=ls\x83l6hi');\x0a"))); + }); }); });