From f394a48c0b87fb05988480d1c526486651492949 Mon Sep 17 00:00:00 2001 From: Timothy Clem Date: Mon, 20 Mar 2017 16:54:19 -0700 Subject: [PATCH 1/2] utf8proc_iterate can set codepoint_ref to -1 and returns negative error --- src/runtime/lexer.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/runtime/lexer.c b/src/runtime/lexer.c index 123a29fd..5646e101 100644 --- a/src/runtime/lexer.c +++ b/src/runtime/lexer.c @@ -36,11 +36,17 @@ static void ts_lexer__get_lookahead(Lexer *self) { const uint8_t *chunk = (const uint8_t *)self->chunk + position_in_chunk; uint32_t size = self->chunk_size - position_in_chunk + 1; - if (self->input.encoding == TSInputEncodingUTF8) - self->lookahead_size = - utf8proc_iterate(chunk, size, &self->data.lookahead); - else + if (self->input.encoding == TSInputEncodingUTF8) { + int64_t lookahead_size = utf8proc_iterate(chunk, size, &self->data.lookahead); + if (lookahead_size < 0) { + self->lookahead_size = 1; + } else { + self->lookahead_size = lookahead_size; + } + } + else { self->lookahead_size = utf16_iterate(chunk, size, &self->data.lookahead); + } } static void ts_lexer__advance(void *payload, bool skip) { From 7092d4522a8d8928d5540c1d33f1d7bcbc036a04 Mon Sep 17 00:00:00 2001 From: Timothy Clem Date: Tue, 21 Mar 2017 09:58:35 -0700 Subject: [PATCH 2/2] Test demonstrating non-UT8 input failure --- test/runtime/parser_test.cc | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/runtime/parser_test.cc b/test/runtime/parser_test.cc index 7dfcf26b..7e409d45 100644 --- a/test/runtime/parser_test.cc +++ b/test/runtime/parser_test.cc @@ -473,6 +473,16 @@ describe("Parser", [&]() { AssertThat(ts_node_end_char(root), Equals(strlen("'OOO - DD';"))); AssertThat(ts_node_end_byte(root), Equals(strlen("'\u03A9\u03A9\u03A9 \u2014 \u0394\u0394';"))); }); + + it("handles non-UTF8 characters", [&]() { + // ts_document_set_logger(document, stderr_logger_new(true)); + ts_document_print_debugging_graphs(document, true); + ts_document_set_language(document, load_real_language("javascript")); + ts_document_set_input_string(document, "cons\xeb\x00e=ls\x83l6hi');\x0a"); + ts_document_parse(document); + + AssertThat(ts_node_end_byte(root), Equals(strlen("cons\xeb\x00e=ls\x83l6hi');\x0a"))); + }); }); });