diff --git a/src/runtime/lexer.c b/src/runtime/lexer.c index 123a29fd..5646e101 100644 --- a/src/runtime/lexer.c +++ b/src/runtime/lexer.c @@ -36,11 +36,17 @@ static void ts_lexer__get_lookahead(Lexer *self) { const uint8_t *chunk = (const uint8_t *)self->chunk + position_in_chunk; uint32_t size = self->chunk_size - position_in_chunk + 1; - if (self->input.encoding == TSInputEncodingUTF8) - self->lookahead_size = - utf8proc_iterate(chunk, size, &self->data.lookahead); - else + if (self->input.encoding == TSInputEncodingUTF8) { + int64_t lookahead_size = utf8proc_iterate(chunk, size, &self->data.lookahead); + if (lookahead_size < 0) { + self->lookahead_size = 1; + } else { + self->lookahead_size = lookahead_size; + } + } + else { self->lookahead_size = utf16_iterate(chunk, size, &self->data.lookahead); + } } static void ts_lexer__advance(void *payload, bool skip) { diff --git a/src/runtime/tree.c b/src/runtime/tree.c index 49f81e9c..195b6260 100644 --- a/src/runtime/tree.c +++ b/src/runtime/tree.c @@ -99,7 +99,7 @@ TreeArray ts_tree_array_remove_trailing_extras(TreeArray *self) { return result; } -Tree *ts_tree_make_error(Length size, Length padding, char lookahead_char) { +Tree *ts_tree_make_error(Length size, Length padding, int32_t lookahead_char) { Tree *result = ts_tree_make_leaf(ts_builtin_sym_error, padding, size, (TSSymbolMetadata){ .visible = true, .named = true, diff --git a/src/runtime/tree.h b/src/runtime/tree.h index c08ba24b..f205af97 100644 --- a/src/runtime/tree.h +++ b/src/runtime/tree.h @@ -76,7 +76,7 @@ Tree *ts_tree_make_leaf(TSSymbol, Length, Length, TSSymbolMetadata); Tree *ts_tree_make_node(TSSymbol, uint32_t, Tree **, TSSymbolMetadata); Tree *ts_tree_make_copy(Tree *child); Tree *ts_tree_make_error_node(TreeArray *); -Tree *ts_tree_make_error(Length, Length, char); +Tree *ts_tree_make_error(Length, Length, int32_t); void ts_tree_retain(Tree *tree); void ts_tree_release(Tree *tree); bool ts_tree_eq(const Tree *tree1, const Tree *tree2); diff --git a/test/helpers/encoding_helpers.cc b/test/helpers/encoding_helpers.cc index 8ef9fec1..1169bb2d 100644 --- a/test/helpers/encoding_helpers.cc +++ b/test/helpers/encoding_helpers.cc @@ -4,10 +4,16 @@ #include "utf8proc.h" static inline int string_iterate(TSInputEncoding encoding, const uint8_t *string, size_t length, int32_t *code_point) { - if (encoding == TSInputEncodingUTF8) - return utf8proc_iterate(string, length, code_point); - else + if (encoding == TSInputEncodingUTF8) { + int32_t character_size = utf8proc_iterate(string, length, code_point); + if (character_size < 0) { + return 1; + } else { + return character_size; + } + } else { return utf16_iterate(string, length, code_point); + } } size_t string_char_count(TSInputEncoding encoding, const std::string &input) { diff --git a/test/runtime/parser_test.cc b/test/runtime/parser_test.cc index 7dfcf26b..e390b164 100644 --- a/test/runtime/parser_test.cc +++ b/test/runtime/parser_test.cc @@ -473,6 +473,17 @@ describe("Parser", [&]() { AssertThat(ts_node_end_char(root), Equals(strlen("'OOO - DD';"))); AssertThat(ts_node_end_byte(root), Equals(strlen("'\u03A9\u03A9\u03A9 \u2014 \u0394\u0394';"))); }); + + it("handles non-UTF8 characters", [&]() { + const char *string = "cons\xeb\x00e=ls\x83l6hi');\x0a"; + + ts_document_set_language(document, load_real_language("javascript")); + ts_document_set_input_string(document, string); + ts_document_parse(document); + + TSNode root = ts_document_root_node(document); + AssertThat(ts_node_end_byte(root), Equals(strlen(string))); + }); }); });