From 91558f0a0e282aac314d75e2dd5d28176499fe07 Mon Sep 17 00:00:00 2001 From: Timothy Clem Date: Mon, 20 Mar 2017 16:54:19 -0700 Subject: [PATCH 1/5] utf8proc_iterate can set codepoint_ref to -1 and returns negative error --- src/runtime/lexer.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/runtime/lexer.c b/src/runtime/lexer.c index 123a29fd..5646e101 100644 --- a/src/runtime/lexer.c +++ b/src/runtime/lexer.c @@ -36,11 +36,17 @@ static void ts_lexer__get_lookahead(Lexer *self) { const uint8_t *chunk = (const uint8_t *)self->chunk + position_in_chunk; uint32_t size = self->chunk_size - position_in_chunk + 1; - if (self->input.encoding == TSInputEncodingUTF8) - self->lookahead_size = - utf8proc_iterate(chunk, size, &self->data.lookahead); - else + if (self->input.encoding == TSInputEncodingUTF8) { + int64_t lookahead_size = utf8proc_iterate(chunk, size, &self->data.lookahead); + if (lookahead_size < 0) { + self->lookahead_size = 1; + } else { + self->lookahead_size = lookahead_size; + } + } + else { self->lookahead_size = utf16_iterate(chunk, size, &self->data.lookahead); + } } static void ts_lexer__advance(void *payload, bool skip) { From 37f2a4745ff17c09c8b1dc316e8c0599c8190219 Mon Sep 17 00:00:00 2001 From: Timothy Clem Date: Tue, 21 Mar 2017 09:58:35 -0700 Subject: [PATCH 2/5] Test demonstrating non-UT8 input failure --- test/runtime/parser_test.cc | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/runtime/parser_test.cc b/test/runtime/parser_test.cc index 7dfcf26b..7e409d45 100644 --- a/test/runtime/parser_test.cc +++ b/test/runtime/parser_test.cc @@ -473,6 +473,16 @@ describe("Parser", [&]() { AssertThat(ts_node_end_char(root), Equals(strlen("'OOO - DD';"))); AssertThat(ts_node_end_byte(root), Equals(strlen("'\u03A9\u03A9\u03A9 \u2014 \u0394\u0394';"))); }); + + it("handles non-UTF8 characters", [&]() { + // ts_document_set_logger(document, stderr_logger_new(true)); + ts_document_print_debugging_graphs(document, true); + ts_document_set_language(document, load_real_language("javascript")); + ts_document_set_input_string(document, "cons\xeb\x00e=ls\x83l6hi');\x0a"); + ts_document_parse(document); + + AssertThat(ts_node_end_byte(root), Equals(strlen("cons\xeb\x00e=ls\x83l6hi');\x0a"))); + }); }); }); From 03a555a86ed79f102ba3a71005b5ad6e6b8c8c8d Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 21 Mar 2017 11:05:32 -0700 Subject: [PATCH 3/5] Finish test for invalid UTF8 handling Signed-off-by: Tim Clem --- test/runtime/parser_test.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/test/runtime/parser_test.cc b/test/runtime/parser_test.cc index 7e409d45..e390b164 100644 --- a/test/runtime/parser_test.cc +++ b/test/runtime/parser_test.cc @@ -475,13 +475,14 @@ describe("Parser", [&]() { }); it("handles non-UTF8 characters", [&]() { - // ts_document_set_logger(document, stderr_logger_new(true)); - ts_document_print_debugging_graphs(document, true); + const char *string = "cons\xeb\x00e=ls\x83l6hi');\x0a"; + ts_document_set_language(document, load_real_language("javascript")); - ts_document_set_input_string(document, "cons\xeb\x00e=ls\x83l6hi');\x0a"); + ts_document_set_input_string(document, string); ts_document_parse(document); - AssertThat(ts_node_end_byte(root), Equals(strlen("cons\xeb\x00e=ls\x83l6hi');\x0a"))); + TSNode root = ts_document_root_node(document); + AssertThat(ts_node_end_byte(root), Equals(strlen(string))); }); }); }); From 704c2d5907fd56c0b85eb5602d722556a278dbc1 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 21 Mar 2017 11:05:48 -0700 Subject: [PATCH 4/5] Fix lookahead_char type in ts_tree_make_error function --- src/runtime/tree.c | 2 +- src/runtime/tree.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/runtime/tree.c b/src/runtime/tree.c index 49f81e9c..195b6260 100644 --- a/src/runtime/tree.c +++ b/src/runtime/tree.c @@ -99,7 +99,7 @@ TreeArray ts_tree_array_remove_trailing_extras(TreeArray *self) { return result; } -Tree *ts_tree_make_error(Length size, Length padding, char lookahead_char) { +Tree *ts_tree_make_error(Length size, Length padding, int32_t lookahead_char) { Tree *result = ts_tree_make_leaf(ts_builtin_sym_error, padding, size, (TSSymbolMetadata){ .visible = true, .named = true, diff --git a/src/runtime/tree.h b/src/runtime/tree.h index c08ba24b..f205af97 100644 --- a/src/runtime/tree.h +++ b/src/runtime/tree.h @@ -76,7 +76,7 @@ Tree *ts_tree_make_leaf(TSSymbol, Length, Length, TSSymbolMetadata); Tree *ts_tree_make_node(TSSymbol, uint32_t, Tree **, TSSymbolMetadata); Tree *ts_tree_make_copy(Tree *child); Tree *ts_tree_make_error_node(TreeArray *); -Tree *ts_tree_make_error(Length, Length, char); +Tree *ts_tree_make_error(Length, Length, int32_t); void ts_tree_retain(Tree *tree); void ts_tree_release(Tree *tree); bool ts_tree_eq(const Tree *tree1, const Tree *tree2); From 82cb1c9806cee57cebe361abed5bc870e3d07b65 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 21 Mar 2017 11:12:08 -0700 Subject: [PATCH 5/5] Handle invalid UTF8 in encoding test helpers Signed-off-by: Tim Clem --- test/helpers/encoding_helpers.cc | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/test/helpers/encoding_helpers.cc b/test/helpers/encoding_helpers.cc index 8ef9fec1..1169bb2d 100644 --- a/test/helpers/encoding_helpers.cc +++ b/test/helpers/encoding_helpers.cc @@ -4,10 +4,16 @@ #include "utf8proc.h" static inline int string_iterate(TSInputEncoding encoding, const uint8_t *string, size_t length, int32_t *code_point) { - if (encoding == TSInputEncodingUTF8) - return utf8proc_iterate(string, length, code_point); - else + if (encoding == TSInputEncodingUTF8) { + int32_t character_size = utf8proc_iterate(string, length, code_point); + if (character_size < 0) { + return 1; + } else { + return character_size; + } + } else { return utf16_iterate(string, length, code_point); + } } size_t string_char_count(TSInputEncoding encoding, const std::string &input) {