Merge pull request #74 from tree-sitter/check-utf8proc_iterate-return

Check utf8proc_iterate return
This commit is contained in:
Timothy Clem 2017-05-01 10:46:24 -07:00 committed by GitHub
commit f594ed2519
5 changed files with 32 additions and 9 deletions

View file

@ -36,11 +36,17 @@ static void ts_lexer__get_lookahead(Lexer *self) {
const uint8_t *chunk = (const uint8_t *)self->chunk + position_in_chunk;
uint32_t size = self->chunk_size - position_in_chunk + 1;
if (self->input.encoding == TSInputEncodingUTF8)
self->lookahead_size =
utf8proc_iterate(chunk, size, &self->data.lookahead);
else
if (self->input.encoding == TSInputEncodingUTF8) {
int64_t lookahead_size = utf8proc_iterate(chunk, size, &self->data.lookahead);
if (lookahead_size < 0) {
self->lookahead_size = 1;
} else {
self->lookahead_size = lookahead_size;
}
}
else {
self->lookahead_size = utf16_iterate(chunk, size, &self->data.lookahead);
}
}
static void ts_lexer__advance(void *payload, bool skip) {

View file

@ -99,7 +99,7 @@ TreeArray ts_tree_array_remove_trailing_extras(TreeArray *self) {
return result;
}
Tree *ts_tree_make_error(Length size, Length padding, char lookahead_char) {
Tree *ts_tree_make_error(Length size, Length padding, int32_t lookahead_char) {
Tree *result = ts_tree_make_leaf(ts_builtin_sym_error, padding, size,
(TSSymbolMetadata){
.visible = true, .named = true,

View file

@ -76,7 +76,7 @@ Tree *ts_tree_make_leaf(TSSymbol, Length, Length, TSSymbolMetadata);
Tree *ts_tree_make_node(TSSymbol, uint32_t, Tree **, TSSymbolMetadata);
Tree *ts_tree_make_copy(Tree *child);
Tree *ts_tree_make_error_node(TreeArray *);
Tree *ts_tree_make_error(Length, Length, char);
Tree *ts_tree_make_error(Length, Length, int32_t);
void ts_tree_retain(Tree *tree);
void ts_tree_release(Tree *tree);
bool ts_tree_eq(const Tree *tree1, const Tree *tree2);

View file

@ -4,10 +4,16 @@
#include "utf8proc.h"
static inline int string_iterate(TSInputEncoding encoding, const uint8_t *string, size_t length, int32_t *code_point) {
if (encoding == TSInputEncodingUTF8)
return utf8proc_iterate(string, length, code_point);
else
if (encoding == TSInputEncodingUTF8) {
int32_t character_size = utf8proc_iterate(string, length, code_point);
if (character_size < 0) {
return 1;
} else {
return character_size;
}
} else {
return utf16_iterate(string, length, code_point);
}
}
size_t string_char_count(TSInputEncoding encoding, const std::string &input) {

View file

@ -473,6 +473,17 @@ describe("Parser", [&]() {
AssertThat(ts_node_end_char(root), Equals(strlen("'OOO - DD';")));
AssertThat(ts_node_end_byte(root), Equals(strlen("'\u03A9\u03A9\u03A9 \u2014 \u0394\u0394';")));
});
it("handles non-UTF8 characters", [&]() {
const char *string = "cons\xeb\x00e=ls\x83l6hi');\x0a";
ts_document_set_language(document, load_real_language("javascript"));
ts_document_set_input_string(document, string);
ts_document_parse(document);
TSNode root = ts_document_root_node(document);
AssertThat(ts_node_end_byte(root), Equals(strlen(string)));
});
});
});