From 4c083252ecef53765a06790756fd1489fde142fd Mon Sep 17 00:00:00 2001 From: Amaan Qureshi Date: Tue, 30 Apr 2024 19:48:04 -0400 Subject: [PATCH] fix(lib): advance the lookahead end byte by 4 when there's an invalid code point This helps in the case where an edit was made in the middle of a code point, but bytes 1-3 are valid, thus we could advance by at most 4 bytes --- cli/src/tests/tree_test.rs | 23 +++++++++++++++++++++++ lib/src/lexer.c | 2 +- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/cli/src/tests/tree_test.rs b/cli/src/tests/tree_test.rs index fb7297a9..793b24af 100644 --- a/cli/src/tests/tree_test.rs +++ b/cli/src/tests/tree_test.rs @@ -679,6 +679,29 @@ fn test_get_changed_ranges() { } } +#[test] +fn test_consistency_with_mid_codepoint_edit() { + let mut parser = Parser::new(); + parser.set_language(&get_language("php/php")).unwrap(); + let mut source_code = + b"\n usize { str::from_utf8(text).unwrap().find(substring).unwrap() } diff --git a/lib/src/lexer.c b/lib/src/lexer.c index d108c04e..b32a9201 100644 --- a/lib/src/lexer.c +++ b/lib/src/lexer.c @@ -365,7 +365,7 @@ void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) { // Therefore, the next byte *after* the current (invalid) character // affects the interpretation of the current character. if (self->data.lookahead == TS_DECODE_ERROR) { - current_lookahead_end_byte++; + current_lookahead_end_byte += 4; // the maximum number of bytes read to identify an invalid code point } if (current_lookahead_end_byte > *lookahead_end_byte) {