Read unicode characters correctly in Lexer advance
This commit is contained in:
parent
78c5fe8e02
commit
808b003f1a
2 changed files with 31 additions and 34 deletions
|
|
@ -287,8 +287,6 @@ describe("Parser", [&]() {
|
|||
|
||||
describe("with non-ascii characters", [&]() {
|
||||
before_each([&]() {
|
||||
chunk_size = 50;
|
||||
|
||||
// αβδ + 1
|
||||
set_text("\u03b1\u03b2\u03b4 + 1");
|
||||
|
||||
|
|
@ -370,17 +368,14 @@ describe("Parser", [&]() {
|
|||
});
|
||||
|
||||
it("recognizes UTF8 characters as single characters", [&]() {
|
||||
// Inputs that return partial UTF8 characters are not yet supported
|
||||
chunk_size = 50;
|
||||
|
||||
// x # Ω — Δ
|
||||
set_text("x # \u03A9 \u2014 \u0394");
|
||||
// x # ΩΩΩ — ΔΔ
|
||||
set_text("x # \u03A9\u03A9\u03A9 \u2014 \u0394\u0394");
|
||||
|
||||
AssertThat(ts_node_string(root), Equals("(DOCUMENT "
|
||||
"(expression (variable) (comment)))"));
|
||||
|
||||
AssertThat(ts_node_size(root).chars, Equals(strlen("x # O - D")));
|
||||
AssertThat(ts_node_size(root).bytes, Equals(strlen("x # \u03A9 \u2014 \u0394")));
|
||||
AssertThat(ts_node_size(root).chars, Equals(strlen("x # OOO - DD")));
|
||||
AssertThat(ts_node_size(root).bytes, Equals(strlen("x # \u03A9\u03A9\u03A9 \u2014 \u0394\u0394")));
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -4,40 +4,42 @@
|
|||
#include "runtime/length.h"
|
||||
#include "utf8proc.h"
|
||||
|
||||
static const char *empty_chunk = "";
|
||||
|
||||
static int advance(TSLexer *lexer) {
|
||||
static const char *empty_chunk = "";
|
||||
|
||||
lexer->lookahead = 0;
|
||||
|
||||
if (lexer->chunk == empty_chunk) {
|
||||
lexer->lookahead_size = 0;
|
||||
/*
|
||||
* Return false if the Lexer has already reached the end of the input.
|
||||
*/
|
||||
if (lexer->chunk == empty_chunk)
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Increment the Lexer's position.
|
||||
*/
|
||||
if (lexer->lookahead_size) {
|
||||
lexer->current_position.bytes += lexer->lookahead_size;
|
||||
lexer->current_position.chars += 1;
|
||||
}
|
||||
|
||||
if (lexer->chunk_start + lexer->chunk_size <= lexer->current_position.bytes + 1) {
|
||||
if (lexer->lookahead_size) {
|
||||
lexer->current_position.bytes += lexer->lookahead_size;
|
||||
lexer->current_position.chars += 1;
|
||||
}
|
||||
lexer->lookahead_size = 0;
|
||||
/*
|
||||
* Request a new chunk of text from the Input if the Lexer has reached
|
||||
* the end of the current chunk.
|
||||
*/
|
||||
if (lexer->current_position.bytes >= lexer->chunk_start + lexer->chunk_size) {
|
||||
lexer->chunk_start += lexer->chunk_size;
|
||||
lexer->chunk = lexer->input.read_fn(lexer->input.data, &lexer->chunk_size);
|
||||
if (!lexer->chunk_size)
|
||||
lexer->chunk = empty_chunk;
|
||||
}
|
||||
|
||||
if (lexer->chunk_size == 0) {
|
||||
lexer->lookahead_size = 0;
|
||||
lexer->chunk = empty_chunk;
|
||||
} else {
|
||||
if (lexer->lookahead_size) {
|
||||
lexer->current_position.bytes += lexer->lookahead_size;
|
||||
lexer->current_position.chars += 1;
|
||||
}
|
||||
|
||||
lexer->lookahead_size = utf8proc_iterate(
|
||||
(const uint8_t *)lexer->chunk + (lexer->current_position.bytes - lexer->chunk_start),
|
||||
lexer->chunk_start + lexer->chunk_size - lexer->current_position.bytes + 1,
|
||||
&lexer->lookahead);
|
||||
}
|
||||
/*
|
||||
* Read the next unicode character from the current chunk of text.
|
||||
*/
|
||||
size_t position_in_chunk = lexer->current_position.bytes - lexer->chunk_start;
|
||||
lexer->lookahead_size = utf8proc_iterate(
|
||||
(const uint8_t *)lexer->chunk + position_in_chunk,
|
||||
lexer->chunk_size - position_in_chunk + 1, &lexer->lookahead);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue