diff --git a/spec/runtime/parser_spec.cc b/spec/runtime/parser_spec.cc index 4bb0195c..064f8465 100644 --- a/spec/runtime/parser_spec.cc +++ b/spec/runtime/parser_spec.cc @@ -287,8 +287,6 @@ describe("Parser", [&]() { describe("with non-ascii characters", [&]() { before_each([&]() { - chunk_size = 50; - // αβδ + 1 set_text("\u03b1\u03b2\u03b4 + 1"); @@ -370,17 +368,14 @@ describe("Parser", [&]() { }); it("recognizes UTF8 characters as single characters", [&]() { - // Inputs that return partial UTF8 characters are not yet supported - chunk_size = 50; - - // x # Ω — Δ - set_text("x # \u03A9 \u2014 \u0394"); + // x # ΩΩΩ — ΔΔ + set_text("x # \u03A9\u03A9\u03A9 \u2014 \u0394\u0394"); AssertThat(ts_node_string(root), Equals("(DOCUMENT " "(expression (variable) (comment)))")); - AssertThat(ts_node_size(root).chars, Equals(strlen("x # O - D"))); - AssertThat(ts_node_size(root).bytes, Equals(strlen("x # \u03A9 \u2014 \u0394"))); + AssertThat(ts_node_size(root).chars, Equals(strlen("x # OOO - DD"))); + AssertThat(ts_node_size(root).bytes, Equals(strlen("x # \u03A9\u03A9\u03A9 \u2014 \u0394\u0394"))); }); }); }); diff --git a/src/runtime/lexer.c b/src/runtime/lexer.c index 9a63e11e..5e6650ef 100644 --- a/src/runtime/lexer.c +++ b/src/runtime/lexer.c @@ -4,40 +4,42 @@ #include "runtime/length.h" #include "utf8proc.h" +static const char *empty_chunk = ""; + static int advance(TSLexer *lexer) { - static const char *empty_chunk = ""; - lexer->lookahead = 0; - - if (lexer->chunk == empty_chunk) { - lexer->lookahead_size = 0; + /* + * Return false if the Lexer has already reached the end of the input. + */ + if (lexer->chunk == empty_chunk) return 0; + + /* + * Increment the Lexer's position. + */ + if (lexer->lookahead_size) { + lexer->current_position.bytes += lexer->lookahead_size; + lexer->current_position.chars += 1; } - if (lexer->chunk_start + lexer->chunk_size <= lexer->current_position.bytes + 1) { - if (lexer->lookahead_size) { - lexer->current_position.bytes += lexer->lookahead_size; - lexer->current_position.chars += 1; - } - lexer->lookahead_size = 0; + /* + * Request a new chunk of text from the Input if the Lexer has reached + * the end of the current chunk. + */ + if (lexer->current_position.bytes >= lexer->chunk_start + lexer->chunk_size) { lexer->chunk_start += lexer->chunk_size; lexer->chunk = lexer->input.read_fn(lexer->input.data, &lexer->chunk_size); + if (!lexer->chunk_size) + lexer->chunk = empty_chunk; } - if (lexer->chunk_size == 0) { - lexer->lookahead_size = 0; - lexer->chunk = empty_chunk; - } else { - if (lexer->lookahead_size) { - lexer->current_position.bytes += lexer->lookahead_size; - lexer->current_position.chars += 1; - } - - lexer->lookahead_size = utf8proc_iterate( - (const uint8_t *)lexer->chunk + (lexer->current_position.bytes - lexer->chunk_start), - lexer->chunk_start + lexer->chunk_size - lexer->current_position.bytes + 1, - &lexer->lookahead); - } + /* + * Read the next unicode character from the current chunk of text. + */ + size_t position_in_chunk = lexer->current_position.bytes - lexer->chunk_start; + lexer->lookahead_size = utf8proc_iterate( + (const uint8_t *)lexer->chunk + position_in_chunk, + lexer->chunk_size - position_in_chunk + 1, &lexer->lookahead); return 1; }