Read unicode characters correctly in Lexer advance

This commit is contained in:
Max Brunsfeld 2014-10-03 15:44:49 -07:00
parent 78c5fe8e02
commit 808b003f1a
2 changed files with 31 additions and 34 deletions

View file

@ -287,8 +287,6 @@ describe("Parser", [&]() {
describe("with non-ascii characters", [&]() {
before_each([&]() {
chunk_size = 50;
// αβδ + 1
set_text("\u03b1\u03b2\u03b4 + 1");
@ -370,17 +368,14 @@ describe("Parser", [&]() {
});
it("recognizes UTF8 characters as single characters", [&]() {
// Inputs that return partial UTF8 characters are not yet supported
chunk_size = 50;
// x # Ω — Δ
set_text("x # \u03A9 \u2014 \u0394");
// x # ΩΩΩ — ΔΔ
set_text("x # \u03A9\u03A9\u03A9 \u2014 \u0394\u0394");
AssertThat(ts_node_string(root), Equals("(DOCUMENT "
"(expression (variable) (comment)))"));
AssertThat(ts_node_size(root).chars, Equals(strlen("x # O - D")));
AssertThat(ts_node_size(root).bytes, Equals(strlen("x # \u03A9 \u2014 \u0394")));
AssertThat(ts_node_size(root).chars, Equals(strlen("x # OOO - DD")));
AssertThat(ts_node_size(root).bytes, Equals(strlen("x # \u03A9\u03A9\u03A9 \u2014 \u0394\u0394")));
});
});
});

View file

@ -4,40 +4,42 @@
#include "runtime/length.h"
#include "utf8proc.h"
static const char *empty_chunk = "";
static int advance(TSLexer *lexer) {
static const char *empty_chunk = "";
lexer->lookahead = 0;
if (lexer->chunk == empty_chunk) {
lexer->lookahead_size = 0;
/*
* Return false if the Lexer has already reached the end of the input.
*/
if (lexer->chunk == empty_chunk)
return 0;
/*
* Increment the Lexer's position.
*/
if (lexer->lookahead_size) {
lexer->current_position.bytes += lexer->lookahead_size;
lexer->current_position.chars += 1;
}
if (lexer->chunk_start + lexer->chunk_size <= lexer->current_position.bytes + 1) {
if (lexer->lookahead_size) {
lexer->current_position.bytes += lexer->lookahead_size;
lexer->current_position.chars += 1;
}
lexer->lookahead_size = 0;
/*
* Request a new chunk of text from the Input if the Lexer has reached
* the end of the current chunk.
*/
if (lexer->current_position.bytes >= lexer->chunk_start + lexer->chunk_size) {
lexer->chunk_start += lexer->chunk_size;
lexer->chunk = lexer->input.read_fn(lexer->input.data, &lexer->chunk_size);
if (!lexer->chunk_size)
lexer->chunk = empty_chunk;
}
if (lexer->chunk_size == 0) {
lexer->lookahead_size = 0;
lexer->chunk = empty_chunk;
} else {
if (lexer->lookahead_size) {
lexer->current_position.bytes += lexer->lookahead_size;
lexer->current_position.chars += 1;
}
lexer->lookahead_size = utf8proc_iterate(
(const uint8_t *)lexer->chunk + (lexer->current_position.bytes - lexer->chunk_start),
lexer->chunk_start + lexer->chunk_size - lexer->current_position.bytes + 1,
&lexer->lookahead);
}
/*
* Read the next unicode character from the current chunk of text.
*/
size_t position_in_chunk = lexer->current_position.bytes - lexer->chunk_start;
lexer->lookahead_size = utf8proc_iterate(
(const uint8_t *)lexer->chunk + position_in_chunk,
lexer->chunk_size - position_in_chunk + 1, &lexer->lookahead);
return 1;
}