* Remove dependency on utf8proc
This removes the only external dependency on utf8proc for UTF-8 decoding. It does so by implementing its own UTF-8 decoder. This decoder is both faster and has a simpler API.
* .gitmodules: remove utf8proc submodule
* docs/section-2-using-parsers.md: remove requirement for utf8proc submodule
* docs/section-6-contributing.md: likewise
* lib/Cargo.toml: remove utf8proc subdirectory package include
* lib/README.md: remove utf8proc subdirectory description
* lib/binding_rust/build.rs: remove utf8proc compiler include directory
* lib/src/lexer.c: remove utf8proc dependencies and types
* lib/src/lib.c: remove utf8proc dependency
* lib/src/unicode.h: define types for Unicode decoders
* lib/src/utf16.{c,h}: implement more readable UTF-16 decoder
* lib/src/utf8.{c,h}: implement fast UTF-8 decoder
* lib/utf8proc: remove utf8proc submodule directory
* script/build-lib: remove utf8proc compiler include directory
* script/build-wasm: likewise
* Optimize ts_lexer__get_lookahead.
Try to favor non-failure code path and assign lookahead values directly to lexer
* lib/src/lexer.c: optimize for non-failure code path
* Fix some compiler errors
* lib/src/lexer.c: cast from signed to unsigned for decode_next result
* lib/src/utf16.c: fix non-constant initializers for older compilers
* Remove some missed remnants of utf8proc
* docs/section-2-using-parsers.md: only two include paths necessary now
* lib/src/lib.c: no need to define UTF8PROC_STATIC
* Use ICU's utf8 and utf16 decoding routines
* Remove unnecessary casts when calling icu macros
* Check buffer length before attempting to decode a unicode character
* Use new unicode function when parsing Queries
Co-Authored-By: Matthew Krupcale <mkrupcale@matthewkrupcale.com>
* Mark libicu files as vendored for GitHub's stats
50 lines
964 B
C
50 lines
964 B
C
#ifndef TREE_SITTER_UNICODE_H_
|
|
#define TREE_SITTER_UNICODE_H_
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
#include <limits.h>
|
|
#include <stdint.h>
|
|
|
|
#define U_EXPORT
|
|
#define U_EXPORT2
|
|
#include "unicode/utf8.h"
|
|
#include "unicode/utf16.h"
|
|
|
|
static const int32_t TS_DECODE_ERROR = U_SENTINEL;
|
|
|
|
// These functions read one unicode code point from the given string,
|
|
// returning the number of bytes consumed.
|
|
typedef uint32_t (*UnicodeDecodeFunction)(
|
|
const uint8_t *string,
|
|
uint32_t length,
|
|
int32_t *code_point
|
|
);
|
|
|
|
static inline uint32_t ts_decode_utf8(
|
|
const uint8_t *string,
|
|
uint32_t length,
|
|
int32_t *code_point
|
|
) {
|
|
uint32_t i = 0;
|
|
U8_NEXT(string, i, length, *code_point);
|
|
return i;
|
|
}
|
|
|
|
static inline uint32_t ts_decode_utf16(
|
|
const uint8_t *string,
|
|
uint32_t length,
|
|
int32_t *code_point
|
|
) {
|
|
uint32_t i = 0;
|
|
U16_NEXT(((uint16_t *)string), i, length, *code_point);
|
|
return i * 2;
|
|
}
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
#endif // TREE_SITTER_UNICODE_H_
|