tree-sitter/lib/src/unicode.h

#ifndef TREE_SITTER_UNICODE_H_
#define TREE_SITTER_UNICODE_H_

#ifdef __cplusplus
extern "C" {
#endif

#include <limits.h>
#include <stdint.h>

#define U_EXPORT
#define U_EXPORT2
#include "unicode/utf8.h"
#include "unicode/utf16.h"

static const int32_t TS_DECODE_ERROR = U_SENTINEL;

// These functions read one unicode code point from the given string,
// returning the number of bytes consumed.
typedef uint32_t (*UnicodeDecodeFunction)(
  const uint8_t *string,
  uint32_t length,
  int32_t *code_point
);

static inline uint32_t ts_decode_utf8(
  const uint8_t *string,
  uint32_t length,
  int32_t *code_point
) {
  uint32_t i = 0;
  U8_NEXT(string, i, length, *code_point);
  return i;
}

static inline uint32_t ts_decode_utf16(
  const uint8_t *string,
  uint32_t length,
  int32_t *code_point
) {
  uint32_t i = 0;
  U16_NEXT(((uint16_t *)string), i, length, *code_point);
  return i * 2;
}

#ifdef __cplusplus
}
#endif

#endif  // TREE_SITTER_UNICODE_H_
lib: remove utf8proc dependency (#436) * Remove dependency on utf8proc This removes the only external dependency on utf8proc for UTF-8 decoding. It does so by implementing its own UTF-8 decoder. This decoder is both faster and has a simpler API. * .gitmodules: remove utf8proc submodule * docs/section-2-using-parsers.md: remove requirement for utf8proc submodule * docs/section-6-contributing.md: likewise * lib/Cargo.toml: remove utf8proc subdirectory package include * lib/README.md: remove utf8proc subdirectory description * lib/binding_rust/build.rs: remove utf8proc compiler include directory * lib/src/lexer.c: remove utf8proc dependencies and types * lib/src/lib.c: remove utf8proc dependency * lib/src/unicode.h: define types for Unicode decoders * lib/src/utf16.{c,h}: implement more readable UTF-16 decoder * lib/src/utf8.{c,h}: implement fast UTF-8 decoder * lib/utf8proc: remove utf8proc submodule directory * script/build-lib: remove utf8proc compiler include directory * script/build-wasm: likewise * Optimize ts_lexer__get_lookahead. Try to favor non-failure code path and assign lookahead values directly to lexer * lib/src/lexer.c: optimize for non-failure code path * Fix some compiler errors * lib/src/lexer.c: cast from signed to unsigned for decode_next result * lib/src/utf16.c: fix non-constant initializers for older compilers * Remove some missed remnants of utf8proc * docs/section-2-using-parsers.md: only two include paths necessary now * lib/src/lib.c: no need to define UTF8PROC_STATIC * Use ICU's utf8 and utf16 decoding routines * Remove unnecessary casts when calling icu macros * Check buffer length before attempting to decode a unicode character * Use new unicode function when parsing Queries Co-Authored-By: Matthew Krupcale <mkrupcale@matthewkrupcale.com> * Mark libicu files as vendored for GitHub's stats 2019-10-14 14:18:39 -04:00			`#ifndef TREE_SITTER_UNICODE_H_`
			`#define TREE_SITTER_UNICODE_H_`

			`#ifdef __cplusplus`
			`extern "C" {`
			`#endif`

			`#include <limits.h>`
			`#include <stdint.h>`

			`#define U_EXPORT`
			`#define U_EXPORT2`
			`#include "unicode/utf8.h"`
			`#include "unicode/utf16.h"`

			`static const int32_t TS_DECODE_ERROR = U_SENTINEL;`

			`// These functions read one unicode code point from the given string,`
			`// returning the number of bytes consumed.`
			`typedef uint32_t (*UnicodeDecodeFunction)(`
			`const uint8_t *string,`
			`uint32_t length,`
			`int32_t *code_point`
			`);`

			`static inline uint32_t ts_decode_utf8(`
			`const uint8_t *string,`
			`uint32_t length,`
			`int32_t *code_point`
			`) {`
			`uint32_t i = 0;`
			`U8_NEXT(string, i, length, *code_point);`
			`return i;`
			`}`

			`static inline uint32_t ts_decode_utf16(`
			`const uint8_t *string,`
			`uint32_t length,`
			`int32_t *code_point`
			`) {`
			`uint32_t i = 0;`
			`U16_NEXT(((uint16_t )string), i, length, code_point);`
			`return i * 2;`
			`}`

			`#ifdef __cplusplus`
			`}`
			`#endif`

			`#endif // TREE_SITTER_UNICODE_H_`