feat!: properly handle UTF-16 endianness encoding

This commit is contained in:
Amaan Qureshi 2024-10-04 23:15:17 -04:00
parent cf8ed78a9a
commit 8943983df6
20 changed files with 485 additions and 50 deletions

View file

@ -27,6 +27,7 @@ OBJ := $(SRC:.c=.o)
ARFLAGS := rcs
CFLAGS ?= -O3 -Wall -Wextra -Wshadow -pedantic
override CFLAGS += -std=c11 -fPIC -fvisibility=hidden
override CFLAGS += -D_POSIX_C_SOURCE=200112L -D_DEFAULT_SOURCE
override CFLAGS += -Ilib/src -Ilib/src/wasm -Ilib/include
# ABI versioning

View file

@ -15,7 +15,11 @@ let package = Package(
.target(name: "TreeSitter",
path: "lib",
sources: ["src/lib.c"],
cSettings: [.headerSearchPath("src")]),
cSettings: [
.headerSearchPath("src"),
.define("_POSIX_C_SOURCE", to: "200112L"),
.define("_DEFAULT_SOURCE"),
]),
],
cLanguageStandard: .c11
)

View file

@ -11,6 +11,8 @@ pub fn build(b: *std.Build) void {
lib.addCSourceFile(.{ .file = b.path("lib/src/lib.c"), .flags = &.{"-std=c11"} });
lib.addIncludePath(b.path("lib/include"));
lib.addIncludePath(b.path("lib/src"));
lib.root_module.addCMacro("_POSIX_C_SOURCE", "200112L");
lib.root_module.addCMacro("_DEFAULT_SOURCE", "");
lib.installHeadersDirectory(b.path("lib/include"), ".", .{});

View file

@ -6,7 +6,7 @@ use std::{
use anstyle::{AnsiColor, Color, Style};
use anyhow::{anyhow, Context, Result};
use clap::{crate_authors, Args, Command, FromArgMatches as _, Subcommand};
use clap::{crate_authors, Args, Command, FromArgMatches as _, Subcommand, ValueEnum};
use clap_complete::{generate, Shell};
use dialoguer::{theme::ColorfulTheme, Confirm, FuzzySelect, Input};
use glob::glob;
@ -191,7 +191,7 @@ struct Parse {
)]
pub edits: Option<Vec<String>>,
#[arg(long, help = "The encoding of the input files")]
pub encoding: Option<String>,
pub encoding: Option<Encoding>,
#[arg(
long,
help = "Open `log.html` in the default browser, if `--debug-graph` is supplied"
@ -208,6 +208,13 @@ struct Parse {
pub no_ranges: bool,
}
#[derive(ValueEnum, Clone)]
pub enum Encoding {
Utf8,
Utf16LE,
Utf16BE,
}
#[derive(Args)]
#[command(about = "Run a parser's tests", alias = "t")]
struct Test {
@ -773,15 +780,11 @@ impl Parse {
ParseOutput::Normal
};
let encoding = if let Some(encoding) = self.encoding {
match encoding.as_str() {
"utf16" => Some(ffi::TSInputEncodingUTF16),
"utf8" => Some(ffi::TSInputEncodingUTF8),
_ => return Err(anyhow!("Invalid encoding. Expected one of: utf8, utf16")),
}
} else {
None
};
let encoding = self.encoding.map(|e| match e {
Encoding::Utf8 => ffi::TSInputEncodingUTF8,
Encoding::Utf16LE => ffi::TSInputEncodingUTF16LE,
Encoding::Utf16BE => ffi::TSInputEncodingUTF16BE,
});
let time = self.time;
let edits = self.edits.unwrap_or_default();

View file

@ -100,24 +100,42 @@ pub fn parse_file_at_path(parser: &mut Parser, opts: &ParseFileOptions) -> Resul
let time = Instant::now();
#[inline(always)]
fn is_utf16_bom(bom_bytes: &[u8]) -> bool {
bom_bytes == [0xFF, 0xFE] || bom_bytes == [0xFE, 0xFF]
fn is_utf16_le_bom(bom_bytes: &[u8]) -> bool {
bom_bytes == [0xFF, 0xFE]
}
let tree = match opts.encoding {
Some(encoding) if encoding == ffi::TSInputEncodingUTF16 => {
let source_code_utf16 = source_code
.chunks_exact(2)
.map(|chunk| u16::from_le_bytes([chunk[0], chunk[1]]))
.collect::<Vec<_>>();
parser.parse_utf16(&source_code_utf16, None)
#[inline(always)]
fn is_utf16_be_bom(bom_bytes: &[u8]) -> bool {
bom_bytes == [0xFE, 0xFF]
}
let encoding = match opts.encoding {
None if source_code.len() >= 2 => {
if is_utf16_le_bom(&source_code[0..2]) {
Some(ffi::TSInputEncodingUTF16LE)
} else if is_utf16_be_bom(&source_code[0..2]) {
Some(ffi::TSInputEncodingUTF16BE)
} else {
None
}
}
None if source_code.len() >= 2 && is_utf16_bom(&source_code[0..2]) => {
_ => opts.encoding,
};
let tree = match encoding {
Some(encoding) if encoding == ffi::TSInputEncodingUTF16LE => {
let source_code_utf16 = source_code
.chunks_exact(2)
.map(|chunk| u16::from_le_bytes([chunk[0], chunk[1]]))
.collect::<Vec<_>>();
parser.parse_utf16(&source_code_utf16, None)
parser.parse_utf16_le(&source_code_utf16, None)
}
Some(encoding) if encoding == ffi::TSInputEncodingUTF16BE => {
let source_code_utf16 = source_code
.chunks_exact(2)
.map(|chunk| u16::from_be_bytes([chunk[0], chunk[1]]))
.collect::<Vec<_>>();
parser.parse_utf16_be(&source_code_utf16, None)
}
_ => parser.parse(&source_code, None),
};

View file

@ -155,17 +155,19 @@ fn test_parsing_with_custom_utf8_input() {
}
#[test]
fn test_parsing_with_custom_utf16_input() {
fn test_parsing_with_custom_utf16le_input() {
let mut parser = Parser::new();
parser.set_language(&get_language("rust")).unwrap();
let lines = ["pub fn foo() {", " 1", "}"]
.iter()
.map(|s| s.encode_utf16().collect::<Vec<_>>())
.map(|s| s.encode_utf16().map(|u| u.to_le()).collect::<Vec<_>>())
.collect::<Vec<_>>();
let newline = [('\n' as u16).to_le()];
let tree = parser
.parse_utf16_with(
.parse_utf16_le_with(
&mut |_, position| {
let row = position.row;
let column = position.column;
@ -173,7 +175,7 @@ fn test_parsing_with_custom_utf16_input() {
if column < lines[row].len() {
&lines[row][column..]
} else {
&[10]
&newline
}
} else {
&[]
@ -193,6 +195,47 @@ fn test_parsing_with_custom_utf16_input() {
assert_eq!(root.child(0).unwrap().kind(), "function_item");
}
#[test]
fn test_parsing_with_custom_utf16_be_input() {
let mut parser = Parser::new();
parser.set_language(&get_language("rust")).unwrap();
let lines: Vec<Vec<u16>> = ["pub fn foo() {", " 1", "}"]
.iter()
.map(|s| s.encode_utf16().collect::<Vec<_>>())
.map(|v| v.iter().map(|u| u.to_be()).collect())
.collect();
let newline = [('\n' as u16).to_be()];
let tree = parser
.parse_utf16_be_with(
&mut |_, position| {
let row = position.row;
let column = position.column;
if row < lines.len() {
if column < lines[row].len() {
&lines[row][column..]
} else {
&newline
}
} else {
&[]
}
},
None,
)
.unwrap();
let root = tree.root_node();
assert_eq!(
root.to_sexp(),
"(source_file (function_item (visibility_modifier) name: (identifier) parameters: (parameters) body: (block (integer_literal))))"
);
assert_eq!(root.kind(), "source_file");
assert!(!root.has_error());
assert_eq!(root.child(0).unwrap().kind(), "function_item");
}
#[test]
fn test_parsing_with_callback_returning_owned_strings() {
let mut parser = Parser::new();
@ -221,7 +264,13 @@ fn test_parsing_text_with_byte_order_mark() {
// Parse UTF16 text with a BOM
let tree = parser
.parse_utf16("\u{FEFF}fn a() {}".encode_utf16().collect::<Vec<_>>(), None)
.parse_utf16_le(
"\u{FEFF}fn a() {}"
.encode_utf16()
.map(|u| u.to_le())
.collect::<Vec<_>>(),
None,
)
.unwrap();
assert_eq!(
tree.root_node().to_sexp(),
@ -1084,9 +1133,8 @@ fn test_parsing_error_in_invalid_included_ranges() {
fn test_parsing_utf16_code_with_errors_at_the_end_of_an_included_range() {
let source_code = "<script>a.</script>";
let utf16_source_code = source_code
.as_bytes()
.iter()
.map(|c| u16::from(*c))
.encode_utf16()
.map(|u| u.to_le())
.collect::<Vec<_>>();
let start_byte = 2 * source_code.find("a.").unwrap();
@ -1102,7 +1150,7 @@ fn test_parsing_utf16_code_with_errors_at_the_end_of_an_included_range() {
end_point: Point::new(0, end_byte),
}])
.unwrap();
let tree = parser.parse_utf16(&utf16_source_code, None).unwrap();
let tree = parser.parse_utf16_le(&utf16_source_code, None).unwrap();
assert_eq!(tree.root_node().to_sexp(), "(program (ERROR (identifier)))");
}

View file

@ -76,6 +76,8 @@ set_target_properties(tree-sitter
SOVERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}"
DEFINE_SYMBOL "")
target_compile_definitions(tree-sitter PRIVATE _POSIX_C_SOURCE=200112L _DEFAULT_SOURCE)
configure_file(tree-sitter.pc.in "${CMAKE_CURRENT_BINARY_DIR}/tree-sitter.pc" @ONLY)
include(GNUInstallDirs)

View file

@ -20,6 +20,7 @@ include = [
"/Cargo.toml",
"/src/*.h",
"/src/*.c",
"/src/portable/*",
"/src/unicode/*",
"/src/wasm/*",
"/include/tree_sitter/api.h",

View file

@ -36,7 +36,8 @@ pub struct TSLookaheadIterator {
_unused: [u8; 0],
}
pub const TSInputEncodingUTF8: TSInputEncoding = 0;
pub const TSInputEncodingUTF16: TSInputEncoding = 1;
pub const TSInputEncodingUTF16LE: TSInputEncoding = 1;
pub const TSInputEncodingUTF16BE: TSInputEncoding = 2;
pub type TSInputEncoding = ::core::ffi::c_uint;
pub const TSSymbolTypeRegular: TSSymbolType = 0;
pub const TSSymbolTypeAnonymous: TSSymbolType = 1;

View file

@ -41,6 +41,8 @@ fn main() {
.include(&src_path)
.include(&wasm_path)
.include(&include_path)
.define("_POSIX_C_SOURCE", "200112L")
.define("_DEFAULT_SOURCE", None)
.warnings(false)
.file(src_path.join("lib.c"))
.compile("tree-sitter");

View file

@ -610,6 +610,7 @@ impl Parser {
/// * `old_tree` A previous syntax tree parsed from the same document. If the text of the
/// document has changed since `old_tree` was created, then you must edit `old_tree` to match
/// the new text using [`Tree::edit`].
#[deprecated(since = "0.25.0", note = "Prefer parse_utf16_le instead")]
pub fn parse_utf16(
&mut self,
input: impl AsRef<[u16]>,
@ -617,7 +618,7 @@ impl Parser {
) -> Option<Tree> {
let code_points = input.as_ref();
let len = code_points.len();
self.parse_utf16_with(
self.parse_utf16_le_with(
&mut |i, _| (i < len).then(|| &code_points[i..]).unwrap_or_default(),
old_tree,
)
@ -672,6 +673,45 @@ impl Parser {
}
}
pub fn parse_with_<T: AsRef<[u8]>, F: FnMut(usize, Point) -> T>(
&mut self,
callback: &mut F,
old_tree: Option<&Tree>,
) -> Option<Tree> {
// A pointer to this payload is passed on every call to the `read` C function.
// The payload contains two things:
// 1. A reference to the rust `callback`.
// 2. The text that was returned from the previous call to `callback`. This allows the
// callback to return owned values like vectors.
let mut payload: (&mut F, Option<T>) = (callback, None);
// This C function is passed to Tree-sitter as the input callback.
unsafe extern "C" fn read<T: AsRef<[u8]>, F: FnMut(usize, Point) -> T>(
payload: *mut c_void,
byte_offset: u32,
position: ffi::TSPoint,
bytes_read: *mut u32,
) -> *const c_char {
let (callback, text) = (payload as *mut (&mut F, Option<T>)).as_mut().unwrap();
*text = Some(callback(byte_offset as usize, position.into()));
let slice = text.as_ref().unwrap().as_ref();
*bytes_read = slice.len() as u32;
slice.as_ptr().cast::<c_char>()
}
let c_input = ffi::TSInput {
payload: &mut payload as *mut (&mut F, Option<T>) as *mut c_void,
read: Some(read::<T, F>),
encoding: ffi::TSInputEncodingUTF8,
};
let c_old_tree = old_tree.map_or(ptr::null_mut(), |t| t.0.as_ptr());
unsafe {
let c_new_tree = ffi::ts_parser_parse(self.0.as_ptr(), c_old_tree, c_input);
NonNull::new(c_new_tree).map(Tree)
}
}
/// Parse UTF16 text provided in chunks by a callback.
///
/// # Arguments:
@ -682,10 +722,49 @@ impl Parser {
/// * `old_tree` A previous syntax tree parsed from the same document. If the text of the
/// document has changed since `old_tree` was created, then you must edit `old_tree` to match
/// the new text using [`Tree::edit`].
#[deprecated(since = "0.25.0", note = "Prefer parse_utf16_le_with instead")]
pub fn parse_utf16_with<T: AsRef<[u16]>, F: FnMut(usize, Point) -> T>(
&mut self,
callback: &mut F,
old_tree: Option<&Tree>,
) -> Option<Tree> {
self.parse_utf16_le_with(callback, old_tree)
}
/// Parse a slice of UTF16 little-endian text.
///
/// # Arguments:
/// * `text` The UTF16-encoded text to parse.
/// * `old_tree` A previous syntax tree parsed from the same document. If the text of the
/// document has changed since `old_tree` was created, then you must edit `old_tree` to match
/// the new text using [Tree::edit].
pub fn parse_utf16_le(
&mut self,
input: impl AsRef<[u16]>,
old_tree: Option<&Tree>,
) -> Option<Tree> {
let code_points = input.as_ref();
let len = code_points.len();
self.parse_utf16_le_with(
&mut |i, _| (i < len).then(|| &code_points[i..]).unwrap_or_default(),
old_tree,
)
}
/// Parse UTF16 little-endian text provided in chunks by a callback.
///
/// # Arguments:
/// * `callback` A function that takes a code point offset and position and returns a slice of
/// UTF16-encoded text starting at that byte offset and position. The slices can be of any
/// length. If the given position is at the end of the text, the callback should return an
/// empty slice.
/// * `old_tree` A previous syntax tree parsed from the same document. If the text of the
/// document has changed since `old_tree` was created, then you must edit `old_tree` to match
/// the new text using [Tree::edit].
pub fn parse_utf16_le_with<T: AsRef<[u16]>, F: FnMut(usize, Point) -> T>(
&mut self,
callback: &mut F,
old_tree: Option<&Tree>,
) -> Option<Tree> {
// A pointer to this payload is passed on every call to the `read` C function.
// The payload contains two things:
@ -701,7 +780,7 @@ impl Parser {
position: ffi::TSPoint,
bytes_read: *mut u32,
) -> *const c_char {
let (callback, text) = payload.cast::<(&mut F, Option<T>)>().as_mut().unwrap();
let (callback, text) = (payload as *mut (&mut F, Option<T>)).as_mut().unwrap();
*text = Some(callback(
(byte_offset / 2) as usize,
Point {
@ -715,9 +794,83 @@ impl Parser {
}
let c_input = ffi::TSInput {
payload: core::ptr::addr_of_mut!(payload).cast::<c_void>(),
payload: &mut payload as *mut (&mut F, Option<T>) as *mut c_void,
read: Some(read::<T, F>),
encoding: ffi::TSInputEncodingUTF16,
encoding: ffi::TSInputEncodingUTF16LE,
};
let c_old_tree = old_tree.map_or(ptr::null_mut(), |t| t.0.as_ptr());
unsafe {
let c_new_tree = ffi::ts_parser_parse(self.0.as_ptr(), c_old_tree, c_input);
NonNull::new(c_new_tree).map(Tree)
}
}
/// Parse a slice of UTF16 big-endian text.
///
/// # Arguments:
/// * `text` The UTF16-encoded text to parse.
/// * `old_tree` A previous syntax tree parsed from the same document. If the text of the
/// document has changed since `old_tree` was created, then you must edit `old_tree` to match
/// the new text using [Tree::edit].
pub fn parse_utf16_be(
&mut self,
input: impl AsRef<[u16]>,
old_tree: Option<&Tree>,
) -> Option<Tree> {
let code_points = input.as_ref();
let len = code_points.len();
self.parse_utf16_be_with(
&mut |i, _| if i < len { &code_points[i..] } else { &[] },
old_tree,
)
}
/// Parse UTF16 big-endian text provided in chunks by a callback.
///
/// # Arguments:
/// * `callback` A function that takes a code point offset and position and returns a slice of
/// UTF16-encoded text starting at that byte offset and position. The slices can be of any
/// length. If the given position is at the end of the text, the callback should return an
/// empty slice.
/// * `old_tree` A previous syntax tree parsed from the same document. If the text of the
/// document has changed since `old_tree` was created, then you must edit `old_tree` to match
/// the new text using [Tree::edit].
pub fn parse_utf16_be_with<T: AsRef<[u16]>, F: FnMut(usize, Point) -> T>(
&mut self,
callback: &mut F,
old_tree: Option<&Tree>,
) -> Option<Tree> {
// A pointer to this payload is passed on every call to the `read` C function.
// The payload contains two things:
// 1. A reference to the rust `callback`.
// 2. The text that was returned from the previous call to `callback`. This allows the
// callback to return owned values like vectors.
let mut payload: (&mut F, Option<T>) = (callback, None);
// This C function is passed to Tree-sitter as the input callback.
unsafe extern "C" fn read<T: AsRef<[u16]>, F: FnMut(usize, Point) -> T>(
payload: *mut c_void,
byte_offset: u32,
position: ffi::TSPoint,
bytes_read: *mut u32,
) -> *const c_char {
let (callback, text) = (payload as *mut (&mut F, Option<T>)).as_mut().unwrap();
*text = Some(callback(
(byte_offset / 2) as usize,
Point {
row: position.row as usize,
column: position.column as usize / 2,
},
));
let slice = text.as_ref().unwrap().as_ref();
*bytes_read = slice.len() as u32 * 2;
slice.as_ptr() as *const c_char
}
let c_input = ffi::TSInput {
payload: &mut payload as *mut (&mut F, Option<T>) as *mut c_void,
read: Some(read::<T, F>),
encoding: ffi::TSInputEncodingUTF16BE,
};
let c_old_tree = old_tree.map_or(ptr::null_mut(), |t| t.0.as_ptr());

View file

@ -172,7 +172,7 @@ TSTree *ts_parser_parse_wasm(
TSInput input = {
input_buffer,
call_parse_callback,
TSInputEncodingUTF16
TSInputEncodingUTF16LE
};
if (range_count) {
for (unsigned i = 0; i < range_count; i++) {

View file

@ -50,7 +50,8 @@ typedef struct TSLookaheadIterator TSLookaheadIterator;
typedef enum TSInputEncoding {
TSInputEncodingUTF8,
TSInputEncodingUTF16,
TSInputEncodingUTF16LE,
TSInputEncodingUTF16BE,
} TSInputEncoding;
typedef enum TSSymbolType {

View file

@ -83,9 +83,9 @@ static void ts_lexer__get_lookahead(Lexer *self) {
}
const uint8_t *chunk = (const uint8_t *)self->chunk + position_in_chunk;
UnicodeDecodeFunction decode = self->input.encoding == TSInputEncodingUTF8
? ts_decode_utf8
: ts_decode_utf16;
UnicodeDecodeFunction decode =
self->input.encoding == TSInputEncodingUTF8 ? ts_decode_utf8 :
self->input.encoding == TSInputEncodingUTF16LE ? ts_decode_utf16_le : ts_decode_utf16_be;
self->lookahead_size = decode(chunk, size, &self->data.lookahead);

View file

@ -1,5 +1,3 @@
#define _POSIX_C_SOURCE 200112L
#include "./alloc.c"
#include "./get_changed_ranges.c"
#include "./language.c"

View file

@ -1,5 +1,3 @@
#define _POSIX_C_SOURCE 200112L
#include <time.h>
#include <stdio.h>
#include <limits.h>

170
lib/src/portable/endian.h Normal file
View file

@ -0,0 +1,170 @@
// "License": Public Domain
// I, Mathias Panzenböck, place this file hereby into the public domain. Use it at your own risk for whatever you like.
// In case there are jurisdictions that don't support putting things in the public domain you can also consider it to
// be "dual licensed" under the BSD, MIT and Apache licenses, if you want to. This code is trivial anyway. Consider it
// an example on how to get the endian conversion functions on different platforms.
#ifndef PORTABLE_ENDIAN_H__
#define PORTABLE_ENDIAN_H__
#if (defined(_WIN16) || defined(_WIN32) || defined(_WIN64)) && !defined(__WINDOWS__)
# define __WINDOWS__
#endif
#if defined(__linux__) || defined(__CYGWIN__) || defined(__GNU__) || defined(__EMSCRIPTEN__)
# include <endian.h>
#elif defined(__APPLE__)
# include <libkern/OSByteOrder.h>
# define htobe16(x) OSSwapHostToBigInt16(x)
# define htole16(x) OSSwapHostToLittleInt16(x)
# define be16toh(x) OSSwapBigToHostInt16(x)
# define le16toh(x) OSSwapLittleToHostInt16(x)
# define htobe32(x) OSSwapHostToBigInt32(x)
# define htole32(x) OSSwapHostToLittleInt32(x)
# define be32toh(x) OSSwapBigToHostInt32(x)
# define le32toh(x) OSSwapLittleToHostInt32(x)
# define htobe64(x) OSSwapHostToBigInt64(x)
# define htole64(x) OSSwapHostToLittleInt64(x)
# define be64toh(x) OSSwapBigToHostInt64(x)
# define le64toh(x) OSSwapLittleToHostInt64(x)
# define __BYTE_ORDER BYTE_ORDER
# define __BIG_ENDIAN BIG_ENDIAN
# define __LITTLE_ENDIAN LITTLE_ENDIAN
# define __PDP_ENDIAN PDP_ENDIAN
#elif defined(__OpenBSD__)
# include <endian.h>
# define __BYTE_ORDER BYTE_ORDER
# define __BIG_ENDIAN BIG_ENDIAN
# define __LITTLE_ENDIAN LITTLE_ENDIAN
# define __PDP_ENDIAN PDP_ENDIAN
#elif defined(__NetBSD__) || defined(__FreeBSD__) || defined(__DragonFly__)
# include <sys/endian.h>
# define be16toh(x) betoh16(x)
# define le16toh(x) letoh16(x)
# define be32toh(x) betoh32(x)
# define le32toh(x) letoh32(x)
# define be64toh(x) betoh64(x)
# define le64toh(x) letoh64(x)
#elif defined(__WINDOWS__)
# include <winsock2.h>
# ifdef __GNUC__
# include <sys/param.h>
# endif
# if BYTE_ORDER == LITTLE_ENDIAN
# define htobe16(x) htons(x)
# define htole16(x) (x)
# define be16toh(x) ntohs(x)
# define le16toh(x) (x)
# define htobe32(x) htonl(x)
# define htole32(x) (x)
# define be32toh(x) ntohl(x)
# define le32toh(x) (x)
# define htobe64(x) htonll(x)
# define htole64(x) (x)
# define be64toh(x) ntohll(x)
# define le64toh(x) (x)
# elif BYTE_ORDER == BIG_ENDIAN
/* that would be xbox 360 */
# define htobe16(x) (x)
# define htole16(x) __builtin_bswap16(x)
# define be16toh(x) (x)
# define le16toh(x) __builtin_bswap16(x)
# define htobe32(x) (x)
# define htole32(x) __builtin_bswap32(x)
# define be32toh(x) (x)
# define le32toh(x) __builtin_bswap32(x)
# define htobe64(x) (x)
# define htole64(x) __builtin_bswap64(x)
# define be64toh(x) (x)
# define le64toh(x) __builtin_bswap64(x)
# else
# error byte order not supported
# endif
# define __BYTE_ORDER BYTE_ORDER
# define __BIG_ENDIAN BIG_ENDIAN
# define __LITTLE_ENDIAN LITTLE_ENDIAN
# define __PDP_ENDIAN PDP_ENDIAN
#elif defined(__QNXNTO__)
# include <gulliver.h>
# define __LITTLE_ENDIAN 1234
# define __BIG_ENDIAN 4321
# define __PDP_ENDIAN 3412
# if defined(__BIGENDIAN__)
# define __BYTE_ORDER __BIG_ENDIAN
# define htobe16(x) (x)
# define htobe32(x) (x)
# define htobe64(x) (x)
# define htole16(x) ENDIAN_SWAP16(x)
# define htole32(x) ENDIAN_SWAP32(x)
# define htole64(x) ENDIAN_SWAP64(x)
# elif defined(__LITTLEENDIAN__)
# define __BYTE_ORDER __LITTLE_ENDIAN
# define htole16(x) (x)
# define htole32(x) (x)
# define htole64(x) (x)
# define htobe16(x) ENDIAN_SWAP16(x)
# define htobe32(x) ENDIAN_SWAP32(x)
# define htobe64(x) ENDIAN_SWAP64(x)
# else
# error byte order not supported
# endif
# define be16toh(x) ENDIAN_BE16(x)
# define be32toh(x) ENDIAN_BE32(x)
# define be64toh(x) ENDIAN_BE64(x)
# define le16toh(x) ENDIAN_LE16(x)
# define le32toh(x) ENDIAN_LE32(x)
# define le64toh(x) ENDIAN_LE64(x)
#else
# error platform not supported
#endif
#endif

View file

@ -1,5 +1,3 @@
#define _POSIX_C_SOURCE 200112L
#include "tree_sitter/api.h"
#include "./array.h"
#include "./get_changed_ranges.h"

View file

@ -12,6 +12,29 @@ extern "C" {
#define U_EXPORT2
#include "unicode/utf8.h"
#include "unicode/utf16.h"
#include "portable/endian.h"
#define U16_NEXT_LE(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=le16toh((s)[(i)++]); \
if(U16_IS_LEAD(c)) { \
uint16_t __c2; \
if((i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
++(i); \
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
} \
} \
} UPRV_BLOCK_MACRO_END
#define U16_NEXT_BE(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=be16toh((s)[(i)++]); \
if(U16_IS_LEAD(c)) { \
uint16_t __c2; \
if((i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
++(i); \
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
} \
} \
} UPRV_BLOCK_MACRO_END
static const int32_t TS_DECODE_ERROR = U_SENTINEL;
@ -33,13 +56,23 @@ static inline uint32_t ts_decode_utf8(
return i;
}
static inline uint32_t ts_decode_utf16(
static inline uint32_t ts_decode_utf16_le(
const uint8_t *string,
uint32_t length,
int32_t *code_point
) {
uint32_t i = 0;
U16_NEXT(((uint16_t *)string), i, length, *code_point);
U16_NEXT_LE(((uint16_t *)string), i, length, *code_point);
return i * 2;
}
static inline uint32_t ts_decode_utf16_be(
const uint8_t *string,
uint32_t length,
int32_t *code_point
) {
uint32_t i = 0;
U16_NEXT_BE(((uint16_t *)string), i, length, *code_point);
return i * 2;
}

View file

@ -131,6 +131,8 @@ $emcc \
-std=c11 \
-D 'fprintf(...)=' \
-D NDEBUG= \
-D _POSIX_C_SOURCE=200112L \
-D _DEFAULT_SOURCE= \
-I ${SRC_DIR} \
-I lib/include \
--js-library ${WEB_DIR}/imports.js \