diff --git a/Makefile b/Makefile index 3a737ddf..22b878a5 100644 --- a/Makefile +++ b/Makefile @@ -27,6 +27,7 @@ OBJ := $(SRC:.c=.o) ARFLAGS := rcs CFLAGS ?= -O3 -Wall -Wextra -Wshadow -pedantic override CFLAGS += -std=c11 -fPIC -fvisibility=hidden +override CFLAGS += -D_POSIX_C_SOURCE=200112L -D_DEFAULT_SOURCE override CFLAGS += -Ilib/src -Ilib/src/wasm -Ilib/include # ABI versioning diff --git a/Package.swift b/Package.swift index 3a4b9744..86e1ef01 100644 --- a/Package.swift +++ b/Package.swift @@ -15,7 +15,11 @@ let package = Package( .target(name: "TreeSitter", path: "lib", sources: ["src/lib.c"], - cSettings: [.headerSearchPath("src")]), + cSettings: [ + .headerSearchPath("src"), + .define("_POSIX_C_SOURCE", to: "200112L"), + .define("_DEFAULT_SOURCE"), + ]), ], cLanguageStandard: .c11 ) diff --git a/build.zig b/build.zig index 8c1273c7..da577bae 100644 --- a/build.zig +++ b/build.zig @@ -11,6 +11,8 @@ pub fn build(b: *std.Build) void { lib.addCSourceFile(.{ .file = b.path("lib/src/lib.c"), .flags = &.{"-std=c11"} }); lib.addIncludePath(b.path("lib/include")); lib.addIncludePath(b.path("lib/src")); + lib.root_module.addCMacro("_POSIX_C_SOURCE", "200112L"); + lib.root_module.addCMacro("_DEFAULT_SOURCE", ""); lib.installHeadersDirectory(b.path("lib/include"), ".", .{}); diff --git a/cli/src/main.rs b/cli/src/main.rs index 1758fada..72d70a4e 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -6,7 +6,7 @@ use std::{ use anstyle::{AnsiColor, Color, Style}; use anyhow::{anyhow, Context, Result}; -use clap::{crate_authors, Args, Command, FromArgMatches as _, Subcommand}; +use clap::{crate_authors, Args, Command, FromArgMatches as _, Subcommand, ValueEnum}; use clap_complete::{generate, Shell}; use dialoguer::{theme::ColorfulTheme, Confirm, FuzzySelect, Input}; use glob::glob; @@ -191,7 +191,7 @@ struct Parse { )] pub edits: Option>, #[arg(long, help = "The encoding of the input files")] - pub encoding: Option, + pub encoding: Option, #[arg( long, help = "Open `log.html` in the default browser, if `--debug-graph` is supplied" @@ -208,6 +208,13 @@ struct Parse { pub no_ranges: bool, } +#[derive(ValueEnum, Clone)] +pub enum Encoding { + Utf8, + Utf16LE, + Utf16BE, +} + #[derive(Args)] #[command(about = "Run a parser's tests", alias = "t")] struct Test { @@ -773,15 +780,11 @@ impl Parse { ParseOutput::Normal }; - let encoding = if let Some(encoding) = self.encoding { - match encoding.as_str() { - "utf16" => Some(ffi::TSInputEncodingUTF16), - "utf8" => Some(ffi::TSInputEncodingUTF8), - _ => return Err(anyhow!("Invalid encoding. Expected one of: utf8, utf16")), - } - } else { - None - }; + let encoding = self.encoding.map(|e| match e { + Encoding::Utf8 => ffi::TSInputEncodingUTF8, + Encoding::Utf16LE => ffi::TSInputEncodingUTF16LE, + Encoding::Utf16BE => ffi::TSInputEncodingUTF16BE, + }); let time = self.time; let edits = self.edits.unwrap_or_default(); diff --git a/cli/src/parse.rs b/cli/src/parse.rs index 6bc41721..69fa6387 100644 --- a/cli/src/parse.rs +++ b/cli/src/parse.rs @@ -100,24 +100,42 @@ pub fn parse_file_at_path(parser: &mut Parser, opts: &ParseFileOptions) -> Resul let time = Instant::now(); #[inline(always)] - fn is_utf16_bom(bom_bytes: &[u8]) -> bool { - bom_bytes == [0xFF, 0xFE] || bom_bytes == [0xFE, 0xFF] + fn is_utf16_le_bom(bom_bytes: &[u8]) -> bool { + bom_bytes == [0xFF, 0xFE] } - let tree = match opts.encoding { - Some(encoding) if encoding == ffi::TSInputEncodingUTF16 => { - let source_code_utf16 = source_code - .chunks_exact(2) - .map(|chunk| u16::from_le_bytes([chunk[0], chunk[1]])) - .collect::>(); - parser.parse_utf16(&source_code_utf16, None) + #[inline(always)] + fn is_utf16_be_bom(bom_bytes: &[u8]) -> bool { + bom_bytes == [0xFE, 0xFF] + } + + let encoding = match opts.encoding { + None if source_code.len() >= 2 => { + if is_utf16_le_bom(&source_code[0..2]) { + Some(ffi::TSInputEncodingUTF16LE) + } else if is_utf16_be_bom(&source_code[0..2]) { + Some(ffi::TSInputEncodingUTF16BE) + } else { + None + } } - None if source_code.len() >= 2 && is_utf16_bom(&source_code[0..2]) => { + _ => opts.encoding, + }; + + let tree = match encoding { + Some(encoding) if encoding == ffi::TSInputEncodingUTF16LE => { let source_code_utf16 = source_code .chunks_exact(2) .map(|chunk| u16::from_le_bytes([chunk[0], chunk[1]])) .collect::>(); - parser.parse_utf16(&source_code_utf16, None) + parser.parse_utf16_le(&source_code_utf16, None) + } + Some(encoding) if encoding == ffi::TSInputEncodingUTF16BE => { + let source_code_utf16 = source_code + .chunks_exact(2) + .map(|chunk| u16::from_be_bytes([chunk[0], chunk[1]])) + .collect::>(); + parser.parse_utf16_be(&source_code_utf16, None) } _ => parser.parse(&source_code, None), }; diff --git a/cli/src/tests/parser_test.rs b/cli/src/tests/parser_test.rs index e1319395..d2181999 100644 --- a/cli/src/tests/parser_test.rs +++ b/cli/src/tests/parser_test.rs @@ -155,17 +155,19 @@ fn test_parsing_with_custom_utf8_input() { } #[test] -fn test_parsing_with_custom_utf16_input() { +fn test_parsing_with_custom_utf16le_input() { let mut parser = Parser::new(); parser.set_language(&get_language("rust")).unwrap(); let lines = ["pub fn foo() {", " 1", "}"] .iter() - .map(|s| s.encode_utf16().collect::>()) + .map(|s| s.encode_utf16().map(|u| u.to_le()).collect::>()) .collect::>(); + let newline = [('\n' as u16).to_le()]; + let tree = parser - .parse_utf16_with( + .parse_utf16_le_with( &mut |_, position| { let row = position.row; let column = position.column; @@ -173,7 +175,7 @@ fn test_parsing_with_custom_utf16_input() { if column < lines[row].len() { &lines[row][column..] } else { - &[10] + &newline } } else { &[] @@ -193,6 +195,47 @@ fn test_parsing_with_custom_utf16_input() { assert_eq!(root.child(0).unwrap().kind(), "function_item"); } +#[test] +fn test_parsing_with_custom_utf16_be_input() { + let mut parser = Parser::new(); + parser.set_language(&get_language("rust")).unwrap(); + + let lines: Vec> = ["pub fn foo() {", " 1", "}"] + .iter() + .map(|s| s.encode_utf16().collect::>()) + .map(|v| v.iter().map(|u| u.to_be()).collect()) + .collect(); + + let newline = [('\n' as u16).to_be()]; + + let tree = parser + .parse_utf16_be_with( + &mut |_, position| { + let row = position.row; + let column = position.column; + if row < lines.len() { + if column < lines[row].len() { + &lines[row][column..] + } else { + &newline + } + } else { + &[] + } + }, + None, + ) + .unwrap(); + let root = tree.root_node(); + assert_eq!( + root.to_sexp(), + "(source_file (function_item (visibility_modifier) name: (identifier) parameters: (parameters) body: (block (integer_literal))))" + ); + assert_eq!(root.kind(), "source_file"); + assert!(!root.has_error()); + assert_eq!(root.child(0).unwrap().kind(), "function_item"); +} + #[test] fn test_parsing_with_callback_returning_owned_strings() { let mut parser = Parser::new(); @@ -221,7 +264,13 @@ fn test_parsing_text_with_byte_order_mark() { // Parse UTF16 text with a BOM let tree = parser - .parse_utf16("\u{FEFF}fn a() {}".encode_utf16().collect::>(), None) + .parse_utf16_le( + "\u{FEFF}fn a() {}" + .encode_utf16() + .map(|u| u.to_le()) + .collect::>(), + None, + ) .unwrap(); assert_eq!( tree.root_node().to_sexp(), @@ -1084,9 +1133,8 @@ fn test_parsing_error_in_invalid_included_ranges() { fn test_parsing_utf16_code_with_errors_at_the_end_of_an_included_range() { let source_code = ""; let utf16_source_code = source_code - .as_bytes() - .iter() - .map(|c| u16::from(*c)) + .encode_utf16() + .map(|u| u.to_le()) .collect::>(); let start_byte = 2 * source_code.find("a.").unwrap(); @@ -1102,7 +1150,7 @@ fn test_parsing_utf16_code_with_errors_at_the_end_of_an_included_range() { end_point: Point::new(0, end_byte), }]) .unwrap(); - let tree = parser.parse_utf16(&utf16_source_code, None).unwrap(); + let tree = parser.parse_utf16_le(&utf16_source_code, None).unwrap(); assert_eq!(tree.root_node().to_sexp(), "(program (ERROR (identifier)))"); } diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 4acca095..a6b20bbd 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -76,6 +76,8 @@ set_target_properties(tree-sitter SOVERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}" DEFINE_SYMBOL "") +target_compile_definitions(tree-sitter PRIVATE _POSIX_C_SOURCE=200112L _DEFAULT_SOURCE) + configure_file(tree-sitter.pc.in "${CMAKE_CURRENT_BINARY_DIR}/tree-sitter.pc" @ONLY) include(GNUInstallDirs) diff --git a/lib/Cargo.toml b/lib/Cargo.toml index c261e36c..3f3d4fd9 100644 --- a/lib/Cargo.toml +++ b/lib/Cargo.toml @@ -20,6 +20,7 @@ include = [ "/Cargo.toml", "/src/*.h", "/src/*.c", + "/src/portable/*", "/src/unicode/*", "/src/wasm/*", "/include/tree_sitter/api.h", diff --git a/lib/binding_rust/bindings.rs b/lib/binding_rust/bindings.rs index fe41b900..e7d17b16 100644 --- a/lib/binding_rust/bindings.rs +++ b/lib/binding_rust/bindings.rs @@ -36,7 +36,8 @@ pub struct TSLookaheadIterator { _unused: [u8; 0], } pub const TSInputEncodingUTF8: TSInputEncoding = 0; -pub const TSInputEncodingUTF16: TSInputEncoding = 1; +pub const TSInputEncodingUTF16LE: TSInputEncoding = 1; +pub const TSInputEncodingUTF16BE: TSInputEncoding = 2; pub type TSInputEncoding = ::core::ffi::c_uint; pub const TSSymbolTypeRegular: TSSymbolType = 0; pub const TSSymbolTypeAnonymous: TSSymbolType = 1; diff --git a/lib/binding_rust/build.rs b/lib/binding_rust/build.rs index 6f44e83c..a9e553de 100644 --- a/lib/binding_rust/build.rs +++ b/lib/binding_rust/build.rs @@ -41,6 +41,8 @@ fn main() { .include(&src_path) .include(&wasm_path) .include(&include_path) + .define("_POSIX_C_SOURCE", "200112L") + .define("_DEFAULT_SOURCE", None) .warnings(false) .file(src_path.join("lib.c")) .compile("tree-sitter"); diff --git a/lib/binding_rust/lib.rs b/lib/binding_rust/lib.rs index bba21bea..eaf6df12 100644 --- a/lib/binding_rust/lib.rs +++ b/lib/binding_rust/lib.rs @@ -610,6 +610,7 @@ impl Parser { /// * `old_tree` A previous syntax tree parsed from the same document. If the text of the /// document has changed since `old_tree` was created, then you must edit `old_tree` to match /// the new text using [`Tree::edit`]. + #[deprecated(since = "0.25.0", note = "Prefer parse_utf16_le instead")] pub fn parse_utf16( &mut self, input: impl AsRef<[u16]>, @@ -617,7 +618,7 @@ impl Parser { ) -> Option { let code_points = input.as_ref(); let len = code_points.len(); - self.parse_utf16_with( + self.parse_utf16_le_with( &mut |i, _| (i < len).then(|| &code_points[i..]).unwrap_or_default(), old_tree, ) @@ -672,6 +673,45 @@ impl Parser { } } + pub fn parse_with_, F: FnMut(usize, Point) -> T>( + &mut self, + callback: &mut F, + old_tree: Option<&Tree>, + ) -> Option { + // A pointer to this payload is passed on every call to the `read` C function. + // The payload contains two things: + // 1. A reference to the rust `callback`. + // 2. The text that was returned from the previous call to `callback`. This allows the + // callback to return owned values like vectors. + let mut payload: (&mut F, Option) = (callback, None); + + // This C function is passed to Tree-sitter as the input callback. + unsafe extern "C" fn read, F: FnMut(usize, Point) -> T>( + payload: *mut c_void, + byte_offset: u32, + position: ffi::TSPoint, + bytes_read: *mut u32, + ) -> *const c_char { + let (callback, text) = (payload as *mut (&mut F, Option)).as_mut().unwrap(); + *text = Some(callback(byte_offset as usize, position.into())); + let slice = text.as_ref().unwrap().as_ref(); + *bytes_read = slice.len() as u32; + slice.as_ptr().cast::() + } + + let c_input = ffi::TSInput { + payload: &mut payload as *mut (&mut F, Option) as *mut c_void, + read: Some(read::), + encoding: ffi::TSInputEncodingUTF8, + }; + + let c_old_tree = old_tree.map_or(ptr::null_mut(), |t| t.0.as_ptr()); + unsafe { + let c_new_tree = ffi::ts_parser_parse(self.0.as_ptr(), c_old_tree, c_input); + NonNull::new(c_new_tree).map(Tree) + } + } + /// Parse UTF16 text provided in chunks by a callback. /// /// # Arguments: @@ -682,10 +722,49 @@ impl Parser { /// * `old_tree` A previous syntax tree parsed from the same document. If the text of the /// document has changed since `old_tree` was created, then you must edit `old_tree` to match /// the new text using [`Tree::edit`]. + #[deprecated(since = "0.25.0", note = "Prefer parse_utf16_le_with instead")] pub fn parse_utf16_with, F: FnMut(usize, Point) -> T>( &mut self, callback: &mut F, old_tree: Option<&Tree>, + ) -> Option { + self.parse_utf16_le_with(callback, old_tree) + } + + /// Parse a slice of UTF16 little-endian text. + /// + /// # Arguments: + /// * `text` The UTF16-encoded text to parse. + /// * `old_tree` A previous syntax tree parsed from the same document. If the text of the + /// document has changed since `old_tree` was created, then you must edit `old_tree` to match + /// the new text using [Tree::edit]. + pub fn parse_utf16_le( + &mut self, + input: impl AsRef<[u16]>, + old_tree: Option<&Tree>, + ) -> Option { + let code_points = input.as_ref(); + let len = code_points.len(); + self.parse_utf16_le_with( + &mut |i, _| (i < len).then(|| &code_points[i..]).unwrap_or_default(), + old_tree, + ) + } + + /// Parse UTF16 little-endian text provided in chunks by a callback. + /// + /// # Arguments: + /// * `callback` A function that takes a code point offset and position and returns a slice of + /// UTF16-encoded text starting at that byte offset and position. The slices can be of any + /// length. If the given position is at the end of the text, the callback should return an + /// empty slice. + /// * `old_tree` A previous syntax tree parsed from the same document. If the text of the + /// document has changed since `old_tree` was created, then you must edit `old_tree` to match + /// the new text using [Tree::edit]. + pub fn parse_utf16_le_with, F: FnMut(usize, Point) -> T>( + &mut self, + callback: &mut F, + old_tree: Option<&Tree>, ) -> Option { // A pointer to this payload is passed on every call to the `read` C function. // The payload contains two things: @@ -701,7 +780,7 @@ impl Parser { position: ffi::TSPoint, bytes_read: *mut u32, ) -> *const c_char { - let (callback, text) = payload.cast::<(&mut F, Option)>().as_mut().unwrap(); + let (callback, text) = (payload as *mut (&mut F, Option)).as_mut().unwrap(); *text = Some(callback( (byte_offset / 2) as usize, Point { @@ -715,9 +794,83 @@ impl Parser { } let c_input = ffi::TSInput { - payload: core::ptr::addr_of_mut!(payload).cast::(), + payload: &mut payload as *mut (&mut F, Option) as *mut c_void, read: Some(read::), - encoding: ffi::TSInputEncodingUTF16, + encoding: ffi::TSInputEncodingUTF16LE, + }; + + let c_old_tree = old_tree.map_or(ptr::null_mut(), |t| t.0.as_ptr()); + unsafe { + let c_new_tree = ffi::ts_parser_parse(self.0.as_ptr(), c_old_tree, c_input); + NonNull::new(c_new_tree).map(Tree) + } + } + + /// Parse a slice of UTF16 big-endian text. + /// + /// # Arguments: + /// * `text` The UTF16-encoded text to parse. + /// * `old_tree` A previous syntax tree parsed from the same document. If the text of the + /// document has changed since `old_tree` was created, then you must edit `old_tree` to match + /// the new text using [Tree::edit]. + pub fn parse_utf16_be( + &mut self, + input: impl AsRef<[u16]>, + old_tree: Option<&Tree>, + ) -> Option { + let code_points = input.as_ref(); + let len = code_points.len(); + self.parse_utf16_be_with( + &mut |i, _| if i < len { &code_points[i..] } else { &[] }, + old_tree, + ) + } + + /// Parse UTF16 big-endian text provided in chunks by a callback. + /// + /// # Arguments: + /// * `callback` A function that takes a code point offset and position and returns a slice of + /// UTF16-encoded text starting at that byte offset and position. The slices can be of any + /// length. If the given position is at the end of the text, the callback should return an + /// empty slice. + /// * `old_tree` A previous syntax tree parsed from the same document. If the text of the + /// document has changed since `old_tree` was created, then you must edit `old_tree` to match + /// the new text using [Tree::edit]. + pub fn parse_utf16_be_with, F: FnMut(usize, Point) -> T>( + &mut self, + callback: &mut F, + old_tree: Option<&Tree>, + ) -> Option { + // A pointer to this payload is passed on every call to the `read` C function. + // The payload contains two things: + // 1. A reference to the rust `callback`. + // 2. The text that was returned from the previous call to `callback`. This allows the + // callback to return owned values like vectors. + let mut payload: (&mut F, Option) = (callback, None); + + // This C function is passed to Tree-sitter as the input callback. + unsafe extern "C" fn read, F: FnMut(usize, Point) -> T>( + payload: *mut c_void, + byte_offset: u32, + position: ffi::TSPoint, + bytes_read: *mut u32, + ) -> *const c_char { + let (callback, text) = (payload as *mut (&mut F, Option)).as_mut().unwrap(); + *text = Some(callback( + (byte_offset / 2) as usize, + Point { + row: position.row as usize, + column: position.column as usize / 2, + }, + )); + let slice = text.as_ref().unwrap().as_ref(); + *bytes_read = slice.len() as u32 * 2; + slice.as_ptr() as *const c_char + } + let c_input = ffi::TSInput { + payload: &mut payload as *mut (&mut F, Option) as *mut c_void, + read: Some(read::), + encoding: ffi::TSInputEncodingUTF16BE, }; let c_old_tree = old_tree.map_or(ptr::null_mut(), |t| t.0.as_ptr()); diff --git a/lib/binding_web/binding.c b/lib/binding_web/binding.c index 36efb042..23faeafe 100644 --- a/lib/binding_web/binding.c +++ b/lib/binding_web/binding.c @@ -172,7 +172,7 @@ TSTree *ts_parser_parse_wasm( TSInput input = { input_buffer, call_parse_callback, - TSInputEncodingUTF16 + TSInputEncodingUTF16LE }; if (range_count) { for (unsigned i = 0; i < range_count; i++) { diff --git a/lib/include/tree_sitter/api.h b/lib/include/tree_sitter/api.h index 362c236e..73bcc136 100644 --- a/lib/include/tree_sitter/api.h +++ b/lib/include/tree_sitter/api.h @@ -50,7 +50,8 @@ typedef struct TSLookaheadIterator TSLookaheadIterator; typedef enum TSInputEncoding { TSInputEncodingUTF8, - TSInputEncodingUTF16, + TSInputEncodingUTF16LE, + TSInputEncodingUTF16BE, } TSInputEncoding; typedef enum TSSymbolType { diff --git a/lib/src/lexer.c b/lib/src/lexer.c index e795618d..84af1c65 100644 --- a/lib/src/lexer.c +++ b/lib/src/lexer.c @@ -83,9 +83,9 @@ static void ts_lexer__get_lookahead(Lexer *self) { } const uint8_t *chunk = (const uint8_t *)self->chunk + position_in_chunk; - UnicodeDecodeFunction decode = self->input.encoding == TSInputEncodingUTF8 - ? ts_decode_utf8 - : ts_decode_utf16; + UnicodeDecodeFunction decode = + self->input.encoding == TSInputEncodingUTF8 ? ts_decode_utf8 : + self->input.encoding == TSInputEncodingUTF16LE ? ts_decode_utf16_le : ts_decode_utf16_be; self->lookahead_size = decode(chunk, size, &self->data.lookahead); diff --git a/lib/src/lib.c b/lib/src/lib.c index 70671ee6..9bfb69f0 100644 --- a/lib/src/lib.c +++ b/lib/src/lib.c @@ -1,5 +1,3 @@ -#define _POSIX_C_SOURCE 200112L - #include "./alloc.c" #include "./get_changed_ranges.c" #include "./language.c" diff --git a/lib/src/parser.c b/lib/src/parser.c index d38ace38..7d71d374 100644 --- a/lib/src/parser.c +++ b/lib/src/parser.c @@ -1,5 +1,3 @@ -#define _POSIX_C_SOURCE 200112L - #include #include #include diff --git a/lib/src/portable/endian.h b/lib/src/portable/endian.h new file mode 100644 index 00000000..cc8271c2 --- /dev/null +++ b/lib/src/portable/endian.h @@ -0,0 +1,170 @@ +// "License": Public Domain +// I, Mathias Panzenböck, place this file hereby into the public domain. Use it at your own risk for whatever you like. +// In case there are jurisdictions that don't support putting things in the public domain you can also consider it to +// be "dual licensed" under the BSD, MIT and Apache licenses, if you want to. This code is trivial anyway. Consider it +// an example on how to get the endian conversion functions on different platforms. + +#ifndef PORTABLE_ENDIAN_H__ +#define PORTABLE_ENDIAN_H__ + +#if (defined(_WIN16) || defined(_WIN32) || defined(_WIN64)) && !defined(__WINDOWS__) + +# define __WINDOWS__ + +#endif + +#if defined(__linux__) || defined(__CYGWIN__) || defined(__GNU__) || defined(__EMSCRIPTEN__) + +# include + +#elif defined(__APPLE__) + +# include + +# define htobe16(x) OSSwapHostToBigInt16(x) +# define htole16(x) OSSwapHostToLittleInt16(x) +# define be16toh(x) OSSwapBigToHostInt16(x) +# define le16toh(x) OSSwapLittleToHostInt16(x) + +# define htobe32(x) OSSwapHostToBigInt32(x) +# define htole32(x) OSSwapHostToLittleInt32(x) +# define be32toh(x) OSSwapBigToHostInt32(x) +# define le32toh(x) OSSwapLittleToHostInt32(x) + +# define htobe64(x) OSSwapHostToBigInt64(x) +# define htole64(x) OSSwapHostToLittleInt64(x) +# define be64toh(x) OSSwapBigToHostInt64(x) +# define le64toh(x) OSSwapLittleToHostInt64(x) + +# define __BYTE_ORDER BYTE_ORDER +# define __BIG_ENDIAN BIG_ENDIAN +# define __LITTLE_ENDIAN LITTLE_ENDIAN +# define __PDP_ENDIAN PDP_ENDIAN + +#elif defined(__OpenBSD__) + +# include + +# define __BYTE_ORDER BYTE_ORDER +# define __BIG_ENDIAN BIG_ENDIAN +# define __LITTLE_ENDIAN LITTLE_ENDIAN +# define __PDP_ENDIAN PDP_ENDIAN + +#elif defined(__NetBSD__) || defined(__FreeBSD__) || defined(__DragonFly__) + +# include + +# define be16toh(x) betoh16(x) +# define le16toh(x) letoh16(x) + +# define be32toh(x) betoh32(x) +# define le32toh(x) letoh32(x) + +# define be64toh(x) betoh64(x) +# define le64toh(x) letoh64(x) + +#elif defined(__WINDOWS__) + +# include +# ifdef __GNUC__ +# include +# endif + +# if BYTE_ORDER == LITTLE_ENDIAN + +# define htobe16(x) htons(x) +# define htole16(x) (x) +# define be16toh(x) ntohs(x) +# define le16toh(x) (x) + +# define htobe32(x) htonl(x) +# define htole32(x) (x) +# define be32toh(x) ntohl(x) +# define le32toh(x) (x) + +# define htobe64(x) htonll(x) +# define htole64(x) (x) +# define be64toh(x) ntohll(x) +# define le64toh(x) (x) + +# elif BYTE_ORDER == BIG_ENDIAN + + /* that would be xbox 360 */ +# define htobe16(x) (x) +# define htole16(x) __builtin_bswap16(x) +# define be16toh(x) (x) +# define le16toh(x) __builtin_bswap16(x) + +# define htobe32(x) (x) +# define htole32(x) __builtin_bswap32(x) +# define be32toh(x) (x) +# define le32toh(x) __builtin_bswap32(x) + +# define htobe64(x) (x) +# define htole64(x) __builtin_bswap64(x) +# define be64toh(x) (x) +# define le64toh(x) __builtin_bswap64(x) + +# else + +# error byte order not supported + +# endif + +# define __BYTE_ORDER BYTE_ORDER +# define __BIG_ENDIAN BIG_ENDIAN +# define __LITTLE_ENDIAN LITTLE_ENDIAN +# define __PDP_ENDIAN PDP_ENDIAN + +#elif defined(__QNXNTO__) + +# include + +# define __LITTLE_ENDIAN 1234 +# define __BIG_ENDIAN 4321 +# define __PDP_ENDIAN 3412 + +# if defined(__BIGENDIAN__) + +# define __BYTE_ORDER __BIG_ENDIAN + +# define htobe16(x) (x) +# define htobe32(x) (x) +# define htobe64(x) (x) + +# define htole16(x) ENDIAN_SWAP16(x) +# define htole32(x) ENDIAN_SWAP32(x) +# define htole64(x) ENDIAN_SWAP64(x) + +# elif defined(__LITTLEENDIAN__) + +# define __BYTE_ORDER __LITTLE_ENDIAN + +# define htole16(x) (x) +# define htole32(x) (x) +# define htole64(x) (x) + +# define htobe16(x) ENDIAN_SWAP16(x) +# define htobe32(x) ENDIAN_SWAP32(x) +# define htobe64(x) ENDIAN_SWAP64(x) + +# else + +# error byte order not supported + +# endif + +# define be16toh(x) ENDIAN_BE16(x) +# define be32toh(x) ENDIAN_BE32(x) +# define be64toh(x) ENDIAN_BE64(x) +# define le16toh(x) ENDIAN_LE16(x) +# define le32toh(x) ENDIAN_LE32(x) +# define le64toh(x) ENDIAN_LE64(x) + +#else + +# error platform not supported + +#endif + +#endif diff --git a/lib/src/tree.c b/lib/src/tree.c index 55e79a7e..bb451180 100644 --- a/lib/src/tree.c +++ b/lib/src/tree.c @@ -1,5 +1,3 @@ -#define _POSIX_C_SOURCE 200112L - #include "tree_sitter/api.h" #include "./array.h" #include "./get_changed_ranges.h" diff --git a/lib/src/unicode.h b/lib/src/unicode.h index 0fba56a6..efeee1fd 100644 --- a/lib/src/unicode.h +++ b/lib/src/unicode.h @@ -12,6 +12,29 @@ extern "C" { #define U_EXPORT2 #include "unicode/utf8.h" #include "unicode/utf16.h" +#include "portable/endian.h" + +#define U16_NEXT_LE(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \ + (c)=le16toh((s)[(i)++]); \ + if(U16_IS_LEAD(c)) { \ + uint16_t __c2; \ + if((i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \ + ++(i); \ + (c)=U16_GET_SUPPLEMENTARY((c), __c2); \ + } \ + } \ +} UPRV_BLOCK_MACRO_END + +#define U16_NEXT_BE(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \ + (c)=be16toh((s)[(i)++]); \ + if(U16_IS_LEAD(c)) { \ + uint16_t __c2; \ + if((i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \ + ++(i); \ + (c)=U16_GET_SUPPLEMENTARY((c), __c2); \ + } \ + } \ +} UPRV_BLOCK_MACRO_END static const int32_t TS_DECODE_ERROR = U_SENTINEL; @@ -33,13 +56,23 @@ static inline uint32_t ts_decode_utf8( return i; } -static inline uint32_t ts_decode_utf16( +static inline uint32_t ts_decode_utf16_le( const uint8_t *string, uint32_t length, int32_t *code_point ) { uint32_t i = 0; - U16_NEXT(((uint16_t *)string), i, length, *code_point); + U16_NEXT_LE(((uint16_t *)string), i, length, *code_point); + return i * 2; +} + +static inline uint32_t ts_decode_utf16_be( + const uint8_t *string, + uint32_t length, + int32_t *code_point +) { + uint32_t i = 0; + U16_NEXT_BE(((uint16_t *)string), i, length, *code_point); return i * 2; } diff --git a/script/build-wasm b/script/build-wasm index 19e22412..8cd32331 100755 --- a/script/build-wasm +++ b/script/build-wasm @@ -131,6 +131,8 @@ $emcc \ -std=c11 \ -D 'fprintf(...)=' \ -D NDEBUG= \ + -D _POSIX_C_SOURCE=200112L \ + -D _DEFAULT_SOURCE= \ -I ${SRC_DIR} \ -I lib/include \ --js-library ${WEB_DIR}/imports.js \