feat!: properly handle UTF-16 endianness encoding

2024-10-04 23:15:17 -04:00 · 2024-10-04 23:15:17 -04:00 · 8943983df6
commit 8943983df6
parent cf8ed78a9a
20 changed files with 485 additions and 50 deletions
--- a/1
+++ b/1
@ -27,6 +27,7 @@ OBJ := $(SRC:.c=.o)
 ARFLAGS := rcs
 CFLAGS ?= -O3 -Wall -Wextra -Wshadow -pedantic
 override CFLAGS += -std=c11 -fPIC -fvisibility=hidden
+override CFLAGS += -D_POSIX_C_SOURCE=200112L -D_DEFAULT_SOURCE
 override CFLAGS += -Ilib/src -Ilib/src/wasm -Ilib/include

 # ABI versioning
--- a/Package.swift
+++ b/Package.swift
@ -15,7 +15,11 @@ let package = Package(
        .target(name: "TreeSitter",
                path: "lib",
                sources: ["src/lib.c"],
-                cSettings: [.headerSearchPath("src")]),
+                cSettings: [
+                        .headerSearchPath("src"),
+                        .define("_POSIX_C_SOURCE", to: "200112L"),
+                        .define("_DEFAULT_SOURCE"),
+                ]),
    ],
    cLanguageStandard: .c11
 )
--- a/build.zig
+++ b/build.zig
@ -11,6 +11,8 @@ pub fn build(b: *std.Build) void {
    lib.addCSourceFile(.{ .file = b.path("lib/src/lib.c"), .flags = &.{"-std=c11"} });
    lib.addIncludePath(b.path("lib/include"));
    lib.addIncludePath(b.path("lib/src"));
+    lib.root_module.addCMacro("_POSIX_C_SOURCE", "200112L");
+    lib.root_module.addCMacro("_DEFAULT_SOURCE", "");

    lib.installHeadersDirectory(b.path("lib/include"), ".", .{});

--- a/cli/src/main.rs
+++ b/cli/src/main.rs
@ -6,7 +6,7 @@ use std::{

 use anstyle::{AnsiColor, Color, Style};
 use anyhow::{anyhow, Context, Result};
-use clap::{crate_authors, Args, Command, FromArgMatches as _, Subcommand};
+use clap::{crate_authors, Args, Command, FromArgMatches as _, Subcommand, ValueEnum};
 use clap_complete::{generate, Shell};
 use dialoguer::{theme::ColorfulTheme, Confirm, FuzzySelect, Input};
 use glob::glob;
@ -191,7 +191,7 @@ struct Parse {
    )]
    pub edits: Option<Vec<String>>,
    #[arg(long, help = "The encoding of the input files")]
-    pub encoding: Option<String>,
+    pub encoding: Option<Encoding>,
    #[arg(
        long,
        help = "Open `log.html` in the default browser, if `--debug-graph` is supplied"
@ -208,6 +208,13 @@ struct Parse {
    pub no_ranges: bool,
 }

+#[derive(ValueEnum, Clone)]
+pub enum Encoding {
+    Utf8,
+    Utf16LE,
+    Utf16BE,
+}
+
 #[derive(Args)]
 #[command(about = "Run a parser's tests", alias = "t")]
 struct Test {
@ -773,15 +780,11 @@ impl Parse {
            ParseOutput::Normal
        };

-        let encoding = if let Some(encoding) = self.encoding {
-            match encoding.as_str() {
-                "utf16" => Some(ffi::TSInputEncodingUTF16),
-                "utf8" => Some(ffi::TSInputEncodingUTF8),
-                _ => return Err(anyhow!("Invalid encoding. Expected one of: utf8, utf16")),
-            }
-        } else {
-            None
-        };
+        let encoding = self.encoding.map(|e| match e {
+            Encoding::Utf8 => ffi::TSInputEncodingUTF8,
+            Encoding::Utf16LE => ffi::TSInputEncodingUTF16LE,
+            Encoding::Utf16BE => ffi::TSInputEncodingUTF16BE,
+        });

        let time = self.time;
        let edits = self.edits.unwrap_or_default();
--- a/cli/src/parse.rs
+++ b/cli/src/parse.rs
@ -100,24 +100,42 @@ pub fn parse_file_at_path(parser: &mut Parser, opts: &ParseFileOptions) -> Resul
    let time = Instant::now();

    #[inline(always)]
-    fn is_utf16_bom(bom_bytes: &[u8]) -> bool {
-        bom_bytes == [0xFF, 0xFE] || bom_bytes == [0xFE, 0xFF]
+    fn is_utf16_le_bom(bom_bytes: &[u8]) -> bool {
+        bom_bytes == [0xFF, 0xFE]
    }

-    let tree = match opts.encoding {
-        Some(encoding) if encoding == ffi::TSInputEncodingUTF16 => {
-            let source_code_utf16 = source_code
-                .chunks_exact(2)
-                .map(|chunk| u16::from_le_bytes([chunk[0], chunk[1]]))
-                .collect::<Vec<_>>();
-            parser.parse_utf16(&source_code_utf16, None)
+    #[inline(always)]
+    fn is_utf16_be_bom(bom_bytes: &[u8]) -> bool {
+        bom_bytes == [0xFE, 0xFF]
+    }
+
+    let encoding = match opts.encoding {
+        None if source_code.len() >= 2 => {
+            if is_utf16_le_bom(&source_code[0..2]) {
+                Some(ffi::TSInputEncodingUTF16LE)
+            } else if is_utf16_be_bom(&source_code[0..2]) {
+                Some(ffi::TSInputEncodingUTF16BE)
+            } else {
+                None
+            }
        }
-        None if source_code.len() >= 2 && is_utf16_bom(&source_code[0..2]) => {
+        _ => opts.encoding,
+    };
+
+    let tree = match encoding {
+        Some(encoding) if encoding == ffi::TSInputEncodingUTF16LE => {
            let source_code_utf16 = source_code
                .chunks_exact(2)
                .map(|chunk| u16::from_le_bytes([chunk[0], chunk[1]]))
                .collect::<Vec<_>>();
-            parser.parse_utf16(&source_code_utf16, None)
+            parser.parse_utf16_le(&source_code_utf16, None)
+        }
+        Some(encoding) if encoding == ffi::TSInputEncodingUTF16BE => {
+            let source_code_utf16 = source_code
+                .chunks_exact(2)
+                .map(|chunk| u16::from_be_bytes([chunk[0], chunk[1]]))
+                .collect::<Vec<_>>();
+            parser.parse_utf16_be(&source_code_utf16, None)
        }
        _ => parser.parse(&source_code, None),
    };
--- a/cli/src/tests/parser_test.rs
+++ b/cli/src/tests/parser_test.rs
@ -155,17 +155,19 @@ fn test_parsing_with_custom_utf8_input() {
 }

 #[test]
-fn test_parsing_with_custom_utf16_input() {
+fn test_parsing_with_custom_utf16le_input() {
    let mut parser = Parser::new();
    parser.set_language(&get_language("rust")).unwrap();

    let lines = ["pub fn foo() {", "  1", "}"]
        .iter()
-        .map(|s| s.encode_utf16().collect::<Vec<_>>())
+        .map(|s| s.encode_utf16().map(|u| u.to_le()).collect::<Vec<_>>())
        .collect::<Vec<_>>();

+    let newline = [('\n' as u16).to_le()];
+
    let tree = parser
-        .parse_utf16_with(
+        .parse_utf16_le_with(
            &mut |_, position| {
                let row = position.row;
                let column = position.column;
@ -173,7 +175,7 @@ fn test_parsing_with_custom_utf16_input() {
                    if column < lines[row].len() {
                        &lines[row][column..]
                    } else {
-                        &[10]
+                        &newline
                    }
                } else {
                    &[]
@ -193,6 +195,47 @@ fn test_parsing_with_custom_utf16_input() {
    assert_eq!(root.child(0).unwrap().kind(), "function_item");
 }

+#[test]
+fn test_parsing_with_custom_utf16_be_input() {
+    let mut parser = Parser::new();
+    parser.set_language(&get_language("rust")).unwrap();
+
+    let lines: Vec<Vec<u16>> = ["pub fn foo() {", "  1", "}"]
+        .iter()
+        .map(|s| s.encode_utf16().collect::<Vec<_>>())
+        .map(|v| v.iter().map(|u| u.to_be()).collect())
+        .collect();
+
+    let newline = [('\n' as u16).to_be()];
+
+    let tree = parser
+        .parse_utf16_be_with(
+            &mut |_, position| {
+                let row = position.row;
+                let column = position.column;
+                if row < lines.len() {
+                    if column < lines[row].len() {
+                        &lines[row][column..]
+                    } else {
+                        &newline
+                    }
+                } else {
+                    &[]
+                }
+            },
+            None,
+        )
+        .unwrap();
+    let root = tree.root_node();
+    assert_eq!(
+        root.to_sexp(),
+        "(source_file (function_item (visibility_modifier) name: (identifier) parameters: (parameters) body: (block (integer_literal))))"
+    );
+    assert_eq!(root.kind(), "source_file");
+    assert!(!root.has_error());
+    assert_eq!(root.child(0).unwrap().kind(), "function_item");
+}
+
 #[test]
 fn test_parsing_with_callback_returning_owned_strings() {
    let mut parser = Parser::new();
@ -221,7 +264,13 @@ fn test_parsing_text_with_byte_order_mark() {

    // Parse UTF16 text with a BOM
    let tree = parser
-        .parse_utf16("\u{FEFF}fn a() {}".encode_utf16().collect::<Vec<_>>(), None)
+        .parse_utf16_le(
+            "\u{FEFF}fn a() {}"
+                .encode_utf16()
+                .map(|u| u.to_le())
+                .collect::<Vec<_>>(),
+            None,
+        )
        .unwrap();
    assert_eq!(
        tree.root_node().to_sexp(),
@ -1084,9 +1133,8 @@ fn test_parsing_error_in_invalid_included_ranges() {
 fn test_parsing_utf16_code_with_errors_at_the_end_of_an_included_range() {
    let source_code = "<script>a.</script>";
    let utf16_source_code = source_code
-        .as_bytes()
-        .iter()
-        .map(|c| u16::from(*c))
+        .encode_utf16()
+        .map(|u| u.to_le())
        .collect::<Vec<_>>();

    let start_byte = 2 * source_code.find("a.").unwrap();
@ -1102,7 +1150,7 @@ fn test_parsing_utf16_code_with_errors_at_the_end_of_an_included_range() {
            end_point: Point::new(0, end_byte),
        }])
        .unwrap();
-    let tree = parser.parse_utf16(&utf16_source_code, None).unwrap();
+    let tree = parser.parse_utf16_le(&utf16_source_code, None).unwrap();
    assert_eq!(tree.root_node().to_sexp(), "(program (ERROR (identifier)))");
 }

--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@ -76,6 +76,8 @@ set_target_properties(tree-sitter
                      SOVERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}"
                      DEFINE_SYMBOL "")

+target_compile_definitions(tree-sitter PRIVATE _POSIX_C_SOURCE=200112L _DEFAULT_SOURCE)
+
 configure_file(tree-sitter.pc.in "${CMAKE_CURRENT_BINARY_DIR}/tree-sitter.pc" @ONLY)

 include(GNUInstallDirs)
--- a/lib/Cargo.toml
+++ b/lib/Cargo.toml
@ -20,6 +20,7 @@ include = [
  "/Cargo.toml",
  "/src/*.h",
  "/src/*.c",
+  "/src/portable/*",
  "/src/unicode/*",
  "/src/wasm/*",
  "/include/tree_sitter/api.h",
--- a/lib/binding_rust/bindings.rs
+++ b/lib/binding_rust/bindings.rs
@ -36,7 +36,8 @@ pub struct TSLookaheadIterator {
    _unused: [u8; 0],
 }
 pub const TSInputEncodingUTF8: TSInputEncoding = 0;
-pub const TSInputEncodingUTF16: TSInputEncoding = 1;
+pub const TSInputEncodingUTF16LE: TSInputEncoding = 1;
+pub const TSInputEncodingUTF16BE: TSInputEncoding = 2;
 pub type TSInputEncoding = ::core::ffi::c_uint;
 pub const TSSymbolTypeRegular: TSSymbolType = 0;
 pub const TSSymbolTypeAnonymous: TSSymbolType = 1;
--- a/lib/binding_rust/build.rs
+++ b/lib/binding_rust/build.rs
@ -41,6 +41,8 @@ fn main() {
        .include(&src_path)
        .include(&wasm_path)
        .include(&include_path)
+        .define("_POSIX_C_SOURCE", "200112L")
+        .define("_DEFAULT_SOURCE", None)
        .warnings(false)
        .file(src_path.join("lib.c"))
        .compile("tree-sitter");
--- a/lib/binding_rust/lib.rs
+++ b/lib/binding_rust/lib.rs
@ -610,6 +610,7 @@ impl Parser {
    /// * `old_tree` A previous syntax tree parsed from the same document. If the text of the
    ///   document has changed since `old_tree` was created, then you must edit `old_tree` to match
    ///   the new text using [`Tree::edit`].
+    #[deprecated(since = "0.25.0", note = "Prefer parse_utf16_le instead")]
    pub fn parse_utf16(
        &mut self,
        input: impl AsRef<[u16]>,
@ -617,7 +618,7 @@ impl Parser {
    ) -> Option<Tree> {
        let code_points = input.as_ref();
        let len = code_points.len();
-        self.parse_utf16_with(
+        self.parse_utf16_le_with(
            &mut |i, _| (i < len).then(|| &code_points[i..]).unwrap_or_default(),
            old_tree,
        )
@ -672,6 +673,45 @@ impl Parser {
        }
    }

+    pub fn parse_with_<T: AsRef<[u8]>, F: FnMut(usize, Point) -> T>(
+        &mut self,
+        callback: &mut F,
+        old_tree: Option<&Tree>,
+    ) -> Option<Tree> {
+        // A pointer to this payload is passed on every call to the `read` C function.
+        // The payload contains two things:
+        // 1. A reference to the rust `callback`.
+        // 2. The text that was returned from the previous call to `callback`. This allows the
+        //    callback to return owned values like vectors.
+        let mut payload: (&mut F, Option<T>) = (callback, None);
+
+        // This C function is passed to Tree-sitter as the input callback.
+        unsafe extern "C" fn read<T: AsRef<[u8]>, F: FnMut(usize, Point) -> T>(
+            payload: *mut c_void,
+            byte_offset: u32,
+            position: ffi::TSPoint,
+            bytes_read: *mut u32,
+        ) -> *const c_char {
+            let (callback, text) = (payload as *mut (&mut F, Option<T>)).as_mut().unwrap();
+            *text = Some(callback(byte_offset as usize, position.into()));
+            let slice = text.as_ref().unwrap().as_ref();
+            *bytes_read = slice.len() as u32;
+            slice.as_ptr().cast::<c_char>()
+        }
+
+        let c_input = ffi::TSInput {
+            payload: &mut payload as *mut (&mut F, Option<T>) as *mut c_void,
+            read: Some(read::<T, F>),
+            encoding: ffi::TSInputEncodingUTF8,
+        };
+
+        let c_old_tree = old_tree.map_or(ptr::null_mut(), |t| t.0.as_ptr());
+        unsafe {
+            let c_new_tree = ffi::ts_parser_parse(self.0.as_ptr(), c_old_tree, c_input);
+            NonNull::new(c_new_tree).map(Tree)
+        }
+    }
+
    /// Parse UTF16 text provided in chunks by a callback.
    ///
    /// # Arguments:
@ -682,10 +722,49 @@ impl Parser {
    /// * `old_tree` A previous syntax tree parsed from the same document. If the text of the
    ///   document has changed since `old_tree` was created, then you must edit `old_tree` to match
    ///   the new text using [`Tree::edit`].
+    #[deprecated(since = "0.25.0", note = "Prefer parse_utf16_le_with instead")]
    pub fn parse_utf16_with<T: AsRef<[u16]>, F: FnMut(usize, Point) -> T>(
        &mut self,
        callback: &mut F,
        old_tree: Option<&Tree>,
+    ) -> Option<Tree> {
+        self.parse_utf16_le_with(callback, old_tree)
+    }
+
+    /// Parse a slice of UTF16 little-endian text.
+    ///
+    /// # Arguments:
+    /// * `text` The UTF16-encoded text to parse.
+    /// * `old_tree` A previous syntax tree parsed from the same document. If the text of the
+    ///   document has changed since `old_tree` was created, then you must edit `old_tree` to match
+    ///   the new text using [Tree::edit].
+    pub fn parse_utf16_le(
+        &mut self,
+        input: impl AsRef<[u16]>,
+        old_tree: Option<&Tree>,
+    ) -> Option<Tree> {
+        let code_points = input.as_ref();
+        let len = code_points.len();
+        self.parse_utf16_le_with(
+            &mut |i, _| (i < len).then(|| &code_points[i..]).unwrap_or_default(),
+            old_tree,
+        )
+    }
+
+    /// Parse UTF16 little-endian text provided in chunks by a callback.
+    ///
+    /// # Arguments:
+    /// * `callback` A function that takes a code point offset and position and returns a slice of
+    ///   UTF16-encoded text starting at that byte offset and position. The slices can be of any
+    ///   length. If the given position is at the end of the text, the callback should return an
+    ///   empty slice.
+    /// * `old_tree` A previous syntax tree parsed from the same document. If the text of the
+    ///   document has changed since `old_tree` was created, then you must edit `old_tree` to match
+    ///   the new text using [Tree::edit].
+    pub fn parse_utf16_le_with<T: AsRef<[u16]>, F: FnMut(usize, Point) -> T>(
+        &mut self,
+        callback: &mut F,
+        old_tree: Option<&Tree>,
    ) -> Option<Tree> {
        // A pointer to this payload is passed on every call to the `read` C function.
        // The payload contains two things:
@ -701,7 +780,7 @@ impl Parser {
            position: ffi::TSPoint,
            bytes_read: *mut u32,
        ) -> *const c_char {
-            let (callback, text) = payload.cast::<(&mut F, Option<T>)>().as_mut().unwrap();
+            let (callback, text) = (payload as *mut (&mut F, Option<T>)).as_mut().unwrap();
            *text = Some(callback(
                (byte_offset / 2) as usize,
                Point {
@ -715,9 +794,83 @@ impl Parser {
        }

        let c_input = ffi::TSInput {
-            payload: core::ptr::addr_of_mut!(payload).cast::<c_void>(),
+            payload: &mut payload as *mut (&mut F, Option<T>) as *mut c_void,
            read: Some(read::<T, F>),
-            encoding: ffi::TSInputEncodingUTF16,
+            encoding: ffi::TSInputEncodingUTF16LE,
+        };
+
+        let c_old_tree = old_tree.map_or(ptr::null_mut(), |t| t.0.as_ptr());
+        unsafe {
+            let c_new_tree = ffi::ts_parser_parse(self.0.as_ptr(), c_old_tree, c_input);
+            NonNull::new(c_new_tree).map(Tree)
+        }
+    }
+
+    /// Parse a slice of UTF16 big-endian text.
+    ///
+    /// # Arguments:
+    /// * `text` The UTF16-encoded text to parse.
+    /// * `old_tree` A previous syntax tree parsed from the same document. If the text of the
+    ///   document has changed since `old_tree` was created, then you must edit `old_tree` to match
+    ///   the new text using [Tree::edit].
+    pub fn parse_utf16_be(
+        &mut self,
+        input: impl AsRef<[u16]>,
+        old_tree: Option<&Tree>,
+    ) -> Option<Tree> {
+        let code_points = input.as_ref();
+        let len = code_points.len();
+        self.parse_utf16_be_with(
+            &mut |i, _| if i < len { &code_points[i..] } else { &[] },
+            old_tree,
+        )
+    }
+
+    /// Parse UTF16 big-endian text provided in chunks by a callback.
+    ///
+    /// # Arguments:
+    /// * `callback` A function that takes a code point offset and position and returns a slice of
+    ///   UTF16-encoded text starting at that byte offset and position. The slices can be of any
+    ///   length. If the given position is at the end of the text, the callback should return an
+    ///   empty slice.
+    /// * `old_tree` A previous syntax tree parsed from the same document. If the text of the
+    ///   document has changed since `old_tree` was created, then you must edit `old_tree` to match
+    ///   the new text using [Tree::edit].
+    pub fn parse_utf16_be_with<T: AsRef<[u16]>, F: FnMut(usize, Point) -> T>(
+        &mut self,
+        callback: &mut F,
+        old_tree: Option<&Tree>,
+    ) -> Option<Tree> {
+        // A pointer to this payload is passed on every call to the `read` C function.
+        // The payload contains two things:
+        // 1. A reference to the rust `callback`.
+        // 2. The text that was returned from the previous call to `callback`. This allows the
+        //    callback to return owned values like vectors.
+        let mut payload: (&mut F, Option<T>) = (callback, None);
+
+        // This C function is passed to Tree-sitter as the input callback.
+        unsafe extern "C" fn read<T: AsRef<[u16]>, F: FnMut(usize, Point) -> T>(
+            payload: *mut c_void,
+            byte_offset: u32,
+            position: ffi::TSPoint,
+            bytes_read: *mut u32,
+        ) -> *const c_char {
+            let (callback, text) = (payload as *mut (&mut F, Option<T>)).as_mut().unwrap();
+            *text = Some(callback(
+                (byte_offset / 2) as usize,
+                Point {
+                    row: position.row as usize,
+                    column: position.column as usize / 2,
+                },
+            ));
+            let slice = text.as_ref().unwrap().as_ref();
+            *bytes_read = slice.len() as u32 * 2;
+            slice.as_ptr() as *const c_char
+        }
+        let c_input = ffi::TSInput {
+            payload: &mut payload as *mut (&mut F, Option<T>) as *mut c_void,
+            read: Some(read::<T, F>),
+            encoding: ffi::TSInputEncodingUTF16BE,
        };

        let c_old_tree = old_tree.map_or(ptr::null_mut(), |t| t.0.as_ptr());
--- a/lib/binding_web/binding.c
+++ b/lib/binding_web/binding.c
@ -172,7 +172,7 @@ TSTree *ts_parser_parse_wasm(
  TSInput input = {
    input_buffer,
    call_parse_callback,
-    TSInputEncodingUTF16
+    TSInputEncodingUTF16LE
  };
  if (range_count) {
    for (unsigned i = 0; i < range_count; i++) {
--- a/lib/include/tree_sitter/api.h
+++ b/lib/include/tree_sitter/api.h
@ -50,7 +50,8 @@ typedef struct TSLookaheadIterator TSLookaheadIterator;

 typedef enum TSInputEncoding {
  TSInputEncodingUTF8,
-  TSInputEncodingUTF16,
+  TSInputEncodingUTF16LE,
+  TSInputEncodingUTF16BE,
 } TSInputEncoding;

 typedef enum TSSymbolType {
--- a/lib/src/lexer.c
+++ b/lib/src/lexer.c
@ -83,9 +83,9 @@ static void ts_lexer__get_lookahead(Lexer *self) {
  }

  const uint8_t *chunk = (const uint8_t *)self->chunk + position_in_chunk;
-  UnicodeDecodeFunction decode = self->input.encoding == TSInputEncodingUTF8
-    ? ts_decode_utf8
-    : ts_decode_utf16;
+  UnicodeDecodeFunction decode =
+    self->input.encoding == TSInputEncodingUTF8 ? ts_decode_utf8 :
+    self->input.encoding == TSInputEncodingUTF16LE ? ts_decode_utf16_le : ts_decode_utf16_be;

  self->lookahead_size = decode(chunk, size, &self->data.lookahead);

--- a/lib/src/lib.c
+++ b/lib/src/lib.c
@ -1,5 +1,3 @@
-#define _POSIX_C_SOURCE 200112L
-
 #include "./alloc.c"
 #include "./get_changed_ranges.c"
 #include "./language.c"
--- a/lib/src/parser.c
+++ b/lib/src/parser.c
@ -1,5 +1,3 @@
-#define _POSIX_C_SOURCE 200112L
-
 #include <time.h>
 #include <stdio.h>
 #include <limits.h>
--- a/lib/src/portable/endian.h
+++ b/lib/src/portable/endian.h
@ -0,0 +1,170 @@
+// "License": Public Domain
+// I, Mathias Panzenböck, place this file hereby into the public domain. Use it at your own risk for whatever you like.
+// In case there are jurisdictions that don't support putting things in the public domain you can also consider it to
+// be "dual licensed" under the BSD, MIT and Apache licenses, if you want to. This code is trivial anyway. Consider it
+// an example on how to get the endian conversion functions on different platforms.
+
+#ifndef PORTABLE_ENDIAN_H__
+#define PORTABLE_ENDIAN_H__
+
+#if (defined(_WIN16) || defined(_WIN32) || defined(_WIN64)) && !defined(__WINDOWS__)
+
+#    define __WINDOWS__
+
+#endif
+
+#if defined(__linux__) || defined(__CYGWIN__) || defined(__GNU__) || defined(__EMSCRIPTEN__)
+
+#    include <endian.h>
+
+#elif defined(__APPLE__)
+
+#    include <libkern/OSByteOrder.h>
+
+#    define htobe16(x) OSSwapHostToBigInt16(x)
+#    define htole16(x) OSSwapHostToLittleInt16(x)
+#    define be16toh(x) OSSwapBigToHostInt16(x)
+#    define le16toh(x) OSSwapLittleToHostInt16(x)
+ 
+#    define htobe32(x) OSSwapHostToBigInt32(x)
+#    define htole32(x) OSSwapHostToLittleInt32(x)
+#    define be32toh(x) OSSwapBigToHostInt32(x)
+#    define le32toh(x) OSSwapLittleToHostInt32(x)
+ 
+#    define htobe64(x) OSSwapHostToBigInt64(x)
+#    define htole64(x) OSSwapHostToLittleInt64(x)
+#    define be64toh(x) OSSwapBigToHostInt64(x)
+#    define le64toh(x) OSSwapLittleToHostInt64(x)
+
+#    define __BYTE_ORDER    BYTE_ORDER
+#    define __BIG_ENDIAN    BIG_ENDIAN
+#    define __LITTLE_ENDIAN LITTLE_ENDIAN
+#    define __PDP_ENDIAN    PDP_ENDIAN
+
+#elif defined(__OpenBSD__)
+
+#    include <endian.h>
+
+#    define __BYTE_ORDER    BYTE_ORDER
+#    define __BIG_ENDIAN    BIG_ENDIAN
+#    define __LITTLE_ENDIAN LITTLE_ENDIAN
+#    define __PDP_ENDIAN    PDP_ENDIAN
+
+#elif defined(__NetBSD__) || defined(__FreeBSD__) || defined(__DragonFly__)
+
+#    include <sys/endian.h>
+
+#    define be16toh(x) betoh16(x)
+#    define le16toh(x) letoh16(x)
+
+#    define be32toh(x) betoh32(x)
+#    define le32toh(x) letoh32(x)
+
+#    define be64toh(x) betoh64(x)
+#    define le64toh(x) letoh64(x)
+
+#elif defined(__WINDOWS__)
+
+#    include <winsock2.h>
+#    ifdef __GNUC__
+#        include <sys/param.h>
+#    endif
+
+#    if BYTE_ORDER == LITTLE_ENDIAN
+
+#        define htobe16(x) htons(x)
+#        define htole16(x) (x)
+#        define be16toh(x) ntohs(x)
+#        define le16toh(x) (x)
+ 
+#        define htobe32(x) htonl(x)
+#        define htole32(x) (x)
+#        define be32toh(x) ntohl(x)
+#        define le32toh(x) (x)
+ 
+#        define htobe64(x) htonll(x)
+#        define htole64(x) (x)
+#        define be64toh(x) ntohll(x)
+#        define le64toh(x) (x)
+
+#    elif BYTE_ORDER == BIG_ENDIAN
+
+        /* that would be xbox 360 */
+#        define htobe16(x) (x)
+#        define htole16(x) __builtin_bswap16(x)
+#        define be16toh(x) (x)
+#        define le16toh(x) __builtin_bswap16(x)
+ 
+#        define htobe32(x) (x)
+#        define htole32(x) __builtin_bswap32(x)
+#        define be32toh(x) (x)
+#        define le32toh(x) __builtin_bswap32(x)
+ 
+#        define htobe64(x) (x)
+#        define htole64(x) __builtin_bswap64(x)
+#        define be64toh(x) (x)
+#        define le64toh(x) __builtin_bswap64(x)
+
+#    else
+
+#        error byte order not supported
+
+#    endif
+
+#    define __BYTE_ORDER    BYTE_ORDER
+#    define __BIG_ENDIAN    BIG_ENDIAN
+#    define __LITTLE_ENDIAN LITTLE_ENDIAN
+#    define __PDP_ENDIAN    PDP_ENDIAN
+
+#elif defined(__QNXNTO__)
+
+#    include <gulliver.h>
+
+#    define __LITTLE_ENDIAN 1234
+#    define __BIG_ENDIAN    4321
+#    define __PDP_ENDIAN    3412
+
+#    if defined(__BIGENDIAN__)
+
+#        define __BYTE_ORDER __BIG_ENDIAN
+
+#        define htobe16(x) (x)
+#        define htobe32(x) (x)
+#        define htobe64(x) (x)
+
+#        define htole16(x) ENDIAN_SWAP16(x)
+#        define htole32(x) ENDIAN_SWAP32(x)
+#        define htole64(x) ENDIAN_SWAP64(x)
+
+#    elif defined(__LITTLEENDIAN__)
+
+#        define __BYTE_ORDER __LITTLE_ENDIAN
+
+#        define htole16(x) (x)
+#        define htole32(x) (x)
+#        define htole64(x) (x)
+
+#        define htobe16(x) ENDIAN_SWAP16(x)
+#        define htobe32(x) ENDIAN_SWAP32(x)
+#        define htobe64(x) ENDIAN_SWAP64(x)
+
+#    else
+
+#        error byte order not supported
+
+#    endif
+
+#    define be16toh(x) ENDIAN_BE16(x)
+#    define be32toh(x) ENDIAN_BE32(x)
+#    define be64toh(x) ENDIAN_BE64(x)
+#    define le16toh(x) ENDIAN_LE16(x)
+#    define le32toh(x) ENDIAN_LE32(x)
+#    define le64toh(x) ENDIAN_LE64(x)
+
+#else
+
+#    error platform not supported
+
+#endif
+
+#endif
--- a/lib/src/tree.c
+++ b/lib/src/tree.c
@ -1,5 +1,3 @@
-#define _POSIX_C_SOURCE 200112L
-
 #include "tree_sitter/api.h"
 #include "./array.h"
 #include "./get_changed_ranges.h"
--- a/lib/src/unicode.h
+++ b/lib/src/unicode.h
@ -12,6 +12,29 @@ extern "C" {
 #define U_EXPORT2
 #include "unicode/utf8.h"
 #include "unicode/utf16.h"
+#include "portable/endian.h"
+
+#define U16_NEXT_LE(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
+    (c)=le16toh((s)[(i)++]); \
+    if(U16_IS_LEAD(c)) { \
+        uint16_t __c2; \
+        if((i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
+            ++(i); \
+            (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
+        } \
+    } \
+} UPRV_BLOCK_MACRO_END
+
+#define U16_NEXT_BE(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
+    (c)=be16toh((s)[(i)++]); \
+    if(U16_IS_LEAD(c)) { \
+        uint16_t __c2; \
+        if((i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
+            ++(i); \
+            (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
+        } \
+    } \
+} UPRV_BLOCK_MACRO_END

 static const int32_t TS_DECODE_ERROR = U_SENTINEL;

@ -33,13 +56,23 @@ static inline uint32_t ts_decode_utf8(
  return i;
 }

-static inline uint32_t ts_decode_utf16(
+static inline uint32_t ts_decode_utf16_le(
  const uint8_t *string,
  uint32_t length,
  int32_t *code_point
 ) {
  uint32_t i = 0;
-  U16_NEXT(((uint16_t *)string), i, length, *code_point);
+  U16_NEXT_LE(((uint16_t *)string), i, length, *code_point);
+  return i * 2;
+}
+
+static inline uint32_t ts_decode_utf16_be(
+  const uint8_t *string,
+  uint32_t length,
+  int32_t *code_point
+) {
+  uint32_t i = 0;
+  U16_NEXT_BE(((uint16_t *)string), i, length, *code_point);
  return i * 2;
 }

--- a/script/build-wasm
+++ b/script/build-wasm
@ -131,6 +131,8 @@ $emcc                                            \
  -std=c11                                       \
  -D 'fprintf(...)='                             \
  -D NDEBUG=                                     \
+  -D _POSIX_C_SOURCE=200112L                     \
+  -D _DEFAULT_SOURCE=                            \
  -I ${SRC_DIR}                                  \
  -I lib/include                                 \
  --js-library ${WEB_DIR}/imports.js             \