feat!: properly handle UTF-16 endianness encoding
This commit is contained in:
parent
cf8ed78a9a
commit
8943983df6
20 changed files with 485 additions and 50 deletions
1
Makefile
1
Makefile
|
|
@ -27,6 +27,7 @@ OBJ := $(SRC:.c=.o)
|
|||
ARFLAGS := rcs
|
||||
CFLAGS ?= -O3 -Wall -Wextra -Wshadow -pedantic
|
||||
override CFLAGS += -std=c11 -fPIC -fvisibility=hidden
|
||||
override CFLAGS += -D_POSIX_C_SOURCE=200112L -D_DEFAULT_SOURCE
|
||||
override CFLAGS += -Ilib/src -Ilib/src/wasm -Ilib/include
|
||||
|
||||
# ABI versioning
|
||||
|
|
|
|||
|
|
@ -15,7 +15,11 @@ let package = Package(
|
|||
.target(name: "TreeSitter",
|
||||
path: "lib",
|
||||
sources: ["src/lib.c"],
|
||||
cSettings: [.headerSearchPath("src")]),
|
||||
cSettings: [
|
||||
.headerSearchPath("src"),
|
||||
.define("_POSIX_C_SOURCE", to: "200112L"),
|
||||
.define("_DEFAULT_SOURCE"),
|
||||
]),
|
||||
],
|
||||
cLanguageStandard: .c11
|
||||
)
|
||||
|
|
|
|||
|
|
@ -11,6 +11,8 @@ pub fn build(b: *std.Build) void {
|
|||
lib.addCSourceFile(.{ .file = b.path("lib/src/lib.c"), .flags = &.{"-std=c11"} });
|
||||
lib.addIncludePath(b.path("lib/include"));
|
||||
lib.addIncludePath(b.path("lib/src"));
|
||||
lib.root_module.addCMacro("_POSIX_C_SOURCE", "200112L");
|
||||
lib.root_module.addCMacro("_DEFAULT_SOURCE", "");
|
||||
|
||||
lib.installHeadersDirectory(b.path("lib/include"), ".", .{});
|
||||
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ use std::{
|
|||
|
||||
use anstyle::{AnsiColor, Color, Style};
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use clap::{crate_authors, Args, Command, FromArgMatches as _, Subcommand};
|
||||
use clap::{crate_authors, Args, Command, FromArgMatches as _, Subcommand, ValueEnum};
|
||||
use clap_complete::{generate, Shell};
|
||||
use dialoguer::{theme::ColorfulTheme, Confirm, FuzzySelect, Input};
|
||||
use glob::glob;
|
||||
|
|
@ -191,7 +191,7 @@ struct Parse {
|
|||
)]
|
||||
pub edits: Option<Vec<String>>,
|
||||
#[arg(long, help = "The encoding of the input files")]
|
||||
pub encoding: Option<String>,
|
||||
pub encoding: Option<Encoding>,
|
||||
#[arg(
|
||||
long,
|
||||
help = "Open `log.html` in the default browser, if `--debug-graph` is supplied"
|
||||
|
|
@ -208,6 +208,13 @@ struct Parse {
|
|||
pub no_ranges: bool,
|
||||
}
|
||||
|
||||
#[derive(ValueEnum, Clone)]
|
||||
pub enum Encoding {
|
||||
Utf8,
|
||||
Utf16LE,
|
||||
Utf16BE,
|
||||
}
|
||||
|
||||
#[derive(Args)]
|
||||
#[command(about = "Run a parser's tests", alias = "t")]
|
||||
struct Test {
|
||||
|
|
@ -773,15 +780,11 @@ impl Parse {
|
|||
ParseOutput::Normal
|
||||
};
|
||||
|
||||
let encoding = if let Some(encoding) = self.encoding {
|
||||
match encoding.as_str() {
|
||||
"utf16" => Some(ffi::TSInputEncodingUTF16),
|
||||
"utf8" => Some(ffi::TSInputEncodingUTF8),
|
||||
_ => return Err(anyhow!("Invalid encoding. Expected one of: utf8, utf16")),
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let encoding = self.encoding.map(|e| match e {
|
||||
Encoding::Utf8 => ffi::TSInputEncodingUTF8,
|
||||
Encoding::Utf16LE => ffi::TSInputEncodingUTF16LE,
|
||||
Encoding::Utf16BE => ffi::TSInputEncodingUTF16BE,
|
||||
});
|
||||
|
||||
let time = self.time;
|
||||
let edits = self.edits.unwrap_or_default();
|
||||
|
|
|
|||
|
|
@ -100,24 +100,42 @@ pub fn parse_file_at_path(parser: &mut Parser, opts: &ParseFileOptions) -> Resul
|
|||
let time = Instant::now();
|
||||
|
||||
#[inline(always)]
|
||||
fn is_utf16_bom(bom_bytes: &[u8]) -> bool {
|
||||
bom_bytes == [0xFF, 0xFE] || bom_bytes == [0xFE, 0xFF]
|
||||
fn is_utf16_le_bom(bom_bytes: &[u8]) -> bool {
|
||||
bom_bytes == [0xFF, 0xFE]
|
||||
}
|
||||
|
||||
let tree = match opts.encoding {
|
||||
Some(encoding) if encoding == ffi::TSInputEncodingUTF16 => {
|
||||
let source_code_utf16 = source_code
|
||||
.chunks_exact(2)
|
||||
.map(|chunk| u16::from_le_bytes([chunk[0], chunk[1]]))
|
||||
.collect::<Vec<_>>();
|
||||
parser.parse_utf16(&source_code_utf16, None)
|
||||
#[inline(always)]
|
||||
fn is_utf16_be_bom(bom_bytes: &[u8]) -> bool {
|
||||
bom_bytes == [0xFE, 0xFF]
|
||||
}
|
||||
|
||||
let encoding = match opts.encoding {
|
||||
None if source_code.len() >= 2 => {
|
||||
if is_utf16_le_bom(&source_code[0..2]) {
|
||||
Some(ffi::TSInputEncodingUTF16LE)
|
||||
} else if is_utf16_be_bom(&source_code[0..2]) {
|
||||
Some(ffi::TSInputEncodingUTF16BE)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
None if source_code.len() >= 2 && is_utf16_bom(&source_code[0..2]) => {
|
||||
_ => opts.encoding,
|
||||
};
|
||||
|
||||
let tree = match encoding {
|
||||
Some(encoding) if encoding == ffi::TSInputEncodingUTF16LE => {
|
||||
let source_code_utf16 = source_code
|
||||
.chunks_exact(2)
|
||||
.map(|chunk| u16::from_le_bytes([chunk[0], chunk[1]]))
|
||||
.collect::<Vec<_>>();
|
||||
parser.parse_utf16(&source_code_utf16, None)
|
||||
parser.parse_utf16_le(&source_code_utf16, None)
|
||||
}
|
||||
Some(encoding) if encoding == ffi::TSInputEncodingUTF16BE => {
|
||||
let source_code_utf16 = source_code
|
||||
.chunks_exact(2)
|
||||
.map(|chunk| u16::from_be_bytes([chunk[0], chunk[1]]))
|
||||
.collect::<Vec<_>>();
|
||||
parser.parse_utf16_be(&source_code_utf16, None)
|
||||
}
|
||||
_ => parser.parse(&source_code, None),
|
||||
};
|
||||
|
|
|
|||
|
|
@ -155,17 +155,19 @@ fn test_parsing_with_custom_utf8_input() {
|
|||
}
|
||||
|
||||
#[test]
|
||||
fn test_parsing_with_custom_utf16_input() {
|
||||
fn test_parsing_with_custom_utf16le_input() {
|
||||
let mut parser = Parser::new();
|
||||
parser.set_language(&get_language("rust")).unwrap();
|
||||
|
||||
let lines = ["pub fn foo() {", " 1", "}"]
|
||||
.iter()
|
||||
.map(|s| s.encode_utf16().collect::<Vec<_>>())
|
||||
.map(|s| s.encode_utf16().map(|u| u.to_le()).collect::<Vec<_>>())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let newline = [('\n' as u16).to_le()];
|
||||
|
||||
let tree = parser
|
||||
.parse_utf16_with(
|
||||
.parse_utf16_le_with(
|
||||
&mut |_, position| {
|
||||
let row = position.row;
|
||||
let column = position.column;
|
||||
|
|
@ -173,7 +175,7 @@ fn test_parsing_with_custom_utf16_input() {
|
|||
if column < lines[row].len() {
|
||||
&lines[row][column..]
|
||||
} else {
|
||||
&[10]
|
||||
&newline
|
||||
}
|
||||
} else {
|
||||
&[]
|
||||
|
|
@ -193,6 +195,47 @@ fn test_parsing_with_custom_utf16_input() {
|
|||
assert_eq!(root.child(0).unwrap().kind(), "function_item");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parsing_with_custom_utf16_be_input() {
|
||||
let mut parser = Parser::new();
|
||||
parser.set_language(&get_language("rust")).unwrap();
|
||||
|
||||
let lines: Vec<Vec<u16>> = ["pub fn foo() {", " 1", "}"]
|
||||
.iter()
|
||||
.map(|s| s.encode_utf16().collect::<Vec<_>>())
|
||||
.map(|v| v.iter().map(|u| u.to_be()).collect())
|
||||
.collect();
|
||||
|
||||
let newline = [('\n' as u16).to_be()];
|
||||
|
||||
let tree = parser
|
||||
.parse_utf16_be_with(
|
||||
&mut |_, position| {
|
||||
let row = position.row;
|
||||
let column = position.column;
|
||||
if row < lines.len() {
|
||||
if column < lines[row].len() {
|
||||
&lines[row][column..]
|
||||
} else {
|
||||
&newline
|
||||
}
|
||||
} else {
|
||||
&[]
|
||||
}
|
||||
},
|
||||
None,
|
||||
)
|
||||
.unwrap();
|
||||
let root = tree.root_node();
|
||||
assert_eq!(
|
||||
root.to_sexp(),
|
||||
"(source_file (function_item (visibility_modifier) name: (identifier) parameters: (parameters) body: (block (integer_literal))))"
|
||||
);
|
||||
assert_eq!(root.kind(), "source_file");
|
||||
assert!(!root.has_error());
|
||||
assert_eq!(root.child(0).unwrap().kind(), "function_item");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parsing_with_callback_returning_owned_strings() {
|
||||
let mut parser = Parser::new();
|
||||
|
|
@ -221,7 +264,13 @@ fn test_parsing_text_with_byte_order_mark() {
|
|||
|
||||
// Parse UTF16 text with a BOM
|
||||
let tree = parser
|
||||
.parse_utf16("\u{FEFF}fn a() {}".encode_utf16().collect::<Vec<_>>(), None)
|
||||
.parse_utf16_le(
|
||||
"\u{FEFF}fn a() {}"
|
||||
.encode_utf16()
|
||||
.map(|u| u.to_le())
|
||||
.collect::<Vec<_>>(),
|
||||
None,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
tree.root_node().to_sexp(),
|
||||
|
|
@ -1084,9 +1133,8 @@ fn test_parsing_error_in_invalid_included_ranges() {
|
|||
fn test_parsing_utf16_code_with_errors_at_the_end_of_an_included_range() {
|
||||
let source_code = "<script>a.</script>";
|
||||
let utf16_source_code = source_code
|
||||
.as_bytes()
|
||||
.iter()
|
||||
.map(|c| u16::from(*c))
|
||||
.encode_utf16()
|
||||
.map(|u| u.to_le())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let start_byte = 2 * source_code.find("a.").unwrap();
|
||||
|
|
@ -1102,7 +1150,7 @@ fn test_parsing_utf16_code_with_errors_at_the_end_of_an_included_range() {
|
|||
end_point: Point::new(0, end_byte),
|
||||
}])
|
||||
.unwrap();
|
||||
let tree = parser.parse_utf16(&utf16_source_code, None).unwrap();
|
||||
let tree = parser.parse_utf16_le(&utf16_source_code, None).unwrap();
|
||||
assert_eq!(tree.root_node().to_sexp(), "(program (ERROR (identifier)))");
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -76,6 +76,8 @@ set_target_properties(tree-sitter
|
|||
SOVERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}"
|
||||
DEFINE_SYMBOL "")
|
||||
|
||||
target_compile_definitions(tree-sitter PRIVATE _POSIX_C_SOURCE=200112L _DEFAULT_SOURCE)
|
||||
|
||||
configure_file(tree-sitter.pc.in "${CMAKE_CURRENT_BINARY_DIR}/tree-sitter.pc" @ONLY)
|
||||
|
||||
include(GNUInstallDirs)
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@ include = [
|
|||
"/Cargo.toml",
|
||||
"/src/*.h",
|
||||
"/src/*.c",
|
||||
"/src/portable/*",
|
||||
"/src/unicode/*",
|
||||
"/src/wasm/*",
|
||||
"/include/tree_sitter/api.h",
|
||||
|
|
|
|||
|
|
@ -36,7 +36,8 @@ pub struct TSLookaheadIterator {
|
|||
_unused: [u8; 0],
|
||||
}
|
||||
pub const TSInputEncodingUTF8: TSInputEncoding = 0;
|
||||
pub const TSInputEncodingUTF16: TSInputEncoding = 1;
|
||||
pub const TSInputEncodingUTF16LE: TSInputEncoding = 1;
|
||||
pub const TSInputEncodingUTF16BE: TSInputEncoding = 2;
|
||||
pub type TSInputEncoding = ::core::ffi::c_uint;
|
||||
pub const TSSymbolTypeRegular: TSSymbolType = 0;
|
||||
pub const TSSymbolTypeAnonymous: TSSymbolType = 1;
|
||||
|
|
|
|||
|
|
@ -41,6 +41,8 @@ fn main() {
|
|||
.include(&src_path)
|
||||
.include(&wasm_path)
|
||||
.include(&include_path)
|
||||
.define("_POSIX_C_SOURCE", "200112L")
|
||||
.define("_DEFAULT_SOURCE", None)
|
||||
.warnings(false)
|
||||
.file(src_path.join("lib.c"))
|
||||
.compile("tree-sitter");
|
||||
|
|
|
|||
|
|
@ -610,6 +610,7 @@ impl Parser {
|
|||
/// * `old_tree` A previous syntax tree parsed from the same document. If the text of the
|
||||
/// document has changed since `old_tree` was created, then you must edit `old_tree` to match
|
||||
/// the new text using [`Tree::edit`].
|
||||
#[deprecated(since = "0.25.0", note = "Prefer parse_utf16_le instead")]
|
||||
pub fn parse_utf16(
|
||||
&mut self,
|
||||
input: impl AsRef<[u16]>,
|
||||
|
|
@ -617,7 +618,7 @@ impl Parser {
|
|||
) -> Option<Tree> {
|
||||
let code_points = input.as_ref();
|
||||
let len = code_points.len();
|
||||
self.parse_utf16_with(
|
||||
self.parse_utf16_le_with(
|
||||
&mut |i, _| (i < len).then(|| &code_points[i..]).unwrap_or_default(),
|
||||
old_tree,
|
||||
)
|
||||
|
|
@ -672,6 +673,45 @@ impl Parser {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn parse_with_<T: AsRef<[u8]>, F: FnMut(usize, Point) -> T>(
|
||||
&mut self,
|
||||
callback: &mut F,
|
||||
old_tree: Option<&Tree>,
|
||||
) -> Option<Tree> {
|
||||
// A pointer to this payload is passed on every call to the `read` C function.
|
||||
// The payload contains two things:
|
||||
// 1. A reference to the rust `callback`.
|
||||
// 2. The text that was returned from the previous call to `callback`. This allows the
|
||||
// callback to return owned values like vectors.
|
||||
let mut payload: (&mut F, Option<T>) = (callback, None);
|
||||
|
||||
// This C function is passed to Tree-sitter as the input callback.
|
||||
unsafe extern "C" fn read<T: AsRef<[u8]>, F: FnMut(usize, Point) -> T>(
|
||||
payload: *mut c_void,
|
||||
byte_offset: u32,
|
||||
position: ffi::TSPoint,
|
||||
bytes_read: *mut u32,
|
||||
) -> *const c_char {
|
||||
let (callback, text) = (payload as *mut (&mut F, Option<T>)).as_mut().unwrap();
|
||||
*text = Some(callback(byte_offset as usize, position.into()));
|
||||
let slice = text.as_ref().unwrap().as_ref();
|
||||
*bytes_read = slice.len() as u32;
|
||||
slice.as_ptr().cast::<c_char>()
|
||||
}
|
||||
|
||||
let c_input = ffi::TSInput {
|
||||
payload: &mut payload as *mut (&mut F, Option<T>) as *mut c_void,
|
||||
read: Some(read::<T, F>),
|
||||
encoding: ffi::TSInputEncodingUTF8,
|
||||
};
|
||||
|
||||
let c_old_tree = old_tree.map_or(ptr::null_mut(), |t| t.0.as_ptr());
|
||||
unsafe {
|
||||
let c_new_tree = ffi::ts_parser_parse(self.0.as_ptr(), c_old_tree, c_input);
|
||||
NonNull::new(c_new_tree).map(Tree)
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse UTF16 text provided in chunks by a callback.
|
||||
///
|
||||
/// # Arguments:
|
||||
|
|
@ -682,10 +722,49 @@ impl Parser {
|
|||
/// * `old_tree` A previous syntax tree parsed from the same document. If the text of the
|
||||
/// document has changed since `old_tree` was created, then you must edit `old_tree` to match
|
||||
/// the new text using [`Tree::edit`].
|
||||
#[deprecated(since = "0.25.0", note = "Prefer parse_utf16_le_with instead")]
|
||||
pub fn parse_utf16_with<T: AsRef<[u16]>, F: FnMut(usize, Point) -> T>(
|
||||
&mut self,
|
||||
callback: &mut F,
|
||||
old_tree: Option<&Tree>,
|
||||
) -> Option<Tree> {
|
||||
self.parse_utf16_le_with(callback, old_tree)
|
||||
}
|
||||
|
||||
/// Parse a slice of UTF16 little-endian text.
|
||||
///
|
||||
/// # Arguments:
|
||||
/// * `text` The UTF16-encoded text to parse.
|
||||
/// * `old_tree` A previous syntax tree parsed from the same document. If the text of the
|
||||
/// document has changed since `old_tree` was created, then you must edit `old_tree` to match
|
||||
/// the new text using [Tree::edit].
|
||||
pub fn parse_utf16_le(
|
||||
&mut self,
|
||||
input: impl AsRef<[u16]>,
|
||||
old_tree: Option<&Tree>,
|
||||
) -> Option<Tree> {
|
||||
let code_points = input.as_ref();
|
||||
let len = code_points.len();
|
||||
self.parse_utf16_le_with(
|
||||
&mut |i, _| (i < len).then(|| &code_points[i..]).unwrap_or_default(),
|
||||
old_tree,
|
||||
)
|
||||
}
|
||||
|
||||
/// Parse UTF16 little-endian text provided in chunks by a callback.
|
||||
///
|
||||
/// # Arguments:
|
||||
/// * `callback` A function that takes a code point offset and position and returns a slice of
|
||||
/// UTF16-encoded text starting at that byte offset and position. The slices can be of any
|
||||
/// length. If the given position is at the end of the text, the callback should return an
|
||||
/// empty slice.
|
||||
/// * `old_tree` A previous syntax tree parsed from the same document. If the text of the
|
||||
/// document has changed since `old_tree` was created, then you must edit `old_tree` to match
|
||||
/// the new text using [Tree::edit].
|
||||
pub fn parse_utf16_le_with<T: AsRef<[u16]>, F: FnMut(usize, Point) -> T>(
|
||||
&mut self,
|
||||
callback: &mut F,
|
||||
old_tree: Option<&Tree>,
|
||||
) -> Option<Tree> {
|
||||
// A pointer to this payload is passed on every call to the `read` C function.
|
||||
// The payload contains two things:
|
||||
|
|
@ -701,7 +780,7 @@ impl Parser {
|
|||
position: ffi::TSPoint,
|
||||
bytes_read: *mut u32,
|
||||
) -> *const c_char {
|
||||
let (callback, text) = payload.cast::<(&mut F, Option<T>)>().as_mut().unwrap();
|
||||
let (callback, text) = (payload as *mut (&mut F, Option<T>)).as_mut().unwrap();
|
||||
*text = Some(callback(
|
||||
(byte_offset / 2) as usize,
|
||||
Point {
|
||||
|
|
@ -715,9 +794,83 @@ impl Parser {
|
|||
}
|
||||
|
||||
let c_input = ffi::TSInput {
|
||||
payload: core::ptr::addr_of_mut!(payload).cast::<c_void>(),
|
||||
payload: &mut payload as *mut (&mut F, Option<T>) as *mut c_void,
|
||||
read: Some(read::<T, F>),
|
||||
encoding: ffi::TSInputEncodingUTF16,
|
||||
encoding: ffi::TSInputEncodingUTF16LE,
|
||||
};
|
||||
|
||||
let c_old_tree = old_tree.map_or(ptr::null_mut(), |t| t.0.as_ptr());
|
||||
unsafe {
|
||||
let c_new_tree = ffi::ts_parser_parse(self.0.as_ptr(), c_old_tree, c_input);
|
||||
NonNull::new(c_new_tree).map(Tree)
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse a slice of UTF16 big-endian text.
|
||||
///
|
||||
/// # Arguments:
|
||||
/// * `text` The UTF16-encoded text to parse.
|
||||
/// * `old_tree` A previous syntax tree parsed from the same document. If the text of the
|
||||
/// document has changed since `old_tree` was created, then you must edit `old_tree` to match
|
||||
/// the new text using [Tree::edit].
|
||||
pub fn parse_utf16_be(
|
||||
&mut self,
|
||||
input: impl AsRef<[u16]>,
|
||||
old_tree: Option<&Tree>,
|
||||
) -> Option<Tree> {
|
||||
let code_points = input.as_ref();
|
||||
let len = code_points.len();
|
||||
self.parse_utf16_be_with(
|
||||
&mut |i, _| if i < len { &code_points[i..] } else { &[] },
|
||||
old_tree,
|
||||
)
|
||||
}
|
||||
|
||||
/// Parse UTF16 big-endian text provided in chunks by a callback.
|
||||
///
|
||||
/// # Arguments:
|
||||
/// * `callback` A function that takes a code point offset and position and returns a slice of
|
||||
/// UTF16-encoded text starting at that byte offset and position. The slices can be of any
|
||||
/// length. If the given position is at the end of the text, the callback should return an
|
||||
/// empty slice.
|
||||
/// * `old_tree` A previous syntax tree parsed from the same document. If the text of the
|
||||
/// document has changed since `old_tree` was created, then you must edit `old_tree` to match
|
||||
/// the new text using [Tree::edit].
|
||||
pub fn parse_utf16_be_with<T: AsRef<[u16]>, F: FnMut(usize, Point) -> T>(
|
||||
&mut self,
|
||||
callback: &mut F,
|
||||
old_tree: Option<&Tree>,
|
||||
) -> Option<Tree> {
|
||||
// A pointer to this payload is passed on every call to the `read` C function.
|
||||
// The payload contains two things:
|
||||
// 1. A reference to the rust `callback`.
|
||||
// 2. The text that was returned from the previous call to `callback`. This allows the
|
||||
// callback to return owned values like vectors.
|
||||
let mut payload: (&mut F, Option<T>) = (callback, None);
|
||||
|
||||
// This C function is passed to Tree-sitter as the input callback.
|
||||
unsafe extern "C" fn read<T: AsRef<[u16]>, F: FnMut(usize, Point) -> T>(
|
||||
payload: *mut c_void,
|
||||
byte_offset: u32,
|
||||
position: ffi::TSPoint,
|
||||
bytes_read: *mut u32,
|
||||
) -> *const c_char {
|
||||
let (callback, text) = (payload as *mut (&mut F, Option<T>)).as_mut().unwrap();
|
||||
*text = Some(callback(
|
||||
(byte_offset / 2) as usize,
|
||||
Point {
|
||||
row: position.row as usize,
|
||||
column: position.column as usize / 2,
|
||||
},
|
||||
));
|
||||
let slice = text.as_ref().unwrap().as_ref();
|
||||
*bytes_read = slice.len() as u32 * 2;
|
||||
slice.as_ptr() as *const c_char
|
||||
}
|
||||
let c_input = ffi::TSInput {
|
||||
payload: &mut payload as *mut (&mut F, Option<T>) as *mut c_void,
|
||||
read: Some(read::<T, F>),
|
||||
encoding: ffi::TSInputEncodingUTF16BE,
|
||||
};
|
||||
|
||||
let c_old_tree = old_tree.map_or(ptr::null_mut(), |t| t.0.as_ptr());
|
||||
|
|
|
|||
|
|
@ -172,7 +172,7 @@ TSTree *ts_parser_parse_wasm(
|
|||
TSInput input = {
|
||||
input_buffer,
|
||||
call_parse_callback,
|
||||
TSInputEncodingUTF16
|
||||
TSInputEncodingUTF16LE
|
||||
};
|
||||
if (range_count) {
|
||||
for (unsigned i = 0; i < range_count; i++) {
|
||||
|
|
|
|||
|
|
@ -50,7 +50,8 @@ typedef struct TSLookaheadIterator TSLookaheadIterator;
|
|||
|
||||
typedef enum TSInputEncoding {
|
||||
TSInputEncodingUTF8,
|
||||
TSInputEncodingUTF16,
|
||||
TSInputEncodingUTF16LE,
|
||||
TSInputEncodingUTF16BE,
|
||||
} TSInputEncoding;
|
||||
|
||||
typedef enum TSSymbolType {
|
||||
|
|
|
|||
|
|
@ -83,9 +83,9 @@ static void ts_lexer__get_lookahead(Lexer *self) {
|
|||
}
|
||||
|
||||
const uint8_t *chunk = (const uint8_t *)self->chunk + position_in_chunk;
|
||||
UnicodeDecodeFunction decode = self->input.encoding == TSInputEncodingUTF8
|
||||
? ts_decode_utf8
|
||||
: ts_decode_utf16;
|
||||
UnicodeDecodeFunction decode =
|
||||
self->input.encoding == TSInputEncodingUTF8 ? ts_decode_utf8 :
|
||||
self->input.encoding == TSInputEncodingUTF16LE ? ts_decode_utf16_le : ts_decode_utf16_be;
|
||||
|
||||
self->lookahead_size = decode(chunk, size, &self->data.lookahead);
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,3 @@
|
|||
#define _POSIX_C_SOURCE 200112L
|
||||
|
||||
#include "./alloc.c"
|
||||
#include "./get_changed_ranges.c"
|
||||
#include "./language.c"
|
||||
|
|
|
|||
|
|
@ -1,5 +1,3 @@
|
|||
#define _POSIX_C_SOURCE 200112L
|
||||
|
||||
#include <time.h>
|
||||
#include <stdio.h>
|
||||
#include <limits.h>
|
||||
|
|
|
|||
170
lib/src/portable/endian.h
Normal file
170
lib/src/portable/endian.h
Normal file
|
|
@ -0,0 +1,170 @@
|
|||
// "License": Public Domain
|
||||
// I, Mathias Panzenböck, place this file hereby into the public domain. Use it at your own risk for whatever you like.
|
||||
// In case there are jurisdictions that don't support putting things in the public domain you can also consider it to
|
||||
// be "dual licensed" under the BSD, MIT and Apache licenses, if you want to. This code is trivial anyway. Consider it
|
||||
// an example on how to get the endian conversion functions on different platforms.
|
||||
|
||||
#ifndef PORTABLE_ENDIAN_H__
|
||||
#define PORTABLE_ENDIAN_H__
|
||||
|
||||
#if (defined(_WIN16) || defined(_WIN32) || defined(_WIN64)) && !defined(__WINDOWS__)
|
||||
|
||||
# define __WINDOWS__
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__linux__) || defined(__CYGWIN__) || defined(__GNU__) || defined(__EMSCRIPTEN__)
|
||||
|
||||
# include <endian.h>
|
||||
|
||||
#elif defined(__APPLE__)
|
||||
|
||||
# include <libkern/OSByteOrder.h>
|
||||
|
||||
# define htobe16(x) OSSwapHostToBigInt16(x)
|
||||
# define htole16(x) OSSwapHostToLittleInt16(x)
|
||||
# define be16toh(x) OSSwapBigToHostInt16(x)
|
||||
# define le16toh(x) OSSwapLittleToHostInt16(x)
|
||||
|
||||
# define htobe32(x) OSSwapHostToBigInt32(x)
|
||||
# define htole32(x) OSSwapHostToLittleInt32(x)
|
||||
# define be32toh(x) OSSwapBigToHostInt32(x)
|
||||
# define le32toh(x) OSSwapLittleToHostInt32(x)
|
||||
|
||||
# define htobe64(x) OSSwapHostToBigInt64(x)
|
||||
# define htole64(x) OSSwapHostToLittleInt64(x)
|
||||
# define be64toh(x) OSSwapBigToHostInt64(x)
|
||||
# define le64toh(x) OSSwapLittleToHostInt64(x)
|
||||
|
||||
# define __BYTE_ORDER BYTE_ORDER
|
||||
# define __BIG_ENDIAN BIG_ENDIAN
|
||||
# define __LITTLE_ENDIAN LITTLE_ENDIAN
|
||||
# define __PDP_ENDIAN PDP_ENDIAN
|
||||
|
||||
#elif defined(__OpenBSD__)
|
||||
|
||||
# include <endian.h>
|
||||
|
||||
# define __BYTE_ORDER BYTE_ORDER
|
||||
# define __BIG_ENDIAN BIG_ENDIAN
|
||||
# define __LITTLE_ENDIAN LITTLE_ENDIAN
|
||||
# define __PDP_ENDIAN PDP_ENDIAN
|
||||
|
||||
#elif defined(__NetBSD__) || defined(__FreeBSD__) || defined(__DragonFly__)
|
||||
|
||||
# include <sys/endian.h>
|
||||
|
||||
# define be16toh(x) betoh16(x)
|
||||
# define le16toh(x) letoh16(x)
|
||||
|
||||
# define be32toh(x) betoh32(x)
|
||||
# define le32toh(x) letoh32(x)
|
||||
|
||||
# define be64toh(x) betoh64(x)
|
||||
# define le64toh(x) letoh64(x)
|
||||
|
||||
#elif defined(__WINDOWS__)
|
||||
|
||||
# include <winsock2.h>
|
||||
# ifdef __GNUC__
|
||||
# include <sys/param.h>
|
||||
# endif
|
||||
|
||||
# if BYTE_ORDER == LITTLE_ENDIAN
|
||||
|
||||
# define htobe16(x) htons(x)
|
||||
# define htole16(x) (x)
|
||||
# define be16toh(x) ntohs(x)
|
||||
# define le16toh(x) (x)
|
||||
|
||||
# define htobe32(x) htonl(x)
|
||||
# define htole32(x) (x)
|
||||
# define be32toh(x) ntohl(x)
|
||||
# define le32toh(x) (x)
|
||||
|
||||
# define htobe64(x) htonll(x)
|
||||
# define htole64(x) (x)
|
||||
# define be64toh(x) ntohll(x)
|
||||
# define le64toh(x) (x)
|
||||
|
||||
# elif BYTE_ORDER == BIG_ENDIAN
|
||||
|
||||
/* that would be xbox 360 */
|
||||
# define htobe16(x) (x)
|
||||
# define htole16(x) __builtin_bswap16(x)
|
||||
# define be16toh(x) (x)
|
||||
# define le16toh(x) __builtin_bswap16(x)
|
||||
|
||||
# define htobe32(x) (x)
|
||||
# define htole32(x) __builtin_bswap32(x)
|
||||
# define be32toh(x) (x)
|
||||
# define le32toh(x) __builtin_bswap32(x)
|
||||
|
||||
# define htobe64(x) (x)
|
||||
# define htole64(x) __builtin_bswap64(x)
|
||||
# define be64toh(x) (x)
|
||||
# define le64toh(x) __builtin_bswap64(x)
|
||||
|
||||
# else
|
||||
|
||||
# error byte order not supported
|
||||
|
||||
# endif
|
||||
|
||||
# define __BYTE_ORDER BYTE_ORDER
|
||||
# define __BIG_ENDIAN BIG_ENDIAN
|
||||
# define __LITTLE_ENDIAN LITTLE_ENDIAN
|
||||
# define __PDP_ENDIAN PDP_ENDIAN
|
||||
|
||||
#elif defined(__QNXNTO__)
|
||||
|
||||
# include <gulliver.h>
|
||||
|
||||
# define __LITTLE_ENDIAN 1234
|
||||
# define __BIG_ENDIAN 4321
|
||||
# define __PDP_ENDIAN 3412
|
||||
|
||||
# if defined(__BIGENDIAN__)
|
||||
|
||||
# define __BYTE_ORDER __BIG_ENDIAN
|
||||
|
||||
# define htobe16(x) (x)
|
||||
# define htobe32(x) (x)
|
||||
# define htobe64(x) (x)
|
||||
|
||||
# define htole16(x) ENDIAN_SWAP16(x)
|
||||
# define htole32(x) ENDIAN_SWAP32(x)
|
||||
# define htole64(x) ENDIAN_SWAP64(x)
|
||||
|
||||
# elif defined(__LITTLEENDIAN__)
|
||||
|
||||
# define __BYTE_ORDER __LITTLE_ENDIAN
|
||||
|
||||
# define htole16(x) (x)
|
||||
# define htole32(x) (x)
|
||||
# define htole64(x) (x)
|
||||
|
||||
# define htobe16(x) ENDIAN_SWAP16(x)
|
||||
# define htobe32(x) ENDIAN_SWAP32(x)
|
||||
# define htobe64(x) ENDIAN_SWAP64(x)
|
||||
|
||||
# else
|
||||
|
||||
# error byte order not supported
|
||||
|
||||
# endif
|
||||
|
||||
# define be16toh(x) ENDIAN_BE16(x)
|
||||
# define be32toh(x) ENDIAN_BE32(x)
|
||||
# define be64toh(x) ENDIAN_BE64(x)
|
||||
# define le16toh(x) ENDIAN_LE16(x)
|
||||
# define le32toh(x) ENDIAN_LE32(x)
|
||||
# define le64toh(x) ENDIAN_LE64(x)
|
||||
|
||||
#else
|
||||
|
||||
# error platform not supported
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
@ -1,5 +1,3 @@
|
|||
#define _POSIX_C_SOURCE 200112L
|
||||
|
||||
#include "tree_sitter/api.h"
|
||||
#include "./array.h"
|
||||
#include "./get_changed_ranges.h"
|
||||
|
|
|
|||
|
|
@ -12,6 +12,29 @@ extern "C" {
|
|||
#define U_EXPORT2
|
||||
#include "unicode/utf8.h"
|
||||
#include "unicode/utf16.h"
|
||||
#include "portable/endian.h"
|
||||
|
||||
#define U16_NEXT_LE(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
|
||||
(c)=le16toh((s)[(i)++]); \
|
||||
if(U16_IS_LEAD(c)) { \
|
||||
uint16_t __c2; \
|
||||
if((i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
|
||||
++(i); \
|
||||
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
|
||||
} \
|
||||
} \
|
||||
} UPRV_BLOCK_MACRO_END
|
||||
|
||||
#define U16_NEXT_BE(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
|
||||
(c)=be16toh((s)[(i)++]); \
|
||||
if(U16_IS_LEAD(c)) { \
|
||||
uint16_t __c2; \
|
||||
if((i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
|
||||
++(i); \
|
||||
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
|
||||
} \
|
||||
} \
|
||||
} UPRV_BLOCK_MACRO_END
|
||||
|
||||
static const int32_t TS_DECODE_ERROR = U_SENTINEL;
|
||||
|
||||
|
|
@ -33,13 +56,23 @@ static inline uint32_t ts_decode_utf8(
|
|||
return i;
|
||||
}
|
||||
|
||||
static inline uint32_t ts_decode_utf16(
|
||||
static inline uint32_t ts_decode_utf16_le(
|
||||
const uint8_t *string,
|
||||
uint32_t length,
|
||||
int32_t *code_point
|
||||
) {
|
||||
uint32_t i = 0;
|
||||
U16_NEXT(((uint16_t *)string), i, length, *code_point);
|
||||
U16_NEXT_LE(((uint16_t *)string), i, length, *code_point);
|
||||
return i * 2;
|
||||
}
|
||||
|
||||
static inline uint32_t ts_decode_utf16_be(
|
||||
const uint8_t *string,
|
||||
uint32_t length,
|
||||
int32_t *code_point
|
||||
) {
|
||||
uint32_t i = 0;
|
||||
U16_NEXT_BE(((uint16_t *)string), i, length, *code_point);
|
||||
return i * 2;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -131,6 +131,8 @@ $emcc \
|
|||
-std=c11 \
|
||||
-D 'fprintf(...)=' \
|
||||
-D NDEBUG= \
|
||||
-D _POSIX_C_SOURCE=200112L \
|
||||
-D _DEFAULT_SOURCE= \
|
||||
-I ${SRC_DIR} \
|
||||
-I lib/include \
|
||||
--js-library ${WEB_DIR}/imports.js \
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue