feat!: properly handle UTF-16 endianness encoding

This commit is contained in:
Amaan Qureshi 2024-10-04 23:15:17 -04:00
parent cf8ed78a9a
commit 8943983df6
20 changed files with 485 additions and 50 deletions

View file

@ -155,17 +155,19 @@ fn test_parsing_with_custom_utf8_input() {
}
#[test]
fn test_parsing_with_custom_utf16_input() {
fn test_parsing_with_custom_utf16le_input() {
let mut parser = Parser::new();
parser.set_language(&get_language("rust")).unwrap();
let lines = ["pub fn foo() {", " 1", "}"]
.iter()
.map(|s| s.encode_utf16().collect::<Vec<_>>())
.map(|s| s.encode_utf16().map(|u| u.to_le()).collect::<Vec<_>>())
.collect::<Vec<_>>();
let newline = [('\n' as u16).to_le()];
let tree = parser
.parse_utf16_with(
.parse_utf16_le_with(
&mut |_, position| {
let row = position.row;
let column = position.column;
@ -173,7 +175,7 @@ fn test_parsing_with_custom_utf16_input() {
if column < lines[row].len() {
&lines[row][column..]
} else {
&[10]
&newline
}
} else {
&[]
@ -193,6 +195,47 @@ fn test_parsing_with_custom_utf16_input() {
assert_eq!(root.child(0).unwrap().kind(), "function_item");
}
#[test]
fn test_parsing_with_custom_utf16_be_input() {
let mut parser = Parser::new();
parser.set_language(&get_language("rust")).unwrap();
let lines: Vec<Vec<u16>> = ["pub fn foo() {", " 1", "}"]
.iter()
.map(|s| s.encode_utf16().collect::<Vec<_>>())
.map(|v| v.iter().map(|u| u.to_be()).collect())
.collect();
let newline = [('\n' as u16).to_be()];
let tree = parser
.parse_utf16_be_with(
&mut |_, position| {
let row = position.row;
let column = position.column;
if row < lines.len() {
if column < lines[row].len() {
&lines[row][column..]
} else {
&newline
}
} else {
&[]
}
},
None,
)
.unwrap();
let root = tree.root_node();
assert_eq!(
root.to_sexp(),
"(source_file (function_item (visibility_modifier) name: (identifier) parameters: (parameters) body: (block (integer_literal))))"
);
assert_eq!(root.kind(), "source_file");
assert!(!root.has_error());
assert_eq!(root.child(0).unwrap().kind(), "function_item");
}
#[test]
fn test_parsing_with_callback_returning_owned_strings() {
let mut parser = Parser::new();
@ -221,7 +264,13 @@ fn test_parsing_text_with_byte_order_mark() {
// Parse UTF16 text with a BOM
let tree = parser
.parse_utf16("\u{FEFF}fn a() {}".encode_utf16().collect::<Vec<_>>(), None)
.parse_utf16_le(
"\u{FEFF}fn a() {}"
.encode_utf16()
.map(|u| u.to_le())
.collect::<Vec<_>>(),
None,
)
.unwrap();
assert_eq!(
tree.root_node().to_sexp(),
@ -1084,9 +1133,8 @@ fn test_parsing_error_in_invalid_included_ranges() {
fn test_parsing_utf16_code_with_errors_at_the_end_of_an_included_range() {
let source_code = "<script>a.</script>";
let utf16_source_code = source_code
.as_bytes()
.iter()
.map(|c| u16::from(*c))
.encode_utf16()
.map(|u| u.to_le())
.collect::<Vec<_>>();
let start_byte = 2 * source_code.find("a.").unwrap();
@ -1102,7 +1150,7 @@ fn test_parsing_utf16_code_with_errors_at_the_end_of_an_included_range() {
end_point: Point::new(0, end_byte),
}])
.unwrap();
let tree = parser.parse_utf16(&utf16_source_code, None).unwrap();
let tree = parser.parse_utf16_le(&utf16_source_code, None).unwrap();
assert_eq!(tree.root_node().to_sexp(), "(program (ERROR (identifier)))");
}