feat!: properly handle UTF-16 endianness encoding

This commit is contained in:
Amaan Qureshi 2024-10-04 23:15:17 -04:00
parent cf8ed78a9a
commit 8943983df6
20 changed files with 485 additions and 50 deletions

View file

@ -6,7 +6,7 @@ use std::{
use anstyle::{AnsiColor, Color, Style};
use anyhow::{anyhow, Context, Result};
use clap::{crate_authors, Args, Command, FromArgMatches as _, Subcommand};
use clap::{crate_authors, Args, Command, FromArgMatches as _, Subcommand, ValueEnum};
use clap_complete::{generate, Shell};
use dialoguer::{theme::ColorfulTheme, Confirm, FuzzySelect, Input};
use glob::glob;
@ -191,7 +191,7 @@ struct Parse {
)]
pub edits: Option<Vec<String>>,
#[arg(long, help = "The encoding of the input files")]
pub encoding: Option<String>,
pub encoding: Option<Encoding>,
#[arg(
long,
help = "Open `log.html` in the default browser, if `--debug-graph` is supplied"
@ -208,6 +208,13 @@ struct Parse {
pub no_ranges: bool,
}
#[derive(ValueEnum, Clone)]
pub enum Encoding {
Utf8,
Utf16LE,
Utf16BE,
}
#[derive(Args)]
#[command(about = "Run a parser's tests", alias = "t")]
struct Test {
@ -773,15 +780,11 @@ impl Parse {
ParseOutput::Normal
};
let encoding = if let Some(encoding) = self.encoding {
match encoding.as_str() {
"utf16" => Some(ffi::TSInputEncodingUTF16),
"utf8" => Some(ffi::TSInputEncodingUTF8),
_ => return Err(anyhow!("Invalid encoding. Expected one of: utf8, utf16")),
}
} else {
None
};
let encoding = self.encoding.map(|e| match e {
Encoding::Utf8 => ffi::TSInputEncodingUTF8,
Encoding::Utf16LE => ffi::TSInputEncodingUTF16LE,
Encoding::Utf16BE => ffi::TSInputEncodingUTF16BE,
});
let time = self.time;
let edits = self.edits.unwrap_or_default();

View file

@ -100,24 +100,42 @@ pub fn parse_file_at_path(parser: &mut Parser, opts: &ParseFileOptions) -> Resul
let time = Instant::now();
#[inline(always)]
fn is_utf16_bom(bom_bytes: &[u8]) -> bool {
bom_bytes == [0xFF, 0xFE] || bom_bytes == [0xFE, 0xFF]
fn is_utf16_le_bom(bom_bytes: &[u8]) -> bool {
bom_bytes == [0xFF, 0xFE]
}
let tree = match opts.encoding {
Some(encoding) if encoding == ffi::TSInputEncodingUTF16 => {
let source_code_utf16 = source_code
.chunks_exact(2)
.map(|chunk| u16::from_le_bytes([chunk[0], chunk[1]]))
.collect::<Vec<_>>();
parser.parse_utf16(&source_code_utf16, None)
#[inline(always)]
fn is_utf16_be_bom(bom_bytes: &[u8]) -> bool {
bom_bytes == [0xFE, 0xFF]
}
let encoding = match opts.encoding {
None if source_code.len() >= 2 => {
if is_utf16_le_bom(&source_code[0..2]) {
Some(ffi::TSInputEncodingUTF16LE)
} else if is_utf16_be_bom(&source_code[0..2]) {
Some(ffi::TSInputEncodingUTF16BE)
} else {
None
}
}
None if source_code.len() >= 2 && is_utf16_bom(&source_code[0..2]) => {
_ => opts.encoding,
};
let tree = match encoding {
Some(encoding) if encoding == ffi::TSInputEncodingUTF16LE => {
let source_code_utf16 = source_code
.chunks_exact(2)
.map(|chunk| u16::from_le_bytes([chunk[0], chunk[1]]))
.collect::<Vec<_>>();
parser.parse_utf16(&source_code_utf16, None)
parser.parse_utf16_le(&source_code_utf16, None)
}
Some(encoding) if encoding == ffi::TSInputEncodingUTF16BE => {
let source_code_utf16 = source_code
.chunks_exact(2)
.map(|chunk| u16::from_be_bytes([chunk[0], chunk[1]]))
.collect::<Vec<_>>();
parser.parse_utf16_be(&source_code_utf16, None)
}
_ => parser.parse(&source_code, None),
};

View file

@ -155,17 +155,19 @@ fn test_parsing_with_custom_utf8_input() {
}
#[test]
fn test_parsing_with_custom_utf16_input() {
fn test_parsing_with_custom_utf16le_input() {
let mut parser = Parser::new();
parser.set_language(&get_language("rust")).unwrap();
let lines = ["pub fn foo() {", " 1", "}"]
.iter()
.map(|s| s.encode_utf16().collect::<Vec<_>>())
.map(|s| s.encode_utf16().map(|u| u.to_le()).collect::<Vec<_>>())
.collect::<Vec<_>>();
let newline = [('\n' as u16).to_le()];
let tree = parser
.parse_utf16_with(
.parse_utf16_le_with(
&mut |_, position| {
let row = position.row;
let column = position.column;
@ -173,7 +175,7 @@ fn test_parsing_with_custom_utf16_input() {
if column < lines[row].len() {
&lines[row][column..]
} else {
&[10]
&newline
}
} else {
&[]
@ -193,6 +195,47 @@ fn test_parsing_with_custom_utf16_input() {
assert_eq!(root.child(0).unwrap().kind(), "function_item");
}
#[test]
fn test_parsing_with_custom_utf16_be_input() {
let mut parser = Parser::new();
parser.set_language(&get_language("rust")).unwrap();
let lines: Vec<Vec<u16>> = ["pub fn foo() {", " 1", "}"]
.iter()
.map(|s| s.encode_utf16().collect::<Vec<_>>())
.map(|v| v.iter().map(|u| u.to_be()).collect())
.collect();
let newline = [('\n' as u16).to_be()];
let tree = parser
.parse_utf16_be_with(
&mut |_, position| {
let row = position.row;
let column = position.column;
if row < lines.len() {
if column < lines[row].len() {
&lines[row][column..]
} else {
&newline
}
} else {
&[]
}
},
None,
)
.unwrap();
let root = tree.root_node();
assert_eq!(
root.to_sexp(),
"(source_file (function_item (visibility_modifier) name: (identifier) parameters: (parameters) body: (block (integer_literal))))"
);
assert_eq!(root.kind(), "source_file");
assert!(!root.has_error());
assert_eq!(root.child(0).unwrap().kind(), "function_item");
}
#[test]
fn test_parsing_with_callback_returning_owned_strings() {
let mut parser = Parser::new();
@ -221,7 +264,13 @@ fn test_parsing_text_with_byte_order_mark() {
// Parse UTF16 text with a BOM
let tree = parser
.parse_utf16("\u{FEFF}fn a() {}".encode_utf16().collect::<Vec<_>>(), None)
.parse_utf16_le(
"\u{FEFF}fn a() {}"
.encode_utf16()
.map(|u| u.to_le())
.collect::<Vec<_>>(),
None,
)
.unwrap();
assert_eq!(
tree.root_node().to_sexp(),
@ -1084,9 +1133,8 @@ fn test_parsing_error_in_invalid_included_ranges() {
fn test_parsing_utf16_code_with_errors_at_the_end_of_an_included_range() {
let source_code = "<script>a.</script>";
let utf16_source_code = source_code
.as_bytes()
.iter()
.map(|c| u16::from(*c))
.encode_utf16()
.map(|u| u.to_le())
.collect::<Vec<_>>();
let start_byte = 2 * source_code.find("a.").unwrap();
@ -1102,7 +1150,7 @@ fn test_parsing_utf16_code_with_errors_at_the_end_of_an_included_range() {
end_point: Point::new(0, end_byte),
}])
.unwrap();
let tree = parser.parse_utf16(&utf16_source_code, None).unwrap();
let tree = parser.parse_utf16_le(&utf16_source_code, None).unwrap();
assert_eq!(tree.root_node().to_sexp(), "(program (ERROR (identifier)))");
}