feat!: properly handle UTF-16 endianness encoding
This commit is contained in:
parent
cf8ed78a9a
commit
8943983df6
20 changed files with 485 additions and 50 deletions
|
|
@ -6,7 +6,7 @@ use std::{
|
|||
|
||||
use anstyle::{AnsiColor, Color, Style};
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use clap::{crate_authors, Args, Command, FromArgMatches as _, Subcommand};
|
||||
use clap::{crate_authors, Args, Command, FromArgMatches as _, Subcommand, ValueEnum};
|
||||
use clap_complete::{generate, Shell};
|
||||
use dialoguer::{theme::ColorfulTheme, Confirm, FuzzySelect, Input};
|
||||
use glob::glob;
|
||||
|
|
@ -191,7 +191,7 @@ struct Parse {
|
|||
)]
|
||||
pub edits: Option<Vec<String>>,
|
||||
#[arg(long, help = "The encoding of the input files")]
|
||||
pub encoding: Option<String>,
|
||||
pub encoding: Option<Encoding>,
|
||||
#[arg(
|
||||
long,
|
||||
help = "Open `log.html` in the default browser, if `--debug-graph` is supplied"
|
||||
|
|
@ -208,6 +208,13 @@ struct Parse {
|
|||
pub no_ranges: bool,
|
||||
}
|
||||
|
||||
#[derive(ValueEnum, Clone)]
|
||||
pub enum Encoding {
|
||||
Utf8,
|
||||
Utf16LE,
|
||||
Utf16BE,
|
||||
}
|
||||
|
||||
#[derive(Args)]
|
||||
#[command(about = "Run a parser's tests", alias = "t")]
|
||||
struct Test {
|
||||
|
|
@ -773,15 +780,11 @@ impl Parse {
|
|||
ParseOutput::Normal
|
||||
};
|
||||
|
||||
let encoding = if let Some(encoding) = self.encoding {
|
||||
match encoding.as_str() {
|
||||
"utf16" => Some(ffi::TSInputEncodingUTF16),
|
||||
"utf8" => Some(ffi::TSInputEncodingUTF8),
|
||||
_ => return Err(anyhow!("Invalid encoding. Expected one of: utf8, utf16")),
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let encoding = self.encoding.map(|e| match e {
|
||||
Encoding::Utf8 => ffi::TSInputEncodingUTF8,
|
||||
Encoding::Utf16LE => ffi::TSInputEncodingUTF16LE,
|
||||
Encoding::Utf16BE => ffi::TSInputEncodingUTF16BE,
|
||||
});
|
||||
|
||||
let time = self.time;
|
||||
let edits = self.edits.unwrap_or_default();
|
||||
|
|
|
|||
|
|
@ -100,24 +100,42 @@ pub fn parse_file_at_path(parser: &mut Parser, opts: &ParseFileOptions) -> Resul
|
|||
let time = Instant::now();
|
||||
|
||||
#[inline(always)]
|
||||
fn is_utf16_bom(bom_bytes: &[u8]) -> bool {
|
||||
bom_bytes == [0xFF, 0xFE] || bom_bytes == [0xFE, 0xFF]
|
||||
fn is_utf16_le_bom(bom_bytes: &[u8]) -> bool {
|
||||
bom_bytes == [0xFF, 0xFE]
|
||||
}
|
||||
|
||||
let tree = match opts.encoding {
|
||||
Some(encoding) if encoding == ffi::TSInputEncodingUTF16 => {
|
||||
let source_code_utf16 = source_code
|
||||
.chunks_exact(2)
|
||||
.map(|chunk| u16::from_le_bytes([chunk[0], chunk[1]]))
|
||||
.collect::<Vec<_>>();
|
||||
parser.parse_utf16(&source_code_utf16, None)
|
||||
#[inline(always)]
|
||||
fn is_utf16_be_bom(bom_bytes: &[u8]) -> bool {
|
||||
bom_bytes == [0xFE, 0xFF]
|
||||
}
|
||||
|
||||
let encoding = match opts.encoding {
|
||||
None if source_code.len() >= 2 => {
|
||||
if is_utf16_le_bom(&source_code[0..2]) {
|
||||
Some(ffi::TSInputEncodingUTF16LE)
|
||||
} else if is_utf16_be_bom(&source_code[0..2]) {
|
||||
Some(ffi::TSInputEncodingUTF16BE)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
None if source_code.len() >= 2 && is_utf16_bom(&source_code[0..2]) => {
|
||||
_ => opts.encoding,
|
||||
};
|
||||
|
||||
let tree = match encoding {
|
||||
Some(encoding) if encoding == ffi::TSInputEncodingUTF16LE => {
|
||||
let source_code_utf16 = source_code
|
||||
.chunks_exact(2)
|
||||
.map(|chunk| u16::from_le_bytes([chunk[0], chunk[1]]))
|
||||
.collect::<Vec<_>>();
|
||||
parser.parse_utf16(&source_code_utf16, None)
|
||||
parser.parse_utf16_le(&source_code_utf16, None)
|
||||
}
|
||||
Some(encoding) if encoding == ffi::TSInputEncodingUTF16BE => {
|
||||
let source_code_utf16 = source_code
|
||||
.chunks_exact(2)
|
||||
.map(|chunk| u16::from_be_bytes([chunk[0], chunk[1]]))
|
||||
.collect::<Vec<_>>();
|
||||
parser.parse_utf16_be(&source_code_utf16, None)
|
||||
}
|
||||
_ => parser.parse(&source_code, None),
|
||||
};
|
||||
|
|
|
|||
|
|
@ -155,17 +155,19 @@ fn test_parsing_with_custom_utf8_input() {
|
|||
}
|
||||
|
||||
#[test]
|
||||
fn test_parsing_with_custom_utf16_input() {
|
||||
fn test_parsing_with_custom_utf16le_input() {
|
||||
let mut parser = Parser::new();
|
||||
parser.set_language(&get_language("rust")).unwrap();
|
||||
|
||||
let lines = ["pub fn foo() {", " 1", "}"]
|
||||
.iter()
|
||||
.map(|s| s.encode_utf16().collect::<Vec<_>>())
|
||||
.map(|s| s.encode_utf16().map(|u| u.to_le()).collect::<Vec<_>>())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let newline = [('\n' as u16).to_le()];
|
||||
|
||||
let tree = parser
|
||||
.parse_utf16_with(
|
||||
.parse_utf16_le_with(
|
||||
&mut |_, position| {
|
||||
let row = position.row;
|
||||
let column = position.column;
|
||||
|
|
@ -173,7 +175,7 @@ fn test_parsing_with_custom_utf16_input() {
|
|||
if column < lines[row].len() {
|
||||
&lines[row][column..]
|
||||
} else {
|
||||
&[10]
|
||||
&newline
|
||||
}
|
||||
} else {
|
||||
&[]
|
||||
|
|
@ -193,6 +195,47 @@ fn test_parsing_with_custom_utf16_input() {
|
|||
assert_eq!(root.child(0).unwrap().kind(), "function_item");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parsing_with_custom_utf16_be_input() {
|
||||
let mut parser = Parser::new();
|
||||
parser.set_language(&get_language("rust")).unwrap();
|
||||
|
||||
let lines: Vec<Vec<u16>> = ["pub fn foo() {", " 1", "}"]
|
||||
.iter()
|
||||
.map(|s| s.encode_utf16().collect::<Vec<_>>())
|
||||
.map(|v| v.iter().map(|u| u.to_be()).collect())
|
||||
.collect();
|
||||
|
||||
let newline = [('\n' as u16).to_be()];
|
||||
|
||||
let tree = parser
|
||||
.parse_utf16_be_with(
|
||||
&mut |_, position| {
|
||||
let row = position.row;
|
||||
let column = position.column;
|
||||
if row < lines.len() {
|
||||
if column < lines[row].len() {
|
||||
&lines[row][column..]
|
||||
} else {
|
||||
&newline
|
||||
}
|
||||
} else {
|
||||
&[]
|
||||
}
|
||||
},
|
||||
None,
|
||||
)
|
||||
.unwrap();
|
||||
let root = tree.root_node();
|
||||
assert_eq!(
|
||||
root.to_sexp(),
|
||||
"(source_file (function_item (visibility_modifier) name: (identifier) parameters: (parameters) body: (block (integer_literal))))"
|
||||
);
|
||||
assert_eq!(root.kind(), "source_file");
|
||||
assert!(!root.has_error());
|
||||
assert_eq!(root.child(0).unwrap().kind(), "function_item");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parsing_with_callback_returning_owned_strings() {
|
||||
let mut parser = Parser::new();
|
||||
|
|
@ -221,7 +264,13 @@ fn test_parsing_text_with_byte_order_mark() {
|
|||
|
||||
// Parse UTF16 text with a BOM
|
||||
let tree = parser
|
||||
.parse_utf16("\u{FEFF}fn a() {}".encode_utf16().collect::<Vec<_>>(), None)
|
||||
.parse_utf16_le(
|
||||
"\u{FEFF}fn a() {}"
|
||||
.encode_utf16()
|
||||
.map(|u| u.to_le())
|
||||
.collect::<Vec<_>>(),
|
||||
None,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
tree.root_node().to_sexp(),
|
||||
|
|
@ -1084,9 +1133,8 @@ fn test_parsing_error_in_invalid_included_ranges() {
|
|||
fn test_parsing_utf16_code_with_errors_at_the_end_of_an_included_range() {
|
||||
let source_code = "<script>a.</script>";
|
||||
let utf16_source_code = source_code
|
||||
.as_bytes()
|
||||
.iter()
|
||||
.map(|c| u16::from(*c))
|
||||
.encode_utf16()
|
||||
.map(|u| u.to_le())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let start_byte = 2 * source_code.find("a.").unwrap();
|
||||
|
|
@ -1102,7 +1150,7 @@ fn test_parsing_utf16_code_with_errors_at_the_end_of_an_included_range() {
|
|||
end_point: Point::new(0, end_byte),
|
||||
}])
|
||||
.unwrap();
|
||||
let tree = parser.parse_utf16(&utf16_source_code, None).unwrap();
|
||||
let tree = parser.parse_utf16_le(&utf16_source_code, None).unwrap();
|
||||
assert_eq!(tree.root_node().to_sexp(), "(program (ERROR (identifier)))");
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue