Automatically skip BOM characters at beginnings of files
Refs tree-sitter/tree-sitter-python#48
This commit is contained in:
parent
15629f1231
commit
0afbc31789
2 changed files with 65 additions and 0 deletions
|
|
@ -160,6 +160,65 @@ fn test_parsing_with_custom_utf16_input() {
|
|||
assert_eq!(root.child(0).unwrap().kind(), "function_item");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parsing_text_with_byte_order_mark() {
|
||||
let mut parser = Parser::new();
|
||||
parser.set_language(get_language("rust")).unwrap();
|
||||
|
||||
// Parse UTF16 text with a BOM
|
||||
let tree = parser
|
||||
.parse_utf16(
|
||||
&"\u{FEFF}fn a() {}".encode_utf16().collect::<Vec<_>>(),
|
||||
None,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
tree.root_node().to_sexp(),
|
||||
"(source_file (function_item (identifier) (parameters) (block)))"
|
||||
);
|
||||
assert_eq!(tree.root_node().start_byte(), 2);
|
||||
|
||||
// Parse UTF8 text with a BOM
|
||||
let mut tree = parser.parse("\u{FEFF}fn a() {}", None).unwrap();
|
||||
assert_eq!(
|
||||
tree.root_node().to_sexp(),
|
||||
"(source_file (function_item (identifier) (parameters) (block)))"
|
||||
);
|
||||
assert_eq!(tree.root_node().start_byte(), 3);
|
||||
|
||||
// Edit the text, inserting a character before the BOM. The BOM is now an error.
|
||||
tree.edit(&InputEdit {
|
||||
start_byte: 0,
|
||||
old_end_byte: 0,
|
||||
new_end_byte: 1,
|
||||
start_position: Point::new(0, 0),
|
||||
old_end_position: Point::new(0, 0),
|
||||
new_end_position: Point::new(0, 1),
|
||||
});
|
||||
let mut tree = parser.parse(" \u{FEFF}fn a() {}", Some(&tree)).unwrap();
|
||||
assert_eq!(
|
||||
tree.root_node().to_sexp(),
|
||||
"(source_file (ERROR (UNEXPECTED 65279)) (function_item (identifier) (parameters) (block)))"
|
||||
);
|
||||
assert_eq!(tree.root_node().start_byte(), 1);
|
||||
|
||||
// Edit the text again, putting the BOM back at the beginning.
|
||||
tree.edit(&InputEdit {
|
||||
start_byte: 0,
|
||||
old_end_byte: 1,
|
||||
new_end_byte: 0,
|
||||
start_position: Point::new(0, 0),
|
||||
old_end_position: Point::new(0, 1),
|
||||
new_end_position: Point::new(0, 0),
|
||||
});
|
||||
let tree = parser.parse("\u{FEFF}fn a() {}", Some(&tree)).unwrap();
|
||||
assert_eq!(
|
||||
tree.root_node().to_sexp(),
|
||||
"(source_file (function_item (identifier) (parameters) (block)))"
|
||||
);
|
||||
assert_eq!(tree.root_node().start_byte(), 3);
|
||||
}
|
||||
|
||||
// Incremental parsing
|
||||
|
||||
#[test]
|
||||
|
|
|
|||
|
|
@ -20,6 +20,8 @@
|
|||
|
||||
static const char empty_chunk[3] = { 0, 0 };
|
||||
|
||||
static const int32_t BYTE_ORDER_MARK = 0xFEFF;
|
||||
|
||||
static void ts_lexer__get_chunk(Lexer *self) {
|
||||
self->chunk_start = self->current_position.bytes;
|
||||
self->chunk = self->input.read(
|
||||
|
|
@ -248,6 +250,10 @@ void ts_lexer_start(Lexer *self) {
|
|||
self->data.result_symbol = 0;
|
||||
if (!self->chunk) ts_lexer__get_chunk(self);
|
||||
if (!self->lookahead_size) ts_lexer__get_lookahead(self);
|
||||
if (
|
||||
self->current_position.bytes == 0 &&
|
||||
self->data.lookahead == BYTE_ORDER_MARK
|
||||
) ts_lexer__advance((TSLexer *)self, true);
|
||||
}
|
||||
|
||||
void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue