From 0afbc317893c13f39d3b6402d679f9b57c6764ea Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 2 Aug 2019 11:30:43 -0700 Subject: [PATCH] Automatically skip BOM characters at beginnings of files Refs tree-sitter/tree-sitter-python#48 --- cli/src/tests/parser_test.rs | 59 ++++++++++++++++++++++++++++++++++++ lib/src/lexer.c | 6 ++++ 2 files changed, 65 insertions(+) diff --git a/cli/src/tests/parser_test.rs b/cli/src/tests/parser_test.rs index 96c2ba5d..184afd04 100644 --- a/cli/src/tests/parser_test.rs +++ b/cli/src/tests/parser_test.rs @@ -160,6 +160,65 @@ fn test_parsing_with_custom_utf16_input() { assert_eq!(root.child(0).unwrap().kind(), "function_item"); } +#[test] +fn test_parsing_text_with_byte_order_mark() { + let mut parser = Parser::new(); + parser.set_language(get_language("rust")).unwrap(); + + // Parse UTF16 text with a BOM + let tree = parser + .parse_utf16( + &"\u{FEFF}fn a() {}".encode_utf16().collect::>(), + None, + ) + .unwrap(); + assert_eq!( + tree.root_node().to_sexp(), + "(source_file (function_item (identifier) (parameters) (block)))" + ); + assert_eq!(tree.root_node().start_byte(), 2); + + // Parse UTF8 text with a BOM + let mut tree = parser.parse("\u{FEFF}fn a() {}", None).unwrap(); + assert_eq!( + tree.root_node().to_sexp(), + "(source_file (function_item (identifier) (parameters) (block)))" + ); + assert_eq!(tree.root_node().start_byte(), 3); + + // Edit the text, inserting a character before the BOM. The BOM is now an error. + tree.edit(&InputEdit { + start_byte: 0, + old_end_byte: 0, + new_end_byte: 1, + start_position: Point::new(0, 0), + old_end_position: Point::new(0, 0), + new_end_position: Point::new(0, 1), + }); + let mut tree = parser.parse(" \u{FEFF}fn a() {}", Some(&tree)).unwrap(); + assert_eq!( + tree.root_node().to_sexp(), + "(source_file (ERROR (UNEXPECTED 65279)) (function_item (identifier) (parameters) (block)))" + ); + assert_eq!(tree.root_node().start_byte(), 1); + + // Edit the text again, putting the BOM back at the beginning. + tree.edit(&InputEdit { + start_byte: 0, + old_end_byte: 1, + new_end_byte: 0, + start_position: Point::new(0, 0), + old_end_position: Point::new(0, 1), + new_end_position: Point::new(0, 0), + }); + let tree = parser.parse("\u{FEFF}fn a() {}", Some(&tree)).unwrap(); + assert_eq!( + tree.root_node().to_sexp(), + "(source_file (function_item (identifier) (parameters) (block)))" + ); + assert_eq!(tree.root_node().start_byte(), 3); +} + // Incremental parsing #[test] diff --git a/lib/src/lexer.c b/lib/src/lexer.c index b33da344..fdc12746 100644 --- a/lib/src/lexer.c +++ b/lib/src/lexer.c @@ -20,6 +20,8 @@ static const char empty_chunk[3] = { 0, 0 }; +static const int32_t BYTE_ORDER_MARK = 0xFEFF; + static void ts_lexer__get_chunk(Lexer *self) { self->chunk_start = self->current_position.bytes; self->chunk = self->input.read( @@ -248,6 +250,10 @@ void ts_lexer_start(Lexer *self) { self->data.result_symbol = 0; if (!self->chunk) ts_lexer__get_chunk(self); if (!self->lookahead_size) ts_lexer__get_lookahead(self); + if ( + self->current_position.bytes == 0 && + self->data.lookahead == BYTE_ORDER_MARK + ) ts_lexer__advance((TSLexer *)self, true); } void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) {