From 0afbc317893c13f39d3b6402d679f9b57c6764ea Mon Sep 17 00:00:00 2001
From: Max Brunsfeld <maxbrunsfeld@gmail.com>
Date: Fri, 2 Aug 2019 11:30:43 -0700
Subject: [PATCH] Automatically skip BOM characters at beginnings of files

Refs tree-sitter/tree-sitter-python#48
---
 cli/src/tests/parser_test.rs | 59 ++++++++++++++++++++++++++++++++++++
 lib/src/lexer.c              |  6 ++++
 2 files changed, 65 insertions(+)

diff --git a/cli/src/tests/parser_test.rs b/cli/src/tests/parser_test.rs
index 96c2ba5d..184afd04 100644
--- a/cli/src/tests/parser_test.rs
+++ b/cli/src/tests/parser_test.rs
@@ -160,6 +160,65 @@ fn test_parsing_with_custom_utf16_input() {
     assert_eq!(root.child(0).unwrap().kind(), "function_item");
 }
 
+#[test]
+fn test_parsing_text_with_byte_order_mark() {
+    let mut parser = Parser::new();
+    parser.set_language(get_language("rust")).unwrap();
+
+    // Parse UTF16 text with a BOM
+    let tree = parser
+        .parse_utf16(
+            &"\u{FEFF}fn a() {}".encode_utf16().collect::<Vec<_>>(),
+            None,
+        )
+        .unwrap();
+    assert_eq!(
+        tree.root_node().to_sexp(),
+        "(source_file (function_item (identifier) (parameters) (block)))"
+    );
+    assert_eq!(tree.root_node().start_byte(), 2);
+
+    // Parse UTF8 text with a BOM
+    let mut tree = parser.parse("\u{FEFF}fn a() {}", None).unwrap();
+    assert_eq!(
+        tree.root_node().to_sexp(),
+        "(source_file (function_item (identifier) (parameters) (block)))"
+    );
+    assert_eq!(tree.root_node().start_byte(), 3);
+
+    // Edit the text, inserting a character before the BOM. The BOM is now an error.
+    tree.edit(&InputEdit {
+        start_byte: 0,
+        old_end_byte: 0,
+        new_end_byte: 1,
+        start_position: Point::new(0, 0),
+        old_end_position: Point::new(0, 0),
+        new_end_position: Point::new(0, 1),
+    });
+    let mut tree = parser.parse(" \u{FEFF}fn a() {}", Some(&tree)).unwrap();
+    assert_eq!(
+        tree.root_node().to_sexp(),
+        "(source_file (ERROR (UNEXPECTED 65279)) (function_item (identifier) (parameters) (block)))"
+    );
+    assert_eq!(tree.root_node().start_byte(), 1);
+
+    // Edit the text again, putting the BOM back at the beginning.
+    tree.edit(&InputEdit {
+        start_byte: 0,
+        old_end_byte: 1,
+        new_end_byte: 0,
+        start_position: Point::new(0, 0),
+        old_end_position: Point::new(0, 1),
+        new_end_position: Point::new(0, 0),
+    });
+    let tree = parser.parse("\u{FEFF}fn a() {}", Some(&tree)).unwrap();
+    assert_eq!(
+        tree.root_node().to_sexp(),
+        "(source_file (function_item (identifier) (parameters) (block)))"
+    );
+    assert_eq!(tree.root_node().start_byte(), 3);
+}
+
 // Incremental parsing
 
 #[test]
diff --git a/lib/src/lexer.c b/lib/src/lexer.c
index b33da344..fdc12746 100644
--- a/lib/src/lexer.c
+++ b/lib/src/lexer.c
@@ -20,6 +20,8 @@
 
 static const char empty_chunk[3] = { 0, 0 };
 
+static const int32_t BYTE_ORDER_MARK = 0xFEFF;
+
 static void ts_lexer__get_chunk(Lexer *self) {
   self->chunk_start = self->current_position.bytes;
   self->chunk = self->input.read(
@@ -248,6 +250,10 @@ void ts_lexer_start(Lexer *self) {
   self->data.result_symbol = 0;
   if (!self->chunk) ts_lexer__get_chunk(self);
   if (!self->lookahead_size) ts_lexer__get_lookahead(self);
+  if (
+    self->current_position.bytes == 0 &&
+    self->data.lookahead == BYTE_ORDER_MARK
+  ) ts_lexer__advance((TSLexer *)self, true);
 }
 
 void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) {