diff --git a/cli/src/highlight.rs b/cli/src/highlight.rs index ddcdaa77..dff8fd2c 100644 --- a/cli/src/highlight.rs +++ b/cli/src/highlight.rs @@ -293,11 +293,11 @@ pub fn ansi( { let event = event.map_err(|e| e.to_string())?; match event { - HighlightEvent::Source(s) => { + HighlightEvent::Source { start, end } => { if let Some(style) = highlight_stack.last().and_then(|s| theme.ansi_style(*s)) { - write!(&mut stdout, "{}", style.paint(s))?; + style.paint(&source[start..end]).write_to(&mut stdout)?; } else { - write!(&mut stdout, "{}", s)?; + stdout.write_all(&source[start..end])?; } } HighlightEvent::HighlightStart(h) => { diff --git a/cli/src/tests/highlight_test.rs b/cli/src/tests/highlight_test.rs index 79e70546..34b545ff 100644 --- a/cli/src/tests/highlight_test.rs +++ b/cli/src/tests/highlight_test.rs @@ -420,10 +420,10 @@ fn test_highlighting_via_c_api() { assert_eq!( lines, vec![ - "<script>", - "const a = b('c');", - "c.d();", - "</script>", + "<script>\n", + "const a = b('c');\n", + "c.d();\n", + "</script>\n", ] ); @@ -431,6 +431,23 @@ fn test_highlighting_via_c_api() { c::ts_highlight_buffer_delete(buffer); } +#[test] +fn test_decode_utf8_lossy() { + use tree_sitter_highlight::util::LossyUtf8; + + let parts = LossyUtf8::new(b"hi").collect::>(); + assert_eq!(parts, vec!["hi"]); + + let parts = LossyUtf8::new(b"hi\xc0\xc1bye").collect::>(); + assert_eq!(parts, vec!["hi", "\u{fffd}", "\u{fffd}", "bye"]); + + let parts = LossyUtf8::new(b"\xc0\xc1bye").collect::>(); + assert_eq!(parts, vec!["\u{fffd}", "\u{fffd}", "bye"]); + + let parts = LossyUtf8::new(b"hello\xc0\xc1").collect::>(); + assert_eq!(parts, vec!["hello", "\u{fffd}", "\u{fffd}"]); +} + fn c_string(s: &str) -> CString { CString::new(s.as_bytes().to_vec()).unwrap() } @@ -466,11 +483,12 @@ fn to_token_vector<'a>( language: Language, property_sheet: &'a PropertySheet, ) -> Result)>>, Error> { + let src = src.as_bytes(); let mut lines = Vec::new(); let mut highlights = Vec::new(); let mut line = Vec::new(); for event in highlight( - src.as_bytes(), + src, language, property_sheet, None, @@ -481,7 +499,8 @@ fn to_token_vector<'a>( HighlightEvent::HighlightEnd => { highlights.pop(); } - HighlightEvent::Source(s) => { + HighlightEvent::Source { start, end } => { + let s = str::from_utf8(&src[start..end]).unwrap(); for (i, l) in s.split("\n").enumerate() { let l = l.trim_end_matches('\r'); if i > 0 { diff --git a/highlight/src/c_lib.rs b/highlight/src/c_lib.rs index 8ddb476b..063ab990 100644 --- a/highlight/src/c_lib.rs +++ b/highlight/src/c_lib.rs @@ -1,8 +1,7 @@ -use super::{escape, load_property_sheet, Error, Highlight, HighlightEvent, Highlighter, Properties}; +use super::{load_property_sheet, Error, Highlight, Highlighter, HtmlRenderer, Properties}; use regex::Regex; use std::collections::HashMap; use std::ffi::CStr; -use std::io::Write; use std::os::raw::c_char; use std::process::abort; use std::sync::atomic::AtomicUsize; @@ -20,10 +19,7 @@ pub struct TSHighlighter { attribute_strings: Vec<&'static [u8]>, } -pub struct TSHighlightBuffer { - html: Vec, - line_offsets: Vec, -} +pub struct TSHighlightBuffer(HtmlRenderer); #[repr(C)] pub enum ErrorCode { @@ -57,10 +53,7 @@ pub extern "C" fn ts_highlighter_new( #[no_mangle] pub extern "C" fn ts_highlight_buffer_new() -> *mut TSHighlightBuffer { - Box::into_raw(Box::new(TSHighlightBuffer { - html: Vec::new(), - line_offsets: Vec::new(), - })) + Box::into_raw(Box::new(TSHighlightBuffer(HtmlRenderer::new()))) } #[no_mangle] @@ -76,25 +69,25 @@ pub extern "C" fn ts_highlight_buffer_delete(this: *mut TSHighlightBuffer) { #[no_mangle] pub extern "C" fn ts_highlight_buffer_content(this: *const TSHighlightBuffer) -> *const u8 { let this = unwrap_ptr(this); - this.html.as_slice().as_ptr() + this.0.html.as_slice().as_ptr() } #[no_mangle] pub extern "C" fn ts_highlight_buffer_line_offsets(this: *const TSHighlightBuffer) -> *const u32 { let this = unwrap_ptr(this); - this.line_offsets.as_slice().as_ptr() + this.0.line_offsets.as_slice().as_ptr() } #[no_mangle] pub extern "C" fn ts_highlight_buffer_len(this: *const TSHighlightBuffer) -> u32 { let this = unwrap_ptr(this); - this.html.len() as u32 + this.0.html.len() as u32 } #[no_mangle] pub extern "C" fn ts_highlight_buffer_line_count(this: *const TSHighlightBuffer) -> u32 { let this = unwrap_ptr(this); - this.line_offsets.len() as u32 + this.0.line_offsets.len() as u32 } #[no_mangle] @@ -183,77 +176,28 @@ impl TSHighlighter { ); if let Ok(highlighter) = highlighter { - output.html.clear(); - output.line_offsets.clear(); - output.line_offsets.push(0); - let mut highlights = Vec::new(); - for event in highlighter { - match event { - Ok(HighlightEvent::HighlightStart(s)) => { - highlights.push(s); - output.start_highlight(s, &self.attribute_strings); - } - Ok(HighlightEvent::HighlightEnd) => { - highlights.pop(); - output.end_highlight(); - } - Ok(HighlightEvent::Source(src)) => { - output.add_text(src, &highlights, &self.attribute_strings); - }, - Err(Error::Cancelled) => { - return ErrorCode::Timeout; - }, - Err(Error::InvalidLanguage) => { - return ErrorCode::InvalidLanguage; - }, - Err(Error::Unknown) => { - return ErrorCode::Timeout; - } + output.0.reset(); + let result = output.0.render(highlighter, source_code, &|s| { + self.attribute_strings[s as usize] + }); + match result { + Err(Error::Cancelled) => { + return ErrorCode::Timeout; } + Err(Error::InvalidLanguage) => { + return ErrorCode::InvalidLanguage; + } + Err(Error::Unknown) => { + return ErrorCode::Timeout; + } + Ok(()) => ErrorCode::Ok, } - ErrorCode::Ok } else { ErrorCode::Timeout } } } -impl TSHighlightBuffer { - fn start_highlight(&mut self, h: Highlight, attribute_strings: &[&[u8]]) { - let attribute_string = attribute_strings[h as usize]; - self.html.extend(b""); - } - - fn end_highlight(&mut self) { - self.html.extend(b""); - } - - fn finish_line(&mut self) { - self.line_offsets.push(self.html.len() as u32); - } - - fn add_text(&mut self, src: &str, highlights: &Vec, attribute_strings: &[&[u8]]) { - let mut multiline = false; - for line in src.split('\n') { - let line = line.trim_end_matches('\r'); - if multiline { - highlights.iter().for_each(|_| self.end_highlight()); - self.finish_line(); - highlights - .iter() - .for_each(|scope| self.start_highlight(*scope, attribute_strings)); - } - write!(&mut self.html, "{}", escape::Escape(line)).unwrap(); - multiline = true; - } - } -} - fn unwrap_ptr<'a, T>(result: *const T) -> &'a T { unsafe { result.as_ref() }.unwrap_or_else(|| { eprintln!("{}:{} - pointer must not be null", file!(), line!()); diff --git a/highlight/src/escape.rs b/highlight/src/escape.rs deleted file mode 100644 index 882f160c..00000000 --- a/highlight/src/escape.rs +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2013 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -//! HTML Escaping -//! -//! This module contains one unit-struct which can be used to HTML-escape a -//! string of text (for use in a format string). - -use std::fmt; - -/// Wrapper struct which will emit the HTML-escaped version of the contained -/// string when passed to a format string. -pub struct Escape<'a>(pub &'a str); - -impl<'a> fmt::Display for Escape<'a> { - fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { - // Because the internet is always right, turns out there's not that many - // characters to escape: http://stackoverflow.com/questions/7381974 - let Escape(s) = *self; - let pile_o_bits = s; - let mut last = 0; - for (i, ch) in s.bytes().enumerate() { - match ch as char { - '<' | '>' | '&' | '\'' | '"' => { - fmt.write_str(&pile_o_bits[last..i])?; - let s = match ch as char { - '>' => ">", - '<' => "<", - '&' => "&", - '\'' => "'", - '"' => """, - _ => unreachable!(), - }; - fmt.write_str(s)?; - last = i + 1; - } - _ => {} - } - } - - if last < s.len() { - fmt.write_str(&pile_o_bits[last..])?; - } - Ok(()) - } -} diff --git a/highlight/src/lib.rs b/highlight/src/lib.rs index 0663bf92..477a640d 100644 --- a/highlight/src/lib.rs +++ b/highlight/src/lib.rs @@ -1,13 +1,12 @@ pub mod c_lib; -mod escape; +pub mod util; pub use c_lib as c; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use serde_derive::*; -use std::fmt::{self, Write}; use std::mem::transmute; use std::sync::atomic::{AtomicUsize, Ordering}; -use std::{cmp, str, usize}; +use std::{cmp, fmt, str, usize}; use tree_sitter::{Language, Node, Parser, Point, PropertySheet, Range, Tree, TreePropertyCursor}; const CANCELLATION_CHECK_INTERVAL: usize = 100; @@ -116,14 +115,13 @@ where parser: Parser, layers: Vec>, max_opaque_layer_depth: usize, - utf8_error_len: Option, operation_count: usize, cancellation_flag: Option<&'a AtomicUsize>, } #[derive(Copy, Clone, Debug)] -pub enum HighlightEvent<'a> { - Source(&'a str), +pub enum HighlightEvent { + Source { start: usize, end: usize }, HighlightStart(Highlight), HighlightEnd, } @@ -471,7 +469,6 @@ where injection_callback, source_offset: 0, operation_count: 0, - utf8_error_len: None, max_opaque_layer_depth: 0, layers: vec![Layer::new( source, @@ -489,30 +486,13 @@ where }) } - fn emit_source(&mut self, next_offset: usize) -> Option, Error>> { - let input = &self.source[self.source_offset..next_offset]; - match str::from_utf8(input) { - Ok(valid) => { - self.source_offset = next_offset; - Some(Ok(HighlightEvent::Source(valid))) - } - Err(error) => { - if let Some(error_len) = error.error_len() { - if error.valid_up_to() > 0 { - let prefix = &input[0..error.valid_up_to()]; - self.utf8_error_len = Some(error_len); - Some(Ok(HighlightEvent::Source(unsafe { - str::from_utf8_unchecked(prefix) - }))) - } else { - self.source_offset += error_len; - Some(Ok(HighlightEvent::Source("\u{FFFD}"))) - } - } else { - None - } - } - } + fn emit_source(&mut self, next_offset: usize) -> HighlightEvent { + let result = HighlightEvent::Source { + start: self.source_offset, + end: next_offset, + }; + self.source_offset = next_offset; + result } fn process_tree_step(&self, step: &TreeStep, nodes: &mut Vec) { @@ -727,7 +707,7 @@ impl<'a, T> Iterator for Highlighter<'a, T> where T: Fn(&str) -> Option<(Language, &'a PropertySheet)>, { - type Item = Result, Error>; + type Item = Result; fn next(&mut self) -> Option { if let Some(cancellation_flag) = self.cancellation_flag { @@ -740,11 +720,6 @@ where } } - if let Some(utf8_error_len) = self.utf8_error_len.take() { - self.source_offset += utf8_error_len; - return Some(Ok(HighlightEvent::Source("\u{FFFD}"))); - } - while !self.layers.is_empty() { let mut scope_event = None; let first_layer = &self.layers[0]; @@ -808,7 +783,7 @@ where // Before returning any highlight boundaries, return any remaining slice of // the source code the precedes that highlight boundary. if self.source_offset < next_offset { - return self.emit_source(next_offset); + return Some(Ok(self.emit_source(next_offset))); } scope_event = if first_layer.at_node_end { @@ -841,7 +816,7 @@ where } if self.source_offset < self.source.len() { - self.emit_source(self.source.len()) + Some(Ok(self.emit_source(self.source.len()))) } else { None } @@ -1081,7 +1056,7 @@ pub fn highlight<'a, F>( property_sheet: &'a PropertySheet, cancellation_flag: Option<&'a AtomicUsize>, injection_callback: F, -) -> Result, Error>> + 'a, Error> +) -> Result> + 'a, Error> where F: Fn(&str) -> Option<(Language, &'a PropertySheet)> + 'a, { @@ -1106,87 +1081,124 @@ where F1: Fn(&str) -> Option<(Language, &'a PropertySheet)>, F2: Fn(Highlight) -> &'a str, { - let highlighter = Highlighter::new( + let mut renderer = HtmlRenderer::new(); + renderer.render( + Highlighter::new( + source, + language, + property_sheet, + injection_callback, + cancellation_flag, + )?, source, - language, - property_sheet, - injection_callback, - cancellation_flag, + &|s| (attribute_callback)(s).as_bytes(), )?; - let mut renderer = HtmlRenderer::new(attribute_callback); - let mut scopes = Vec::new(); - for event in highlighter { - let event = event?; - match event { - HighlightEvent::HighlightStart(s) => { - scopes.push(s); - renderer.start_scope(s); - } - HighlightEvent::HighlightEnd => { - scopes.pop(); - renderer.end_scope(); - } - HighlightEvent::Source(src) => { - renderer.add_text(src, &scopes); - } - }; - } - if !renderer.current_line.is_empty() { - renderer.finish_line(); - } - Ok(renderer.result) + Ok(renderer + .line_offsets + .iter() + .enumerate() + .map(|(i, offset)| { + let offset = *offset as usize; + let next_offset = renderer + .line_offsets + .get(i + 1) + .map_or(renderer.html.len(), |i| *i as usize); + String::from_utf8(renderer.html[offset..next_offset].to_vec()).unwrap() + }) + .collect()) } -struct HtmlRenderer<'a, F: Fn(Highlight) -> &'a str> { - result: Vec, - current_line: String, - attribute_callback: F, +pub struct HtmlRenderer { + pub html: Vec, + pub line_offsets: Vec, } -impl<'a, F> HtmlRenderer<'a, F> -where - F: Fn(Highlight) -> &'a str, -{ - fn new(attribute_callback: F) -> Self { +impl HtmlRenderer { + fn new() -> Self { HtmlRenderer { - result: Vec::new(), - current_line: String::new(), - attribute_callback, + html: Vec::new(), + line_offsets: vec![0], } } - fn start_scope(&mut self, s: Highlight) { - write!( - &mut self.current_line, - "", - (self.attribute_callback)(s), - ) - .unwrap(); + pub fn reset(&mut self) { + self.html.clear(); + self.line_offsets.clear(); + self.line_offsets.push(0); } - fn end_scope(&mut self) { - write!(&mut self.current_line, "").unwrap(); - } - - fn finish_line(&mut self) { - self.current_line.push('\n'); - self.result.push(self.current_line.clone()); - self.current_line.clear(); - } - - fn add_text(&mut self, src: &str, scopes: &Vec) { - let mut multiline = false; - for line in src.split('\n') { - let line = line.trim_end_matches('\r'); - if multiline { - scopes.iter().for_each(|_| self.end_scope()); - self.finish_line(); - scopes - .iter() - .for_each(|highlight| self.start_scope(*highlight)); + pub fn render<'a, F>( + &mut self, + highlighter: impl Iterator>, + source: &'a [u8], + attribute_callback: &F, + ) -> Result<(), Error> + where + F: Fn(Highlight) -> &'a [u8], + { + let mut highlights = Vec::new(); + for event in highlighter { + match event { + Ok(HighlightEvent::HighlightStart(s)) => { + highlights.push(s); + self.start_highlight(s, attribute_callback); + } + Ok(HighlightEvent::HighlightEnd) => { + highlights.pop(); + self.end_highlight(); + } + Ok(HighlightEvent::Source { start, end }) => { + self.add_text(&source[start..end], &highlights, attribute_callback); + } + Err(a) => return Err(a), + } + } + if self.html.last() != Some(&b'\n') { + self.html.push(b'\n'); + } + if self.line_offsets.last() == Some(&(self.html.len() as u32)) { + self.line_offsets.pop(); + } + Ok(()) + } + + fn start_highlight<'a, F>(&mut self, h: Highlight, attribute_callback: &F) + where + F: Fn(Highlight) -> &'a [u8], + { + let attribute_string = (attribute_callback)(h); + self.html.extend(b""); + } + + fn end_highlight(&mut self) { + self.html.extend(b""); + } + + fn add_text<'a, F>(&mut self, src: &[u8], highlights: &Vec, attribute_callback: &F) + where + F: Fn(Highlight) -> &'a [u8], + { + for c in util::LossyUtf8::new(src).flat_map(|p| p.bytes()) { + if c == b'\n' { + if self.html.ends_with(b"\r") { + self.html.pop(); + } + highlights.iter().for_each(|_| self.end_highlight()); + self.html.push(c); + self.line_offsets.push(self.html.len() as u32); + highlights + .iter() + .for_each(|scope| self.start_highlight(*scope, attribute_callback)); + } else if let Some(escape) = util::html_escape(c) { + self.html.extend_from_slice(escape); + } else { + self.html.push(c); } - write!(&mut self.current_line, "{}", escape::Escape(line)).unwrap(); - multiline = true; } } } diff --git a/highlight/src/util.rs b/highlight/src/util.rs new file mode 100644 index 00000000..6c325a6c --- /dev/null +++ b/highlight/src/util.rs @@ -0,0 +1,63 @@ +use std::str; + +pub struct LossyUtf8<'a> { + bytes: &'a [u8], + in_replacement: bool, +} + +impl<'a> LossyUtf8<'a> { + pub fn new(bytes: &'a [u8]) -> Self { + LossyUtf8 { + bytes, + in_replacement: false, + } + } +} + +impl<'a> Iterator for LossyUtf8<'a> { + type Item = &'a str; + + fn next(&mut self) -> Option<&'a str> { + if self.bytes.is_empty() { + return None; + } + if self.in_replacement { + self.in_replacement = false; + return Some("\u{fffd}"); + } + match str::from_utf8(self.bytes) { + Ok(valid) => { + self.bytes = &[]; + Some(valid) + } + Err(error) => { + if let Some(error_len) = error.error_len() { + let error_start = error.valid_up_to(); + if error_start > 0 { + let result = + unsafe { str::from_utf8_unchecked(&self.bytes[..error_start]) }; + self.bytes = &self.bytes[(error_start + error_len)..]; + self.in_replacement = true; + Some(result) + } else { + self.bytes = &self.bytes[error_len..]; + Some("\u{fffd}") + } + } else { + None + } + } + } + } +} + +pub fn html_escape(c: u8) -> Option<&'static [u8]> { + match c as char { + '>' => Some(b">"), + '<' => Some(b"<"), + '&' => Some(b"&"), + '\'' => Some(b"'"), + '"' => Some(b"""), + _ => None, + } +}