diff --git a/cli/src/highlight.rs b/cli/src/highlight.rs
index ddcdaa77..dff8fd2c 100644
--- a/cli/src/highlight.rs
+++ b/cli/src/highlight.rs
@@ -293,11 +293,11 @@ pub fn ansi(
{
let event = event.map_err(|e| e.to_string())?;
match event {
- HighlightEvent::Source(s) => {
+ HighlightEvent::Source { start, end } => {
if let Some(style) = highlight_stack.last().and_then(|s| theme.ansi_style(*s)) {
- write!(&mut stdout, "{}", style.paint(s))?;
+ style.paint(&source[start..end]).write_to(&mut stdout)?;
} else {
- write!(&mut stdout, "{}", s)?;
+ stdout.write_all(&source[start..end])?;
}
}
HighlightEvent::HighlightStart(h) => {
diff --git a/cli/src/tests/highlight_test.rs b/cli/src/tests/highlight_test.rs
index 79e70546..34b545ff 100644
--- a/cli/src/tests/highlight_test.rs
+++ b/cli/src/tests/highlight_test.rs
@@ -420,10 +420,10 @@ fn test_highlighting_via_c_api() {
assert_eq!(
lines,
vec![
- "<script>",
- "const a = b('c');",
- "c.d();",
- "</script>",
+ "<script>\n",
+ "const a = b('c');\n",
+ "c.d();\n",
+ "</script>\n",
]
);
@@ -431,6 +431,23 @@ fn test_highlighting_via_c_api() {
c::ts_highlight_buffer_delete(buffer);
}
+#[test]
+fn test_decode_utf8_lossy() {
+ use tree_sitter_highlight::util::LossyUtf8;
+
+ let parts = LossyUtf8::new(b"hi").collect::>();
+ assert_eq!(parts, vec!["hi"]);
+
+ let parts = LossyUtf8::new(b"hi\xc0\xc1bye").collect::>();
+ assert_eq!(parts, vec!["hi", "\u{fffd}", "\u{fffd}", "bye"]);
+
+ let parts = LossyUtf8::new(b"\xc0\xc1bye").collect::>();
+ assert_eq!(parts, vec!["\u{fffd}", "\u{fffd}", "bye"]);
+
+ let parts = LossyUtf8::new(b"hello\xc0\xc1").collect::>();
+ assert_eq!(parts, vec!["hello", "\u{fffd}", "\u{fffd}"]);
+}
+
fn c_string(s: &str) -> CString {
CString::new(s.as_bytes().to_vec()).unwrap()
}
@@ -466,11 +483,12 @@ fn to_token_vector<'a>(
language: Language,
property_sheet: &'a PropertySheet,
) -> Result)>>, Error> {
+ let src = src.as_bytes();
let mut lines = Vec::new();
let mut highlights = Vec::new();
let mut line = Vec::new();
for event in highlight(
- src.as_bytes(),
+ src,
language,
property_sheet,
None,
@@ -481,7 +499,8 @@ fn to_token_vector<'a>(
HighlightEvent::HighlightEnd => {
highlights.pop();
}
- HighlightEvent::Source(s) => {
+ HighlightEvent::Source { start, end } => {
+ let s = str::from_utf8(&src[start..end]).unwrap();
for (i, l) in s.split("\n").enumerate() {
let l = l.trim_end_matches('\r');
if i > 0 {
diff --git a/highlight/src/c_lib.rs b/highlight/src/c_lib.rs
index 8ddb476b..063ab990 100644
--- a/highlight/src/c_lib.rs
+++ b/highlight/src/c_lib.rs
@@ -1,8 +1,7 @@
-use super::{escape, load_property_sheet, Error, Highlight, HighlightEvent, Highlighter, Properties};
+use super::{load_property_sheet, Error, Highlight, Highlighter, HtmlRenderer, Properties};
use regex::Regex;
use std::collections::HashMap;
use std::ffi::CStr;
-use std::io::Write;
use std::os::raw::c_char;
use std::process::abort;
use std::sync::atomic::AtomicUsize;
@@ -20,10 +19,7 @@ pub struct TSHighlighter {
attribute_strings: Vec<&'static [u8]>,
}
-pub struct TSHighlightBuffer {
- html: Vec,
- line_offsets: Vec,
-}
+pub struct TSHighlightBuffer(HtmlRenderer);
#[repr(C)]
pub enum ErrorCode {
@@ -57,10 +53,7 @@ pub extern "C" fn ts_highlighter_new(
#[no_mangle]
pub extern "C" fn ts_highlight_buffer_new() -> *mut TSHighlightBuffer {
- Box::into_raw(Box::new(TSHighlightBuffer {
- html: Vec::new(),
- line_offsets: Vec::new(),
- }))
+ Box::into_raw(Box::new(TSHighlightBuffer(HtmlRenderer::new())))
}
#[no_mangle]
@@ -76,25 +69,25 @@ pub extern "C" fn ts_highlight_buffer_delete(this: *mut TSHighlightBuffer) {
#[no_mangle]
pub extern "C" fn ts_highlight_buffer_content(this: *const TSHighlightBuffer) -> *const u8 {
let this = unwrap_ptr(this);
- this.html.as_slice().as_ptr()
+ this.0.html.as_slice().as_ptr()
}
#[no_mangle]
pub extern "C" fn ts_highlight_buffer_line_offsets(this: *const TSHighlightBuffer) -> *const u32 {
let this = unwrap_ptr(this);
- this.line_offsets.as_slice().as_ptr()
+ this.0.line_offsets.as_slice().as_ptr()
}
#[no_mangle]
pub extern "C" fn ts_highlight_buffer_len(this: *const TSHighlightBuffer) -> u32 {
let this = unwrap_ptr(this);
- this.html.len() as u32
+ this.0.html.len() as u32
}
#[no_mangle]
pub extern "C" fn ts_highlight_buffer_line_count(this: *const TSHighlightBuffer) -> u32 {
let this = unwrap_ptr(this);
- this.line_offsets.len() as u32
+ this.0.line_offsets.len() as u32
}
#[no_mangle]
@@ -183,77 +176,28 @@ impl TSHighlighter {
);
if let Ok(highlighter) = highlighter {
- output.html.clear();
- output.line_offsets.clear();
- output.line_offsets.push(0);
- let mut highlights = Vec::new();
- for event in highlighter {
- match event {
- Ok(HighlightEvent::HighlightStart(s)) => {
- highlights.push(s);
- output.start_highlight(s, &self.attribute_strings);
- }
- Ok(HighlightEvent::HighlightEnd) => {
- highlights.pop();
- output.end_highlight();
- }
- Ok(HighlightEvent::Source(src)) => {
- output.add_text(src, &highlights, &self.attribute_strings);
- },
- Err(Error::Cancelled) => {
- return ErrorCode::Timeout;
- },
- Err(Error::InvalidLanguage) => {
- return ErrorCode::InvalidLanguage;
- },
- Err(Error::Unknown) => {
- return ErrorCode::Timeout;
- }
+ output.0.reset();
+ let result = output.0.render(highlighter, source_code, &|s| {
+ self.attribute_strings[s as usize]
+ });
+ match result {
+ Err(Error::Cancelled) => {
+ return ErrorCode::Timeout;
}
+ Err(Error::InvalidLanguage) => {
+ return ErrorCode::InvalidLanguage;
+ }
+ Err(Error::Unknown) => {
+ return ErrorCode::Timeout;
+ }
+ Ok(()) => ErrorCode::Ok,
}
- ErrorCode::Ok
} else {
ErrorCode::Timeout
}
}
}
-impl TSHighlightBuffer {
- fn start_highlight(&mut self, h: Highlight, attribute_strings: &[&[u8]]) {
- let attribute_string = attribute_strings[h as usize];
- self.html.extend(b"");
- }
-
- fn end_highlight(&mut self) {
- self.html.extend(b"");
- }
-
- fn finish_line(&mut self) {
- self.line_offsets.push(self.html.len() as u32);
- }
-
- fn add_text(&mut self, src: &str, highlights: &Vec, attribute_strings: &[&[u8]]) {
- let mut multiline = false;
- for line in src.split('\n') {
- let line = line.trim_end_matches('\r');
- if multiline {
- highlights.iter().for_each(|_| self.end_highlight());
- self.finish_line();
- highlights
- .iter()
- .for_each(|scope| self.start_highlight(*scope, attribute_strings));
- }
- write!(&mut self.html, "{}", escape::Escape(line)).unwrap();
- multiline = true;
- }
- }
-}
-
fn unwrap_ptr<'a, T>(result: *const T) -> &'a T {
unsafe { result.as_ref() }.unwrap_or_else(|| {
eprintln!("{}:{} - pointer must not be null", file!(), line!());
diff --git a/highlight/src/escape.rs b/highlight/src/escape.rs
deleted file mode 100644
index 882f160c..00000000
--- a/highlight/src/escape.rs
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2013 The Rust Project Developers. See the COPYRIGHT
-// file at the top-level directory of this distribution and at
-// http://rust-lang.org/COPYRIGHT.
-//
-// Licensed under the Apache License, Version 2.0 or the MIT license
-// , at your
-// option. This file may not be copied, modified, or distributed
-// except according to those terms.
-
-//! HTML Escaping
-//!
-//! This module contains one unit-struct which can be used to HTML-escape a
-//! string of text (for use in a format string).
-
-use std::fmt;
-
-/// Wrapper struct which will emit the HTML-escaped version of the contained
-/// string when passed to a format string.
-pub struct Escape<'a>(pub &'a str);
-
-impl<'a> fmt::Display for Escape<'a> {
- fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
- // Because the internet is always right, turns out there's not that many
- // characters to escape: http://stackoverflow.com/questions/7381974
- let Escape(s) = *self;
- let pile_o_bits = s;
- let mut last = 0;
- for (i, ch) in s.bytes().enumerate() {
- match ch as char {
- '<' | '>' | '&' | '\'' | '"' => {
- fmt.write_str(&pile_o_bits[last..i])?;
- let s = match ch as char {
- '>' => ">",
- '<' => "<",
- '&' => "&",
- '\'' => "'",
- '"' => """,
- _ => unreachable!(),
- };
- fmt.write_str(s)?;
- last = i + 1;
- }
- _ => {}
- }
- }
-
- if last < s.len() {
- fmt.write_str(&pile_o_bits[last..])?;
- }
- Ok(())
- }
-}
diff --git a/highlight/src/lib.rs b/highlight/src/lib.rs
index 0663bf92..477a640d 100644
--- a/highlight/src/lib.rs
+++ b/highlight/src/lib.rs
@@ -1,13 +1,12 @@
pub mod c_lib;
-mod escape;
+pub mod util;
pub use c_lib as c;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use serde_derive::*;
-use std::fmt::{self, Write};
use std::mem::transmute;
use std::sync::atomic::{AtomicUsize, Ordering};
-use std::{cmp, str, usize};
+use std::{cmp, fmt, str, usize};
use tree_sitter::{Language, Node, Parser, Point, PropertySheet, Range, Tree, TreePropertyCursor};
const CANCELLATION_CHECK_INTERVAL: usize = 100;
@@ -116,14 +115,13 @@ where
parser: Parser,
layers: Vec>,
max_opaque_layer_depth: usize,
- utf8_error_len: Option,
operation_count: usize,
cancellation_flag: Option<&'a AtomicUsize>,
}
#[derive(Copy, Clone, Debug)]
-pub enum HighlightEvent<'a> {
- Source(&'a str),
+pub enum HighlightEvent {
+ Source { start: usize, end: usize },
HighlightStart(Highlight),
HighlightEnd,
}
@@ -471,7 +469,6 @@ where
injection_callback,
source_offset: 0,
operation_count: 0,
- utf8_error_len: None,
max_opaque_layer_depth: 0,
layers: vec![Layer::new(
source,
@@ -489,30 +486,13 @@ where
})
}
- fn emit_source(&mut self, next_offset: usize) -> Option, Error>> {
- let input = &self.source[self.source_offset..next_offset];
- match str::from_utf8(input) {
- Ok(valid) => {
- self.source_offset = next_offset;
- Some(Ok(HighlightEvent::Source(valid)))
- }
- Err(error) => {
- if let Some(error_len) = error.error_len() {
- if error.valid_up_to() > 0 {
- let prefix = &input[0..error.valid_up_to()];
- self.utf8_error_len = Some(error_len);
- Some(Ok(HighlightEvent::Source(unsafe {
- str::from_utf8_unchecked(prefix)
- })))
- } else {
- self.source_offset += error_len;
- Some(Ok(HighlightEvent::Source("\u{FFFD}")))
- }
- } else {
- None
- }
- }
- }
+ fn emit_source(&mut self, next_offset: usize) -> HighlightEvent {
+ let result = HighlightEvent::Source {
+ start: self.source_offset,
+ end: next_offset,
+ };
+ self.source_offset = next_offset;
+ result
}
fn process_tree_step(&self, step: &TreeStep, nodes: &mut Vec) {
@@ -727,7 +707,7 @@ impl<'a, T> Iterator for Highlighter<'a, T>
where
T: Fn(&str) -> Option<(Language, &'a PropertySheet)>,
{
- type Item = Result, Error>;
+ type Item = Result;
fn next(&mut self) -> Option {
if let Some(cancellation_flag) = self.cancellation_flag {
@@ -740,11 +720,6 @@ where
}
}
- if let Some(utf8_error_len) = self.utf8_error_len.take() {
- self.source_offset += utf8_error_len;
- return Some(Ok(HighlightEvent::Source("\u{FFFD}")));
- }
-
while !self.layers.is_empty() {
let mut scope_event = None;
let first_layer = &self.layers[0];
@@ -808,7 +783,7 @@ where
// Before returning any highlight boundaries, return any remaining slice of
// the source code the precedes that highlight boundary.
if self.source_offset < next_offset {
- return self.emit_source(next_offset);
+ return Some(Ok(self.emit_source(next_offset)));
}
scope_event = if first_layer.at_node_end {
@@ -841,7 +816,7 @@ where
}
if self.source_offset < self.source.len() {
- self.emit_source(self.source.len())
+ Some(Ok(self.emit_source(self.source.len())))
} else {
None
}
@@ -1081,7 +1056,7 @@ pub fn highlight<'a, F>(
property_sheet: &'a PropertySheet,
cancellation_flag: Option<&'a AtomicUsize>,
injection_callback: F,
-) -> Result, Error>> + 'a, Error>
+) -> Result> + 'a, Error>
where
F: Fn(&str) -> Option<(Language, &'a PropertySheet)> + 'a,
{
@@ -1106,87 +1081,124 @@ where
F1: Fn(&str) -> Option<(Language, &'a PropertySheet)>,
F2: Fn(Highlight) -> &'a str,
{
- let highlighter = Highlighter::new(
+ let mut renderer = HtmlRenderer::new();
+ renderer.render(
+ Highlighter::new(
+ source,
+ language,
+ property_sheet,
+ injection_callback,
+ cancellation_flag,
+ )?,
source,
- language,
- property_sheet,
- injection_callback,
- cancellation_flag,
+ &|s| (attribute_callback)(s).as_bytes(),
)?;
- let mut renderer = HtmlRenderer::new(attribute_callback);
- let mut scopes = Vec::new();
- for event in highlighter {
- let event = event?;
- match event {
- HighlightEvent::HighlightStart(s) => {
- scopes.push(s);
- renderer.start_scope(s);
- }
- HighlightEvent::HighlightEnd => {
- scopes.pop();
- renderer.end_scope();
- }
- HighlightEvent::Source(src) => {
- renderer.add_text(src, &scopes);
- }
- };
- }
- if !renderer.current_line.is_empty() {
- renderer.finish_line();
- }
- Ok(renderer.result)
+ Ok(renderer
+ .line_offsets
+ .iter()
+ .enumerate()
+ .map(|(i, offset)| {
+ let offset = *offset as usize;
+ let next_offset = renderer
+ .line_offsets
+ .get(i + 1)
+ .map_or(renderer.html.len(), |i| *i as usize);
+ String::from_utf8(renderer.html[offset..next_offset].to_vec()).unwrap()
+ })
+ .collect())
}
-struct HtmlRenderer<'a, F: Fn(Highlight) -> &'a str> {
- result: Vec,
- current_line: String,
- attribute_callback: F,
+pub struct HtmlRenderer {
+ pub html: Vec,
+ pub line_offsets: Vec,
}
-impl<'a, F> HtmlRenderer<'a, F>
-where
- F: Fn(Highlight) -> &'a str,
-{
- fn new(attribute_callback: F) -> Self {
+impl HtmlRenderer {
+ fn new() -> Self {
HtmlRenderer {
- result: Vec::new(),
- current_line: String::new(),
- attribute_callback,
+ html: Vec::new(),
+ line_offsets: vec![0],
}
}
- fn start_scope(&mut self, s: Highlight) {
- write!(
- &mut self.current_line,
- "",
- (self.attribute_callback)(s),
- )
- .unwrap();
+ pub fn reset(&mut self) {
+ self.html.clear();
+ self.line_offsets.clear();
+ self.line_offsets.push(0);
}
- fn end_scope(&mut self) {
- write!(&mut self.current_line, "").unwrap();
- }
-
- fn finish_line(&mut self) {
- self.current_line.push('\n');
- self.result.push(self.current_line.clone());
- self.current_line.clear();
- }
-
- fn add_text(&mut self, src: &str, scopes: &Vec) {
- let mut multiline = false;
- for line in src.split('\n') {
- let line = line.trim_end_matches('\r');
- if multiline {
- scopes.iter().for_each(|_| self.end_scope());
- self.finish_line();
- scopes
- .iter()
- .for_each(|highlight| self.start_scope(*highlight));
+ pub fn render<'a, F>(
+ &mut self,
+ highlighter: impl Iterator- >,
+ source: &'a [u8],
+ attribute_callback: &F,
+ ) -> Result<(), Error>
+ where
+ F: Fn(Highlight) -> &'a [u8],
+ {
+ let mut highlights = Vec::new();
+ for event in highlighter {
+ match event {
+ Ok(HighlightEvent::HighlightStart(s)) => {
+ highlights.push(s);
+ self.start_highlight(s, attribute_callback);
+ }
+ Ok(HighlightEvent::HighlightEnd) => {
+ highlights.pop();
+ self.end_highlight();
+ }
+ Ok(HighlightEvent::Source { start, end }) => {
+ self.add_text(&source[start..end], &highlights, attribute_callback);
+ }
+ Err(a) => return Err(a),
+ }
+ }
+ if self.html.last() != Some(&b'\n') {
+ self.html.push(b'\n');
+ }
+ if self.line_offsets.last() == Some(&(self.html.len() as u32)) {
+ self.line_offsets.pop();
+ }
+ Ok(())
+ }
+
+ fn start_highlight<'a, F>(&mut self, h: Highlight, attribute_callback: &F)
+ where
+ F: Fn(Highlight) -> &'a [u8],
+ {
+ let attribute_string = (attribute_callback)(h);
+ self.html.extend(b"");
+ }
+
+ fn end_highlight(&mut self) {
+ self.html.extend(b"");
+ }
+
+ fn add_text<'a, F>(&mut self, src: &[u8], highlights: &Vec, attribute_callback: &F)
+ where
+ F: Fn(Highlight) -> &'a [u8],
+ {
+ for c in util::LossyUtf8::new(src).flat_map(|p| p.bytes()) {
+ if c == b'\n' {
+ if self.html.ends_with(b"\r") {
+ self.html.pop();
+ }
+ highlights.iter().for_each(|_| self.end_highlight());
+ self.html.push(c);
+ self.line_offsets.push(self.html.len() as u32);
+ highlights
+ .iter()
+ .for_each(|scope| self.start_highlight(*scope, attribute_callback));
+ } else if let Some(escape) = util::html_escape(c) {
+ self.html.extend_from_slice(escape);
+ } else {
+ self.html.push(c);
}
- write!(&mut self.current_line, "{}", escape::Escape(line)).unwrap();
- multiline = true;
}
}
}
diff --git a/highlight/src/util.rs b/highlight/src/util.rs
new file mode 100644
index 00000000..6c325a6c
--- /dev/null
+++ b/highlight/src/util.rs
@@ -0,0 +1,63 @@
+use std::str;
+
+pub struct LossyUtf8<'a> {
+ bytes: &'a [u8],
+ in_replacement: bool,
+}
+
+impl<'a> LossyUtf8<'a> {
+ pub fn new(bytes: &'a [u8]) -> Self {
+ LossyUtf8 {
+ bytes,
+ in_replacement: false,
+ }
+ }
+}
+
+impl<'a> Iterator for LossyUtf8<'a> {
+ type Item = &'a str;
+
+ fn next(&mut self) -> Option<&'a str> {
+ if self.bytes.is_empty() {
+ return None;
+ }
+ if self.in_replacement {
+ self.in_replacement = false;
+ return Some("\u{fffd}");
+ }
+ match str::from_utf8(self.bytes) {
+ Ok(valid) => {
+ self.bytes = &[];
+ Some(valid)
+ }
+ Err(error) => {
+ if let Some(error_len) = error.error_len() {
+ let error_start = error.valid_up_to();
+ if error_start > 0 {
+ let result =
+ unsafe { str::from_utf8_unchecked(&self.bytes[..error_start]) };
+ self.bytes = &self.bytes[(error_start + error_len)..];
+ self.in_replacement = true;
+ Some(result)
+ } else {
+ self.bytes = &self.bytes[error_len..];
+ Some("\u{fffd}")
+ }
+ } else {
+ None
+ }
+ }
+ }
+ }
+}
+
+pub fn html_escape(c: u8) -> Option<&'static [u8]> {
+ match c as char {
+ '>' => Some(b">"),
+ '<' => Some(b"<"),
+ '&' => Some(b"&"),
+ '\'' => Some(b"'"),
+ '"' => Some(b"""),
+ _ => None,
+ }
+}