highlight iterator: Return byte offset ranges instead of string slices

Refs #443
This commit is contained in:
Max Brunsfeld 2019-06-18 20:36:24 -07:00
parent 82fab90c0b
commit 62538ed410
6 changed files with 232 additions and 247 deletions

View file

@ -293,11 +293,11 @@ pub fn ansi(
{
let event = event.map_err(|e| e.to_string())?;
match event {
HighlightEvent::Source(s) => {
HighlightEvent::Source { start, end } => {
if let Some(style) = highlight_stack.last().and_then(|s| theme.ansi_style(*s)) {
write!(&mut stdout, "{}", style.paint(s))?;
style.paint(&source[start..end]).write_to(&mut stdout)?;
} else {
write!(&mut stdout, "{}", s)?;
stdout.write_all(&source[start..end])?;
}
}
HighlightEvent::HighlightStart(h) => {

View file

@ -420,10 +420,10 @@ fn test_highlighting_via_c_api() {
assert_eq!(
lines,
vec![
"&lt;<span class=tag>script</span>&gt;",
"<span class=keyword>const</span> <span>a</span> <span>=</span> <span class=function>b</span><span>(</span><span class=string>&#39;c&#39;</span><span>)</span><span>;</span>",
"<span>c</span><span>.</span><span class=function>d</span><span>(</span><span>)</span><span>;</span>",
"&lt;/<span class=tag>script</span>&gt;",
"&lt;<span class=tag>script</span>&gt;\n",
"<span class=keyword>const</span> <span>a</span> <span>=</span> <span class=function>b</span><span>(</span><span class=string>&#39;c&#39;</span><span>)</span><span>;</span>\n",
"<span>c</span><span>.</span><span class=function>d</span><span>(</span><span>)</span><span>;</span>\n",
"&lt;/<span class=tag>script</span>&gt;\n",
]
);
@ -431,6 +431,23 @@ fn test_highlighting_via_c_api() {
c::ts_highlight_buffer_delete(buffer);
}
#[test]
fn test_decode_utf8_lossy() {
use tree_sitter_highlight::util::LossyUtf8;
let parts = LossyUtf8::new(b"hi").collect::<Vec<_>>();
assert_eq!(parts, vec!["hi"]);
let parts = LossyUtf8::new(b"hi\xc0\xc1bye").collect::<Vec<_>>();
assert_eq!(parts, vec!["hi", "\u{fffd}", "\u{fffd}", "bye"]);
let parts = LossyUtf8::new(b"\xc0\xc1bye").collect::<Vec<_>>();
assert_eq!(parts, vec!["\u{fffd}", "\u{fffd}", "bye"]);
let parts = LossyUtf8::new(b"hello\xc0\xc1").collect::<Vec<_>>();
assert_eq!(parts, vec!["hello", "\u{fffd}", "\u{fffd}"]);
}
fn c_string(s: &str) -> CString {
CString::new(s.as_bytes().to_vec()).unwrap()
}
@ -466,11 +483,12 @@ fn to_token_vector<'a>(
language: Language,
property_sheet: &'a PropertySheet<Properties>,
) -> Result<Vec<Vec<(&'a str, Vec<Highlight>)>>, Error> {
let src = src.as_bytes();
let mut lines = Vec::new();
let mut highlights = Vec::new();
let mut line = Vec::new();
for event in highlight(
src.as_bytes(),
src,
language,
property_sheet,
None,
@ -481,7 +499,8 @@ fn to_token_vector<'a>(
HighlightEvent::HighlightEnd => {
highlights.pop();
}
HighlightEvent::Source(s) => {
HighlightEvent::Source { start, end } => {
let s = str::from_utf8(&src[start..end]).unwrap();
for (i, l) in s.split("\n").enumerate() {
let l = l.trim_end_matches('\r');
if i > 0 {

View file

@ -1,8 +1,7 @@
use super::{escape, load_property_sheet, Error, Highlight, HighlightEvent, Highlighter, Properties};
use super::{load_property_sheet, Error, Highlight, Highlighter, HtmlRenderer, Properties};
use regex::Regex;
use std::collections::HashMap;
use std::ffi::CStr;
use std::io::Write;
use std::os::raw::c_char;
use std::process::abort;
use std::sync::atomic::AtomicUsize;
@ -20,10 +19,7 @@ pub struct TSHighlighter {
attribute_strings: Vec<&'static [u8]>,
}
pub struct TSHighlightBuffer {
html: Vec<u8>,
line_offsets: Vec<u32>,
}
pub struct TSHighlightBuffer(HtmlRenderer);
#[repr(C)]
pub enum ErrorCode {
@ -57,10 +53,7 @@ pub extern "C" fn ts_highlighter_new(
#[no_mangle]
pub extern "C" fn ts_highlight_buffer_new() -> *mut TSHighlightBuffer {
Box::into_raw(Box::new(TSHighlightBuffer {
html: Vec::new(),
line_offsets: Vec::new(),
}))
Box::into_raw(Box::new(TSHighlightBuffer(HtmlRenderer::new())))
}
#[no_mangle]
@ -76,25 +69,25 @@ pub extern "C" fn ts_highlight_buffer_delete(this: *mut TSHighlightBuffer) {
#[no_mangle]
pub extern "C" fn ts_highlight_buffer_content(this: *const TSHighlightBuffer) -> *const u8 {
let this = unwrap_ptr(this);
this.html.as_slice().as_ptr()
this.0.html.as_slice().as_ptr()
}
#[no_mangle]
pub extern "C" fn ts_highlight_buffer_line_offsets(this: *const TSHighlightBuffer) -> *const u32 {
let this = unwrap_ptr(this);
this.line_offsets.as_slice().as_ptr()
this.0.line_offsets.as_slice().as_ptr()
}
#[no_mangle]
pub extern "C" fn ts_highlight_buffer_len(this: *const TSHighlightBuffer) -> u32 {
let this = unwrap_ptr(this);
this.html.len() as u32
this.0.html.len() as u32
}
#[no_mangle]
pub extern "C" fn ts_highlight_buffer_line_count(this: *const TSHighlightBuffer) -> u32 {
let this = unwrap_ptr(this);
this.line_offsets.len() as u32
this.0.line_offsets.len() as u32
}
#[no_mangle]
@ -183,77 +176,28 @@ impl TSHighlighter {
);
if let Ok(highlighter) = highlighter {
output.html.clear();
output.line_offsets.clear();
output.line_offsets.push(0);
let mut highlights = Vec::new();
for event in highlighter {
match event {
Ok(HighlightEvent::HighlightStart(s)) => {
highlights.push(s);
output.start_highlight(s, &self.attribute_strings);
}
Ok(HighlightEvent::HighlightEnd) => {
highlights.pop();
output.end_highlight();
}
Ok(HighlightEvent::Source(src)) => {
output.add_text(src, &highlights, &self.attribute_strings);
},
Err(Error::Cancelled) => {
return ErrorCode::Timeout;
},
Err(Error::InvalidLanguage) => {
return ErrorCode::InvalidLanguage;
},
Err(Error::Unknown) => {
return ErrorCode::Timeout;
}
output.0.reset();
let result = output.0.render(highlighter, source_code, &|s| {
self.attribute_strings[s as usize]
});
match result {
Err(Error::Cancelled) => {
return ErrorCode::Timeout;
}
Err(Error::InvalidLanguage) => {
return ErrorCode::InvalidLanguage;
}
Err(Error::Unknown) => {
return ErrorCode::Timeout;
}
Ok(()) => ErrorCode::Ok,
}
ErrorCode::Ok
} else {
ErrorCode::Timeout
}
}
}
impl TSHighlightBuffer {
fn start_highlight(&mut self, h: Highlight, attribute_strings: &[&[u8]]) {
let attribute_string = attribute_strings[h as usize];
self.html.extend(b"<span");
if !attribute_string.is_empty() {
self.html.extend(b" ");
self.html.extend(attribute_string);
}
self.html.extend(b">");
}
fn end_highlight(&mut self) {
self.html.extend(b"</span>");
}
fn finish_line(&mut self) {
self.line_offsets.push(self.html.len() as u32);
}
fn add_text(&mut self, src: &str, highlights: &Vec<Highlight>, attribute_strings: &[&[u8]]) {
let mut multiline = false;
for line in src.split('\n') {
let line = line.trim_end_matches('\r');
if multiline {
highlights.iter().for_each(|_| self.end_highlight());
self.finish_line();
highlights
.iter()
.for_each(|scope| self.start_highlight(*scope, attribute_strings));
}
write!(&mut self.html, "{}", escape::Escape(line)).unwrap();
multiline = true;
}
}
}
fn unwrap_ptr<'a, T>(result: *const T) -> &'a T {
unsafe { result.as_ref() }.unwrap_or_else(|| {
eprintln!("{}:{} - pointer must not be null", file!(), line!());

View file

@ -1,53 +0,0 @@
// Copyright 2013 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! HTML Escaping
//!
//! This module contains one unit-struct which can be used to HTML-escape a
//! string of text (for use in a format string).
use std::fmt;
/// Wrapper struct which will emit the HTML-escaped version of the contained
/// string when passed to a format string.
pub struct Escape<'a>(pub &'a str);
impl<'a> fmt::Display for Escape<'a> {
fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
// Because the internet is always right, turns out there's not that many
// characters to escape: http://stackoverflow.com/questions/7381974
let Escape(s) = *self;
let pile_o_bits = s;
let mut last = 0;
for (i, ch) in s.bytes().enumerate() {
match ch as char {
'<' | '>' | '&' | '\'' | '"' => {
fmt.write_str(&pile_o_bits[last..i])?;
let s = match ch as char {
'>' => "&gt;",
'<' => "&lt;",
'&' => "&amp;",
'\'' => "&#39;",
'"' => "&quot;",
_ => unreachable!(),
};
fmt.write_str(s)?;
last = i + 1;
}
_ => {}
}
}
if last < s.len() {
fmt.write_str(&pile_o_bits[last..])?;
}
Ok(())
}
}

View file

@ -1,13 +1,12 @@
pub mod c_lib;
mod escape;
pub mod util;
pub use c_lib as c;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use serde_derive::*;
use std::fmt::{self, Write};
use std::mem::transmute;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::{cmp, str, usize};
use std::{cmp, fmt, str, usize};
use tree_sitter::{Language, Node, Parser, Point, PropertySheet, Range, Tree, TreePropertyCursor};
const CANCELLATION_CHECK_INTERVAL: usize = 100;
@ -116,14 +115,13 @@ where
parser: Parser,
layers: Vec<Layer<'a>>,
max_opaque_layer_depth: usize,
utf8_error_len: Option<usize>,
operation_count: usize,
cancellation_flag: Option<&'a AtomicUsize>,
}
#[derive(Copy, Clone, Debug)]
pub enum HighlightEvent<'a> {
Source(&'a str),
pub enum HighlightEvent {
Source { start: usize, end: usize },
HighlightStart(Highlight),
HighlightEnd,
}
@ -471,7 +469,6 @@ where
injection_callback,
source_offset: 0,
operation_count: 0,
utf8_error_len: None,
max_opaque_layer_depth: 0,
layers: vec![Layer::new(
source,
@ -489,30 +486,13 @@ where
})
}
fn emit_source(&mut self, next_offset: usize) -> Option<Result<HighlightEvent<'a>, Error>> {
let input = &self.source[self.source_offset..next_offset];
match str::from_utf8(input) {
Ok(valid) => {
self.source_offset = next_offset;
Some(Ok(HighlightEvent::Source(valid)))
}
Err(error) => {
if let Some(error_len) = error.error_len() {
if error.valid_up_to() > 0 {
let prefix = &input[0..error.valid_up_to()];
self.utf8_error_len = Some(error_len);
Some(Ok(HighlightEvent::Source(unsafe {
str::from_utf8_unchecked(prefix)
})))
} else {
self.source_offset += error_len;
Some(Ok(HighlightEvent::Source("\u{FFFD}")))
}
} else {
None
}
}
}
fn emit_source(&mut self, next_offset: usize) -> HighlightEvent {
let result = HighlightEvent::Source {
start: self.source_offset,
end: next_offset,
};
self.source_offset = next_offset;
result
}
fn process_tree_step(&self, step: &TreeStep, nodes: &mut Vec<Node>) {
@ -727,7 +707,7 @@ impl<'a, T> Iterator for Highlighter<'a, T>
where
T: Fn(&str) -> Option<(Language, &'a PropertySheet<Properties>)>,
{
type Item = Result<HighlightEvent<'a>, Error>;
type Item = Result<HighlightEvent, Error>;
fn next(&mut self) -> Option<Self::Item> {
if let Some(cancellation_flag) = self.cancellation_flag {
@ -740,11 +720,6 @@ where
}
}
if let Some(utf8_error_len) = self.utf8_error_len.take() {
self.source_offset += utf8_error_len;
return Some(Ok(HighlightEvent::Source("\u{FFFD}")));
}
while !self.layers.is_empty() {
let mut scope_event = None;
let first_layer = &self.layers[0];
@ -808,7 +783,7 @@ where
// Before returning any highlight boundaries, return any remaining slice of
// the source code the precedes that highlight boundary.
if self.source_offset < next_offset {
return self.emit_source(next_offset);
return Some(Ok(self.emit_source(next_offset)));
}
scope_event = if first_layer.at_node_end {
@ -841,7 +816,7 @@ where
}
if self.source_offset < self.source.len() {
self.emit_source(self.source.len())
Some(Ok(self.emit_source(self.source.len())))
} else {
None
}
@ -1081,7 +1056,7 @@ pub fn highlight<'a, F>(
property_sheet: &'a PropertySheet<Properties>,
cancellation_flag: Option<&'a AtomicUsize>,
injection_callback: F,
) -> Result<impl Iterator<Item = Result<HighlightEvent<'a>, Error>> + 'a, Error>
) -> Result<impl Iterator<Item = Result<HighlightEvent, Error>> + 'a, Error>
where
F: Fn(&str) -> Option<(Language, &'a PropertySheet<Properties>)> + 'a,
{
@ -1106,87 +1081,124 @@ where
F1: Fn(&str) -> Option<(Language, &'a PropertySheet<Properties>)>,
F2: Fn(Highlight) -> &'a str,
{
let highlighter = Highlighter::new(
let mut renderer = HtmlRenderer::new();
renderer.render(
Highlighter::new(
source,
language,
property_sheet,
injection_callback,
cancellation_flag,
)?,
source,
language,
property_sheet,
injection_callback,
cancellation_flag,
&|s| (attribute_callback)(s).as_bytes(),
)?;
let mut renderer = HtmlRenderer::new(attribute_callback);
let mut scopes = Vec::new();
for event in highlighter {
let event = event?;
match event {
HighlightEvent::HighlightStart(s) => {
scopes.push(s);
renderer.start_scope(s);
}
HighlightEvent::HighlightEnd => {
scopes.pop();
renderer.end_scope();
}
HighlightEvent::Source(src) => {
renderer.add_text(src, &scopes);
}
};
}
if !renderer.current_line.is_empty() {
renderer.finish_line();
}
Ok(renderer.result)
Ok(renderer
.line_offsets
.iter()
.enumerate()
.map(|(i, offset)| {
let offset = *offset as usize;
let next_offset = renderer
.line_offsets
.get(i + 1)
.map_or(renderer.html.len(), |i| *i as usize);
String::from_utf8(renderer.html[offset..next_offset].to_vec()).unwrap()
})
.collect())
}
struct HtmlRenderer<'a, F: Fn(Highlight) -> &'a str> {
result: Vec<String>,
current_line: String,
attribute_callback: F,
pub struct HtmlRenderer {
pub html: Vec<u8>,
pub line_offsets: Vec<u32>,
}
impl<'a, F> HtmlRenderer<'a, F>
where
F: Fn(Highlight) -> &'a str,
{
fn new(attribute_callback: F) -> Self {
impl HtmlRenderer {
fn new() -> Self {
HtmlRenderer {
result: Vec::new(),
current_line: String::new(),
attribute_callback,
html: Vec::new(),
line_offsets: vec![0],
}
}
fn start_scope(&mut self, s: Highlight) {
write!(
&mut self.current_line,
"<span {}>",
(self.attribute_callback)(s),
)
.unwrap();
pub fn reset(&mut self) {
self.html.clear();
self.line_offsets.clear();
self.line_offsets.push(0);
}
fn end_scope(&mut self) {
write!(&mut self.current_line, "</span>").unwrap();
}
fn finish_line(&mut self) {
self.current_line.push('\n');
self.result.push(self.current_line.clone());
self.current_line.clear();
}
fn add_text(&mut self, src: &str, scopes: &Vec<Highlight>) {
let mut multiline = false;
for line in src.split('\n') {
let line = line.trim_end_matches('\r');
if multiline {
scopes.iter().for_each(|_| self.end_scope());
self.finish_line();
scopes
.iter()
.for_each(|highlight| self.start_scope(*highlight));
pub fn render<'a, F>(
&mut self,
highlighter: impl Iterator<Item = Result<HighlightEvent, Error>>,
source: &'a [u8],
attribute_callback: &F,
) -> Result<(), Error>
where
F: Fn(Highlight) -> &'a [u8],
{
let mut highlights = Vec::new();
for event in highlighter {
match event {
Ok(HighlightEvent::HighlightStart(s)) => {
highlights.push(s);
self.start_highlight(s, attribute_callback);
}
Ok(HighlightEvent::HighlightEnd) => {
highlights.pop();
self.end_highlight();
}
Ok(HighlightEvent::Source { start, end }) => {
self.add_text(&source[start..end], &highlights, attribute_callback);
}
Err(a) => return Err(a),
}
}
if self.html.last() != Some(&b'\n') {
self.html.push(b'\n');
}
if self.line_offsets.last() == Some(&(self.html.len() as u32)) {
self.line_offsets.pop();
}
Ok(())
}
fn start_highlight<'a, F>(&mut self, h: Highlight, attribute_callback: &F)
where
F: Fn(Highlight) -> &'a [u8],
{
let attribute_string = (attribute_callback)(h);
self.html.extend(b"<span");
if !attribute_string.is_empty() {
self.html.extend(b" ");
self.html.extend(attribute_string);
}
self.html.extend(b">");
}
fn end_highlight(&mut self) {
self.html.extend(b"</span>");
}
fn add_text<'a, F>(&mut self, src: &[u8], highlights: &Vec<Highlight>, attribute_callback: &F)
where
F: Fn(Highlight) -> &'a [u8],
{
for c in util::LossyUtf8::new(src).flat_map(|p| p.bytes()) {
if c == b'\n' {
if self.html.ends_with(b"\r") {
self.html.pop();
}
highlights.iter().for_each(|_| self.end_highlight());
self.html.push(c);
self.line_offsets.push(self.html.len() as u32);
highlights
.iter()
.for_each(|scope| self.start_highlight(*scope, attribute_callback));
} else if let Some(escape) = util::html_escape(c) {
self.html.extend_from_slice(escape);
} else {
self.html.push(c);
}
write!(&mut self.current_line, "{}", escape::Escape(line)).unwrap();
multiline = true;
}
}
}

63
highlight/src/util.rs Normal file
View file

@ -0,0 +1,63 @@
use std::str;
pub struct LossyUtf8<'a> {
bytes: &'a [u8],
in_replacement: bool,
}
impl<'a> LossyUtf8<'a> {
pub fn new(bytes: &'a [u8]) -> Self {
LossyUtf8 {
bytes,
in_replacement: false,
}
}
}
impl<'a> Iterator for LossyUtf8<'a> {
type Item = &'a str;
fn next(&mut self) -> Option<&'a str> {
if self.bytes.is_empty() {
return None;
}
if self.in_replacement {
self.in_replacement = false;
return Some("\u{fffd}");
}
match str::from_utf8(self.bytes) {
Ok(valid) => {
self.bytes = &[];
Some(valid)
}
Err(error) => {
if let Some(error_len) = error.error_len() {
let error_start = error.valid_up_to();
if error_start > 0 {
let result =
unsafe { str::from_utf8_unchecked(&self.bytes[..error_start]) };
self.bytes = &self.bytes[(error_start + error_len)..];
self.in_replacement = true;
Some(result)
} else {
self.bytes = &self.bytes[error_len..];
Some("\u{fffd}")
}
} else {
None
}
}
}
}
}
pub fn html_escape(c: u8) -> Option<&'static [u8]> {
match c as char {
'>' => Some(b"&gt;"),
'<' => Some(b"&lt;"),
'&' => Some(b"&amp;"),
'\'' => Some(b"&#39;"),
'"' => Some(b"&quot;"),
_ => None,
}
}