From 98e4fd22efb8e59086d4d612e91a21dff5854ca5 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 8 Mar 2019 13:13:02 -0800 Subject: [PATCH] Start work on a C API for syntax highlighting --- cli/src/tests/helpers/fixtures.rs | 8 +- cli/src/tests/highlight_test.rs | 93 +++++++- highlight/Cargo.toml | 3 + highlight/include/tree_sitter/highlight.h | 102 +++++++++ highlight/src/c_lib.rs | 248 ++++++++++++++++++++++ highlight/src/lib.rs | 14 +- 6 files changed, 463 insertions(+), 5 deletions(-) create mode 100644 highlight/include/tree_sitter/highlight.h create mode 100644 highlight/src/c_lib.rs diff --git a/cli/src/tests/helpers/fixtures.rs b/cli/src/tests/helpers/fixtures.rs index e7ba2e55..4389797e 100644 --- a/cli/src/tests/helpers/fixtures.rs +++ b/cli/src/tests/helpers/fixtures.rs @@ -21,12 +21,16 @@ pub fn get_language(name: &str) -> Language { .unwrap() } -pub fn get_property_sheet(language_name: &str, sheet_name: &str) -> PropertySheet { +pub fn get_property_sheet_json(language_name: &str, sheet_name: &str) -> String { let path = GRAMMARS_DIR .join(language_name) .join("src") .join(sheet_name); - let json = fs::read_to_string(path).unwrap(); + fs::read_to_string(path).unwrap() +} + +pub fn get_property_sheet(language_name: &str, sheet_name: &str) -> PropertySheet { + let json = get_property_sheet_json(language_name, sheet_name); let language = get_language(language_name); load_property_sheet(language, &json).unwrap() } diff --git a/cli/src/tests/highlight_test.rs b/cli/src/tests/highlight_test.rs index accca617..2c70f8cc 100644 --- a/cli/src/tests/highlight_test.rs +++ b/cli/src/tests/highlight_test.rs @@ -1,7 +1,9 @@ -use super::helpers::fixtures::{get_language, get_property_sheet}; +use super::helpers::fixtures::{get_language, get_property_sheet, get_property_sheet_json}; use lazy_static::lazy_static; +use std::ffi::CString; +use std::{ptr, slice, str}; use tree_sitter::{Language, PropertySheet}; -use tree_sitter_highlight::{highlight, highlight_html, HighlightEvent, Properties, Scope}; +use tree_sitter_highlight::{c, highlight, highlight_html, HighlightEvent, Properties, Scope}; lazy_static! { static ref JS_SHEET: PropertySheet = @@ -153,6 +155,93 @@ fn test_highlighting_empty_lines() { ); } +#[test] +fn test_highlighting_via_c_api() { + let js_lang = get_language("javascript"); + let html_lang = get_language("html"); + let js_sheet = get_property_sheet_json("javascript", "highlights.json"); + let js_sheet = c_string(&js_sheet); + let html_sheet = get_property_sheet_json("html", "highlights.json"); + let html_sheet = c_string(&html_sheet); + + let class_tag = c_string("class=tag"); + let class_function = c_string("class=function"); + let class_string = c_string("class=string"); + let class_keyword = c_string("class=keyword"); + + let js_scope_name = c_string("source.js"); + let html_scope_name = c_string("text.html.basic"); + let injection_regex = c_string("^(javascript|js)$"); + let source_code = c_string(""); + + let attribute_strings = &mut [ptr::null(); Scope::Unknown as usize + 1]; + attribute_strings[Scope::Tag as usize] = class_tag.as_ptr(); + attribute_strings[Scope::String as usize] = class_string.as_ptr(); + attribute_strings[Scope::Keyword as usize] = class_keyword.as_ptr(); + attribute_strings[Scope::Function as usize] = class_function.as_ptr(); + + let highlighter = c::ts_highlighter_new(attribute_strings.as_ptr()); + let buffer = c::ts_highlight_buffer_new(); + + c::ts_highlighter_add_language( + highlighter, + html_scope_name.as_ptr(), + html_lang, + html_sheet.as_ptr(), + ptr::null_mut(), + ); + c::ts_highlighter_add_language( + highlighter, + js_scope_name.as_ptr(), + js_lang, + js_sheet.as_ptr(), + injection_regex.as_ptr(), + ); + c::ts_highlighter_highlight( + highlighter, + html_scope_name.as_ptr(), + source_code.as_ptr(), + source_code.as_bytes().len() as u32, + buffer, + ); + + let output_bytes = c::ts_highlight_buffer_content(buffer); + let output_line_offsets = c::ts_highlight_buffer_line_offsets(buffer); + let output_len = c::ts_highlight_buffer_len(buffer); + let output_line_count = c::ts_highlight_buffer_line_count(buffer); + + let output_bytes = unsafe { slice::from_raw_parts(output_bytes, output_len as usize) }; + let output_line_offsets = + unsafe { slice::from_raw_parts(output_line_offsets, output_line_count as usize) }; + + let mut lines = Vec::new(); + for i in 0..(output_line_count as usize) { + let line_start = output_line_offsets[i] as usize; + let line_end = output_line_offsets + .get(i + 1) + .map(|x| *x as usize) + .unwrap_or(output_bytes.len()); + lines.push(str::from_utf8(&output_bytes[line_start..line_end]).unwrap()); + } + + assert_eq!( + lines, + vec![ + "<script>", + "const a = b('c');", + "c.d();", + "</script>", + ] + ); + + c::ts_highlighter_delete(highlighter); + c::ts_highlight_buffer_delete(buffer); +} + +fn c_string(s: &str) -> CString { + CString::new(s.as_bytes().to_vec()).unwrap() +} + fn test_language_for_injection_string<'a>( string: &str, ) -> Option<(Language, &'a PropertySheet)> { diff --git a/highlight/Cargo.toml b/highlight/Cargo.toml index 688a2f6c..cf807d9f 100644 --- a/highlight/Cargo.toml +++ b/highlight/Cargo.toml @@ -12,6 +12,9 @@ edition = "2018" keywords = ["incremental", "parsing", "syntax", "highlighting"] categories = ["parsing", "text-editors"] +[lib] +crate-type = ["lib", "staticlib"] + [dependencies] regex = "1" serde = "1.0" diff --git a/highlight/include/tree_sitter/highlight.h b/highlight/include/tree_sitter/highlight.h new file mode 100644 index 00000000..dd2f99c3 --- /dev/null +++ b/highlight/include/tree_sitter/highlight.h @@ -0,0 +1,102 @@ +#ifndef TREE_SITTER_HIGHLIGHT_H_ +#define TREE_SITTER_HIGHLIGHT_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +typedef enum { + TSHighlightOk, + TSHighlightUnknownScope, +} TSHighlightError; + +// The list of scopes which can be styled for syntax highlighting. +// When constructing a `TSHighlighter`, you need to construct an +// `attribute_strings` array whose elements correspond to these values. +enum TSHighlightScope { + TSHighlightScopeAttribute, + TSHighlightScopeComment, + TSHighlightScopeConstant, + TSHighlightScopeConstantBuiltin, + TSHighlightScopeConstructor, + TSHighlightScopeConstructorBuiltin, + TSHighlightScopeEmbedded, + TSHighlightScopeEscape, + TSHighlightScopeFunction, + TSHighlightScopeFunctionBuiltin, + TSHighlightScopeKeyword, + TSHighlightScopeNumber, + TSHighlightScopeOperator, + TSHighlightScopeProperty, + TSHighlightScopePropertyBuiltin, + TSHighlightScopePunctuation, + TSHighlightScopePunctuationBracket, + TSHighlightScopePunctuationDelimiter, + TSHighlightScopePunctuationSpecial, + TSHighlightScopeString, + TSHighlightScopeStringSpecial, + TSHighlightScopeTag, + TSHighlightScopeType, + TSHighlightScopeTypeBuiltin, + TSHighlightScopeVariable, + TSHighlightScopeVariableBuiltin, + TSHighlightScopeUnknown, +}; + +typedef struct TSHighlighter TSHighlighter; +typedef struct TSHighlightBuffer TSHighlightBuffer; + +// Construct a `TSHighlighter` by providing a list of strings containing +// the HTML attributes that should be applied for each highlight scope. +TSHighlighter *ts_highlighter_new( + const char **attribute_strings +); + +// Delete a syntax highlighter. +void ts_highlighter_delete(TSHighlighter *); + +// Add a `TSLanguage` to a highlighter. The language is associated with a +// scope name, which can be used later to select a language for syntax +// highlighting. Along with the language, you must provide a JSON string +// containing the compiled PropertySheet to use for syntax highlighting +// with that language. You can also optionally provide an 'injection regex', +// which is used to detect when this language has been embedded in a document +// written in a different language. +int ts_highlighter_add_language( + TSHighlighter *self, + const char *scope_name, + const TSLanguage *language, + const char *property_sheet_json, + const char *injection_regex +); + +// Compute syntax highlighting for a given document. You must first +// create a `TSHighlightBuffer` to hold the output. +int ts_highlighter_highlight( + TSHighlighter *self, + const char *scope_name, + const char *source_code, + uint32_t source_code_len, + TSHighlightBuffer *output +); + +// TSHighlightBuffer: This struct stores the HTML output of syntax +// highlighting. It can be reused for multiple highlighting calls. +TSHighlightBuffer *ts_highlight_buffer_new(); + +// Delete a highlight buffer. +void ts_highlight_buffer_delete(TSHighlightBuffer *); + +// Access the HTML content of a highlight buffer. +const uint8_t *ts_highlight_buffer_content(const TSHighlightBuffer *); +const uint32_t *ts_highlight_buffer_line_offsets(const TSHighlightBuffer *); +uint32_t ts_highlight_buffer_len(const TSHighlightBuffer *); +uint32_t ts_highlight_buffer_line_count(const TSHighlightBuffer *); + +#ifdef __cplusplus +} +#endif + +#endif // TREE_SITTER_HIGHLIGHT_H_ diff --git a/highlight/src/c_lib.rs b/highlight/src/c_lib.rs new file mode 100644 index 00000000..f6b01984 --- /dev/null +++ b/highlight/src/c_lib.rs @@ -0,0 +1,248 @@ +use super::{escape, load_property_sheet, HighlightEvent, Highlighter, Properties, Scope}; +use regex::Regex; +use std::collections::HashMap; +use std::ffi::CStr; +use std::io::Write; +use std::os::raw::c_char; +use std::process::abort; +use std::{fmt, slice}; +use tree_sitter::{Language, PropertySheet}; + +struct LanguageConfiguration { + language: Language, + property_sheet: PropertySheet, + injection_regex: Option, +} + +pub struct TSHighlighter { + languages: HashMap, + attribute_strings: Vec<&'static [u8]>, +} + +pub struct TSHighlightBuffer { + html: Vec, + line_offsets: Vec, +} + +#[repr(C)] +pub enum ErrorCode { + Ok, + UnknownScope, +} + +#[no_mangle] +pub extern "C" fn ts_highlighter_new(attribute_strings: *const *const c_char) -> *mut TSHighlighter { + let attribute_strings = + unsafe { slice::from_raw_parts(attribute_strings, Scope::Unknown as usize + 1) }; + let attribute_strings = attribute_strings + .into_iter() + .map(|s| { + if s.is_null() { + &[] + } else { + unsafe { CStr::from_ptr(*s).to_bytes() } + } + }) + .collect(); + Box::into_raw(Box::new(TSHighlighter { + languages: HashMap::new(), + attribute_strings, + })) +} + +#[no_mangle] +pub extern "C" fn ts_highlight_buffer_new() -> *mut TSHighlightBuffer { + Box::into_raw(Box::new(TSHighlightBuffer { + html: Vec::new(), + line_offsets: Vec::new(), + })) +} + +#[no_mangle] +pub extern "C" fn ts_highlighter_delete(this: *mut TSHighlighter) { + drop(unsafe { Box::from_raw(this) }) +} + +#[no_mangle] +pub extern "C" fn ts_highlight_buffer_delete(this: *mut TSHighlightBuffer) { + drop(unsafe { Box::from_raw(this) }) +} + +#[no_mangle] +pub extern "C" fn ts_highlight_buffer_content(this: *mut TSHighlightBuffer) -> *const u8 { + let this = unwrap_ptr(this); + this.html.as_slice().as_ptr() +} + +#[no_mangle] +pub extern "C" fn ts_highlight_buffer_line_offsets(this: *mut TSHighlightBuffer) -> *const u32 { + let this = unwrap_ptr(this); + this.line_offsets.as_slice().as_ptr() +} + +#[no_mangle] +pub extern "C" fn ts_highlight_buffer_len(this: *mut TSHighlightBuffer) -> u32 { + let this = unwrap_ptr(this); + this.html.len() as u32 +} + +#[no_mangle] +pub extern "C" fn ts_highlight_buffer_line_count(this: *mut TSHighlightBuffer) -> u32 { + let this = unwrap_ptr(this); + this.line_offsets.len() as u32 +} + +#[no_mangle] +pub extern "C" fn ts_highlighter_add_language( + this: *mut TSHighlighter, + scope_name: *const c_char, + language: Language, + property_sheet_json: *const c_char, + injection_regex: *const c_char, +) -> ErrorCode { + let this = unwrap_ptr(this); + let scope_name = unsafe { CStr::from_ptr(scope_name) }; + let scope_name = unwrap(scope_name.to_str()).to_string(); + let property_sheet_json = unsafe { CStr::from_ptr(property_sheet_json) }; + let property_sheet_json = unwrap(property_sheet_json.to_str()); + + let property_sheet = unwrap(load_property_sheet(language, property_sheet_json)); + let injection_regex = if injection_regex.is_null() { + None + } else { + let pattern = unsafe { CStr::from_ptr(injection_regex) }; + Some(unwrap(Regex::new(unwrap(pattern.to_str())))) + }; + + this.languages.insert( + scope_name, + LanguageConfiguration { + language, + property_sheet, + injection_regex, + }, + ); + + ErrorCode::Ok +} + +#[no_mangle] +pub extern "C" fn ts_highlighter_highlight( + this: *mut TSHighlighter, + scope_name: *const c_char, + source_code: *const c_char, + source_code_len: u32, + output: *mut TSHighlightBuffer, +) -> ErrorCode { + let this = unwrap_ptr(this); + let output = unwrap_ptr(output); + let scope_name = unwrap(unsafe { CStr::from_ptr(scope_name).to_str() }); + let source_code = + unsafe { slice::from_raw_parts(source_code as *const u8, source_code_len as usize) }; + this.highlight(source_code, scope_name, output) +} + +impl TSHighlighter { + fn highlight( + &mut self, + source_code: &[u8], + scope_name: &str, + output: &mut TSHighlightBuffer, + ) -> ErrorCode { + let configuration = self.languages.get(scope_name); + if configuration.is_none() { + return ErrorCode::UnknownScope; + } + let configuration = configuration.unwrap(); + let languages = &self.languages; + + let highlighter = unwrap(Highlighter::new( + source_code, + configuration.language, + &configuration.property_sheet, + |injection_string| { + languages.values().find_map(|conf| { + conf.injection_regex.as_ref().and_then(|regex| { + if regex.is_match(injection_string) { + Some((conf.language, &conf.property_sheet)) + } else { + None + } + }) + }) + }, + )); + + output.html.clear(); + output.line_offsets.clear(); + output.line_offsets.push(0); + let mut scopes = Vec::new(); + for event in highlighter { + match event { + HighlightEvent::ScopeStart(s) => { + scopes.push(s); + output.start_scope(s, &self.attribute_strings); + } + HighlightEvent::ScopeEnd => { + scopes.pop(); + output.end_scope(); + } + HighlightEvent::Source(src) => { + output.add_text(src, &scopes, &self.attribute_strings); + } + }; + } + + ErrorCode::Ok + } +} + +impl TSHighlightBuffer { + fn start_scope(&mut self, s: Scope, attribute_strings: &[&[u8]]) { + let attribute_string = attribute_strings[s as usize]; + self.html.extend(b""); + } + + fn end_scope(&mut self) { + self.html.extend(b""); + } + + fn finish_line(&mut self) { + self.line_offsets.push(self.html.len() as u32); + } + + fn add_text(&mut self, src: &str, scopes: &Vec, attribute_strings: &[&[u8]]) { + let mut multiline = false; + for line in src.split('\n') { + let line = line.trim_end_matches('\r'); + if multiline { + scopes.iter().for_each(|_| self.end_scope()); + self.finish_line(); + scopes + .iter() + .for_each(|scope| self.start_scope(*scope, attribute_strings)); + } + write!(&mut self.html, "{}", escape::Escape(line)).unwrap(); + multiline = true; + } + } +} + +fn unwrap_ptr<'a, T>(result: *mut T) -> &'a mut T { + unsafe { result.as_mut() }.unwrap_or_else(|| { + eprintln!("{}:{} - pointer must not be null", file!(), line!()); + abort(); + }) +} + +fn unwrap(result: Result) -> T { + result.unwrap_or_else(|error| { + eprintln!("tree-sitter highlight error: {}", error); + abort(); + }) +} diff --git a/highlight/src/lib.rs b/highlight/src/lib.rs index e5499fbc..25d8d59f 100644 --- a/highlight/src/lib.rs +++ b/highlight/src/lib.rs @@ -1,9 +1,11 @@ +pub mod c_lib; mod escape; +pub use c_lib as c; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use serde_derive::*; use std::cmp; -use std::fmt::Write; +use std::fmt::{self, Write}; use std::mem::transmute; use std::str; use std::usize; @@ -151,6 +153,16 @@ pub enum PropertySheetError { InvalidFormat(String), } +impl fmt::Display for PropertySheetError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + PropertySheetError::InvalidJSON(e) => e.fmt(f), + PropertySheetError::InvalidRegex(e) => e.fmt(f), + PropertySheetError::InvalidFormat(e) => e.fmt(f), + } + } +} + pub fn load_property_sheet( language: Language, json: &str,