diff --git a/Cargo.lock b/Cargo.lock index 99ae53a8..52e2c3c3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -210,6 +210,14 @@ dependencies = [ "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "lock_api" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "log" version = "0.4.6" @@ -263,6 +271,35 @@ name = "num-traits" version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "once_cell" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "parking_lot 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "parking_lot" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "lock_api 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", + "parking_lot_core 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "parking_lot_core" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "rand 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", + "rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)", + "smallvec 0.6.8 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "proc-macro2" version = "0.4.24" @@ -418,6 +455,14 @@ dependencies = [ "ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "remove_dir_all" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "rsass" version = "0.9.8" @@ -502,6 +547,14 @@ name = "smallbitvec" version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "smallvec" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "unreachable 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "spin" version = "0.5.0" @@ -533,6 +586,19 @@ dependencies = [ "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "tempfile" +version = "3.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "rand 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", + "redox_syscall 0.1.43 (registry+https://github.com/rust-lang/crates.io-index)", + "remove_dir_all 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "termion" version = "1.5.1" @@ -561,7 +627,7 @@ dependencies = [ [[package]] name = "tree-sitter" -version = "0.3.8" +version = "0.3.9" dependencies = [ "cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)", "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", @@ -572,7 +638,7 @@ dependencies = [ [[package]] name = "tree-sitter-cli" -version = "0.14.4" +version = "0.14.5" dependencies = [ "ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)", "cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)", @@ -583,6 +649,7 @@ dependencies = [ "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", "libloading 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", + "once_cell 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)", "rand 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", @@ -592,7 +659,20 @@ dependencies = [ "serde_json 1.0.33 (registry+https://github.com/rust-lang/crates.io-index)", "smallbitvec 2.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "spin 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", - "tree-sitter 0.3.8", + "tempfile 3.0.7 (registry+https://github.com/rust-lang/crates.io-index)", + "tree-sitter 0.3.9", + "tree-sitter-highlight 0.1.4", +] + +[[package]] +name = "tree-sitter-highlight" +version = "0.1.4" +dependencies = [ + "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_json 1.0.33 (registry+https://github.com/rust-lang/crates.io-index)", + "tree-sitter 0.3.9", ] [[package]] @@ -610,6 +690,14 @@ name = "unicode-xid" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "unreachable" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "utf8-ranges" version = "1.0.2" @@ -625,6 +713,11 @@ name = "version_check" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "void" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "winapi" version = "0.3.6" @@ -673,6 +766,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a374c89b9db55895453a74c1e38861d9deec0b01b405a82516e9d5de4820dea1" "checksum libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)" = "10923947f84a519a45c8fefb7dd1b3e8c08747993381adee176d7a82b4195311" "checksum libloading 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "9c3ad660d7cb8c5822cd83d10897b0f1f1526792737a179e73896152f85b88c2" +"checksum lock_api 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "62ebf1391f6acad60e5c8b43706dde4582df75c06698ab44511d15016bc2442c" "checksum log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c84ec4b527950aa83a329754b01dbe3f58361d1c5efacd1f6d68c494d08a17c6" "checksum memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0a3eb002f0535929f1199681417029ebea04aadc0c7a4224b46be99c7f5d6a16" "checksum nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "2f9667ddcc6cc8a43afc9b7917599d7216aa09c463919ea32c59ed6cac8bc945" @@ -680,6 +774,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum num-integer 0.1.39 (registry+https://github.com/rust-lang/crates.io-index)" = "e83d528d2677f0518c570baf2b7abdcf0cd2d248860b68507bdcb3e91d4c0cea" "checksum num-rational 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4e96f040177bb3da242b5b1ecf3f54b5d5af3efbbfb18608977a5d2767b22f10" "checksum num-traits 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "0b3a5d7cc97d6d30d8b9bc8fa19bf45349ffe46241e8816f50f62f6d6aaabee1" +"checksum once_cell 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)" = "532c29a261168a45ce28948f9537ddd7a5dd272cc513b3017b1e82a88f962c37" +"checksum parking_lot 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)" = "ab41b4aed082705d1056416ae4468b6ea99d52599ecf3169b00088d43113e337" +"checksum parking_lot_core 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "94c8c7923936b28d546dfd14d4472eaf34c99b14e1c973a32b3e6d4eb04298c9" "checksum proc-macro2 0.4.24 (registry+https://github.com/rust-lang/crates.io-index)" = "77619697826f31a02ae974457af0b29b723e5619e113e9397b8b82c6bd253f09" "checksum quote 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)" = "53fa22a1994bd0f9372d7a816207d8a2677ad0325b073f5c5332760f0fb62b5c" "checksum rand 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "8356f47b32624fef5b3301c1be97e5944ecdd595409cc5da11d05f211db6cfbd" @@ -697,6 +794,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum redox_users 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "214a97e49be64fd2c86f568dd0cb2c757d2cc53de95b273b6ad0a1c908482f26" "checksum regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "37e7cbbd370869ce2e8dff25c7018702d10b21a20ef7135316f8daecd6c25b7f" "checksum regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)" = "4e47a2ed29da7a9e1960e1639e7a982e6edc6d49be308a3b02daf511504a16d1" +"checksum remove_dir_all 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3488ba1b9a2084d38645c4c08276a1752dcbf2c7130d74f1569681ad5d2799c5" "checksum rsass 0.9.8 (registry+https://github.com/rust-lang/crates.io-index)" = "7f4534cc03040beacd2668621815f26fe57e5b7cfe085790f98e5e87c1612316" "checksum rustc-demangle 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "bcfe5b13211b4d78e5c2cadfebd7769197d95c639c35a50057eb4c05de811395" "checksum rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" @@ -709,19 +807,23 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)" = "225de307c6302bec3898c51ca302fc94a7a1697ef0845fcee6448f33c032249c" "checksum serde_json 1.0.33 (registry+https://github.com/rust-lang/crates.io-index)" = "c37ccd6be3ed1fdf419ee848f7c758eb31b054d7cd3ae3600e3bae0adf569811" "checksum smallbitvec 2.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1764fe2b30ee783bfe3b9b37b2649d8d590b3148bb12e0079715d4d5c673562e" +"checksum smallvec 0.6.8 (registry+https://github.com/rust-lang/crates.io-index)" = "88aea073965ab29f6edb5493faf96ad662fb18aa9eeb186a3b7057951605ed15" "checksum spin 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "44363f6f51401c34e7be73db0db371c04705d35efbe9f7d6082e03a921a32c55" "checksum strsim 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bb4f380125926a99e52bc279241539c018323fab05ad6368b56f93d9369ff550" "checksum syn 0.15.22 (registry+https://github.com/rust-lang/crates.io-index)" = "ae8b29eb5210bc5cf63ed6149cbf9adfc82ac0be023d8735c176ee74a2db4da7" "checksum synstructure 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)" = "73687139bf99285483c96ac0add482c3776528beac1d97d444f6e91f203a2015" +"checksum tempfile 3.0.7 (registry+https://github.com/rust-lang/crates.io-index)" = "b86c784c88d98c801132806dadd3819ed29d8600836c4088e855cdf3e178ed8a" "checksum termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "689a3bdfaab439fd92bc87df5c4c78417d3cbe537487274e9b0b2dce76e92096" "checksum textwrap 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "307686869c93e71f94da64286f9a9524c0f308a9e1c87a583de8e9c9039ad3f6" "checksum thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c6b53e329000edc2b34dbe8545fd20e55a333362d0a321909685a19bd28c3f1b" "checksum ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "535c204ee4d8434478593480b8f86ab45ec9aae0e83c568ca81abf0fd0e88f86" "checksum unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "882386231c45df4700b275c7ff55b6f3698780a650026380e72dabe76fa46526" "checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" +"checksum unreachable 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "382810877fe448991dfc7f0dd6e3ae5d58088fd0ea5e35189655f84e6814fa56" "checksum utf8-ranges 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "796f7e48bef87609f7ade7e06495a87d5cd06c7866e6a5cbfceffc558a243737" "checksum vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "05c78687fb1a80548ae3250346c3db86a80a7cdd77bda190189f2d0a0987c81a" "checksum version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "914b1a6776c4c929a602fafd8bc742e06365d4bcbe48c30f9cca5824f70dc9dd" +"checksum void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" "checksum winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "92c1eb33641e276cfa214a0522acad57be5c56b10cb348b3c5117db75f3ac4b0" "checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" "checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 7d43ac92..0ce9c75c 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "tree-sitter-cli" description = "CLI tool for developing, testing, and using Tree-sitter parsers" -version = "0.14.4" +version = "0.14.5" authors = ["Max Brunsfeld "] edition = "2018" license = "MIT" @@ -27,6 +27,7 @@ clap = "2.32" dirs = "1.0.2" hashbrown = "0.1" libloading = "0.5" +once_cell = "0.1.8" serde = "1.0" serde_derive = "1.0" regex-syntax = "0.6.4" @@ -37,6 +38,10 @@ rsass = "^0.9.8" version = ">= 0.3.7" path = "../lib" +[dependencies.tree-sitter-highlight] +version = ">= 0.1.0" +path = "../highlight" + [dependencies.serde_json] version = "1.0" features = ["preserve_order"] @@ -48,3 +53,4 @@ features = ["std"] [dev-dependencies] rand = "0.6.4" spin = "0.5" +tempfile = "3" diff --git a/cli/benches/benchmark.rs b/cli/benches/benchmark.rs index 7e02ce50..340ef1be 100644 --- a/cli/benches/benchmark.rs +++ b/cli/benches/benchmark.rs @@ -61,11 +61,6 @@ fn main() { let mut all_error_speeds = Vec::new(); for (language_name, example_paths) in EXAMPLE_PATHS_BY_LANGUAGE_NAME.iter() { - // TODO - remove after fixing slow error parsing HTML. - if language_name == "html" { - continue; - } - if let Some(filter) = LANGUAGE_FILTER.as_ref() { if language_name != filter.as_str() { continue; diff --git a/cli/npm/package-lock.json b/cli/npm/package-lock.json index 65076390..06ff40b3 100644 --- a/cli/npm/package-lock.json +++ b/cli/npm/package-lock.json @@ -1,5 +1,5 @@ { "name": "tree-sitter-cli", - "version": "0.14.4", + "version": "0.14.5", "lockfileVersion": 1 } diff --git a/cli/npm/package.json b/cli/npm/package.json index e463d6f4..862d89dd 100644 --- a/cli/npm/package.json +++ b/cli/npm/package.json @@ -1,6 +1,6 @@ { "name": "tree-sitter-cli", - "version": "0.14.4", + "version": "0.14.5", "author": "Max Brunsfeld", "license": "MIT", "repository": { diff --git a/cli/src/config.rs b/cli/src/config.rs new file mode 100644 index 00000000..1c9cc8f6 --- /dev/null +++ b/cli/src/config.rs @@ -0,0 +1,69 @@ +use super::highlight::Theme; +use serde_derive::{Deserialize, Serialize}; +use std::path::{Path, PathBuf}; +use std::{env, fs, io}; + +#[derive(Default, Deserialize, Serialize)] +pub struct Config { + #[serde(skip)] + pub binary_directory: PathBuf, + + #[serde(default)] + #[serde(rename = "parser-directories")] + pub parser_directories: Vec, + + #[serde(default)] + pub theme: Theme, +} + +impl Config { + pub fn get_path(home_dir: &Path) -> PathBuf { + env::var("TREE_SITTER_DIR") + .map(|p| p.into()) + .unwrap_or_else(|_| home_dir.join(".tree-sitter")) + } + + pub fn load(home_dir: &Path) -> Self { + let tree_sitter_dir = Self::get_path(home_dir); + let config_path = tree_sitter_dir.join("config.json"); + let mut result = fs::read_to_string(&config_path) + .map_err(drop) + .and_then(|json| serde_json::from_str(&json).map_err(drop)) + .unwrap_or_else(|_| Self::default()); + result.init(home_dir, &tree_sitter_dir); + result + } + + pub fn save(&self, home_dir: &Path) -> io::Result<()> { + let tree_sitter_dir = Self::get_path(home_dir); + let config_path = tree_sitter_dir.join("config.json"); + let json = serde_json::to_string_pretty(self).expect("Failed to serialize config"); + fs::write(config_path, json) + } + + pub fn new(home_dir: &Path) -> Self { + let tree_sitter_dir = Self::get_path(home_dir); + let mut result = Self::default(); + result.init(home_dir, &tree_sitter_dir); + result + } + + fn init(&mut self, home_dir: &Path, tree_sitter_dir: &Path) { + if self.parser_directories.is_empty() { + self.parser_directories = vec![ + home_dir.join("github"), + home_dir.join("src"), + home_dir.join("source"), + ] + } + + let binary_path = tree_sitter_dir.join("bin"); + self.binary_directory = binary_path; + fs::create_dir_all(&self.binary_directory).unwrap_or_else(|error| { + panic!( + "Could not find or create parser binary directory {:?}. Error: {}", + self.binary_directory, error + ) + }); + } +} diff --git a/cli/src/error.rs b/cli/src/error.rs index 4769b481..b0e52797 100644 --- a/cli/src/error.rs +++ b/cli/src/error.rs @@ -1,4 +1,5 @@ use std::io; +use tree_sitter_highlight::PropertySheetError; #[derive(Debug)] pub struct Error(pub String); @@ -42,3 +43,13 @@ impl From for Error { Error(error) } } + +impl From for Error { + fn from(error: PropertySheetError) -> Self { + match error { + PropertySheetError::InvalidFormat(e) => Self::from(e), + PropertySheetError::InvalidRegex(e) => Self::regex(&e.to_string()), + PropertySheetError::InvalidJSON(e) => Self::from(e), + } + } +} diff --git a/cli/src/generate/dsl.js b/cli/src/generate/dsl.js index db84274f..cf124258 100644 --- a/cli/src/generate/dsl.js +++ b/cli/src/generate/dsl.js @@ -1,5 +1,3 @@ -const UNICODE_ESCAPE_PATTERN = /\\u([0-9a-f]{4})/gi; - function alias(rule, value) { const result = { type: "ALIAS", @@ -180,12 +178,8 @@ function normalize(value) { }; case RegExp: return { - type: 'PATTERN', - value: value.source - .replace( - UNICODE_ESCAPE_PATTERN, - (match, group) => String.fromCharCode(parseInt(group, 16)) - ) + type: 'PATTERN', + value: value.source }; case ReferenceError: throw value diff --git a/cli/src/highlight.rs b/cli/src/highlight.rs new file mode 100644 index 00000000..703c4053 --- /dev/null +++ b/cli/src/highlight.rs @@ -0,0 +1,373 @@ +use crate::error::Result; +use crate::loader::Loader; +use ansi_term::{Color, Style}; +use lazy_static::lazy_static; +use serde::ser::SerializeMap; +use serde::{Deserialize, Deserializer, Serialize, Serializer}; +use serde_json::{json, Value}; +use std::collections::HashMap; +use std::{fmt, fs, io, path}; +use tree_sitter::{Language, PropertySheet}; +use tree_sitter_highlight::{highlight, highlight_html, HighlightEvent, Properties, Scope}; + +lazy_static! { + static ref CSS_STYLES_BY_COLOR_ID: Vec = + serde_json::from_str(include_str!("../vendor/xterm-colors.json")).unwrap(); +} + +pub struct Theme { + ansi_styles: Vec>, + css_styles: Vec>, +} + +impl Theme { + pub fn load(path: &path::Path) -> io::Result { + let json = fs::read_to_string(path)?; + Ok(serde_json::from_str(&json).unwrap_or_default()) + } + + fn ansi_style(&self, scope: Scope) -> Option<&Style> { + self.ansi_styles[scope as usize].as_ref() + } + + fn css_style(&self, scope: Scope) -> Option<&str> { + self.css_styles[scope as usize].as_ref().map(|s| s.as_str()) + } +} + +impl<'de> Deserialize<'de> for Theme { + fn deserialize(deserializer: D) -> std::result::Result + where + D: Deserializer<'de>, + { + let scope_count = Scope::Unknown as usize + 1; + let mut ansi_styles = vec![None; scope_count]; + let mut css_styles = vec![None; scope_count]; + if let Ok(colors) = HashMap::::deserialize(deserializer) { + for (scope, style_value) in colors { + let mut style = Style::default(); + parse_style(&mut style, style_value); + ansi_styles[scope as usize] = Some(style); + css_styles[scope as usize] = Some(style_to_css(style)); + } + } + Ok(Self { + ansi_styles, + css_styles, + }) + } +} + +impl Serialize for Theme { + fn serialize(&self, serializer: S) -> std::result::Result + where + S: Serializer, + { + let entry_count = self.ansi_styles.iter().filter(|i| i.is_some()).count(); + let mut map = serializer.serialize_map(Some(entry_count))?; + for (i, style) in self.ansi_styles.iter().enumerate() { + let scope = Scope::from_usize(i).unwrap(); + if scope == Scope::Unknown { + break; + } + if let Some(style) = style { + let color = style.foreground.map(|color| match color { + Color::Black => json!("black"), + Color::Blue => json!("blue"), + Color::Cyan => json!("cyan"), + Color::Green => json!("green"), + Color::Purple => json!("purple"), + Color::Red => json!("red"), + Color::White => json!("white"), + Color::Yellow => json!("yellow"), + Color::RGB(r, g, b) => json!(format!("#{:x?}{:x?}{:x?}", r, g, b)), + Color::Fixed(n) => json!(n), + }); + if style.is_bold || style.is_italic || style.is_underline { + let mut entry = HashMap::new(); + if let Some(color) = color { + entry.insert("color", color); + } + if style.is_bold { + entry.insert("bold", Value::Bool(true)); + } + if style.is_italic { + entry.insert("italic", Value::Bool(true)); + } + if style.is_underline { + entry.insert("underline", Value::Bool(true)); + } + map.serialize_entry(&scope, &entry)?; + } else if let Some(color) = color { + map.serialize_entry(&scope, &color)?; + } else { + map.serialize_entry(&scope, &Value::Null)?; + } + } else { + map.serialize_entry(&scope, &Value::Null)?; + } + } + map.end() + } +} + +impl Default for Theme { + fn default() -> Self { + serde_json::from_str( + r#" + { + "attribute": {"color": 124, "italic": true}, + "comment": {"color": 245, "italic": true}, + "constant.builtin": {"color": 94, "bold": true}, + "constant": 94, + "constructor": 136, + "embedded": null, + "function.builtin": {"color": 26, "bold": true}, + "function": 26, + "keyword": 56, + "number": {"color": 94, "bold": true}, + "property": 124, + "operator": {"color": 239, "bold": true}, + "punctuation.bracket": 239, + "punctuation.delimiter": 239, + "string.special": 30, + "string": 28, + "tag": 18, + "type": 23, + "type.builtin": {"color": 23, "bold": true}, + "variable.builtin": {"bold": true} + } + "#, + ) + .unwrap() + } +} + +impl fmt::Debug for Theme { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{{")?; + let mut first = true; + for (i, style) in self.ansi_styles.iter().enumerate() { + if let Some(style) = style { + let scope = Scope::from_usize(i).unwrap(); + if !first { + write!(f, ", ")?; + } + write!(f, "{:?}: {:?}", scope, style)?; + first = false; + } + } + write!(f, "}}")?; + Ok(()) + } +} + +fn parse_style(style: &mut Style, json: Value) { + if let Value::Object(entries) = json { + for (property_name, value) in entries { + match property_name.as_str() { + "bold" => *style = style.bold(), + "italic" => *style = style.italic(), + "underline" => *style = style.underline(), + "color" => { + if let Some(color) = parse_color(value) { + *style = style.fg(color); + } + } + _ => {} + } + } + } else if let Some(color) = parse_color(json) { + *style = style.fg(color); + } +} + +fn parse_color(json: Value) -> Option { + match json { + Value::Number(n) => match n.as_u64() { + Some(n) => Some(Color::Fixed(n as u8)), + _ => None, + }, + Value::String(s) => match s.to_lowercase().as_str() { + "black" => Some(Color::Black), + "blue" => Some(Color::Blue), + "cyan" => Some(Color::Cyan), + "green" => Some(Color::Green), + "purple" => Some(Color::Purple), + "red" => Some(Color::Red), + "white" => Some(Color::White), + "yellow" => Some(Color::Yellow), + s => { + if s.starts_with("#") && s.len() >= 7 { + if let (Ok(red), Ok(green), Ok(blue)) = ( + u8::from_str_radix(&s[1..3], 16), + u8::from_str_radix(&s[3..5], 16), + u8::from_str_radix(&s[5..7], 16), + ) { + Some(Color::RGB(red, green, blue)) + } else { + None + } + } else { + None + } + } + }, + _ => None, + } +} + +fn style_to_css(style: Style) -> String { + use std::fmt::Write; + let mut result = "style='".to_string(); + if style.is_bold { + write!(&mut result, "font-weight: bold;").unwrap(); + } + if style.is_italic { + write!(&mut result, "font-style: italic;").unwrap(); + } + if let Some(color) = style.foreground { + write!(&mut result, "color: {};", color_to_css(color)).unwrap(); + } + result.push('\''); + result +} + +fn color_to_css(color: Color) -> &'static str { + match color { + Color::Black => "black", + Color::Blue => "blue", + Color::Red => "red", + Color::Green => "green", + Color::Yellow => "yellow", + Color::Cyan => "cyan", + Color::Purple => "purple", + Color::White => "white", + Color::Fixed(n) => CSS_STYLES_BY_COLOR_ID[n as usize].as_str(), + _ => panic!("Unsupported color type"), + } +} + +pub fn ansi( + loader: &Loader, + theme: &Theme, + source: &[u8], + language: Language, + property_sheet: &PropertySheet, +) -> Result<()> { + use std::io::Write; + let stdout = io::stdout(); + let mut stdout = stdout.lock(); + let mut scope_stack = Vec::new(); + for event in highlight(source, language, property_sheet, |s| { + language_for_injection_string(loader, s) + })? { + match event { + HighlightEvent::Source(s) => { + if let Some(style) = scope_stack.last().and_then(|s| theme.ansi_style(*s)) { + write!(&mut stdout, "{}", style.paint(s))?; + } else { + write!(&mut stdout, "{}", s)?; + } + } + HighlightEvent::ScopeStart(s) => { + scope_stack.push(s); + } + HighlightEvent::ScopeEnd => { + scope_stack.pop(); + } + } + } + Ok(()) +} + +pub const HTML_HEADER: &'static str = " + + + Tree-sitter Highlighting + + + +"; + +pub const HTML_FOOTER: &'static str = " + +"; + +pub fn html( + loader: &Loader, + theme: &Theme, + source: &[u8], + language: Language, + property_sheet: &PropertySheet, +) -> Result<()> { + use std::io::Write; + let stdout = io::stdout(); + let mut stdout = stdout.lock(); + write!(&mut stdout, "\n")?; + let lines = highlight_html( + source, + language, + property_sheet, + |s| language_for_injection_string(loader, s), + |scope| { + if let Some(css_style) = theme.css_style(scope) { + css_style + } else { + "" + } + }, + )?; + for (i, line) in lines.into_iter().enumerate() { + write!( + &mut stdout, + "\n", + i + 1, + line + )?; + } + write!(&mut stdout, "
{}{}
\n")?; + Ok(()) +} + +fn language_for_injection_string<'a>( + loader: &'a Loader, + string: &str, +) -> Option<(Language, &'a PropertySheet)> { + match loader.language_configuration_for_injection_string(string) { + Err(message) => { + eprintln!( + "Failed to load language for injection string '{}': {}", + string, message.0 + ); + None + } + Ok(None) => None, + Ok(Some((language, configuration))) => { + match configuration.highlight_property_sheet(language) { + Err(message) => { + eprintln!( + "Failed to load property sheet for injection string '{}': {}", + string, message.0 + ); + None + } + Ok(None) => None, + Ok(Some(sheet)) => Some((language, sheet)), + } + } + } +} diff --git a/cli/src/lib.rs b/cli/src/lib.rs index 3a15b457..19b82194 100644 --- a/cli/src/lib.rs +++ b/cli/src/lib.rs @@ -1,5 +1,7 @@ +pub mod config; pub mod error; pub mod generate; +pub mod highlight; pub mod loader; pub mod logger; pub mod parse; diff --git a/cli/src/loader.rs b/cli/src/loader.rs index 5c2a19a7..b6e23a3a 100644 --- a/cli/src/loader.rs +++ b/cli/src/loader.rs @@ -1,5 +1,6 @@ use super::error::{Error, Result}; use libloading::{Library, Symbol}; +use once_cell::unsync::OnceCell; use regex::{Regex, RegexBuilder}; use serde_derive::Deserialize; use std::collections::HashMap; @@ -9,6 +10,7 @@ use std::process::Command; use std::time::SystemTime; use std::{fs, mem}; use tree_sitter::{Language, PropertySheet}; +use tree_sitter_highlight::{load_property_sheet, Properties}; #[cfg(unix)] const DYLIB_EXTENSION: &'static str = "so"; @@ -20,16 +22,18 @@ const BUILD_TARGET: &'static str = env!("BUILD_TARGET"); struct LanguageRepo { path: PathBuf, - language: Option, + language: OnceCell, configurations: Vec, } pub struct LanguageConfiguration { - _name: String, + scope: Option, _content_regex: Option, _first_line_regex: Option, + injection_regex: Option, file_types: Vec, - _highlight_property_sheet: Option>, + highlight_property_sheet_path: Option, + highlight_property_sheet: OnceCell>>, } pub struct Loader { @@ -75,8 +79,23 @@ impl Loader { } } + pub fn language_configuration_for_scope( + &self, + scope: &str, + ) -> Result> { + for (i, repo) in self.language_repos.iter().enumerate() { + for configuration in &repo.configurations { + if configuration.scope.as_ref().map_or(false, |s| s == scope) { + let (language, _) = self.language_configuration_for_id(i)?; + return Ok(Some((language, &configuration))); + } + } + } + Ok(None) + } + pub fn language_configuration_for_file_name( - &mut self, + &self, path: &Path, ) -> Result> { let ids = path @@ -100,20 +119,43 @@ impl Loader { Ok(None) } + pub fn language_configuration_for_injection_string( + &self, + string: &str, + ) -> Result> { + let mut best_match_length = 0; + let mut best_match_position = None; + for (i, repo) in self.language_repos.iter().enumerate() { + for (j, configuration) in repo.configurations.iter().enumerate() { + if let Some(injection_regex) = &configuration.injection_regex { + if let Some(mat) = injection_regex.find(string) { + let length = mat.end() - mat.start(); + if length > best_match_length { + best_match_position = Some((i, j)); + best_match_length = length; + } + } + } + } + } + if let Some((i, j)) = best_match_position { + let (language, configurations) = self.language_configuration_for_id(i)?; + Ok(Some((language, &configurations[j]))) + } else { + Ok(None) + } + } + fn language_configuration_for_id( - &mut self, + &self, id: usize, ) -> Result<(Language, &Vec)> { let repo = &self.language_repos[id]; - let language = if let Some(language) = repo.language { - language - } else { + let language = repo.language.get_or_try_init(|| { let src_path = repo.path.join("src"); - let language = self.load_language_at_path(&src_path, &src_path)?; - self.language_repos[id].language = Some(language); - language - }; - Ok((language, &self.language_repos[id].configurations)) + self.load_language_at_path(&src_path, &src_path) + })?; + Ok((*language, &self.language_repos[id].configurations)) } pub fn load_language_at_path(&self, src_path: &Path, header_path: &Path) -> Result { @@ -191,7 +233,8 @@ impl Loader { .arg("-I") .arg(header_path) .arg("-o") - .arg(&library_path); + .arg(&library_path) + .arg("-O2"); if let Some(scanner_path) = scanner_path.as_ref() { if scanner_path.extension() == Some("c".as_ref()) { command.arg("-xc").arg("-std=c99").arg(scanner_path); @@ -231,13 +274,15 @@ impl Loader { fn find_language_at_path<'a>(&'a mut self, parser_path: &Path) -> Result { #[derive(Deserialize)] struct LanguageConfigurationJSON { - name: String, + scope: Option, #[serde(rename = "file-types")] file_types: Option>, #[serde(rename = "content-regex")] content_regex: Option, #[serde(rename = "first-line-regex")] first_line_regex: Option, + #[serde(rename = "injection-regex")] + injection_regex: Option, highlights: Option, } @@ -255,7 +300,7 @@ impl Loader { configurations .into_iter() .map(|conf| LanguageConfiguration { - _name: conf.name, + scope: conf.scope, file_types: conf.file_types.unwrap_or(Vec::new()), _content_regex: conf .content_regex @@ -263,7 +308,11 @@ impl Loader { _first_line_regex: conf .first_line_regex .and_then(|r| RegexBuilder::new(&r).multi_line(true).build().ok()), - _highlight_property_sheet: conf.highlights.map(|d| Err(d.into())), + injection_regex: conf + .injection_regex + .and_then(|r| RegexBuilder::new(&r).multi_line(true).build().ok()), + highlight_property_sheet_path: conf.highlights.map(|h| parser_path.join(h)), + highlight_property_sheet: OnceCell::new(), }) .collect() }); @@ -279,7 +328,7 @@ impl Loader { self.language_repos.push(LanguageRepo { path: parser_path.to_owned(), - language: None, + language: OnceCell::new(), configurations, }); @@ -287,6 +336,25 @@ impl Loader { } } +impl LanguageConfiguration { + pub fn highlight_property_sheet( + &self, + language: Language, + ) -> Result>> { + self.highlight_property_sheet + .get_or_try_init(|| { + if let Some(path) = &self.highlight_property_sheet_path { + let sheet_json = fs::read_to_string(path)?; + let sheet = load_property_sheet(language, &sheet_json)?; + Ok(Some(sheet)) + } else { + Ok(None) + } + }) + .map(Option::as_ref) + } +} + fn needs_recompile( lib_path: &Path, parser_c_path: &Path, diff --git a/cli/src/main.rs b/cli/src/main.rs index eb848831..3769efa0 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -4,8 +4,9 @@ use std::fs; use std::path::Path; use std::process::exit; use std::usize; -use tree_sitter_cli::loader::Loader; -use tree_sitter_cli::{error, generate, logger, parse, properties, test}; +use tree_sitter_cli::{ + config, error, generate, highlight, loader, logger, parse, properties, test, +}; fn main() { if let Err(e) = run() { @@ -25,6 +26,7 @@ fn run() -> error::Result<()> { .setting(AppSettings::SubcommandRequiredElseHelp) .author("Max Brunsfeld ") .about("Generates and tests parsers") + .subcommand(SubCommand::with_name("init-config").about("Generate a default config file")) .subcommand( SubCommand::with_name("generate") .about("Generate a parser") @@ -64,16 +66,29 @@ fn run() -> error::Result<()> { .arg(Arg::with_name("debug").long("debug").short("d")) .arg(Arg::with_name("debug-graph").long("debug-graph").short("D")), ) + .subcommand( + SubCommand::with_name("highlight") + .about("Highlight a file") + .arg( + Arg::with_name("path") + .index(1) + .multiple(true) + .required(true), + ) + .arg(Arg::with_name("scope").long("scope").takes_value(true)) + .arg(Arg::with_name("html").long("html").short("h")), + ) .get_matches(); - let home_dir = dirs::home_dir().unwrap(); + let home_dir = dirs::home_dir().expect("Failed to read home directory"); let current_dir = env::current_dir().unwrap(); - let config_dir = home_dir.join(".tree-sitter"); + let config = config::Config::load(&home_dir); + let mut loader = loader::Loader::new(config.binary_directory.clone()); - fs::create_dir_all(&config_dir).unwrap(); - let mut loader = Loader::new(config_dir); - - if let Some(matches) = matches.subcommand_matches("generate") { + if matches.subcommand_matches("init-config").is_some() { + let config = config::Config::new(&home_dir); + config.save(&home_dir)?; + } else if let Some(matches) = matches.subcommand_matches("generate") { if matches.is_present("log") { logger::init(); } @@ -81,12 +96,14 @@ fn run() -> error::Result<()> { let grammar_path = matches.value_of("grammar-path"); let minimize = !matches.is_present("no-minimize"); let properties_only = matches.is_present("properties-only"); + let parser_only = grammar_path.is_some(); let state_ids_to_log = matches .values_of("state-ids-to-log") .map_or(Vec::new(), |ids| { ids.filter_map(|id| usize::from_str_radix(id, 10).ok()) .collect() }); + if !properties_only { generate::generate_parser_in_directory( ¤t_dir, @@ -95,7 +112,10 @@ fn run() -> error::Result<()> { state_ids_to_log, )?; } - properties::generate_property_sheets_in_directory(¤t_dir)?; + + if !parser_only { + properties::generate_property_sheets_in_directory(¤t_dir)?; + } } else if let Some(matches) = matches.subcommand_matches("test") { let debug = matches.is_present("debug"); let debug_graph = matches.is_present("debug-graph"); @@ -111,7 +131,7 @@ fn run() -> error::Result<()> { let debug_graph = matches.is_present("debug-graph"); let quiet = matches.is_present("quiet"); let time = matches.is_present("time"); - loader.find_all_languages(&vec![home_dir.join("github")])?; + loader.find_all_languages(&config.parser_directories)?; let paths = matches .values_of("path") .unwrap() @@ -144,6 +164,51 @@ fn run() -> error::Result<()> { if has_error { return Err(error::Error(String::new())); } + } else if let Some(matches) = matches.subcommand_matches("highlight") { + let paths = matches.values_of("path").unwrap().into_iter(); + let html_mode = matches.is_present("html"); + loader.find_all_languages(&config.parser_directories)?; + + if html_mode { + println!("{}", highlight::HTML_HEADER); + } + + let language_config; + if let Some(scope) = matches.value_of("scope") { + language_config = loader.language_configuration_for_scope(scope)?; + if language_config.is_none() { + return Err(error::Error(format!("Unknown scope '{}'", scope))); + } + } else { + language_config = None; + } + + for path in paths { + let path = Path::new(path); + let (language, language_config) = match language_config { + Some(v) => v, + None => match loader.language_configuration_for_file_name(path)? { + Some(v) => v, + None => { + eprintln!("No language found for path {:?}", path); + continue; + } + }, + }; + + if let Some(sheet) = language_config.highlight_property_sheet(language)? { + let source = fs::read(path)?; + if html_mode { + highlight::html(&loader, &config.theme, &source, language, sheet)?; + } else { + highlight::ansi(&loader, &config.theme, &source, language, sheet)?; + } + } else { + return Err(error::Error(format!( + "No syntax highlighting property sheet specified" + ))); + } + } } Ok(()) diff --git a/cli/src/parse.rs b/cli/src/parse.rs index f7961754..b17cd768 100644 --- a/cli/src/parse.rs +++ b/cli/src/parse.rs @@ -1,4 +1,4 @@ -use super::error::Result; +use super::error::{Error, Result}; use super::util; use std::fs; use std::io::{self, Write}; @@ -18,7 +18,8 @@ pub fn parse_file_at_path( let mut _log_session = None; let mut parser = Parser::new(); parser.set_language(language)?; - let source_code = fs::read(path)?; + let source_code = fs::read(path) + .map_err(|e| Error(format!("Error reading source file {:?}: {}", path, e)))?; if debug_graph { _log_session = Some(util::log_graphs(&mut parser, "log.html")?); diff --git a/cli/src/properties.rs b/cli/src/properties.rs index 66cc5589..88cfa9ff 100644 --- a/cli/src/properties.rs +++ b/cli/src/properties.rs @@ -5,7 +5,7 @@ use rsass::sass::Value; use rsass::selectors::SelectorPart; use serde_derive::Serialize; use std::collections::hash_map::Entry; -use std::collections::{BTreeMap, HashMap, HashSet, VecDeque}; +use std::collections::{btree_map, BTreeMap, HashMap, VecDeque}; use std::fmt::{self, Write}; use std::fs::{self, File}; use std::io::BufWriter; @@ -15,12 +15,13 @@ use tree_sitter::{self, PropertyStateJSON, PropertyTransitionJSON}; #[derive(Clone, Debug, PartialEq, Eq, Serialize)] #[serde(untagged)] enum PropertyValue { + Number(isize), String(String), Object(PropertySet), Array(Vec), } -type PropertySet = HashMap; +type PropertySet = BTreeMap; type PropertySheetJSON = tree_sitter::PropertySheetJSON; type StateId = usize; type PropertySetId = usize; @@ -160,7 +161,7 @@ impl Builder { } fn populate_state(&mut self, item_set: ItemSet, state_id: StateId) { - let mut transition_map: HashSet<(PropertyTransitionJSON, u32)> = HashSet::new(); + let mut transitions: HashMap = HashMap::new(); let mut selector_matches = Vec::new(); // First, compute all of the possible state transition predicates for @@ -173,18 +174,21 @@ impl Builder { // If this item has more elements remaining in its selector, then // add a state transition based on the next step. if let Some(step) = next_step { - transition_map.insert(( - PropertyTransitionJSON { + transitions + .entry(PropertyTransitionJSON { kind: step.kind.clone(), field: step.field.clone(), named: step.is_named, index: step.child_index, text: step.text_pattern.clone(), state_id: 0, - }, - // Include the rule id so that it can be used when sorting transitions. - item.rule_id, - )); + }) + .and_modify(|rule_id| { + if item.rule_id > *rule_id { + *rule_id = item.rule_id; + } + }) + .or_insert(item.rule_id); } // If the item has matched its entire selector, then the item's // properties are applicable to this state. @@ -196,46 +200,11 @@ impl Builder { } } - // For eacy possible state transition, compute the set of items in that transition's - // destination state. - let mut transition_list: Vec<(PropertyTransitionJSON, u32)> = transition_map - .into_iter() - .map(|(mut transition, rule_id)| { - let mut next_item_set = ItemSet::new(); - for item in &item_set { - let rule = &self.rules[item.rule_id as usize]; - let selector = &rule.selectors[item.selector_id as usize]; - let next_step = selector.0.get(item.step_id as usize); - - if let Some(step) = next_step { - // If the next step of the item's selector satisfies this transition, - // advance the item to the next part of its selector and add the - // resulting item to this transition's destination state. - if step_matches_transition(step, &transition) { - next_item_set.insert(Item { - rule_id: item.rule_id, - selector_id: item.selector_id, - step_id: item.step_id + 1, - }); - } - - // If the next step of the item is not an immediate child, then - // include this item in this transition's destination state, because - // the next step of the item might match a descendant node. - if !step.is_immediate { - next_item_set.insert(*item); - } - } - } - - transition.state_id = self.add_state(next_item_set); - (transition, rule_id) - }) - .collect(); - // Ensure that for a given node type, more specific transitions are tried // first, and in the event of a tie, transitions corresponding to later rules // in the cascade are tried first. + let mut transition_list: Vec<(PropertyTransitionJSON, u32)> = + transitions.into_iter().collect(); transition_list.sort_by(|a, b| { (transition_specificity(&b.0).cmp(&transition_specificity(&a.0))) .then_with(|| b.1.cmp(&a.1)) @@ -244,6 +213,39 @@ impl Builder { .then_with(|| a.0.field.cmp(&b.0.field)) }); + // For eacy possible state transition, compute the set of items in that transition's + // destination state. + for (transition, _) in transition_list.iter_mut() { + let mut next_item_set = ItemSet::new(); + for item in &item_set { + let rule = &self.rules[item.rule_id as usize]; + let selector = &rule.selectors[item.selector_id as usize]; + let next_step = selector.0.get(item.step_id as usize); + + if let Some(step) = next_step { + // If the next step of the item's selector satisfies this transition, + // advance the item to the next part of its selector and add the + // resulting item to this transition's destination state. + if step_matches_transition(step, &transition) { + next_item_set.insert(Item { + rule_id: item.rule_id, + selector_id: item.selector_id, + step_id: item.step_id + 1, + }); + } + + // If the next step of the item is not an immediate child, then + // include this item in this transition's destination state, because + // the next step of the item might match a descendant node. + if !step.is_immediate { + next_item_set.insert(*item); + } + } + } + + transition.state_id = self.add_state(next_item_set); + } + // Compute the merged properties that apply in the current state. // Sort the matching property sets by ascending specificity and by // their order in the sheet. This way, more specific selectors and later @@ -475,38 +477,9 @@ fn generate_property_sheet(path: impl AsRef, css: &str) -> Result Result> { - let mut i = 0; + let mut schema_paths = Vec::new(); let mut items = rsass::parse_scss_data(css.as_bytes())?; - while i < items.len() { - match &items[i] { - rsass::Item::Import(arg) => { - if let Some(s) = get_sass_string(arg) { - let import_path = resolve_path(path, s)?; - let imported_items = rsass::parse_scss_file(&import_path)?; - items.splice(i..(i + 1), imported_items); - continue; - } else { - return Err(Error("@import arguments must be strings".to_string())); - } - } - rsass::Item::AtRule { name, args, .. } => match name.as_str() { - "schema" => { - if let Some(s) = get_sass_string(args) { - // TODO - use schema - let _schema_path = resolve_path(path, s)?; - items.remove(i); - continue; - } else { - return Err(Error("@schema arguments must be strings".to_string())); - } - } - _ => return Err(Error(format!("Unsupported at-rule '{}'", name))), - }, - _ => {} - } - i += 1; - } - + process_at_rules(&mut items, &mut schema_paths, path)?; let mut result = Vec::new(); let selector_prefixes = vec![Vec::new()]; parse_sass_items(items, &selector_prefixes, &mut result)?; @@ -525,10 +498,10 @@ fn parse_sass_items( rsass::Item::Property(name, value) => { let value = parse_sass_value(&value)?; match properties.entry(name.to_string()) { - Entry::Vacant(v) => { + btree_map::Entry::Vacant(v) => { v.insert(value); } - Entry::Occupied(mut o) => { + btree_map::Entry::Occupied(mut o) => { let existing_value = o.get_mut(); if let PropertyValue::Array(items) = existing_value { items.push(value); @@ -693,6 +666,45 @@ fn parse_sass_items( Ok(()) } +fn process_at_rules( + items: &mut Vec, + schema_paths: &mut Vec, + path: &Path, +) -> Result<()> { + let mut i = 0; + while i < items.len() { + match &items[i] { + rsass::Item::Import(arg) => { + if let Some(s) = get_sass_string(arg) { + let import_path = resolve_path(path, s)?; + let mut imported_items = rsass::parse_scss_file(&import_path)?; + process_at_rules(&mut imported_items, schema_paths, &import_path)?; + items.splice(i..(i + 1), imported_items); + continue; + } else { + return Err(Error("@import arguments must be strings".to_string())); + } + } + rsass::Item::AtRule { name, args, .. } => match name.as_str() { + "schema" => { + if let Some(s) = get_sass_string(args) { + let schema_path = resolve_path(path, s)?; + schema_paths.push(schema_path); + items.remove(i); + continue; + } else { + return Err(Error("@schema arguments must be strings".to_string())); + } + } + _ => return Err(Error(format!("Unsupported at-rule '{}'", name))), + }, + _ => {} + } + i += 1; + } + Ok(()) +} + fn parse_sass_value(value: &Value) -> Result { match value { Value::Literal(s) => { @@ -724,7 +736,7 @@ fn parse_sass_value(value: &Value) -> Result { Ok(PropertyValue::Array(result)) } Value::Color(_, Some(name)) => Ok(PropertyValue::String(name.clone())), - Value::Numeric(n, _) => Ok(PropertyValue::String(format!("{}", n))), + Value::Numeric(n, _) => Ok(PropertyValue::Number(n.to_integer())), Value::True => Ok(PropertyValue::String("true".to_string())), Value::False => Ok(PropertyValue::String("false".to_string())), _ => Err(Error(format!( @@ -744,23 +756,22 @@ fn get_sass_string(value: &Value) -> Option<&str> { fn resolve_path(base: &Path, p: &str) -> Result { let path = Path::new(p); - let mut result = base.to_owned(); - result.pop(); + let mut base = base.to_owned(); + base.pop(); if path.starts_with(".") { - result.push(path); - if result.exists() { - return Ok(result); + base.push(path); + if base.exists() { + return Ok(base); } } else { loop { + let mut result = base.clone(); result.push("node_modules"); result.push(path); if result.exists() { return Ok(result); } - result.pop(); - result.pop(); - if !result.pop() { + if !base.pop() { break; } } @@ -795,9 +806,10 @@ fn interpolation_error() -> Error { mod tests { use super::*; use regex::Regex; + use tempfile::TempDir; #[test] - fn test_properties_immediate_child_and_descendant_selectors() { + fn test_property_sheet_with_immediate_child_and_descendant_selectors() { let sheet = generate_property_sheet( "foo.css", " @@ -829,71 +841,71 @@ mod tests { // f1 single-element selector assert_eq!( *query_simple(&sheet, vec!["f1"]), - props(&[("color", "red")]) + props(&[("color", string("red"))]) ); assert_eq!( *query_simple(&sheet, vec!["f2", "f1"]), - props(&[("color", "red")]) + props(&[("color", string("red"))]) ); assert_eq!( *query_simple(&sheet, vec!["f2", "f3", "f1"]), - props(&[("color", "red")]) + props(&[("color", string("red"))]) ); // f2 single-element selector assert_eq!( *query_simple(&sheet, vec!["f2"]), - props(&[("color", "indigo"), ("height", "2")]) + props(&[("color", string("indigo")), ("height", num(2))]) ); assert_eq!( *query_simple(&sheet, vec!["f2", "f2"]), - props(&[("color", "indigo"), ("height", "2")]) + props(&[("color", string("indigo")), ("height", num(2))]) ); assert_eq!( *query_simple(&sheet, vec!["f1", "f3", "f2"]), - props(&[("color", "indigo"), ("height", "2")]) + props(&[("color", string("indigo")), ("height", num(2))]) ); assert_eq!( *query_simple(&sheet, vec!["f1", "f6", "f2"]), - props(&[("color", "indigo"), ("height", "2")]) + props(&[("color", string("indigo")), ("height", num(2))]) ); // f3 single-element selector assert_eq!( *query_simple(&sheet, vec!["f3"]), - props(&[("color", "violet"), ("height", "3")]) + props(&[("color", string("violet")), ("height", num(3))]) ); assert_eq!( *query_simple(&sheet, vec!["f2", "f3"]), - props(&[("color", "violet"), ("height", "3")]) + props(&[("color", string("violet")), ("height", num(3))]) ); // f2 child selector assert_eq!( *query_simple(&sheet, vec!["f1", "f2"]), - props(&[("color", "green"), ("height", "2")]) + props(&[("color", string("green")), ("height", num(2))]) ); assert_eq!( *query_simple(&sheet, vec!["f2", "f1", "f2"]), - props(&[("color", "green"), ("height", "2")]) + props(&[("color", string("green")), ("height", num(2))]) ); assert_eq!( *query_simple(&sheet, vec!["f3", "f1", "f2"]), - props(&[("color", "green"), ("height", "2")]) + props(&[("color", string("green")), ("height", num(2))]) ); // f3 descendant selector assert_eq!( *query_simple(&sheet, vec!["f1", "f3"]), - props(&[("color", "blue"), ("height", "3")]) + props(&[("color", string("blue")), ("height", num(3))]) ); assert_eq!( *query_simple(&sheet, vec!["f1", "f2", "f3"]), - props(&[("color", "blue"), ("height", "3")]) + props(&[("color", string("blue")), ("height", num(3))]) ); assert_eq!( *query_simple(&sheet, vec!["f1", "f6", "f7", "f8", "f3"]), - props(&[("color", "blue"), ("height", "3")]) + props(&[("color", string("blue")), ("height", num(3))]) ); // no match @@ -902,7 +914,7 @@ mod tests { } #[test] - fn test_properties_text_attribute() { + fn test_property_sheet_with_text_attribute() { let sheet = generate_property_sheet( "foo.css", " @@ -927,15 +939,15 @@ mod tests { assert_eq!( *query(&sheet, vec![("f1", None, true, 0)], "abc"), - props(&[("color", "red")]) + props(&[("color", string("red"))]) ); assert_eq!( *query(&sheet, vec![("f1", None, true, 0)], "Abc"), - props(&[("color", "green")]) + props(&[("color", string("green"))]) ); assert_eq!( *query(&sheet, vec![("f1", None, true, 0)], "AB_CD"), - props(&[("color", "blue")]) + props(&[("color", string("blue"))]) ); assert_eq!( *query(&sheet, vec![("f2", None, true, 0)], "Abc"), @@ -943,12 +955,12 @@ mod tests { ); assert_eq!( *query(&sheet, vec![("f2", None, true, 0)], "ABC"), - props(&[("color", "purple")]) + props(&[("color", string("purple"))]) ); } #[test] - fn test_properties_with_fields() { + fn test_property_sheet_with_fields() { let sheet = generate_property_sheet( "foo.css", " @@ -971,11 +983,11 @@ mod tests { assert_eq!( *query(&sheet, vec![("a", None, true, 0)], ""), - props(&[("color", "red")]) + props(&[("color", string("red"))]) ); assert_eq!( *query(&sheet, vec![("a", Some("x"), true, 0)], ""), - props(&[("color", "green")]) + props(&[("color", string("green"))]) ); assert_eq!( *query( @@ -983,7 +995,7 @@ mod tests { vec![("a", Some("x"), true, 0), ("b", None, true, 0)], "" ), - props(&[("color", "blue")]) + props(&[("color", string("blue"))]) ); assert_eq!( *query( @@ -991,15 +1003,15 @@ mod tests { vec![("a", Some("x"), true, 0), ("b", Some("y"), true, 0)], "" ), - props(&[("color", "yellow")]) + props(&[("color", string("yellow"))]) ); assert_eq!( *query(&sheet, vec![("b", Some("x"), true, 0)], ""), - props(&[("color", "violet")]) + props(&[("color", string("violet"))]) ); assert_eq!( *query(&sheet, vec![("a", None, true, 0), ("b", None, true, 0)], ""), - props(&[("color", "orange")]) + props(&[("color", string("orange"))]) ); assert_eq!( *query( @@ -1007,12 +1019,12 @@ mod tests { vec![("a", None, true, 0), ("b", Some("y"), true, 0)], "" ), - props(&[("color", "indigo")]) + props(&[("color", string("indigo"))]) ); } #[test] - fn test_properties_cascade_ordering_as_tie_breaker() { + fn test_property_sheet_with_cascade_ordering_as_tie_breaker() { let sheet = generate_property_sheet( "foo.css", " @@ -1038,7 +1050,7 @@ mod tests { vec![("f1", None, true, 0), ("f2", None, true, 1)], "x" ), - props(&[("color", "red")]) + props(&[("color", string("red"))]) ); assert_eq!( *query( @@ -1046,7 +1058,7 @@ mod tests { vec![("f1", None, true, 1), ("f2", None, true, 1)], "x" ), - props(&[("color", "green")]) + props(&[("color", string("green"))]) ); assert_eq!( *query( @@ -1054,7 +1066,7 @@ mod tests { vec![("f1", None, true, 1), ("f2", None, true, 1)], "a" ), - props(&[("color", "blue")]) + props(&[("color", string("blue"))]) ); assert_eq!( *query( @@ -1062,12 +1074,12 @@ mod tests { vec![("f1", None, true, 1), ("f2", None, true, 1)], "ab" ), - props(&[("color", "violet")]) + props(&[("color", string("violet"))]) ); } #[test] - fn test_properties_css_function_calls() { + fn test_property_sheet_with_css_function_calls() { let sheet = generate_property_sheet( "foo.css", " @@ -1096,7 +1108,7 @@ mod tests { object(&[("name", string("g")), ("args", array(vec![string("h"),]))]), string("i"), string("j"), - string("10"), + num(10), ]) ), ]) @@ -1104,7 +1116,7 @@ mod tests { } #[test] - fn test_properties_array_by_declaring_property_multiple_times() { + fn test_property_sheet_with_array_by_declaring_property_multiple_times() { let sheet = generate_property_sheet( "foo.css", " @@ -1144,6 +1156,62 @@ mod tests { ); } + #[test] + fn test_property_sheet_with_imports() { + let repo_dir = TempDir::new().unwrap(); + let properties_dir = repo_dir.path().join("properties"); + let dependency_properties_dir = repo_dir + .path() + .join("node_modules") + .join("the-dependency") + .join("properties"); + fs::create_dir_all(&properties_dir).unwrap(); + fs::create_dir_all(&dependency_properties_dir).unwrap(); + let sheet_path1 = properties_dir.join("sheet1.css"); + let sheet_path2 = properties_dir.join("sheet2.css"); + let dependency_sheet_path1 = dependency_properties_dir.join("dependency-sheet1.css"); + let dependency_sheet_path2 = dependency_properties_dir.join("dependency-sheet2.css"); + + fs::write( + sheet_path2, + r#" + a { x: '1'; } + "#, + ) + .unwrap(); + fs::write( + dependency_sheet_path1, + r#" + @import "./dependency-sheet2.css"; + a { y: '2'; } + "#, + ) + .unwrap(); + fs::write( + dependency_sheet_path2, + r#" + b { x: '3'; } + "#, + ) + .unwrap(); + let sheet = generate_property_sheet( + sheet_path1, + r#" + @import "./sheet2.css"; + @import "the-dependency/properties/dependency-sheet1.css"; + b { y: '4'; } + "#, + ) + .unwrap(); + + let a = query_simple(&sheet, vec!["a"]); + assert_eq!(a["x"], string("1"),); + assert_eq!(a["y"], string("2"),); + let b = query_simple(&sheet, vec!["b"]); + assert_eq!(b["x"], string("3"),); + assert_eq!(b["y"], string("4"),); + } + fn query_simple<'a>( sheet: &'a PropertySheetJSON, node_stack: Vec<&'static str>, @@ -1197,9 +1265,13 @@ mod tests { PropertyValue::String(s.to_string()) } - fn props<'a>(s: &'a [(&'a str, &'a str)]) -> PropertySet { + fn num(n: isize) -> PropertyValue { + PropertyValue::Number(n) + } + + fn props<'a>(s: &'a [(&'a str, PropertyValue)]) -> PropertySet { s.into_iter() - .map(|(a, b)| (a.to_string(), PropertyValue::String(b.to_string()))) + .map(|(a, b)| (a.to_string(), b.clone())) .collect() } } diff --git a/cli/src/tests/helpers/fixtures.rs b/cli/src/tests/helpers/fixtures.rs index 8fc00038..e7ba2e55 100644 --- a/cli/src/tests/helpers/fixtures.rs +++ b/cli/src/tests/helpers/fixtures.rs @@ -2,7 +2,8 @@ use crate::loader::Loader; use lazy_static::lazy_static; use std::fs; use std::path::{Path, PathBuf}; -use tree_sitter::Language; +use tree_sitter::{Language, PropertySheet}; +use tree_sitter_highlight::{load_property_sheet, Properties}; include!("./dirs.rs"); @@ -20,6 +21,16 @@ pub fn get_language(name: &str) -> Language { .unwrap() } +pub fn get_property_sheet(language_name: &str, sheet_name: &str) -> PropertySheet { + let path = GRAMMARS_DIR + .join(language_name) + .join("src") + .join(sheet_name); + let json = fs::read_to_string(path).unwrap(); + let language = get_language(language_name); + load_property_sheet(language, &json).unwrap() +} + pub fn get_test_language(name: &str, parser_code: &str, path: Option<&Path>) -> Language { let parser_c_path = SCRATCH_DIR.join(&format!("{}-parser.c", name)); if !fs::read_to_string(&parser_c_path) diff --git a/cli/src/tests/highlight_test.rs b/cli/src/tests/highlight_test.rs new file mode 100644 index 00000000..accca617 --- /dev/null +++ b/cli/src/tests/highlight_test.rs @@ -0,0 +1,214 @@ +use super::helpers::fixtures::{get_language, get_property_sheet}; +use lazy_static::lazy_static; +use tree_sitter::{Language, PropertySheet}; +use tree_sitter_highlight::{highlight, highlight_html, HighlightEvent, Properties, Scope}; + +lazy_static! { + static ref JS_SHEET: PropertySheet = + get_property_sheet("javascript", "highlights.json"); + static ref HTML_SHEET: PropertySheet = + get_property_sheet("html", "highlights.json"); + static ref SCOPE_CLASS_STRINGS: Vec = { + let mut result = Vec::new(); + let mut i = 0; + while let Some(scope) = Scope::from_usize(i) { + result.push(format!("class={:?}", scope)); + i += 1; + } + result + }; +} + +#[test] +fn test_highlighting_injected_html_in_javascript() { + let source = vec!["const s = html `
${a < b}
`;"].join("\n"); + + assert_eq!( + &to_token_vector(&source, get_language("javascript"), &JS_SHEET).unwrap(), + &[vec![ + ("const", vec![Scope::Keyword]), + (" ", vec![]), + ("s", vec![Scope::Variable]), + (" ", vec![]), + ("=", vec![Scope::Operator]), + (" ", vec![]), + ("html", vec![Scope::Function]), + (" ", vec![]), + ("`<", vec![Scope::String]), + ("div", vec![Scope::String, Scope::Tag]), + (">", vec![Scope::String]), + ( + "${", + vec![Scope::String, Scope::Embedded, Scope::PunctuationSpecial] + ), + ("a", vec![Scope::String, Scope::Embedded, Scope::Variable]), + (" ", vec![Scope::String, Scope::Embedded]), + ("<", vec![Scope::String, Scope::Embedded, Scope::Operator]), + (" ", vec![Scope::String, Scope::Embedded]), + ("b", vec![Scope::String, Scope::Embedded, Scope::Variable]), + ( + "}", + vec![Scope::String, Scope::Embedded, Scope::PunctuationSpecial] + ), + ("`", vec![Scope::String]), + (";", vec![Scope::PunctuationDelimiter]), + ]] + ); +} + +#[test] +fn test_highlighting_injected_javascript_in_html() { + let source = vec![ + "", + " ", + "", + ] + .join("\n"); + + assert_eq!( + &to_token_vector(&source, get_language("html"), &HTML_SHEET).unwrap(), + &[ + vec![("<", vec![]), ("body", vec![Scope::Tag]), (">", vec![]),], + vec![(" <", vec![]), ("script", vec![Scope::Tag]), (">", vec![]),], + vec![ + (" ", vec![]), + ("const", vec![Scope::Keyword]), + (" ", vec![]), + ("x", vec![Scope::Variable]), + (" ", vec![]), + ("=", vec![Scope::Operator]), + (" ", vec![]), + ("new", vec![Scope::Keyword]), + (" ", vec![]), + ("Thing", vec![Scope::Constructor]), + ("(", vec![Scope::PunctuationBracket]), + (")", vec![Scope::PunctuationBracket]), + (";", vec![Scope::PunctuationDelimiter]), + ], + vec![ + (" ", vec![]), + ], + vec![("", vec![]),], + ] + ); +} + +#[test] +fn test_highlighting_multiline_scopes_to_html() { + let source = vec![ + "const SOMETHING = `", + " one ${", + " two()", + " } three", + "`", + ] + .join("\n"); + + assert_eq!( + &to_html(&source, get_language("javascript"), &JS_SHEET,).unwrap(), + &[ + "const SOMETHING = `\n".to_string(), + " one ${\n".to_string(), + " two()\n".to_string(), + " } three\n".to_string(), + "`\n".to_string(), + ] + ); +} + +#[test] +fn test_highlighting_empty_lines() { + let source = vec![ + "class A {", + "", + " b(c) {", + "", + " d(e)", + "", + " }", + "", + "}", + ] + .join("\n"); + + assert_eq!( + &to_html(&source, get_language("javascript"), &JS_SHEET,).unwrap(), + &[ + "class A {\n".to_string(), + "\n".to_string(), + " b(c) {\n".to_string(), + "\n".to_string(), + " d(e)\n".to_string(), + "\n".to_string(), + " }\n".to_string(), + "\n".to_string(), + "}\n".to_string(), + ] + ); +} + +fn test_language_for_injection_string<'a>( + string: &str, +) -> Option<(Language, &'a PropertySheet)> { + match string { + "javascript" => Some((get_language("javascript"), &JS_SHEET)), + "html" => Some((get_language("html"), &HTML_SHEET)), + _ => None, + } +} + +fn to_html<'a>( + src: &'a str, + language: Language, + property_sheet: &'a PropertySheet, +) -> Result, String> { + highlight_html( + src.as_bytes(), + language, + property_sheet, + &test_language_for_injection_string, + &|scope| SCOPE_CLASS_STRINGS[scope as usize].as_str(), + ) +} + +fn to_token_vector<'a>( + src: &'a str, + language: Language, + property_sheet: &'a PropertySheet, +) -> Result)>>, String> { + let mut lines = Vec::new(); + let mut scopes = Vec::new(); + let mut line = Vec::new(); + for event in highlight( + src.as_bytes(), + language, + property_sheet, + &test_language_for_injection_string, + )? { + match event { + HighlightEvent::ScopeStart(s) => scopes.push(s), + HighlightEvent::ScopeEnd => { + scopes.pop(); + } + HighlightEvent::Source(s) => { + for (i, l) in s.lines().enumerate() { + if i > 0 { + lines.push(line); + line = Vec::new(); + } + if l.len() > 0 { + line.push((l, scopes.clone())); + } + } + } + } + } + lines.push(line); + Ok(lines) +} diff --git a/cli/src/tests/mod.rs b/cli/src/tests/mod.rs index af2b4582..143e8297 100644 --- a/cli/src/tests/mod.rs +++ b/cli/src/tests/mod.rs @@ -1,5 +1,6 @@ mod corpus_test; mod helpers; +mod highlight_test; mod node_test; mod parser_test; mod properties_test; diff --git a/cli/vendor/xterm-colors.json b/cli/vendor/xterm-colors.json new file mode 100644 index 00000000..47994496 --- /dev/null +++ b/cli/vendor/xterm-colors.json @@ -0,0 +1,258 @@ +[ + "#000000", + "#800000", + "#008000", + "#808000", + "#000080", + "#800080", + "#008080", + "#c0c0c0", + "#808080", + "#ff0000", + "#00ff00", + "#ffff00", + "#0000ff", + "#ff00ff", + "#00ffff", + "#ffffff", + "#000000", + "#00005f", + "#000087", + "#0000af", + "#0000d7", + "#0000ff", + "#005f00", + "#005f5f", + "#005f87", + "#005faf", + "#005fd7", + "#005fff", + "#008700", + "#00875f", + "#008787", + "#0087af", + "#0087d7", + "#0087ff", + "#00af00", + "#00af5f", + "#00af87", + "#00afaf", + "#00afd7", + "#00afff", + "#00d700", + "#00d75f", + "#00d787", + "#00d7af", + "#00d7d7", + "#00d7ff", + "#00ff00", + "#00ff5f", + "#00ff87", + "#00ffaf", + "#00ffd7", + "#00ffff", + "#5f0000", + "#5f005f", + "#5f0087", + "#5f00af", + "#5f00d7", + "#5f00ff", + "#5f5f00", + "#5f5f5f", + "#5f5f87", + "#5f5faf", + "#5f5fd7", + "#5f5fff", + "#5f8700", + "#5f875f", + "#5f8787", + "#5f87af", + "#5f87d7", + "#5f87ff", + "#5faf00", + "#5faf5f", + "#5faf87", + "#5fafaf", + "#5fafd7", + "#5fafff", + "#5fd700", + "#5fd75f", + "#5fd787", + "#5fd7af", + "#5fd7d7", + "#5fd7ff", + "#5fff00", + "#5fff5f", + "#5fff87", + "#5fffaf", + "#5fffd7", + "#5fffff", + "#870000", + "#87005f", + "#870087", + "#8700af", + "#8700d7", + "#8700ff", + "#875f00", + "#875f5f", + "#875f87", + "#875faf", + "#875fd7", + "#875fff", + "#878700", + "#87875f", + "#878787", + "#8787af", + "#8787d7", + "#8787ff", + "#87af00", + "#87af5f", + "#87af87", + "#87afaf", + "#87afd7", + "#87afff", + "#87d700", + "#87d75f", + "#87d787", + "#87d7af", + "#87d7d7", + "#87d7ff", + "#87ff00", + "#87ff5f", + "#87ff87", + "#87ffaf", + "#87ffd7", + "#87ffff", + "#af0000", + "#af005f", + "#af0087", + "#af00af", + "#af00d7", + "#af00ff", + "#af5f00", + "#af5f5f", + "#af5f87", + "#af5faf", + "#af5fd7", + "#af5fff", + "#af8700", + "#af875f", + "#af8787", + "#af87af", + "#af87d7", + "#af87ff", + "#afaf00", + "#afaf5f", + "#afaf87", + "#afafaf", + "#afafd7", + "#afafff", + "#afd700", + "#afd75f", + "#afd787", + "#afd7af", + "#afd7d7", + "#afd7ff", + "#afff00", + "#afff5f", + "#afff87", + "#afffaf", + "#afffd7", + "#afffff", + "#d70000", + "#d7005f", + "#d70087", + "#d700af", + "#d700d7", + "#d700ff", + "#d75f00", + "#d75f5f", + "#d75f87", + "#d75faf", + "#d75fd7", + "#d75fff", + "#d78700", + "#d7875f", + "#d78787", + "#d787af", + "#d787d7", + "#d787ff", + "#d7af00", + "#d7af5f", + "#d7af87", + "#d7afaf", + "#d7afd7", + "#d7afff", + "#d7d700", + "#d7d75f", + "#d7d787", + "#d7d7af", + "#d7d7d7", + "#d7d7ff", + "#d7ff00", + "#d7ff5f", + "#d7ff87", + "#d7ffaf", + "#d7ffd7", + "#d7ffff", + "#ff0000", + "#ff005f", + "#ff0087", + "#ff00af", + "#ff00d7", + "#ff00ff", + "#ff5f00", + "#ff5f5f", + "#ff5f87", + "#ff5faf", + "#ff5fd7", + "#ff5fff", + "#ff8700", + "#ff875f", + "#ff8787", + "#ff87af", + "#ff87d7", + "#ff87ff", + "#ffaf00", + "#ffaf5f", + "#ffaf87", + "#ffafaf", + "#ffafd7", + "#ffafff", + "#ffd700", + "#ffd75f", + "#ffd787", + "#ffd7af", + "#ffd7d7", + "#ffd7ff", + "#ffff00", + "#ffff5f", + "#ffff87", + "#ffffaf", + "#ffffd7", + "#ffffff", + "#080808", + "#121212", + "#1c1c1c", + "#262626", + "#303030", + "#3a3a3a", + "#444444", + "#4e4e4e", + "#585858", + "#626262", + "#6c6c6c", + "#767676", + "#808080", + "#8a8a8a", + "#949494", + "#9e9e9e", + "#a8a8a8", + "#b2b2b2", + "#bcbcbc", + "#c6c6c6", + "#d0d0d0", + "#dadada", + "#e4e4e4", + "#eeeeee" +] diff --git a/docs/section-2-using-parsers.md b/docs/section-2-using-parsers.md index 4df802c4..3085633f 100644 --- a/docs/section-2-using-parsers.md +++ b/docs/section-2-using-parsers.md @@ -5,38 +5,38 @@ permalink: using-parsers # Using Parsers -All of Tree-sitter's parsing functionality is exposed through C APIs. Applications written in higher-level languages can use Tree-sitter via binding libraries like [node-tree-sitter](https://github.com/tree-sitter/node-tree-sitter) or [rust-tree-sitter](https://github.com/tree-sitter/rust-tree-sitter), which have their own documentation. +All of Tree-sitter's parsing functionality is exposed through C APIs. Applications written in higher-level languages can use Tree-sitter via binding libraries like [node-tree-sitter](https://github.com/tree-sitter/node-tree-sitter) or [rust-tree-sitter](https://github.com/tree-sitter/tree-sitter/tree/master/lib/binding), which have their own documentation. This document will describes the general concepts of how to use Tree-sitter, which should be relevant regardless of what language you're using. It also goes into some C-specific details that are useful if you're using the C API directly or are building a new binding to a different language. -## Building the Runtime Library +## Building the Library -Building the runtime library requires one git submodule: [`utf8proc`](https://github.com/JuliaStrings/utf8proc). Make sure that `utf8proc` is downloaded by running this command from the Tree-sitter directory: +Building the library requires one git submodule: [`utf8proc`](https://github.com/JuliaStrings/utf8proc). Make sure that `utf8proc` is downloaded by running this command from the Tree-sitter directory: ```sh git submodule update --init ``` -To build the runtime library on a POSIX system, run this script, which will create a static library called `libruntime.a` in the Tree-sitter folder: +To build the library on a POSIX system, run this script, which will create a static library called `libtree-sitter.a` in the Tree-sitter folder: ```sh -script/build-runtime +script/build-lib ``` -Alternatively, you can use the runtime library in a larger project by adding one source file to the project. This source file needs three directories to be in the include path when compiled: +Alternatively, you can use the library in a larger project by adding one source file to the project. This source file needs three directories to be in the include path when compiled: **source file:** -* `tree-sitter/src/runtime/runtime.c` +* `tree-sitter/lib/src/lib.c` **include directories:** -* `tree-sitter/src` -* `tree-sitter/include` -* `tree-sitter/externals/utf8proc` +* `tree-sitter/lib/src` +* `tree-sitter/lib/include` +* `tree-sitter/lib/utf8proc` ## The Objects There are four main types of objects involved when using Tree-sitter: languages, parsers, syntax trees, and syntax nodes. In C, these are called `TSParser`, `TSLanguage`, `TSTree`, and `TSNode`. -* A `TSLanguage` is an opaque object that defines how to parse a particular programming language. The code for each `TSLanguage` is generated by Tree-sitter. Many languages are already available in separate git repositories within the the [Tree-sitter GitHub organization](https://github.com/tree-sitter). See [the next section](/creating-parsers) for how to create new languages. +* A `TSLanguage` is an opaque object that defines how to parse a particular programming language. The code for each `TSLanguage` is generated by Tree-sitter. Many languages are already available in separate git repositories within the the [Tree-sitter GitHub organization](https://github.com/tree-sitter). See [the next section](./creating-parsers) for how to create new languages. * A `TSParser` is a stateful object that can be assigned a `TSLanguage` and used to produce a `TSTree` based on some source code. * A `TSTree` represents the syntax tree of an entire source code file. Its contains `TSNode` instances that indicate the structure of the source code. It can also be edited and used to produce a new `TSTree` in the event that the source code changes. * A `TSNode` represents a single node in the syntax tree. It tracks its start and end positions in the source code, as well as its relation to other nodes like its parent, siblings and children. @@ -51,7 +51,7 @@ Here's an example of a simple C program that uses the Tree-sitter [JSON parser]( #include #include #include -#include +#include // Declare the `tree_sitter_json` function, which is // implemented by the `tree-sitter-json` library. @@ -103,14 +103,14 @@ int main() { } ``` -This program uses the Tree-sitter C API, which is declared in the header file `tree_sitter/runtime.h`, so we need to add the `tree_sitter/include` directory to the include path. We also need to link `libruntime.a` into the binary. We compile the source code of the JSON language directly into the binary as well. +This program uses the Tree-sitter C API, which is declared in the header file `tree_sitter/api.h`, so we need to add the `tree_sitter/include` directory to the include path. We also need to link `libtree-sitter.a` into the binary. We compile the source code of the JSON language directly into the binary as well. ```sh clang \ -I tree-sitter/include \ test-json-parser.c \ tree-sitter-json/src/parser.c \ - tree-sitter/libruntime.a \ + tree-sitter/libtree-sitter.a \ -o test-json-parser ./test-json-parser @@ -303,7 +303,7 @@ Conceptually, it can be represented by three syntax trees with overlapping range ```c #include -#include +#include // These functions are each implemented in their own repo. const TSLanguage *tree_sitter_embedded_template(); diff --git a/docs/section-3-creating-parsers.md b/docs/section-3-creating-parsers.md index f332060c..f66cacdd 100644 --- a/docs/section-3-creating-parsers.md +++ b/docs/section-3-creating-parsers.md @@ -7,32 +7,56 @@ permalink: creating-parsers Developing Tree-sitter parsers can have a difficult learning curve, but once you get the hang of it, it can be fun and even zen-like. This document should help you to build an effective mental model for parser development. -## Understanding the problem +## Getting Started + +### Understanding the problem Writing a grammar requires creativity. There are an infinite number of CFGs (context-free grammars) that can be used to describe any given language. In order to produce a good Tree-sitter parser, you need to create a grammar with two important properties: - 1. **An intuitive structure** - Tree-sitter's output is a [concrete syntax tree][cst]; each node in the tree corresponds directly to a [terminal or non-terminal symbol][non-terminal] in the grammar. So in order to produce an easy-to-analyze tree, there should be a direct correspondence between the symbols in your grammar and the recognizable constructs in the language. This might seem obvious, but it is very different from the way that context-free grammars are often written in contexts like [language specifications][language-spec] or [Yacc][yacc]/[Bison][bison] parsers. +1. **An intuitive structure** - Tree-sitter's output is a [concrete syntax tree][cst]; each node in the tree corresponds directly to a [terminal or non-terminal symbol][non-terminal] in the grammar. So in order to produce an easy-to-analyze tree, there should be a direct correspondence between the symbols in your grammar and the recognizable constructs in the language. This might seem obvious, but it is very different from the way that context-free grammars are often written in contexts like [language specifications][language-spec] or [Yacc][yacc]/[Bison][bison] parsers. - 2. **A close adherence to LR(1)** - Tree-sitter is based on the [GLR parsing][glr-parsing] algorithm. This means that while it can handle any context-free grammar, it works most efficiently with a class of context-free grammars called [LR(1) Grammars][lr-grammars]. In this respect, Tree-sitter's grammars are similar to (but less restrictive than) [Yacc][yacc] and [Bison][bison] grammars, but *different* from [ANTLR grammars][antlr], [Parsing Expression Grammars][peg], or the [ambiguous grammars][ambiguous-grammar] commonly used in language specifications. +2. **A close adherence to LR(1)** - Tree-sitter is based on the [GLR parsing][glr-parsing] algorithm. This means that while it can handle any context-free grammar, it works most efficiently with a class of context-free grammars called [LR(1) Grammars][lr-grammars]. In this respect, Tree-sitter's grammars are similar to (but less restrictive than) [Yacc][yacc] and [Bison][bison] grammars, but *different* from [ANTLR grammars][antlr], [Parsing Expression Grammars][peg], or the [ambiguous grammars][ambiguous-grammar] commonly used in language specifications. It's unlikely that you'll be able to satisfy these two properties just by translating an existing context-free grammar directly into Tree-sitter's grammar format. There are a few kinds of adjustments that are often required. The following sections will explain these adjustments in more depth. -## Installing the tools +### Dependencies -The best way to create a Tree-sitter parser is with the [`Tree-sitter CLI`][tree-sitter-cli], which is distributed as [a Node.js module][node-module]. To install it, first install [`node`][node.js] and its package manager [`npm`][npm] on your system. Then use `npm` to create a new node module and add `tree-sitter-cli` and [`nan`][nan] as dependencies: +In order to develop a Tree-sitter parser, there are two dependencies that you need to install: + +* **Node.js** - Tree-sitter grammars are written in JavaScript, and Tree-sitter uses [Node.js][node.js] to interpret JavaScript files. It requires the `node` command to be in one of the directories in your [`PATH`][path-env]. It shouldn't matter what version of Node you have. +* **C Compiler** - Tree-sitter creates parsers that are written in C. In order to run and test these parsers with the `tree-sitter parse` or `tree-sitter test` commands, you must have a C/C++ compiler installed. Tree-sitter will try to look for these compilers in the standard places for each platform. + +### Installation + +To create a Tree-sitter parser, you need to use the [the `tree-sitter` CLI][tree-sitter-cli]. You can install the CLI in a few different ways: + +* Install the `tree-sitter-cli` [Node.js module][node-module] using [`npm`][npm], the Node package manager. This is the recommended approach, and it is discussed further in the next section. +* Download a binary for your platform from [the latest GitHub release][releases], and put it into a directory on your `PATH`. +* Build the `tree-sitter-cli` [Rust crate][crate] from source using [`cargo`][cargo], the Rust package manager. + +### Setting up a Project + +The preferred convention is to name the parser repository "tree-sitter-" followed by the name of the language. ```sh mkdir tree-sitter-${YOUR_LANGUAGE_NAME} cd tree-sitter-${YOUR_LANGUAGE_NAME} +``` +You should create a `package.json` file that describes your project, and allows your parser to be used from Node.js. + +```sh # This will prompt you for input npm init +# This installs a small module that lets your parser be used from Node npm install --save nan + +# This installs the Tree-sitter CLI itself npm install --save-dev tree-sitter-cli ``` -This will install the CLI and its dependencies into the `node_modules` folder in your directory. An executable program called `tree-sitter` will be created at the path `./node_modules/.bin/tree-sitter`. You may want to follow the Node.js convention of adding `./node_modules/.bin` to your `PATH` so that you can easily run this program when working in this directory. +The last command will install the CLI into the `node_modules` folder in your project. An executable program called `tree-sitter` will be created at the path `./node_modules/.bin/tree-sitter`. You may want to follow the Node.js convention of adding `./node_modules/.bin` to your `PATH` so that you can easily run this program when working in this directory. Once you have the CLI installed, create a file called `grammar.js` with the following skeleton: @@ -51,7 +75,6 @@ Then run the the following command: ```sh tree-sitter generate -npm install ``` This will generate the C code required to parse this trivial language, as well as all of the files needed to compile and load this native parser as a Node.js module. You can test this parser by creating a source file with the contents `hello;` and parsing it: @@ -60,14 +83,15 @@ This will generate the C code required to parse this trivial language, as well a tree-sitter parse ./the-file ``` -This should print: +This should print the following: + ``` (source_file [0, 0] - [0, 5]) ``` -When you make changes to the grammar, you can update the parser simply by re-running `tree-sitter generate`. The best way to recompile the C-code is to run the command `node-gyp build`. You may have to install the [`node-gyp`][node-gyp] tool separately by running `npm install -g node-gyp`. +You might notice that the first time you run `tree-sitter parse`, it takes a few seconds. This is because Tree-sitter automatically compiles your C code into a dynamically-loadable library. Whenever you make changes to your grammar, you can update the parser simply by re-running `tree-sitter generate`. When the parser changes, Tree-sitter will recompile it as needed. -## Starting to define the grammar +## Writing the Grammar It's usually a good idea to find a formal specification for the language you're trying to parse. This specification will most likely contain a context-free grammar. As you read through the rules of this CFG, you will probably discover a complex and cyclic graph of relationships. It might be unclear how you should navigate this graph as you define your grammar. @@ -168,7 +192,7 @@ With this structure in place, you can now freely decide what part of the grammar After developing the *type* sublanguage a bit further, you might decide to switch to working on *statements* or *expressions* instead. It's often useful to check your progress by trying to parse some real code using `tree-sitter parse`. -## Writing unit tests +### Writing unit tests For each rule that you add to the grammar, you should first create a *test* that describes how the syntax trees should look when parsing that rule. These tests are written using specially-formatted text files in a `corpus` directory in your parser's root folder. Here is an example of how these tests should look: @@ -209,13 +233,13 @@ tree-sitter test -f 'Return statements' The recommendation is to be comprehensive in adding tests. If it's a visible node, add it to a test file in your `corpus` directory. It's typically a good idea to test all of the permutations of each language construct. This increases test coverage, but doubly acquaints readers with a way to examine expected outputs and understand the "edges" of a language. -## Using the grammar DSL +### The Grammar DSL The following is a complete list of built-in functions you can use to define Tree-sitter grammars. Use-cases for some of these functions will be explained in more detail in later sections. * **Symbols (the `$` object)** - Every grammar rule is written as a JavaScript function that takes a parameter conventionally called `$`. The syntax `$.identifier` is how you refer to another grammar symbol within a rule. * **String and Regex literals** - The terminal symbols in a grammar are described using JavaScript strings and regular expressions. Of course during parsing, Tree-sitter does not actually use JavaScript's regex engine to evaluate these regexes; it generates its own regex-matching logic as part of each parser. Regex literals are just used as a convenient way of writing regular expressions in your grammar. -* **Sequences : `seq(rule1, rule2, ...)`** - This function creates a rule that matches any number of other rules, one after another. It is analogous to simply writing multiple symbols next to each other in [EBNF notation](enbf). +* **Sequences : `seq(rule1, rule2, ...)`** - This function creates a rule that matches any number of other rules, one after another. It is analogous to simply writing multiple symbols next to each other in [EBNF notation][ebnf]. * **Alternatives : `choice(rule1, rule2, ...)`** - This function creates a rule that matches *one* of a set of possible rules. The order of the arguments does not matter. This is analogous to the `|` (pipe) operator in EBNF notation. * **Repetitions : `repeat(rule)`** - This function creates a rule that matches *zero-or-more* occurrences of a given rule. It is analogous to the `{x}` (curly brace) syntax in EBNF notation. * **Repetitions : `repeat1(rule)`** - This function creates a rule that matches *one-or-more* occurrences of a given rule. The previous `repeat` rule is implemented in terms of `repeat1` but is included because it is very commonly used. @@ -229,13 +253,13 @@ The following is a complete list of built-in functions you can use to define Tre In addition to the `name` and `rules` fields, grammars have a few other optional public fields that influence the behavior of the parser. -* `extras` - an array of tokens that may appear *anywhere* in the language. This is often used for whitespace and comments. The default value of `extras` is to accept whitespace. To control whitespace explicitly, specify `extras: $ => []` in your grammar. -* `inline` - an array of rule names that should be automatically *removed* from the grammar by replacing all of their usages with a copy of their definition. This is useful for rules that are used in multiple places but for which you *don't* want to create syntax tree nodes at runtime. -* `conflicts` - an array of arrays of rule names. Each inner array represents a set of rules that's involved in an *LR(1) conflict* that is *intended to exist* in the grammar. When these conflicts occur at runtime, Tree-sitter will use the GLR algorithm to explore all of the possible interpretations. If *multiple* parses end up succeeding, Tree-sitter will pick the subtree whose corresponding rule has the highest total *dynamic precedence*. -* `externals` - an array of token names which can be returned by an *external scanner*. External scanners allow you to write custom C code which runs during the lexing process in order to handle lexical rules (e.g. Python's indentation tokens) that cannot be described by regular expressions. -* `word` - the name of a token that will match keywords for the purpose of the [keyword extraction](#keyword-extraction) optimization. +* **`extras`** - an array of tokens that may appear *anywhere* in the language. This is often used for whitespace and comments. The default value of `extras` is to accept whitespace. To control whitespace explicitly, specify `extras: $ => []` in your grammar. +* **`inline`** - an array of rule names that should be automatically *removed* from the grammar by replacing all of their usages with a copy of their definition. This is useful for rules that are used in multiple places but for which you *don't* want to create syntax tree nodes at runtime. +* **`conflicts`** - an array of arrays of rule names. Each inner array represents a set of rules that's involved in an *LR(1) conflict* that is *intended to exist* in the grammar. When these conflicts occur at runtime, Tree-sitter will use the GLR algorithm to explore all of the possible interpretations. If *multiple* parses end up succeeding, Tree-sitter will pick the subtree whose corresponding rule has the highest total *dynamic precedence*. +* **`externals`** - an array of token names which can be returned by an [*external scanner*](#external-scanners). External scanners allow you to write custom C code which runs during the lexing process in order to handle lexical rules (e.g. Python's indentation tokens) that cannot be described by regular expressions. +* **`word`** - the name of a token that will match keywords for the purpose of the [keyword extraction](#keyword-extraction) optimization. -## Adjusting existing grammars +### Adjusting existing grammars Imagine that you were just starting work on the [Tree-sitter JavaScript parser][tree-sitter-javascript]. You might try to directly mirror the structure of the [ECMAScript Language Spec][ecmascript-spec]. To illustrate the problem with this approach, consider the following line of code: @@ -320,7 +344,7 @@ Possible resolutions: 4: Add a conflict for these rules: `binary_expression` `unary_expression` ``` -For an expression like `-a * b`, it's not clear whether the `-` operator applies to the `a * b` or just to the `a`. This is where the `prec` function described above comes into play. By wrapping a rule with `prec`, we can indicate that certain sequence of symbols should *bind to each other more tightly* than others. For example, the `'-', $._expression` sequence in `unary_expression` should bind more tightly than the `$._expression, '+', $._expression` sequence in `binary_expression`: +For an expression like `-a * b`, it's not clear whether the `-` operator applies to the `a * b` or just to the `a`. This is where the `prec` function [described above](#the-grammar-dsl) comes into play. By wrapping a rule with `prec`, we can indicate that certain sequence of symbols should *bind to each other more tightly* than others. For example, the `'-', $._expression` sequence in `unary_expression` should bind more tightly than the `$._expression, '+', $._expression` sequence in `binary_expression`: ```js { @@ -416,6 +440,8 @@ For example, suppose we added `identifier` as the `word` token in our JavaScript ```js grammar({ + name: 'javascript', + word: $ => $.identifier, rules: { @@ -438,36 +464,175 @@ grammar({ identifier: $ => /[a-z_]+/ } -}) +}); ``` Tree-sitter would identify `typeof` and `instanceof` as keywords. Then, when parsing the invalid code above, rather than scanning for the `instanceof` token individually, it would scan for an `identifier` first, and find `instanceofSomething`. It would then correctly recognize the code as invalid. Aside from improving error detection, keyword extraction also has performance benefits. It allows Tree-sitter to generate a smaller, simpler lexing function, which means that **the parser will compile much more quickly**. -[lexing]: https://en.wikipedia.org/wiki/Lexical_analysis -[longest-match]: https://en.wikipedia.org/wiki/Maximal_munch +### External Scanners + +Many languages have some tokens whose structure is impossible or inconvenient to describe with a regular expression. Some examples: +* [Indent and dedent][indent-tokens] tokens in Python +* [Heredocs][heredoc] in Bash and Ruby +* [Percent strings][percent-string] in Ruby + +Tree-sitter allows you to handle these kinds of tokens using *external scanners*. An external scanner is a set of C functions that you, the grammar author, can write by hand in order to add custom logic for recognizing certain tokens. + +To use an external scanner, there are a few steps. First, add an `externals` section to your grammar. This section should list the names of all of your external tokens. These names can then be used elsewhere in your grammar. + +```js +grammar({ + name: 'my_language', + + externals: $ => [ + $.indent, + $.dedent, + $.newline + ], + + // ... +}); +``` + +Then, add another C or C++ source file to your project. Currently, its path must be `src/scanner.c` or `src/scanner.cc` for the CLI to recognize it. Be sure to add this file to the `sources` section of your `binding.gyp` file so that it will be included when your project is compiled by Node.js. + +In this new source file, define an [`enum`][enum] type containing the names of all of your external tokens. The ordering of this enum must match the order in your grammar's `externals` array. + +```c +#include + +enum TokenType { + INDENT, + DEDENT, + NEWLINE +} +``` + +Finally, you must define five functions with specific names, based on your language's name and five actions: *create*, *destroy*, *serialize*, *deserialize*, and *scan*. These functions must all use [C linkage][c-linkage], so if you're writing the scanner in C++, you need to declare them with the `extern "C"` qualifier. + +#### Create + +```c +void * tree_sitter_my_language_external_scanner_create() { + // ... +} +``` + +This function should create your scanner object. It will only be called once anytime your language is set on a parser. Often, you will want to allocate memory on the heap and return a pointer to it. If your external scanner doesn't need to maintain any state, it's ok to return `NULL`. + + +#### Destroy + +```c +void tree_sitter_my_language_external_scanner_destroy(void *payload) { + // ... +} +``` + +This function should free any memory used by your scanner. It is called once when a parser is deleted or assigned a different language. It receives as an argument the same pointer that was returned from the *create* function. If your *create* function didn't allocate any memory, this function can be a noop. + +#### Serialize + +```c +unsigned tree_sitter_my_language_external_scanner_serialize( + void *payload, + char *buffer +) { + // ... +} +``` + +This function should copy the complete state of your scanner into a given byte buffer, and return the number of bytes written. The function is called every time the external scanner successfully recognizes a token. It receives a pointer to your scanner and a pointer to a buffer. The maximum number of bytes that you can write is given by the `TREE_SITTER_SERIALIZATION_BUFFER_SIZE` constant, defined in the `tree_sitter/parser.h` header file. + +The data that this function writes will ultimately be stored in the syntax tree so that the scanner can be restored to the right state when handling edits or ambiguities. For your parser to work correctly, the `serialize` function must store its entire state, and `deserialize` must restore the entire state. For good performance, you should design your scanner so that its state can be serialized as quickly and compactly as possible. + +#### Deserialize + +```c +void tree_sitter_my_language_external_scanner_deserialize( + void *payload, + const char *buffer, + unsigned length +) { + // ... +} +``` + +This function should *restore* the state of your scanner based the bytes that were previously written by the `serialize` function. It is called with a pointer to your scanner, a pointer to the buffer of bytes, and the number of bytes that should be read. + +#### Scan + +```c +bool tree_sitter_my_language_external_scanner_scan( + void *payload, + TSLexer *lexer, + const bool *valid_symbols +) { + // ... +} +``` + +This function is responsible for recognizing external tokens. It should return `true` if a token was recognized, and `false` otherwise. It is called with a "lexer" struct with the following fields: + +* **`uint32_t lookahead`** - The current next character in the input stream, represented as a 32-bit unicode code point. +* **`TSSymbol result_symbol`** - The symbol that was recognized. Your scan function should *assign* to this field one of the values from the `TokenType` enum, described above. +* **`void (*advance)(TSLexer *, bool skip)`** - A function for advancing to the next character. If you pass `true` for the second argument, the current character will be treated as whitespace. +* **`void (*mark_end)(TSLexer *)`** - A function for marking the end of the recognized token. This allows matching tokens that require multiple characters of lookahead. By default (if you don't call `mark_end`), any character that you moved past using the `advance` function will be included in the size of the token. But once you call `mark_end`, then any later calls to `advance` will *not* increase the size of the returned token. You can call `mark_end` multiple times to increase the size of the token. +* **`uint32_t (*get_column)(TSLexer *)`** - **(Experimental)** A function for querying the current column position of the lexer. It returns the number of unicode code points (not bytes) since the start of the current line. +* **`bool (*is_at_included_range_start)(TSLexer *)`** - A function for checking if the parser has just skipped some characters in the document. When parsing an embedded document using the `ts_parser_set_included_ranges` function (described in the [multi-language document section][multi-language-section]), your scanner may want to apply some special behavior when moving to a disjoint part of the document. For example, in [EJS documents][ejs], the JavaScript parser uses this function to enable inserting automatic semicolon tokens in between the code directives, delimited by `<%` and `%>`. + +The third argument to the `scan` function is an array of booleans that indicates which of your external tokens are currently expected by the parser. You should only look for a given token if it is valid according to this array. At the same time, you cannot backtrack, so you may need to combine certain pieces of logic. + +```c +if (valid_symbols[INDENT] || valid_symbol[DEDENT]) { + + // ... logic that is common to both `INDENT` and `DEDENT` + + if (valid_symbols[INDENT]) { + + // ... logic that is specific to `INDENT` + + lexer->result_symbol = INDENT; + return true; + } +} +``` + +[ambiguous-grammar]: https://en.wikipedia.org/wiki/Ambiguous_grammar +[antlr]: http://www.antlr.org/ +[bison-dprec]: https://www.gnu.org/software/bison/manual/html_node/Generalized-LR-Parsing.html +[bison]: https://en.wikipedia.org/wiki/GNU_bison +[c-linkage]: https://en.cppreference.com/w/cpp/language/language_linkage +[cargo]: https://doc.rust-lang.org/cargo/getting-started/installation.html +[crate]: https://crates.io/crates/tree-sitter-cli [cst]: https://en.wikipedia.org/wiki/Parse_tree [dfa]: https://en.wikipedia.org/wiki/Deterministic_finite_automaton -[non-terminal]: https://en.wikipedia.org/wiki/Terminal_and_nonterminal_symbols -[language-spec]: https://en.wikipedia.org/wiki/Programming_language_specification -[glr-parsing]: https://en.wikipedia.org/wiki/GLR_parser -[lr-grammars]: https://en.wikipedia.org/wiki/LR_parser -[yacc]: https://en.wikipedia.org/wiki/Yacc -[bison]: https://en.wikipedia.org/wiki/GNU_bison -[antlr]: http://www.antlr.org/ -[peg]: https://en.wikipedia.org/wiki/Parsing_expression_grammar -[ambiguous-grammar]: https://en.wikipedia.org/wiki/Ambiguous_grammar -[tree-sitter-javascript]: https://github.com/tree-sitter/tree-sitter-javascript +[ebnf]: https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form [ecmascript-spec]: https://www.ecma-international.org/ecma-262/6.0 -[tree-sitter-cli]: https://github.com/tree-sitter/tree-sitter-cli +[ejs]: https://ejs.co +[enum]: https://en.wikipedia.org/wiki/Enumerated_type#C +[glr-parsing]: https://en.wikipedia.org/wiki/GLR_parser +[heredoc]: https://en.wikipedia.org/wiki/Here_document +[indent-tokens]: https://en.wikipedia.org/wiki/Off-side_rule +[language-spec]: https://en.wikipedia.org/wiki/Programming_language_specification +[lexing]: https://en.wikipedia.org/wiki/Lexical_analysis +[longest-match]: https://en.wikipedia.org/wiki/Maximal_munch +[lr-conflict]: https://en.wikipedia.org/wiki/LR_parser#Conflicts_in_the_constructed_tables +[lr-grammars]: https://en.wikipedia.org/wiki/LR_parser +[multi-language-section]: ./using-parsers#multi-language-documents +[nan]: https://github.com/nodejs/nan [node-module]: https://www.npmjs.com/package/tree-sitter-cli [node.js]: https://nodejs.org +[non-terminal]: https://en.wikipedia.org/wiki/Terminal_and_nonterminal_symbols [npm]: https://docs.npmjs.com -[nan]: https://github.com/nodejs/nan +[path-env]: https://en.wikipedia.org/wiki/PATH_(variable) +[peg]: https://en.wikipedia.org/wiki/Parsing_expression_grammar +[percent-string]: https://docs.ruby-lang.org/en/2.5.0/syntax/literals_rdoc.html#label-Percent+Strings +[releases]: https://github.com/tree-sitter/tree-sitter/releases/latest [s-exp]: https://en.wikipedia.org/wiki/S-expression -[node-gyp]: https://github.com/nodejs/node-gyp -[ebnf]: https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form -[lr-conflict]: https://en.wikipedia.org/wiki/LR_parser#Conflicts_in_the_constructed_tables +[tree-sitter-cli]: https://github.com/tree-sitter/tree-sitter/tree/master/cli +[tree-sitter-javascript]: https://github.com/tree-sitter/tree-sitter-javascript [yacc-prec]: https://docs.oracle.com/cd/E19504-01/802-5880/6i9k05dh3/index.html -[bison-dprec]: https://www.gnu.org/software/bison/manual/html_node/Generalized-LR-Parsing.html +[yacc]: https://en.wikipedia.org/wiki/Yacc diff --git a/highlight/Cargo.toml b/highlight/Cargo.toml new file mode 100644 index 00000000..688a2f6c --- /dev/null +++ b/highlight/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "tree-sitter-highlight" +description = "Library for performing syntax highlighting with Tree-sitter" +version = "0.1.4" +authors = [ + "Max Brunsfeld ", + "Tim Clem " +] +license = "MIT" +readme = "README.md" +edition = "2018" +keywords = ["incremental", "parsing", "syntax", "highlighting"] +categories = ["parsing", "text-editors"] + +[dependencies] +regex = "1" +serde = "1.0" +serde_json = "1.0" +serde_derive = "1.0" + +[dependencies.tree-sitter] +version = ">= 0.3.7" +path = "../lib" diff --git a/highlight/README.md b/highlight/README.md new file mode 100644 index 00000000..b6b311cc --- /dev/null +++ b/highlight/README.md @@ -0,0 +1,58 @@ +Tree-sitter Highlighting +========================= + +[![Build Status](https://travis-ci.org/tree-sitter/tree-sitter.svg?branch=master)](https://travis-ci.org/tree-sitter/tree-sitter) +[![Build status](https://ci.appveyor.com/api/projects/status/vtmbd6i92e97l55w/branch/master?svg=true)](https://ci.appveyor.com/project/maxbrunsfeld/tree-sitter/branch/master) +[![Crates.io](https://img.shields.io/crates/v/tree-sitter-highlight.svg)](https://crates.io/crates/tree-sitter-highlight) + +### Usage + +Compile some languages into your app, and declare them: + +```rust +extern "C" tree_sitter_html(); +extern "C" tree_sitter_javascript(); +``` + +Load some *property sheets*: + +```rust +use tree_sitter_highlight::load_property_sheet; + +let javascript_property_sheet = load_property_sheet( + fs::read_to_string("./tree-sitter-javascript/src/highlights.json").unwrap() +).unwrap(); + +let html_property_sheet = load_property_sheet( + fs::read_to_string("./tree-sitter-html/src/highlights.json").unwrap() +).unwrap(); +``` + +Highlight some code: + +```rust +use tree_sitter_highlight::{highlight, HighlightEvent}; + +let highlights = highlight( + b"const x = new Y();", + unsafe { tree_sitter_javascript() }, + &javascript_property_sheet, + &|_| None +).unwrap(); + +for event in highlights { + match event { + HighlightEvent::Source(s) { + eprintln!("source: {:?}", s); + }, + HighlightEvent::ScopeStart(s) { + eprintln!("scope started: {:?}", s); + }, + HighlightEvent::ScopeEnd(s) { + eprintln!("scope ended: {:?}", s); + }, + } +} +``` + +The last parameter to `highlight` is a *language injection* callback. This allows other languages to be retrieved when Tree-sitter detects an embedded document (for example, a piece of JavaScript code inside of a `script` tag within HTML). diff --git a/highlight/src/escape.rs b/highlight/src/escape.rs new file mode 100644 index 00000000..882f160c --- /dev/null +++ b/highlight/src/escape.rs @@ -0,0 +1,53 @@ +// Copyright 2013 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! HTML Escaping +//! +//! This module contains one unit-struct which can be used to HTML-escape a +//! string of text (for use in a format string). + +use std::fmt; + +/// Wrapper struct which will emit the HTML-escaped version of the contained +/// string when passed to a format string. +pub struct Escape<'a>(pub &'a str); + +impl<'a> fmt::Display for Escape<'a> { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + // Because the internet is always right, turns out there's not that many + // characters to escape: http://stackoverflow.com/questions/7381974 + let Escape(s) = *self; + let pile_o_bits = s; + let mut last = 0; + for (i, ch) in s.bytes().enumerate() { + match ch as char { + '<' | '>' | '&' | '\'' | '"' => { + fmt.write_str(&pile_o_bits[last..i])?; + let s = match ch as char { + '>' => ">", + '<' => "<", + '&' => "&", + '\'' => "'", + '"' => """, + _ => unreachable!(), + }; + fmt.write_str(s)?; + last = i + 1; + } + _ => {} + } + } + + if last < s.len() { + fmt.write_str(&pile_o_bits[last..])?; + } + Ok(()) + } +} diff --git a/highlight/src/lib.rs b/highlight/src/lib.rs new file mode 100644 index 00000000..e5499fbc --- /dev/null +++ b/highlight/src/lib.rs @@ -0,0 +1,881 @@ +mod escape; + +use serde::{Deserialize, Deserializer, Serialize, Serializer}; +use serde_derive::*; +use std::cmp; +use std::fmt::Write; +use std::mem::transmute; +use std::str; +use std::usize; +use tree_sitter::{Language, Node, Parser, Point, PropertySheet, Range, Tree, TreePropertyCursor}; + +#[derive(Debug)] +enum TreeStep { + Child { + index: isize, + kinds: Option>, + }, + Children { + kinds: Option>, + }, + Next { + kinds: Option>, + }, +} + +#[derive(Debug)] +enum InjectionLanguage { + Literal(String), + TreePath(Vec), +} + +#[derive(Debug)] +struct Injection { + language: InjectionLanguage, + content: Vec, +} + +#[derive(Debug)] +pub struct Properties { + scope: Option, + injections: Vec, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[repr(u16)] +pub enum Scope { + Attribute, + Comment, + Constant, + ConstantBuiltin, + Constructor, + ConstructorBuiltin, + Embedded, + Escape, + Function, + FunctionBuiltin, + Keyword, + Number, + Operator, + Property, + PropertyBuiltin, + Punctuation, + PunctuationBracket, + PunctuationDelimiter, + PunctuationSpecial, + String, + StringSpecial, + Tag, + Type, + TypeBuiltin, + Variable, + VariableBuiltin, + Unknown, +} + +struct Layer<'a> { + _tree: Tree, + cursor: TreePropertyCursor<'a, Properties>, + ranges: Vec, + at_node_end: bool, +} + +struct Highlighter<'a, T> +where + T: Fn(&str) -> Option<(Language, &'a PropertySheet)>, +{ + injection_callback: T, + source: &'a [u8], + source_offset: usize, + parser: Parser, + layers: Vec>, + utf8_error_len: Option, +} + +#[derive(Copy, Clone, Debug)] +pub enum HighlightEvent<'a> { + Source(&'a str), + ScopeStart(Scope), + ScopeEnd, +} + +#[derive(Debug, Deserialize)] +#[serde(untagged)] +enum TreePathArgJSON { + TreePath(TreePathJSON), + Number(isize), + String(String), +} + +#[derive(Debug, Deserialize)] +#[serde(tag = "name")] +enum TreePathJSON { + #[serde(rename = "this")] + This, + #[serde(rename = "child")] + Child { args: Vec }, + #[serde(rename = "next")] + Next { args: Vec }, + #[serde(rename = "children")] + Children { args: Vec }, +} + +#[derive(Debug, Deserialize)] +#[serde(untagged)] +enum InjectionLanguageJSON { + List(Vec), + TreePath(TreePathJSON), + Literal(String), +} + +#[derive(Debug, Deserialize)] +#[serde(untagged)] +enum InjectionContentJSON { + List(Vec), + TreePath(TreePathJSON), +} + +#[derive(Debug, Deserialize)] +struct PropertiesJSON { + scope: Option, + #[serde(rename = "injection-language")] + injection_language: Option, + #[serde(rename = "injection-content")] + injection_content: Option, +} + +#[derive(Debug)] +pub enum PropertySheetError { + InvalidJSON(serde_json::Error), + InvalidRegex(regex::Error), + InvalidFormat(String), +} + +pub fn load_property_sheet( + language: Language, + json: &str, +) -> Result, PropertySheetError> { + let sheet = PropertySheet::new(language, json).map_err(|e| match e { + tree_sitter::PropertySheetError::InvalidJSON(e) => PropertySheetError::InvalidJSON(e), + tree_sitter::PropertySheetError::InvalidRegex(e) => PropertySheetError::InvalidRegex(e), + })?; + let sheet = sheet + .map(|p| Properties::new(p, language)) + .map_err(PropertySheetError::InvalidFormat)?; + Ok(sheet) +} + +impl Scope { + pub fn from_usize(i: usize) -> Option { + if i <= (Scope::Unknown as usize) { + Some(unsafe { transmute(i as u16) }) + } else { + None + } + } +} + +impl Properties { + fn new(json: PropertiesJSON, language: Language) -> Result { + let injections = match (json.injection_language, json.injection_content) { + (None, None) => Ok(Vec::new()), + (Some(_), None) => Err( + "Must specify an injection-content along with an injection-language".to_string(), + ), + (None, Some(_)) => Err( + "Must specify an injection-language along with an injection-content".to_string(), + ), + (Some(language_json), Some(content_json)) => { + let languages = match language_json { + InjectionLanguageJSON::List(list) => { + let mut result = Vec::with_capacity(list.len()); + for element in list { + result.push(match element { + InjectionLanguageJSON::TreePath(p) => { + let mut result = Vec::new(); + Self::flatten_tree_path(p, &mut result, language)?; + InjectionLanguage::TreePath(result) + } + InjectionLanguageJSON::Literal(s) => InjectionLanguage::Literal(s), + InjectionLanguageJSON::List(_) => { + panic!("Injection-language cannot be a list of lists") + } + }) + } + result + } + InjectionLanguageJSON::TreePath(p) => vec![{ + let mut result = Vec::new(); + Self::flatten_tree_path(p, &mut result, language)?; + InjectionLanguage::TreePath(result) + }], + InjectionLanguageJSON::Literal(s) => vec![InjectionLanguage::Literal(s)], + }; + + let contents = match content_json { + InjectionContentJSON::List(l) => { + let mut result = Vec::with_capacity(l.len()); + for element in l { + result.push(match element { + InjectionContentJSON::TreePath(p) => { + let mut result = Vec::new(); + Self::flatten_tree_path(p, &mut result, language)?; + result + } + InjectionContentJSON::List(_) => { + panic!("Injection-content cannot be a list of lists") + } + }) + } + result + } + InjectionContentJSON::TreePath(p) => vec![{ + let mut result = Vec::new(); + Self::flatten_tree_path(p, &mut result, language)?; + result + }], + }; + + if languages.len() == contents.len() { + Ok(languages + .into_iter() + .zip(contents.into_iter()) + .map(|(language, content)| Injection { language, content }) + .collect()) + } else { + Err(format!( + "Mismatch: got {} injection-language values but {} injection-content values", + languages.len(), + contents.len(), + )) + } + } + }?; + + Ok(Self { + scope: json.scope, + injections, + }) + } + + // Transform a tree path from the format expressed directly in the property sheet + // (nested function calls), to a flat sequence of steps for transforming a list of + // nodes. This way, we can evaluate these tree paths with no recursion and a single + // vector of intermediate storage. + fn flatten_tree_path( + p: TreePathJSON, + steps: &mut Vec, + language: Language, + ) -> Result<(), String> { + match p { + TreePathJSON::This => {} + TreePathJSON::Child { args } => { + let (tree_path, index, kinds) = Self::parse_args("child", args, language)?; + Self::flatten_tree_path(tree_path, steps, language)?; + steps.push(TreeStep::Child { + index: index + .ok_or_else(|| "The `child` function requires an index".to_string())?, + kinds: kinds, + }); + } + TreePathJSON::Children { args } => { + let (tree_path, _, kinds) = Self::parse_args("children", args, language)?; + Self::flatten_tree_path(tree_path, steps, language)?; + steps.push(TreeStep::Children { kinds }); + } + TreePathJSON::Next { args } => { + let (tree_path, _, kinds) = Self::parse_args("next", args, language)?; + Self::flatten_tree_path(tree_path, steps, language)?; + steps.push(TreeStep::Next { kinds }); + } + } + Ok(()) + } + + fn parse_args( + name: &str, + args: Vec, + language: Language, + ) -> Result<(TreePathJSON, Option, Option>), String> { + let tree_path; + let mut index = None; + let mut kinds = Vec::new(); + let mut iter = args.into_iter(); + + match iter.next() { + Some(TreePathArgJSON::TreePath(p)) => tree_path = p, + _ => { + return Err(format!( + "First argument to `{}()` must be a tree path", + name + )); + } + } + + for arg in iter { + match arg { + TreePathArgJSON::TreePath(_) => { + return Err(format!( + "Other arguments to `{}()` must be strings or numbers", + name + )); + } + TreePathArgJSON::Number(i) => index = Some(i), + TreePathArgJSON::String(s) => kinds.push(s), + } + } + + if kinds.len() > 0 { + let mut kind_ids = Vec::new(); + for i in 0..(language.node_kind_count() as u16) { + if kinds.iter().any(|s| s == language.node_kind_for_id(i)) + && language.node_kind_is_named(i) + { + kind_ids.push(i); + } + } + if kind_ids.len() == 0 { + return Err(format!("Non-existent node kinds: {:?}", kinds)); + } + + Ok((tree_path, index, Some(kind_ids))) + } else { + Ok((tree_path, index, None)) + } + } +} + +impl<'a, F> Highlighter<'a, F> +where + F: Fn(&str) -> Option<(Language, &'a PropertySheet)>, +{ + fn new( + source: &'a [u8], + language: Language, + property_sheet: &'a PropertySheet, + injection_callback: F, + ) -> Result { + let mut parser = Parser::new(); + parser.set_language(language)?; + let tree = parser + .parse(source, None) + .ok_or_else(|| format!("Tree-sitter: failed to parse"))?; + Ok(Self { + injection_callback, + source, + source_offset: 0, + parser, + layers: vec![Layer::new( + source, + tree, + property_sheet, + vec![Range { + start_byte: 0, + end_byte: usize::MAX, + start_point: Point::new(0, 0), + end_point: Point::new(usize::MAX, usize::MAX), + }], + )], + utf8_error_len: None, + }) + } + + fn emit_source(&mut self, next_offset: usize) -> Option> { + let input = &self.source[self.source_offset..next_offset]; + match str::from_utf8(input) { + Ok(valid) => { + self.source_offset = next_offset; + Some(HighlightEvent::Source(valid)) + } + Err(error) => { + if let Some(error_len) = error.error_len() { + if error.valid_up_to() > 0 { + let prefix = &input[0..error.valid_up_to()]; + self.utf8_error_len = Some(error_len); + Some(HighlightEvent::Source(unsafe { + str::from_utf8_unchecked(prefix) + })) + } else { + self.source_offset += error_len; + Some(HighlightEvent::Source("\u{FFFD}")) + } + } else { + None + } + } + } + } + + fn process_tree_step(&self, step: &TreeStep, nodes: &mut Vec) { + let len = nodes.len(); + for i in 0..len { + let node = nodes[i]; + match step { + TreeStep::Child { index, kinds } => { + let index = if *index >= 0 { + *index as usize + } else { + (node.child_count() as isize + *index) as usize + }; + if let Some(child) = node.child(index) { + if let Some(kinds) = kinds { + if kinds.contains(&child.kind_id()) { + nodes.push(child); + } + } else { + nodes.push(child); + } + } + } + TreeStep::Children { kinds } => { + for child in node.children() { + if let Some(kinds) = kinds { + if kinds.contains(&child.kind_id()) { + nodes.push(child); + } + } else { + nodes.push(child); + } + } + } + TreeStep::Next { .. } => unimplemented!(), + } + } + nodes.drain(0..len); + } + + fn nodes_for_tree_path(&self, node: Node<'a>, steps: &Vec) -> Vec> { + let mut nodes = vec![node]; + for step in steps.iter() { + self.process_tree_step(step, &mut nodes); + } + nodes + } + + // An injected language name may either be specified as a fixed string, or based + // on the text of some node in the syntax tree. + fn injection_language_string( + &self, + node: &Node<'a>, + language: &InjectionLanguage, + ) -> Option { + match language { + InjectionLanguage::Literal(s) => Some(s.to_string()), + InjectionLanguage::TreePath(steps) => self + .nodes_for_tree_path(*node, steps) + .first() + .and_then(|node| { + str::from_utf8(&self.source[node.start_byte()..node.end_byte()]) + .map(|s| s.to_owned()) + .ok() + }), + } + } + + // Compute the ranges that should be included when parsing an injection. + // This takes into account two things: + // * `nodes` - Every injection takes place within a set of nodes. The injection ranges + // are the ranges of those nodes, *minus* the ranges of those nodes' children. + // * `parent_ranges` - The new injection may be nested inside of *another* injection + // (e.g. JavaScript within HTML within ERB). The parent injection's ranges must + // be taken into account. + fn intersect_ranges(parent_ranges: &Vec, nodes: &Vec) -> Vec { + let mut result = Vec::new(); + let mut parent_range_iter = parent_ranges.iter(); + let mut parent_range = parent_range_iter + .next() + .expect("Layers should only be constructed with non-empty ranges vectors"); + for node in nodes.iter() { + let range = node.range(); + let mut preceding_range = Range { + start_byte: 0, + start_point: Point::new(0, 0), + end_byte: range.start_byte, + end_point: range.start_point, + }; + let following_range = Range { + start_byte: node.end_byte(), + start_point: node.end_position(), + end_byte: usize::MAX, + end_point: Point::new(usize::MAX, usize::MAX), + }; + + for child_range in node + .children() + .map(|c| c.range()) + .chain([following_range].iter().cloned()) + { + let mut range = Range { + start_byte: preceding_range.end_byte, + start_point: preceding_range.end_point, + end_byte: child_range.start_byte, + end_point: child_range.start_point, + }; + preceding_range = child_range; + + if range.end_byte < parent_range.start_byte { + continue; + } + + while parent_range.start_byte <= range.end_byte { + if parent_range.end_byte > range.start_byte { + if range.start_byte < parent_range.start_byte { + range.start_byte = parent_range.start_byte; + range.start_point = parent_range.start_point; + } + + if parent_range.end_byte < range.end_byte { + if range.start_byte < parent_range.end_byte { + result.push(Range { + start_byte: range.start_byte, + start_point: range.start_point, + end_byte: parent_range.end_byte, + end_point: parent_range.end_point, + }); + } + range.start_byte = parent_range.end_byte; + range.start_point = parent_range.end_point; + } else { + if range.start_byte < range.end_byte { + result.push(range); + } + break; + } + } + + if let Some(next_range) = parent_range_iter.next() { + parent_range = next_range; + } else { + return result; + } + } + } + } + result + } + + fn add_layer(&mut self, language_string: &str, ranges: Vec) { + if let Some((language, property_sheet)) = (self.injection_callback)(language_string) { + self.parser + .set_language(language) + .expect("Failed to set language"); + self.parser.set_included_ranges(&ranges); + let tree = self + .parser + .parse(self.source, None) + .expect("Failed to parse"); + let layer = Layer::new(self.source, tree, property_sheet, ranges); + match self.layers.binary_search_by(|l| l.cmp(&layer)) { + Ok(i) | Err(i) => self.layers.insert(i, layer), + }; + } + } +} + +impl<'a, T: Fn(&str) -> Option<(Language, &'a PropertySheet)>> Iterator + for Highlighter<'a, T> +{ + type Item = HighlightEvent<'a>; + + fn next(&mut self) -> Option { + if let Some(utf8_error_len) = self.utf8_error_len.take() { + self.source_offset += utf8_error_len; + return Some(HighlightEvent::Source("\u{FFFD}")); + } + + while !self.layers.is_empty() { + let first_layer = &self.layers[0]; + let properties = &first_layer.cursor.node_properties(); + + // Add any injections for the current node. + if !first_layer.at_node_end { + let node = first_layer.cursor.node(); + let injections = properties + .injections + .iter() + .filter_map(|Injection { language, content }| { + if let Some(language) = self.injection_language_string(&node, language) { + let nodes = self.nodes_for_tree_path(node, content); + let ranges = Self::intersect_ranges(&first_layer.ranges, &nodes); + if ranges.len() > 0 { + return Some((language, ranges)); + } + } + None + }) + .collect::>(); + + for (language, ranges) in injections { + self.add_layer(&language, ranges); + } + } + + // Determine if any scopes start or end at the current position. + let scope_event; + if let Some(scope) = properties.scope { + let next_offset = cmp::min(self.source.len(), self.layers[0].offset()); + + // Before returning any scope boundaries, return any remaining slice of + // the source code the precedes that scope boundary. + if self.source_offset < next_offset { + return self.emit_source(next_offset); + } + + scope_event = if self.layers[0].at_node_end { + Some(HighlightEvent::ScopeEnd) + } else { + Some(HighlightEvent::ScopeStart(scope)) + }; + } else { + scope_event = None; + }; + + // Advance the current layer's tree cursor. This might cause that cursor to move + // beyond one of the other layers' cursors for a different syntax tree, so we need + // to re-sort the layers. If the cursor is already at the end of its syntax tree, + // remove it. + if self.layers[0].advance() { + self.layers.sort_unstable_by(|a, b| a.cmp(&b)); + } else { + self.layers.remove(0); + } + + if scope_event.is_some() { + return scope_event; + } + } + + if self.source_offset < self.source.len() { + self.emit_source(self.source.len()) + } else { + None + } + } +} + +impl<'a> Layer<'a> { + fn new( + source: &'a [u8], + tree: Tree, + sheet: &'a PropertySheet, + ranges: Vec, + ) -> Self { + // The cursor's lifetime parameter indicates that the tree must outlive the cursor. + // But because the tree is really a pointer to the heap, the cursor can remain + // valid when the tree is moved. There's no way to express this with lifetimes + // right now, so we have to `transmute` the cursor's lifetime. + let cursor = unsafe { transmute(tree.walk_with_properties(sheet, source)) }; + Self { + _tree: tree, + cursor, + ranges, + at_node_end: false, + } + } + + fn cmp(&self, other: &Layer) -> cmp::Ordering { + // Events are ordered primarily by their position in the document. But if + // one scope starts at a given position and another scope ends at that + // same position, return the scope end event before the scope start event. + self.offset() + .cmp(&other.offset()) + .then_with(|| other.at_node_end.cmp(&self.at_node_end)) + } + + fn offset(&self) -> usize { + if self.at_node_end { + self.cursor.node().end_byte() + } else { + self.cursor.node().start_byte() + } + } + + fn advance(&mut self) -> bool { + if self.at_node_end { + if self.cursor.goto_next_sibling() { + self.at_node_end = false; + } else if !self.cursor.goto_parent() { + return false; + } + } else if !self.cursor.goto_first_child() { + self.at_node_end = true; + } + true + } +} + +impl<'de> Deserialize<'de> for Scope { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + let s = String::deserialize(deserializer)?; + match s.as_str() { + "attribute" => Ok(Scope::Attribute), + "comment" => Ok(Scope::Comment), + "constant" => Ok(Scope::Constant), + "constant.builtin" => Ok(Scope::ConstantBuiltin), + "constructor" => Ok(Scope::Constructor), + "constructor.builtin" => Ok(Scope::ConstructorBuiltin), + "embedded" => Ok(Scope::Embedded), + "escape" => Ok(Scope::Escape), + "function" => Ok(Scope::Function), + "function.builtin" => Ok(Scope::FunctionBuiltin), + "keyword" => Ok(Scope::Keyword), + "number" => Ok(Scope::Number), + "operator" => Ok(Scope::Operator), + "property" => Ok(Scope::Property), + "property.builtin" => Ok(Scope::PropertyBuiltin), + "punctuation" => Ok(Scope::Punctuation), + "punctuation.bracket" => Ok(Scope::PunctuationBracket), + "punctuation.delimiter" => Ok(Scope::PunctuationDelimiter), + "punctuation.special" => Ok(Scope::PunctuationSpecial), + "string" => Ok(Scope::String), + "string.special" => Ok(Scope::StringSpecial), + "type" => Ok(Scope::Type), + "type.builtin" => Ok(Scope::TypeBuiltin), + "variable" => Ok(Scope::Variable), + "variable.builtin" => Ok(Scope::VariableBuiltin), + "tag" => Ok(Scope::Tag), + _ => Ok(Scope::Unknown), + } + } +} + +impl Serialize for Scope { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + match self { + Scope::Attribute => serializer.serialize_str("attribute"), + Scope::Comment => serializer.serialize_str("comment"), + Scope::Constant => serializer.serialize_str("constant"), + Scope::ConstantBuiltin => serializer.serialize_str("constant.builtin"), + Scope::Constructor => serializer.serialize_str("constructor"), + Scope::ConstructorBuiltin => serializer.serialize_str("constructor.builtin"), + Scope::Embedded => serializer.serialize_str("embedded"), + Scope::Escape => serializer.serialize_str("escape"), + Scope::Function => serializer.serialize_str("function"), + Scope::FunctionBuiltin => serializer.serialize_str("function.builtin"), + Scope::Keyword => serializer.serialize_str("keyword"), + Scope::Number => serializer.serialize_str("number"), + Scope::Operator => serializer.serialize_str("operator"), + Scope::Property => serializer.serialize_str("property"), + Scope::PropertyBuiltin => serializer.serialize_str("property.builtin"), + Scope::Punctuation => serializer.serialize_str("punctuation"), + Scope::PunctuationBracket => serializer.serialize_str("punctuation.bracket"), + Scope::PunctuationDelimiter => serializer.serialize_str("punctuation.delimiter"), + Scope::PunctuationSpecial => serializer.serialize_str("punctuation.special"), + Scope::String => serializer.serialize_str("string"), + Scope::StringSpecial => serializer.serialize_str("string.special"), + Scope::Type => serializer.serialize_str("type"), + Scope::TypeBuiltin => serializer.serialize_str("type.builtin"), + Scope::Variable => serializer.serialize_str("variable"), + Scope::VariableBuiltin => serializer.serialize_str("variable.builtin"), + Scope::Tag => serializer.serialize_str("tag"), + Scope::Unknown => serializer.serialize_str(""), + } + } +} + +pub trait HTMLAttributeCallback<'a>: Fn(Scope) -> &'a str {} + +pub fn highlight<'a, F>( + source: &'a [u8], + language: Language, + property_sheet: &'a PropertySheet, + injection_callback: F, +) -> Result> + 'a, String> +where + F: Fn(&str) -> Option<(Language, &'a PropertySheet)> + 'a, +{ + Highlighter::new(source, language, property_sheet, injection_callback) +} + +pub fn highlight_html<'a, F1, F2>( + source: &'a [u8], + language: Language, + property_sheet: &'a PropertySheet, + injection_callback: F1, + attribute_callback: F2, +) -> Result, String> +where + F1: Fn(&str) -> Option<(Language, &'a PropertySheet)>, + F2: Fn(Scope) -> &'a str, +{ + let highlighter = Highlighter::new(source, language, property_sheet, injection_callback)?; + let mut renderer = HtmlRenderer::new(attribute_callback); + let mut scopes = Vec::new(); + for event in highlighter { + match event { + HighlightEvent::ScopeStart(s) => { + scopes.push(s); + renderer.start_scope(s); + } + HighlightEvent::ScopeEnd => { + scopes.pop(); + renderer.end_scope(); + } + HighlightEvent::Source(src) => { + renderer.add_text(src, &scopes); + } + }; + } + if !renderer.current_line.is_empty() { + renderer.finish_line(); + } + Ok(renderer.result) +} + +struct HtmlRenderer<'a, F: Fn(Scope) -> &'a str> { + result: Vec, + current_line: String, + attribute_callback: F, +} + +impl<'a, F> HtmlRenderer<'a, F> +where + F: Fn(Scope) -> &'a str, +{ + fn new(attribute_callback: F) -> Self { + HtmlRenderer { + result: Vec::new(), + current_line: String::new(), + attribute_callback, + } + } + + fn start_scope(&mut self, s: Scope) { + write!( + &mut self.current_line, + "", + (self.attribute_callback)(s), + ) + .unwrap(); + } + + fn end_scope(&mut self) { + write!(&mut self.current_line, "").unwrap(); + } + + fn finish_line(&mut self) { + self.current_line.push('\n'); + self.result.push(self.current_line.clone()); + self.current_line.clear(); + } + + fn add_text(&mut self, src: &str, scopes: &Vec) { + let mut multiline = false; + for line in src.split('\n') { + let line = line.trim_end_matches('\r'); + if multiline { + scopes.iter().for_each(|_| self.end_scope()); + self.finish_line(); + scopes.iter().for_each(|scope| self.start_scope(*scope)); + } + write!(&mut self.current_line, "{}", escape::Escape(line)).unwrap(); + multiline = true; + } + } +} diff --git a/lib/Cargo.toml b/lib/Cargo.toml index f3c3efb3..c6f84ada 100644 --- a/lib/Cargo.toml +++ b/lib/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "tree-sitter" description = "Rust bindings to the Tree-sitter parsing library" -version = "0.3.8" +version = "0.3.9" authors = ["Max Brunsfeld "] license = "MIT" readme = "binding/README.md" diff --git a/script/version b/script/version index e46898f7..c2bbeed3 100755 --- a/script/version +++ b/script/version @@ -24,7 +24,7 @@ const arg = process.argv[2]; if (!arg) { console.log([ - `Usage: script/version major |minor | patch | `, + `Usage: script/version major | minor | patch | `, '', 'Update the CLI version by the given increment or to the given', 'version number, creating a commit and tag for the new version.',