From 4db132ff94a5775f0c06b743ff1c603251222d4f Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 15 Feb 2019 19:25:20 -0800 Subject: [PATCH 01/27] In property sheets, avoid converting numeric values to strings --- cli/src/properties.rs | 59 +++++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 27 deletions(-) diff --git a/cli/src/properties.rs b/cli/src/properties.rs index fccfd7ed..0ec4330f 100644 --- a/cli/src/properties.rs +++ b/cli/src/properties.rs @@ -15,6 +15,7 @@ use tree_sitter::{self, PropertyStateJSON, PropertyTransitionJSON}; #[derive(Clone, Debug, PartialEq, Eq, Serialize)] #[serde(untagged)] enum PropertyValue { + Number(isize), String(String), Object(PropertySet), Array(Vec), @@ -621,7 +622,7 @@ fn parse_sass_value(value: &Value) -> Result { Ok(PropertyValue::Array(result)) } Value::Color(_, Some(name)) => Ok(PropertyValue::String(name.clone())), - Value::Numeric(n, _) => Ok(PropertyValue::String(format!("{}", n))), + Value::Numeric(n, _) => Ok(PropertyValue::Number(n.to_integer())), Value::True => Ok(PropertyValue::String("true".to_string())), Value::False => Ok(PropertyValue::String("false".to_string())), _ => Err(Error(format!( @@ -703,71 +704,71 @@ mod tests { // f1 single-element selector assert_eq!( *query_simple(&sheet, vec!["f1"]), - props(&[("color", "red")]) + props(&[("color", string("red"))]) ); assert_eq!( *query_simple(&sheet, vec!["f2", "f1"]), - props(&[("color", "red")]) + props(&[("color", string("red"))]) ); assert_eq!( *query_simple(&sheet, vec!["f2", "f3", "f1"]), - props(&[("color", "red")]) + props(&[("color", string("red"))]) ); // f2 single-element selector assert_eq!( *query_simple(&sheet, vec!["f2"]), - props(&[("color", "indigo"), ("height", "2")]) + props(&[("color", string("indigo")), ("height", num(2))]) ); assert_eq!( *query_simple(&sheet, vec!["f2", "f2"]), - props(&[("color", "indigo"), ("height", "2")]) + props(&[("color", string("indigo")), ("height", num(2))]) ); assert_eq!( *query_simple(&sheet, vec!["f1", "f3", "f2"]), - props(&[("color", "indigo"), ("height", "2")]) + props(&[("color", string("indigo")), ("height", num(2))]) ); assert_eq!( *query_simple(&sheet, vec!["f1", "f6", "f2"]), - props(&[("color", "indigo"), ("height", "2")]) + props(&[("color", string("indigo")), ("height", num(2))]) ); // f3 single-element selector assert_eq!( *query_simple(&sheet, vec!["f3"]), - props(&[("color", "violet"), ("height", "3")]) + props(&[("color", string("violet")), ("height", num(3))]) ); assert_eq!( *query_simple(&sheet, vec!["f2", "f3"]), - props(&[("color", "violet"), ("height", "3")]) + props(&[("color", string("violet")), ("height", num(3))]) ); // f2 child selector assert_eq!( *query_simple(&sheet, vec!["f1", "f2"]), - props(&[("color", "green"), ("height", "2")]) + props(&[("color", string("green")), ("height", num(2))]) ); assert_eq!( *query_simple(&sheet, vec!["f2", "f1", "f2"]), - props(&[("color", "green"), ("height", "2")]) + props(&[("color", string("green")), ("height", num(2))]) ); assert_eq!( *query_simple(&sheet, vec!["f3", "f1", "f2"]), - props(&[("color", "green"), ("height", "2")]) + props(&[("color", string("green")), ("height", num(2))]) ); // f3 descendant selector assert_eq!( *query_simple(&sheet, vec!["f1", "f3"]), - props(&[("color", "blue"), ("height", "3")]) + props(&[("color", string("blue")), ("height", num(3))]) ); assert_eq!( *query_simple(&sheet, vec!["f1", "f2", "f3"]), - props(&[("color", "blue"), ("height", "3")]) + props(&[("color", string("blue")), ("height", num(3))]) ); assert_eq!( *query_simple(&sheet, vec!["f1", "f6", "f7", "f8", "f3"]), - props(&[("color", "blue"), ("height", "3")]) + props(&[("color", string("blue")), ("height", num(3))]) ); // no match @@ -801,20 +802,20 @@ mod tests { assert_eq!( *query(&sheet, vec![("f1", true, 0)], "abc"), - props(&[("color", "red")]) + props(&[("color", string("red"))]) ); assert_eq!( *query(&sheet, vec![("f1", true, 0)], "Abc"), - props(&[("color", "green")]) + props(&[("color", string("green"))]) ); assert_eq!( *query(&sheet, vec![("f1", true, 0)], "AB_CD"), - props(&[("color", "blue")]) + props(&[("color", string("blue"))]) ); assert_eq!(*query(&sheet, vec![("f2", true, 0)], "Abc"), props(&[])); assert_eq!( *query(&sheet, vec![("f2", true, 0)], "ABC"), - props(&[("color", "purple")]) + props(&[("color", string("purple"))]) ); } @@ -837,19 +838,19 @@ mod tests { ); assert_eq!( *query(&sheet, vec![("f1", true, 0), ("f2", true, 1)], "x"), - props(&[("color", "red")]) + props(&[("color", string("red"))]) ); assert_eq!( *query(&sheet, vec![("f1", true, 1), ("f2", true, 1)], "x"), - props(&[("color", "green")]) + props(&[("color", string("green"))]) ); assert_eq!( *query(&sheet, vec![("f1", true, 1), ("f2", true, 1)], "a"), - props(&[("color", "blue")]) + props(&[("color", string("blue"))]) ); assert_eq!( *query(&sheet, vec![("f1", true, 1), ("f2", true, 1)], "ab"), - props(&[("color", "violet")]) + props(&[("color", string("violet"))]) ); } @@ -883,7 +884,7 @@ mod tests { object(&[("name", string("g")), ("args", array(vec![string("h"),]))]), string("i"), string("j"), - string("10"), + num(10), ]) ), ]) @@ -983,9 +984,13 @@ mod tests { PropertyValue::String(s.to_string()) } - fn props<'a>(s: &'a [(&'a str, &'a str)]) -> PropertySet { + fn num(n: isize) -> PropertyValue { + PropertyValue::Number(n) + } + + fn props<'a>(s: &'a [(&'a str, PropertyValue)]) -> PropertySet { s.into_iter() - .map(|(a, b)| (a.to_string(), PropertyValue::String(b.to_string()))) + .map(|(a, b)| (a.to_string(), b.clone())) .collect() } } From 9185f6c1684a9b7df6074a89ca78386f6f314aa2 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 19 Feb 2019 10:35:03 -0800 Subject: [PATCH 02/27] Clarify `parse` command error message when failing to read source file --- cli/src/parse.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cli/src/parse.rs b/cli/src/parse.rs index bd134457..2e8b3e4c 100644 --- a/cli/src/parse.rs +++ b/cli/src/parse.rs @@ -1,4 +1,4 @@ -use super::error::Result; +use super::error::{Error, Result}; use super::util; use std::fs; use std::io::{self, Write}; @@ -18,7 +18,8 @@ pub fn parse_file_at_path( let mut _log_session = None; let mut parser = Parser::new(); parser.set_language(language)?; - let source_code = fs::read(path)?; + let source_code = fs::read(path) + .map_err(|e| Error(format!("Error reading source file {:?}: {}", path, e)))?; if debug_graph { _log_session = Some(util::log_graphs(&mut parser, "log.html")?); From 0dd15e2b02da2aa83bdcad8adf24db8c2b6bc70e Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 19 Feb 2019 10:35:49 -0800 Subject: [PATCH 03/27] Ensure deterministic order of values in property sheet JSON files --- cli/src/properties.rs | 124 +++++++++++++++++++----------------------- 1 file changed, 57 insertions(+), 67 deletions(-) diff --git a/cli/src/properties.rs b/cli/src/properties.rs index 0ec4330f..996c77f5 100644 --- a/cli/src/properties.rs +++ b/cli/src/properties.rs @@ -3,9 +3,8 @@ use log::info; use rsass; use rsass::sass::Value; use serde_derive::Serialize; -use std::cmp::Ordering; use std::collections::hash_map::Entry; -use std::collections::{BTreeMap, HashMap, HashSet, VecDeque}; +use std::collections::{btree_map, BTreeMap, HashMap, VecDeque}; use std::fmt::{self, Write}; use std::fs::{self, File}; use std::io::BufWriter; @@ -21,7 +20,7 @@ enum PropertyValue { Array(Vec), } -type PropertySet = HashMap; +type PropertySet = BTreeMap; type PropertySheetJSON = tree_sitter::PropertySheetJSON; type StateId = usize; type PropertySetId = usize; @@ -160,7 +159,7 @@ impl Builder { } fn populate_state(&mut self, item_set: ItemSet, state_id: StateId) { - let mut transition_map: HashSet<(PropertyTransitionJSON, u32)> = HashSet::new(); + let mut transitions: HashMap = HashMap::new(); let mut selector_matches = Vec::new(); // First, compute all of the possible state transition predicates for @@ -173,17 +172,20 @@ impl Builder { // If this item has more elements remaining in its selector, then // add a state transition based on the next step. if let Some(step) = next_step { - transition_map.insert(( - PropertyTransitionJSON { + transitions + .entry(PropertyTransitionJSON { kind: step.kind.clone(), named: step.is_named, index: step.child_index, text: step.text_pattern.clone(), state_id: 0, - }, - // Include the rule id so that it can be used when sorting transitions. - item.rule_id, - )); + }) + .and_modify(|rule_id| { + if item.rule_id > *rule_id { + *rule_id = item.rule_id; + } + }) + .or_insert(item.rule_id); } // If the item has matched its entire selector, then the item's // properties are applicable to this state. @@ -195,73 +197,61 @@ impl Builder { } } - // For eacy possible state transition, compute the set of items in that transition's - // destination state. - let mut transition_list: Vec<(PropertyTransitionJSON, u32)> = transition_map - .into_iter() - .map(|(mut transition, rule_id)| { - let mut next_item_set = ItemSet::new(); - for item in &item_set { - let rule = &self.rules[item.rule_id as usize]; - let selector = &rule.selectors[item.selector_id as usize]; - let next_step = selector.0.get(item.step_id as usize); - - if let Some(step) = next_step { - // If the next step of the item's selector satisfies this transition, - // advance the item to the next part of its selector and add the - // resulting item to this transition's destination state. - if step_matches_transition(step, &transition) { - next_item_set.insert(Item { - rule_id: item.rule_id, - selector_id: item.selector_id, - step_id: item.step_id + 1, - }); - } - - // If the next step of the item is not an immediate child, then - // include this item in this transition's destination state, because - // the next step of the item might match a descendant node. - if !step.is_immediate { - next_item_set.insert(*item); - } - } - } - - transition.state_id = self.add_state(next_item_set); - (transition, rule_id) - }) - .collect(); - // Ensure that for a given node type, more specific transitions are tried // first, and in the event of a tie, transitions corresponding to later rules // in the cascade are tried first. + let mut transition_list: Vec<(PropertyTransitionJSON, u32)> = + transitions.into_iter().collect(); transition_list.sort_by(|a, b| { - let result = a.0.kind.cmp(&b.0.kind); - if result != Ordering::Equal { - return result; - } - let result = a.0.named.cmp(&b.0.named); - if result != Ordering::Equal { - return result; - } - let result = transition_specificity(&b.0).cmp(&transition_specificity(&a.0)); - if result != Ordering::Equal { - return result; - } - b.1.cmp(&a.1) + a.0.kind + .cmp(&b.0.kind) + .then_with(|| a.0.named.cmp(&b.0.named)) + .then_with(|| transition_specificity(&b.0).cmp(&transition_specificity(&a.0))) + .then_with(|| b.1.cmp(&a.1)) }); + // For eacy possible state transition, compute the set of items in that transition's + // destination state. + for (transition, _) in transition_list.iter_mut() { + let mut next_item_set = ItemSet::new(); + for item in &item_set { + let rule = &self.rules[item.rule_id as usize]; + let selector = &rule.selectors[item.selector_id as usize]; + let next_step = selector.0.get(item.step_id as usize); + + if let Some(step) = next_step { + // If the next step of the item's selector satisfies this transition, + // advance the item to the next part of its selector and add the + // resulting item to this transition's destination state. + if step_matches_transition(step, &transition) { + next_item_set.insert(Item { + rule_id: item.rule_id, + selector_id: item.selector_id, + step_id: item.step_id + 1, + }); + } + + // If the next step of the item is not an immediate child, then + // include this item in this transition's destination state, because + // the next step of the item might match a descendant node. + if !step.is_immediate { + next_item_set.insert(*item); + } + } + } + + transition.state_id = self.add_state(next_item_set); + } + // Compute the merged properties that apply in the current state. // Sort the matching property sets by ascending specificity and by // their order in the sheet. This way, more specific selectors and later // rules will override less specific selectors and earlier rules. let mut properties = PropertySet::new(); selector_matches.sort_unstable_by(|a, b| { - let result = a.specificity.cmp(&b.specificity); - if result != Ordering::Equal { - return result; - } - a.rule_id.cmp(&b.rule_id) + a.specificity + .cmp(&b.specificity) + .then_with(|| a.rule_id.cmp(&b.rule_id)) }); selector_matches.dedup(); for selector_match in selector_matches { @@ -505,10 +495,10 @@ fn parse_sass_items( rsass::Item::Property(name, value) => { let value = parse_sass_value(&value)?; match properties.entry(name.to_string()) { - Entry::Vacant(v) => { + btree_map::Entry::Vacant(v) => { v.insert(value); } - Entry::Occupied(mut o) => { + btree_map::Entry::Occupied(mut o) => { let existing_value = o.get_mut(); if let PropertyValue::Array(items) = existing_value { items.push(value); From e89b6b2402acb0d2ebbbd87bc6f3281dc20a3bd3 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 19 Feb 2019 11:24:50 -0800 Subject: [PATCH 04/27] Add a `highlight` subcommand --- Cargo.lock | 78 +++ cli/Cargo.toml | 5 + cli/src/error.rs | 11 + cli/src/highlight.rs | 272 ++++++++++ cli/src/lib.rs | 1 + cli/src/loader.rs | 115 ++++- cli/src/main.rs | 48 +- cli/src/tests/helpers/fixtures.rs | 13 +- cli/src/tests/highlight_test.rs | 191 +++++++ cli/src/tests/mod.rs | 1 + cli/vendor/xterm-colors.json | 258 ++++++++++ highlight/Cargo.toml | 23 + highlight/src/escape.rs | 53 ++ highlight/src/lib.rs | 823 ++++++++++++++++++++++++++++++ 14 files changed, 1870 insertions(+), 22 deletions(-) create mode 100644 cli/src/highlight.rs create mode 100644 cli/src/tests/highlight_test.rs create mode 100644 cli/vendor/xterm-colors.json create mode 100644 highlight/Cargo.toml create mode 100644 highlight/src/escape.rs create mode 100644 highlight/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 9ca3a70a..50058336 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -210,6 +210,14 @@ dependencies = [ "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "lock_api" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "log" version = "0.4.6" @@ -263,6 +271,35 @@ name = "num-traits" version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "once_cell" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "parking_lot 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "parking_lot" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "lock_api 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", + "parking_lot_core 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "parking_lot_core" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "rand 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", + "rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)", + "smallvec 0.6.8 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "proc-macro2" version = "0.4.24" @@ -502,6 +539,14 @@ name = "smallbitvec" version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "smallvec" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "unreachable 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "spin" version = "0.5.0" @@ -583,6 +628,7 @@ dependencies = [ "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", "libloading 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", + "once_cell 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)", "rand 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", @@ -593,6 +639,18 @@ dependencies = [ "smallbitvec 2.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "spin 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", "tree-sitter 0.3.8", + "tree-sitter-highlight 0.1.0", +] + +[[package]] +name = "tree-sitter-highlight" +version = "0.1.0" +dependencies = [ + "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_json 1.0.33 (registry+https://github.com/rust-lang/crates.io-index)", + "tree-sitter 0.3.8", ] [[package]] @@ -610,6 +668,14 @@ name = "unicode-xid" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "unreachable" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "utf8-ranges" version = "1.0.2" @@ -625,6 +691,11 @@ name = "version_check" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "void" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "winapi" version = "0.3.6" @@ -673,6 +744,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a374c89b9db55895453a74c1e38861d9deec0b01b405a82516e9d5de4820dea1" "checksum libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)" = "10923947f84a519a45c8fefb7dd1b3e8c08747993381adee176d7a82b4195311" "checksum libloading 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "9c3ad660d7cb8c5822cd83d10897b0f1f1526792737a179e73896152f85b88c2" +"checksum lock_api 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "62ebf1391f6acad60e5c8b43706dde4582df75c06698ab44511d15016bc2442c" "checksum log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c84ec4b527950aa83a329754b01dbe3f58361d1c5efacd1f6d68c494d08a17c6" "checksum memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0a3eb002f0535929f1199681417029ebea04aadc0c7a4224b46be99c7f5d6a16" "checksum nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "2f9667ddcc6cc8a43afc9b7917599d7216aa09c463919ea32c59ed6cac8bc945" @@ -680,6 +752,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum num-integer 0.1.39 (registry+https://github.com/rust-lang/crates.io-index)" = "e83d528d2677f0518c570baf2b7abdcf0cd2d248860b68507bdcb3e91d4c0cea" "checksum num-rational 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4e96f040177bb3da242b5b1ecf3f54b5d5af3efbbfb18608977a5d2767b22f10" "checksum num-traits 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "0b3a5d7cc97d6d30d8b9bc8fa19bf45349ffe46241e8816f50f62f6d6aaabee1" +"checksum once_cell 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)" = "532c29a261168a45ce28948f9537ddd7a5dd272cc513b3017b1e82a88f962c37" +"checksum parking_lot 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)" = "ab41b4aed082705d1056416ae4468b6ea99d52599ecf3169b00088d43113e337" +"checksum parking_lot_core 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "94c8c7923936b28d546dfd14d4472eaf34c99b14e1c973a32b3e6d4eb04298c9" "checksum proc-macro2 0.4.24 (registry+https://github.com/rust-lang/crates.io-index)" = "77619697826f31a02ae974457af0b29b723e5619e113e9397b8b82c6bd253f09" "checksum quote 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)" = "53fa22a1994bd0f9372d7a816207d8a2677ad0325b073f5c5332760f0fb62b5c" "checksum rand 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "8356f47b32624fef5b3301c1be97e5944ecdd595409cc5da11d05f211db6cfbd" @@ -709,6 +784,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)" = "225de307c6302bec3898c51ca302fc94a7a1697ef0845fcee6448f33c032249c" "checksum serde_json 1.0.33 (registry+https://github.com/rust-lang/crates.io-index)" = "c37ccd6be3ed1fdf419ee848f7c758eb31b054d7cd3ae3600e3bae0adf569811" "checksum smallbitvec 2.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1764fe2b30ee783bfe3b9b37b2649d8d590b3148bb12e0079715d4d5c673562e" +"checksum smallvec 0.6.8 (registry+https://github.com/rust-lang/crates.io-index)" = "88aea073965ab29f6edb5493faf96ad662fb18aa9eeb186a3b7057951605ed15" "checksum spin 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "44363f6f51401c34e7be73db0db371c04705d35efbe9f7d6082e03a921a32c55" "checksum strsim 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bb4f380125926a99e52bc279241539c018323fab05ad6368b56f93d9369ff550" "checksum syn 0.15.22 (registry+https://github.com/rust-lang/crates.io-index)" = "ae8b29eb5210bc5cf63ed6149cbf9adfc82ac0be023d8735c176ee74a2db4da7" @@ -719,9 +795,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "535c204ee4d8434478593480b8f86ab45ec9aae0e83c568ca81abf0fd0e88f86" "checksum unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "882386231c45df4700b275c7ff55b6f3698780a650026380e72dabe76fa46526" "checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" +"checksum unreachable 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "382810877fe448991dfc7f0dd6e3ae5d58088fd0ea5e35189655f84e6814fa56" "checksum utf8-ranges 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "796f7e48bef87609f7ade7e06495a87d5cd06c7866e6a5cbfceffc558a243737" "checksum vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "05c78687fb1a80548ae3250346c3db86a80a7cdd77bda190189f2d0a0987c81a" "checksum version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "914b1a6776c4c929a602fafd8bc742e06365d4bcbe48c30f9cca5824f70dc9dd" +"checksum void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" "checksum winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "92c1eb33641e276cfa214a0522acad57be5c56b10cb348b3c5117db75f3ac4b0" "checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" "checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 54d0eb5e..242ed72b 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -27,6 +27,7 @@ clap = "2.32" dirs = "1.0.2" hashbrown = "0.1" libloading = "0.5" +once_cell = "0.1.8" serde = "1.0" serde_derive = "1.0" regex-syntax = "0.6.4" @@ -37,6 +38,10 @@ rsass = "0.9" version = ">= 0.3.7" path = "../lib" +[dependencies.tree-sitter-highlight] +version = ">= 0.1.0" +path = "../highlight" + [dependencies.serde_json] version = "1.0" features = ["preserve_order"] diff --git a/cli/src/error.rs b/cli/src/error.rs index 4769b481..b0e52797 100644 --- a/cli/src/error.rs +++ b/cli/src/error.rs @@ -1,4 +1,5 @@ use std::io; +use tree_sitter_highlight::PropertySheetError; #[derive(Debug)] pub struct Error(pub String); @@ -42,3 +43,13 @@ impl From for Error { Error(error) } } + +impl From for Error { + fn from(error: PropertySheetError) -> Self { + match error { + PropertySheetError::InvalidFormat(e) => Self::from(e), + PropertySheetError::InvalidRegex(e) => Self::regex(&e.to_string()), + PropertySheetError::InvalidJSON(e) => Self::from(e), + } + } +} diff --git a/cli/src/highlight.rs b/cli/src/highlight.rs new file mode 100644 index 00000000..1651b98d --- /dev/null +++ b/cli/src/highlight.rs @@ -0,0 +1,272 @@ +use crate::error::Result; +use crate::loader::Loader; +use ansi_term::{Color, Style}; +use lazy_static::lazy_static; +use serde_json::Value; +use std::collections::HashMap; +use std::{fmt, fs, io, mem, path}; +use tree_sitter::{Language, PropertySheet}; +use tree_sitter_highlight::{highlight, highlight_html, HighlightEvent, Properties, Scope}; + +lazy_static! { + static ref CSS_STYLES_BY_COLOR_ID: Vec = + serde_json::from_str(include_str!("../vendor/xterm-colors.json")).unwrap(); +} + +pub struct Theme { + ansi_styles: Vec>, + css_styles: Vec>, +} + +impl Theme { + pub fn load(path: &path::Path) -> io::Result { + let json = fs::read_to_string(path)?; + Ok(Self::new(&json)) + } + + pub fn new(json: &str) -> Self { + let mut ansi_styles = vec![None; 30]; + let mut css_styles = vec![None; 30]; + if let Ok(colors) = serde_json::from_str::>(json) { + for (scope, style_value) in colors { + let mut style = Style::default(); + parse_style(&mut style, style_value); + ansi_styles[scope as usize] = Some(style); + css_styles[scope as usize] = Some(style_to_css(style)); + } + } + Self { + ansi_styles, + css_styles, + } + } + + fn ansi_style(&self, scope: Scope) -> Option<&Style> { + self.ansi_styles[scope as usize].as_ref() + } + + fn css_style(&self, scope: Scope) -> Option<&str> { + self.css_styles[scope as usize].as_ref().map(|s| s.as_str()) + } +} + +impl Default for Theme { + fn default() -> Self { + Theme::new( + r#" + { + "attribute": {"color": 124, "italic": true}, + "comment": {"color": 245, "italic": true}, + "constant.builtin": {"color": 94, "bold": true}, + "constant": 94, + "constructor": 136, + "embedded": null, + "function.builtin": {"color": 26, "bold": true}, + "function": 26, + "keyword": 56, + "number": {"color": 94, "bold": true}, + "property": 124, + "operator": {"color": 239, "bold": true}, + "punctuation.bracket": 239, + "punctuation.delimiter": 239, + "string.special": 30, + "string": 28, + "tag": {"color": 18}, + "variable.builtin": {"bold": true} + } + "#, + ) + } +} + +impl fmt::Debug for Theme { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{{")?; + let mut first = true; + for (i, style) in self.ansi_styles.iter().enumerate() { + if let Some(style) = style { + let scope = Scope::from_usize(i).unwrap(); + if !first { + write!(f, ", ")?; + } + write!(f, "{:?}: {:?}", scope, style)?; + first = false; + } + } + write!(f, "}}")?; + Ok(()) + } +} + +fn parse_style(style: &mut Style, json: Value) { + if let Value::Object(entries) = json { + for (property_name, value) in entries { + match property_name.as_str() { + "italic" => *style = style.italic(), + "bold" => *style = style.bold(), + "dimmed" => *style = style.dimmed(), + "underline" => *style = style.underline(), + "color" => { + if let Some(color) = parse_color(value) { + *style = style.fg(color); + } + } + _ => {} + } + } + } else if let Some(color) = parse_color(json) { + *style = style.fg(color); + } +} + +fn parse_color(json: Value) -> Option { + match json { + Value::Number(n) => match n.as_u64() { + Some(n) => Some(Color::Fixed(n as u8)), + _ => None, + }, + Value::String(s) => match s.to_lowercase().as_str() { + "blue" => Some(Color::Blue), + "cyan" => Some(Color::Cyan), + "green" => Some(Color::Green), + "purple" => Some(Color::Purple), + "red" => Some(Color::Red), + "white" => Some(Color::White), + "yellow" => Some(Color::Yellow), + s => { + if s.starts_with("#") && s.len() >= 7 { + if let (Ok(red), Ok(green), Ok(blue)) = ( + u8::from_str_radix(&s[1..3], 16), + u8::from_str_radix(&s[3..5], 16), + u8::from_str_radix(&s[5..7], 16), + ) { + Some(Color::RGB(red, green, blue)) + } else { + None + } + } else { + None + } + } + }, + _ => None, + } +} + +fn style_to_css(style: Style) -> String { + use std::fmt::Write; + let mut result = "style='".to_string(); + if style.is_bold { + write!(&mut result, "font-weight: bold;").unwrap(); + } + if style.is_italic { + write!(&mut result, "font-style: italic;").unwrap(); + } + if let Some(color) = style.foreground { + write!(&mut result, "color: {};", color_to_css(color)).unwrap(); + } + result.push('\''); + result +} + +fn color_to_css(color: Color) -> &'static str { + match color { + Color::Black => "black", + Color::Blue => "blue", + Color::Red => "red", + Color::Green => "green", + Color::Yellow => "yellow", + Color::Cyan => "cyan", + Color::Purple => "purple", + Color::White => "white", + Color::Fixed(n) => CSS_STYLES_BY_COLOR_ID[n as usize].as_str(), + _ => panic!("Unsupported color type"), + } +} + +pub fn ansi( + loader: &Loader, + theme: &Theme, + source: &[u8], + language: Language, + property_sheet: &PropertySheet, +) -> Result<()> { + use std::io::Write; + let stdout = io::stdout(); + let mut stdout = stdout.lock(); + let mut scope_stack = Vec::new(); + for event in highlight(loader, source, language, property_sheet)? { + match event { + HighlightEvent::Source(s) => { + if let Some(style) = scope_stack.last().and_then(|s| theme.ansi_style(*s)) { + write!(&mut stdout, "{}", style.paint(s))?; + } else { + write!(&mut stdout, "{}", s)?; + } + } + HighlightEvent::ScopeStart(s) => { + scope_stack.push(s); + } + HighlightEvent::ScopeEnd(_) => { + scope_stack.pop(); + } + } + } + Ok(()) +} + +pub const HTML_HEADER: &'static str = " + + + Tree-sitter Highlighting + + + +"; + +pub const HTML_FOOTER: &'static str = " + +"; + +pub fn html( + loader: &Loader, + theme: &Theme, + source: &[u8], + language: Language, + property_sheet: &PropertySheet, +) -> Result<()> { + use std::io::Write; + let stdout = io::stdout(); + let mut stdout = stdout.lock(); + write!(&mut stdout, "\n")?; + let lines = highlight_html(loader, source, language, property_sheet, |scope| { + if let Some(css_style) = theme.css_style(scope) { + css_style + } else { + "" + } + })?; + for (i, line) in lines.into_iter().enumerate() { + write!( + &mut stdout, + "\n", + i + 1, + line + )?; + } + write!(&mut stdout, "
{}{}
\n")?; + Ok(()) +} diff --git a/cli/src/lib.rs b/cli/src/lib.rs index 3a15b457..0ece9cac 100644 --- a/cli/src/lib.rs +++ b/cli/src/lib.rs @@ -1,5 +1,6 @@ pub mod error; pub mod generate; +pub mod highlight; pub mod loader; pub mod logger; pub mod parse; diff --git a/cli/src/loader.rs b/cli/src/loader.rs index 5c2a19a7..d19acf46 100644 --- a/cli/src/loader.rs +++ b/cli/src/loader.rs @@ -1,5 +1,6 @@ use super::error::{Error, Result}; use libloading::{Library, Symbol}; +use once_cell::unsync::OnceCell; use regex::{Regex, RegexBuilder}; use serde_derive::Deserialize; use std::collections::HashMap; @@ -9,6 +10,7 @@ use std::process::Command; use std::time::SystemTime; use std::{fs, mem}; use tree_sitter::{Language, PropertySheet}; +use tree_sitter_highlight::{load_property_sheet, LanguageRegistry, Properties}; #[cfg(unix)] const DYLIB_EXTENSION: &'static str = "so"; @@ -20,16 +22,18 @@ const BUILD_TARGET: &'static str = env!("BUILD_TARGET"); struct LanguageRepo { path: PathBuf, - language: Option, + language: OnceCell, configurations: Vec, } pub struct LanguageConfiguration { - _name: String, + pub name: String, _content_regex: Option, _first_line_regex: Option, + injection_regex: Option, file_types: Vec, - _highlight_property_sheet: Option>, + highlight_property_sheet_path: Option, + highlight_property_sheet: OnceCell>>, } pub struct Loader { @@ -76,7 +80,7 @@ impl Loader { } pub fn language_configuration_for_file_name( - &mut self, + &self, path: &Path, ) -> Result> { let ids = path @@ -100,20 +104,43 @@ impl Loader { Ok(None) } + pub fn language_configuration_for_injection_string( + &self, + string: &str, + ) -> Result> { + let mut best_match_length = 0; + let mut best_match_position = None; + for (i, repo) in self.language_repos.iter().enumerate() { + for (j, configuration) in repo.configurations.iter().enumerate() { + if let Some(injection_regex) = &configuration.injection_regex { + if let Some(mat) = injection_regex.find(string) { + let length = mat.end() - mat.start(); + if length > best_match_length { + best_match_position = Some((i, j)); + best_match_length = length; + } + } + } + } + } + if let Some((i, j)) = best_match_position { + let (language, configurations) = self.language_configuration_for_id(i)?; + Ok(Some((language, &configurations[j]))) + } else { + Ok(None) + } + } + fn language_configuration_for_id( - &mut self, + &self, id: usize, ) -> Result<(Language, &Vec)> { let repo = &self.language_repos[id]; - let language = if let Some(language) = repo.language { - language - } else { + let language = repo.language.get_or_try_init(|| { let src_path = repo.path.join("src"); - let language = self.load_language_at_path(&src_path, &src_path)?; - self.language_repos[id].language = Some(language); - language - }; - Ok((language, &self.language_repos[id].configurations)) + self.load_language_at_path(&src_path, &src_path) + })?; + Ok((*language, &self.language_repos[id].configurations)) } pub fn load_language_at_path(&self, src_path: &Path, header_path: &Path) -> Result { @@ -238,6 +265,8 @@ impl Loader { content_regex: Option, #[serde(rename = "first-line-regex")] first_line_regex: Option, + #[serde(rename = "injection-regex")] + injection_regex: Option, highlights: Option, } @@ -255,7 +284,7 @@ impl Loader { configurations .into_iter() .map(|conf| LanguageConfiguration { - _name: conf.name, + name: conf.name, file_types: conf.file_types.unwrap_or(Vec::new()), _content_regex: conf .content_regex @@ -263,7 +292,11 @@ impl Loader { _first_line_regex: conf .first_line_regex .and_then(|r| RegexBuilder::new(&r).multi_line(true).build().ok()), - _highlight_property_sheet: conf.highlights.map(|d| Err(d.into())), + injection_regex: conf + .injection_regex + .and_then(|r| RegexBuilder::new(&r).multi_line(true).build().ok()), + highlight_property_sheet_path: conf.highlights.map(|h| parser_path.join(h)), + highlight_property_sheet: OnceCell::new(), }) .collect() }); @@ -279,7 +312,7 @@ impl Loader { self.language_repos.push(LanguageRepo { path: parser_path.to_owned(), - language: None, + language: OnceCell::new(), configurations, }); @@ -287,6 +320,56 @@ impl Loader { } } +impl LanguageRegistry for Loader { + fn language_for_injection_string<'a>( + &'a self, + string: &str, + ) -> Option<(Language, &'a PropertySheet)> { + match self.language_configuration_for_injection_string(string) { + Err(message) => { + eprintln!( + "Failed to load language for injection string '{}': {}", + string, message.0 + ); + None + } + Ok(None) => None, + Ok(Some((language, configuration))) => { + match configuration.highlight_property_sheet(language) { + Err(message) => { + eprintln!( + "Failed to load property sheet for injection string '{}': {}", + string, message.0 + ); + None + } + Ok(None) => None, + Ok(Some(sheet)) => Some((language, sheet)), + } + } + } + } +} + +impl LanguageConfiguration { + pub fn highlight_property_sheet( + &self, + language: Language, + ) -> Result>> { + self.highlight_property_sheet + .get_or_try_init(|| { + if let Some(path) = &self.highlight_property_sheet_path { + let sheet_json = fs::read_to_string(path)?; + let sheet = load_property_sheet(language, &sheet_json)?; + Ok(Some(sheet)) + } else { + Ok(None) + } + }) + .map(Option::as_ref) + } +} + fn needs_recompile( lib_path: &Path, parser_c_path: &Path, diff --git a/cli/src/main.rs b/cli/src/main.rs index eb848831..9cd4e131 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -4,8 +4,7 @@ use std::fs; use std::path::Path; use std::process::exit; use std::usize; -use tree_sitter_cli::loader::Loader; -use tree_sitter_cli::{error, generate, logger, parse, properties, test}; +use tree_sitter_cli::{error, generate, highlight, loader, logger, parse, properties, test}; fn main() { if let Err(e) = run() { @@ -64,14 +63,30 @@ fn run() -> error::Result<()> { .arg(Arg::with_name("debug").long("debug").short("d")) .arg(Arg::with_name("debug-graph").long("debug-graph").short("D")), ) + .subcommand( + SubCommand::with_name("highlight") + .about("Highlight a file") + .arg( + Arg::with_name("path") + .index(1) + .multiple(true) + .required(true), + ) + .arg(Arg::with_name("html").long("html").short("h")), + ) .get_matches(); let home_dir = dirs::home_dir().unwrap(); let current_dir = env::current_dir().unwrap(); let config_dir = home_dir.join(".tree-sitter"); + let theme_path = config_dir.join("theme.json"); + let parsers_dir = config_dir.join("parsers"); - fs::create_dir_all(&config_dir).unwrap(); - let mut loader = Loader::new(config_dir); + // TODO - make configurable + let parser_repo_paths = vec![home_dir.join("github")]; + + fs::create_dir_all(&parsers_dir).unwrap(); + let mut loader = loader::Loader::new(config_dir); if let Some(matches) = matches.subcommand_matches("generate") { if matches.is_present("log") { @@ -111,7 +126,7 @@ fn run() -> error::Result<()> { let debug_graph = matches.is_present("debug-graph"); let quiet = matches.is_present("quiet"); let time = matches.is_present("time"); - loader.find_all_languages(&vec![home_dir.join("github")])?; + loader.find_all_languages(&parser_repo_paths)?; let paths = matches .values_of("path") .unwrap() @@ -144,6 +159,29 @@ fn run() -> error::Result<()> { if has_error { return Err(error::Error(String::new())); } + } else if let Some(matches) = matches.subcommand_matches("highlight") { + loader.find_all_languages(&parser_repo_paths)?; + let theme = highlight::Theme::load(&theme_path).unwrap_or_default(); + let paths = matches.values_of("path").unwrap().into_iter(); + let html_mode = matches.is_present("html"); + + if html_mode { + println!("{}", highlight::HTML_HEADER); + } + + for path in paths { + let path = Path::new(path); + if let Some((language, config)) = loader.language_configuration_for_file_name(path)? { + if let Some(sheet) = config.highlight_property_sheet(language)? { + let source = fs::read(path)?; + if html_mode { + highlight::html(&loader, &theme, &source, language, sheet)?; + } else { + highlight::ansi(&loader, &theme, &source, language, sheet)?; + } + } + } + } } Ok(()) diff --git a/cli/src/tests/helpers/fixtures.rs b/cli/src/tests/helpers/fixtures.rs index 8fc00038..e7ba2e55 100644 --- a/cli/src/tests/helpers/fixtures.rs +++ b/cli/src/tests/helpers/fixtures.rs @@ -2,7 +2,8 @@ use crate::loader::Loader; use lazy_static::lazy_static; use std::fs; use std::path::{Path, PathBuf}; -use tree_sitter::Language; +use tree_sitter::{Language, PropertySheet}; +use tree_sitter_highlight::{load_property_sheet, Properties}; include!("./dirs.rs"); @@ -20,6 +21,16 @@ pub fn get_language(name: &str) -> Language { .unwrap() } +pub fn get_property_sheet(language_name: &str, sheet_name: &str) -> PropertySheet { + let path = GRAMMARS_DIR + .join(language_name) + .join("src") + .join(sheet_name); + let json = fs::read_to_string(path).unwrap(); + let language = get_language(language_name); + load_property_sheet(language, &json).unwrap() +} + pub fn get_test_language(name: &str, parser_code: &str, path: Option<&Path>) -> Language { let parser_c_path = SCRATCH_DIR.join(&format!("{}-parser.c", name)); if !fs::read_to_string(&parser_c_path) diff --git a/cli/src/tests/highlight_test.rs b/cli/src/tests/highlight_test.rs new file mode 100644 index 00000000..ea14a1c2 --- /dev/null +++ b/cli/src/tests/highlight_test.rs @@ -0,0 +1,191 @@ +use super::helpers::fixtures::{get_language, get_property_sheet}; +use lazy_static::lazy_static; +use tree_sitter::{Language, PropertySheet}; +use tree_sitter_highlight::{ + highlight, highlight_html, HighlightEvent, LanguageRegistry, Properties, Scope, +}; + +lazy_static! { + static ref JS_SHEET: PropertySheet = + get_property_sheet("javascript", "highlights.json"); + static ref HTML_SHEET: PropertySheet = + get_property_sheet("html", "highlights.json"); + static ref SCOPE_CLASS_STRINGS: Vec = { + let mut result = Vec::new(); + let mut i = 0; + while let Some(scope) = Scope::from_usize(i) { + result.push(format!("class={:?}", scope)); + i += 1; + } + result + }; +} + +#[test] +fn test_highlighting_injected_html_in_javascript() { + let source = vec!["const s = html `
${a < b}
`;"].join("\n"); + + assert_eq!( + &to_token_vector(&source, get_language("javascript"), &JS_SHEET).unwrap(), + &[vec![ + ("const", vec![Scope::Keyword]), + (" ", vec![]), + ("s", vec![Scope::Variable]), + (" ", vec![]), + ("=", vec![Scope::Operator]), + (" ", vec![]), + ("html", vec![Scope::Function]), + (" ", vec![]), + ("`<", vec![Scope::String]), + ("div", vec![Scope::String, Scope::Tag]), + (">", vec![Scope::String]), + ( + "${", + vec![Scope::String, Scope::Embedded, Scope::PunctuationSpecial] + ), + ("a", vec![Scope::String, Scope::Embedded, Scope::Variable]), + (" ", vec![Scope::String, Scope::Embedded]), + ("<", vec![Scope::String, Scope::Embedded, Scope::Operator]), + (" ", vec![Scope::String, Scope::Embedded]), + ("b", vec![Scope::String, Scope::Embedded, Scope::Variable]), + ( + "}", + vec![Scope::String, Scope::Embedded, Scope::PunctuationSpecial] + ), + ("`", vec![Scope::String]), + (";", vec![Scope::PunctuationDelimiter]), + ]] + ); +} + +#[test] +fn test_highlighting_injected_javascript_in_html() { + let source = vec![ + "", + " ", + "", + ] + .join("\n"); + + assert_eq!( + &to_token_vector(&source, get_language("html"), &HTML_SHEET).unwrap(), + &[ + vec![("<", vec![]), ("body", vec![Scope::Tag]), (">", vec![]),], + vec![(" <", vec![]), ("script", vec![Scope::Tag]), (">", vec![]),], + vec![ + (" ", vec![]), + ("const", vec![Scope::Keyword]), + (" ", vec![]), + ("x", vec![Scope::Variable]), + (" ", vec![]), + ("=", vec![Scope::Operator]), + (" ", vec![]), + ("new", vec![Scope::Keyword]), + (" ", vec![]), + ("Thing", vec![Scope::Constructor]), + ("(", vec![Scope::PunctuationBracket]), + (")", vec![Scope::PunctuationBracket]), + (";", vec![Scope::PunctuationDelimiter]), + ], + vec![ + (" ", vec![]), + ], + vec![("", vec![]),], + ] + ); +} + +#[test] +fn test_highlighting_multiline_scopes_to_html() { + let source = vec![ + "const SOMETHING = `", + " one ${", + " two()", + " } three", + "`", + ] + .join("\n"); + + assert_eq!( + &to_html(&source, get_language("javascript"), &JS_SHEET,).unwrap(), + &[ + "const SOMETHING = `\n".to_string(), + " one ${\n".to_string(), + " two()\n".to_string(), + " } three\n".to_string(), + "`\n".to_string(), + ] + ); +} + +struct TestLanguageRegistry; + +impl LanguageRegistry for TestLanguageRegistry { + fn language_for_injection_string( + &self, + string: &str, + ) -> Option<(Language, &PropertySheet)> { + match string { + "javascript" => Some((get_language("javascript"), &JS_SHEET)), + "html" => Some((get_language("html"), &HTML_SHEET)), + _ => None, + } + } +} + +fn to_html<'a>( + src: &'a str, + language: Language, + property_sheet: &'a PropertySheet, +) -> Result, String> { + highlight_html( + &TestLanguageRegistry, + src.as_bytes(), + language, + property_sheet, + |scope| SCOPE_CLASS_STRINGS[scope as usize].as_str(), + ) +} + +fn to_token_vector<'a>( + src: &'a str, + language: Language, + property_sheet: &'a PropertySheet, +) -> Result)>>, String> { + let mut lines = Vec::new(); + let mut scopes = Vec::new(); + let mut line = Vec::new(); + for event in highlight( + &TestLanguageRegistry, + src.as_bytes(), + language, + property_sheet, + )? { + match event { + HighlightEvent::ScopeStart(s) => scopes.push(s), + HighlightEvent::ScopeEnd(s) => { + assert_eq!(*scopes.last().unwrap(), s); + scopes.pop(); + } + HighlightEvent::Source(s) => { + for (i, l) in s.lines().enumerate() { + if i > 0 { + lines.push(line); + line = Vec::new(); + } + if l.len() > 0 { + line.push((l, scopes.clone())); + } + } + } + } + } + lines.push(line); + Ok(lines) +} diff --git a/cli/src/tests/mod.rs b/cli/src/tests/mod.rs index af2b4582..143e8297 100644 --- a/cli/src/tests/mod.rs +++ b/cli/src/tests/mod.rs @@ -1,5 +1,6 @@ mod corpus_test; mod helpers; +mod highlight_test; mod node_test; mod parser_test; mod properties_test; diff --git a/cli/vendor/xterm-colors.json b/cli/vendor/xterm-colors.json new file mode 100644 index 00000000..47994496 --- /dev/null +++ b/cli/vendor/xterm-colors.json @@ -0,0 +1,258 @@ +[ + "#000000", + "#800000", + "#008000", + "#808000", + "#000080", + "#800080", + "#008080", + "#c0c0c0", + "#808080", + "#ff0000", + "#00ff00", + "#ffff00", + "#0000ff", + "#ff00ff", + "#00ffff", + "#ffffff", + "#000000", + "#00005f", + "#000087", + "#0000af", + "#0000d7", + "#0000ff", + "#005f00", + "#005f5f", + "#005f87", + "#005faf", + "#005fd7", + "#005fff", + "#008700", + "#00875f", + "#008787", + "#0087af", + "#0087d7", + "#0087ff", + "#00af00", + "#00af5f", + "#00af87", + "#00afaf", + "#00afd7", + "#00afff", + "#00d700", + "#00d75f", + "#00d787", + "#00d7af", + "#00d7d7", + "#00d7ff", + "#00ff00", + "#00ff5f", + "#00ff87", + "#00ffaf", + "#00ffd7", + "#00ffff", + "#5f0000", + "#5f005f", + "#5f0087", + "#5f00af", + "#5f00d7", + "#5f00ff", + "#5f5f00", + "#5f5f5f", + "#5f5f87", + "#5f5faf", + "#5f5fd7", + "#5f5fff", + "#5f8700", + "#5f875f", + "#5f8787", + "#5f87af", + "#5f87d7", + "#5f87ff", + "#5faf00", + "#5faf5f", + "#5faf87", + "#5fafaf", + "#5fafd7", + "#5fafff", + "#5fd700", + "#5fd75f", + "#5fd787", + "#5fd7af", + "#5fd7d7", + "#5fd7ff", + "#5fff00", + "#5fff5f", + "#5fff87", + "#5fffaf", + "#5fffd7", + "#5fffff", + "#870000", + "#87005f", + "#870087", + "#8700af", + "#8700d7", + "#8700ff", + "#875f00", + "#875f5f", + "#875f87", + "#875faf", + "#875fd7", + "#875fff", + "#878700", + "#87875f", + "#878787", + "#8787af", + "#8787d7", + "#8787ff", + "#87af00", + "#87af5f", + "#87af87", + "#87afaf", + "#87afd7", + "#87afff", + "#87d700", + "#87d75f", + "#87d787", + "#87d7af", + "#87d7d7", + "#87d7ff", + "#87ff00", + "#87ff5f", + "#87ff87", + "#87ffaf", + "#87ffd7", + "#87ffff", + "#af0000", + "#af005f", + "#af0087", + "#af00af", + "#af00d7", + "#af00ff", + "#af5f00", + "#af5f5f", + "#af5f87", + "#af5faf", + "#af5fd7", + "#af5fff", + "#af8700", + "#af875f", + "#af8787", + "#af87af", + "#af87d7", + "#af87ff", + "#afaf00", + "#afaf5f", + "#afaf87", + "#afafaf", + "#afafd7", + "#afafff", + "#afd700", + "#afd75f", + "#afd787", + "#afd7af", + "#afd7d7", + "#afd7ff", + "#afff00", + "#afff5f", + "#afff87", + "#afffaf", + "#afffd7", + "#afffff", + "#d70000", + "#d7005f", + "#d70087", + "#d700af", + "#d700d7", + "#d700ff", + "#d75f00", + "#d75f5f", + "#d75f87", + "#d75faf", + "#d75fd7", + "#d75fff", + "#d78700", + "#d7875f", + "#d78787", + "#d787af", + "#d787d7", + "#d787ff", + "#d7af00", + "#d7af5f", + "#d7af87", + "#d7afaf", + "#d7afd7", + "#d7afff", + "#d7d700", + "#d7d75f", + "#d7d787", + "#d7d7af", + "#d7d7d7", + "#d7d7ff", + "#d7ff00", + "#d7ff5f", + "#d7ff87", + "#d7ffaf", + "#d7ffd7", + "#d7ffff", + "#ff0000", + "#ff005f", + "#ff0087", + "#ff00af", + "#ff00d7", + "#ff00ff", + "#ff5f00", + "#ff5f5f", + "#ff5f87", + "#ff5faf", + "#ff5fd7", + "#ff5fff", + "#ff8700", + "#ff875f", + "#ff8787", + "#ff87af", + "#ff87d7", + "#ff87ff", + "#ffaf00", + "#ffaf5f", + "#ffaf87", + "#ffafaf", + "#ffafd7", + "#ffafff", + "#ffd700", + "#ffd75f", + "#ffd787", + "#ffd7af", + "#ffd7d7", + "#ffd7ff", + "#ffff00", + "#ffff5f", + "#ffff87", + "#ffffaf", + "#ffffd7", + "#ffffff", + "#080808", + "#121212", + "#1c1c1c", + "#262626", + "#303030", + "#3a3a3a", + "#444444", + "#4e4e4e", + "#585858", + "#626262", + "#6c6c6c", + "#767676", + "#808080", + "#8a8a8a", + "#949494", + "#9e9e9e", + "#a8a8a8", + "#b2b2b2", + "#bcbcbc", + "#c6c6c6", + "#d0d0d0", + "#dadada", + "#e4e4e4", + "#eeeeee" +] diff --git a/highlight/Cargo.toml b/highlight/Cargo.toml new file mode 100644 index 00000000..dd33add2 --- /dev/null +++ b/highlight/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "tree-sitter-highlight" +description = "Library for performing syntax highlighting with Tree-sitter" +version = "0.1.0" +authors = [ + "Max Brunsfeld ", + "Tim Clem " +] +license = "MIT" +readme = "README.md" +edition = "2018" +keywords = ["incremental", "parsing", "syntax", "highlighting"] +categories = ["parsing", "text-editors"] + +[dependencies] +regex = "1" +serde = "1.0" +serde_json = "1.0" +serde_derive = "1.0" + +[dependencies.tree-sitter] +version = ">= 0.3.7" +path = "../lib" diff --git a/highlight/src/escape.rs b/highlight/src/escape.rs new file mode 100644 index 00000000..882f160c --- /dev/null +++ b/highlight/src/escape.rs @@ -0,0 +1,53 @@ +// Copyright 2013 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! HTML Escaping +//! +//! This module contains one unit-struct which can be used to HTML-escape a +//! string of text (for use in a format string). + +use std::fmt; + +/// Wrapper struct which will emit the HTML-escaped version of the contained +/// string when passed to a format string. +pub struct Escape<'a>(pub &'a str); + +impl<'a> fmt::Display for Escape<'a> { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + // Because the internet is always right, turns out there's not that many + // characters to escape: http://stackoverflow.com/questions/7381974 + let Escape(s) = *self; + let pile_o_bits = s; + let mut last = 0; + for (i, ch) in s.bytes().enumerate() { + match ch as char { + '<' | '>' | '&' | '\'' | '"' => { + fmt.write_str(&pile_o_bits[last..i])?; + let s = match ch as char { + '>' => ">", + '<' => "<", + '&' => "&", + '\'' => "'", + '"' => """, + _ => unreachable!(), + }; + fmt.write_str(s)?; + last = i + 1; + } + _ => {} + } + } + + if last < s.len() { + fmt.write_str(&pile_o_bits[last..])?; + } + Ok(()) + } +} diff --git a/highlight/src/lib.rs b/highlight/src/lib.rs new file mode 100644 index 00000000..bdf35b9f --- /dev/null +++ b/highlight/src/lib.rs @@ -0,0 +1,823 @@ +mod escape; + +use serde::{Deserialize, Deserializer}; +use serde_derive::*; +use std::cmp; +use std::fmt::Write; +use std::mem::transmute; +use std::str; +use std::usize; +use tree_sitter::{Language, Node, Parser, Point, PropertySheet, Range, Tree, TreePropertyCursor}; + +pub trait LanguageRegistry { + fn language_for_injection_string<'a>( + &'a self, + s: &str, + ) -> Option<(Language, &'a PropertySheet)>; +} + +#[derive(Debug)] +enum TreeStep { + Child { + index: isize, + kinds: Option>, + }, + Children { + kinds: Option>, + }, + Next { + kinds: Option>, + }, +} + +#[derive(Debug)] +enum InjectionLanguage { + Literal(String), + TreePath(Vec), +} + +#[derive(Debug)] +struct Injection { + language: InjectionLanguage, + content: Vec, +} + +#[derive(Debug)] +pub struct Properties { + scope: Option, + injections: Vec, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[repr(u16)] +pub enum Scope { + Attribute, + Comment, + Constant, + ConstantBuiltin, + Constructor, + ConstructorBuiltin, + Embedded, + Escape, + Function, + FunctionBuiltin, + Keyword, + Number, + Operator, + Property, + PropertyBuiltin, + Punctuation, + PunctuationBracket, + PunctuationDelimiter, + PunctuationSpecial, + String, + StringSpecial, + Tag, + Type, + TypeBuiltin, + Variable, + VariableBuiltin, + Unknown, +} + +struct Layer<'a> { + _tree: Tree, + cursor: TreePropertyCursor<'a, Properties>, + ranges: Vec, + at_node_end: bool, +} + +struct Highlighter<'a, T: LanguageRegistry> { + language_registry: &'a T, + source: &'a [u8], + source_offset: usize, + parser: Parser, + layers: Vec>, + utf8_error_len: Option, +} + +#[derive(Copy, Clone, Debug)] +pub enum HighlightEvent<'a> { + Source(&'a str), + ScopeStart(Scope), + ScopeEnd(Scope), +} + +#[derive(Debug, Deserialize)] +#[serde(untagged)] +enum TreePathArgJSON { + TreePath(TreePathJSON), + Number(isize), + String(String), +} + +#[derive(Debug, Deserialize)] +#[serde(tag = "name")] +enum TreePathJSON { + #[serde(rename = "this")] + This, + #[serde(rename = "child")] + Child { args: Vec }, + #[serde(rename = "next")] + Next { args: Vec }, + #[serde(rename = "children")] + Children { args: Vec }, +} + +#[derive(Debug, Deserialize)] +#[serde(untagged)] +enum InjectionLanguageJSON { + List(Vec), + TreePath(TreePathJSON), + Literal(String), +} + +#[derive(Debug, Deserialize)] +#[serde(untagged)] +enum InjectionContentJSON { + List(Vec), + TreePath(TreePathJSON), +} + +#[derive(Debug, Deserialize)] +struct PropertiesJSON { + scope: Option, + #[serde(rename = "injection-language")] + injection_language: Option, + #[serde(rename = "injection-content")] + injection_content: Option, +} + +#[derive(Debug)] +pub enum PropertySheetError { + InvalidJSON(serde_json::Error), + InvalidRegex(regex::Error), + InvalidFormat(String), +} + +pub fn load_property_sheet( + language: Language, + json: &str, +) -> Result, PropertySheetError> { + let sheet = PropertySheet::new(language, json).map_err(|e| match e { + tree_sitter::PropertySheetError::InvalidJSON(e) => PropertySheetError::InvalidJSON(e), + tree_sitter::PropertySheetError::InvalidRegex(e) => PropertySheetError::InvalidRegex(e), + })?; + let sheet = sheet + .map(|p| Properties::new(p, language)) + .map_err(PropertySheetError::InvalidFormat)?; + Ok(sheet) +} + +impl Scope { + pub fn from_usize(i: usize) -> Option { + if i <= (Scope::Unknown as usize) { + Some(unsafe { transmute(i as u16) }) + } else { + None + } + } +} + +impl Properties { + fn new(json: PropertiesJSON, language: Language) -> Result { + let injections = match (json.injection_language, json.injection_content) { + (None, None) => Ok(Vec::new()), + (Some(_), None) => Err( + "Must specify an injection-content along with an injection-language".to_string(), + ), + (None, Some(_)) => Err( + "Must specify an injection-language along with an injection-content".to_string(), + ), + (Some(language_json), Some(content_json)) => { + let languages = match language_json { + InjectionLanguageJSON::List(list) => { + let mut result = Vec::with_capacity(list.len()); + for element in list { + result.push(match element { + InjectionLanguageJSON::TreePath(p) => { + let mut result = Vec::new(); + Self::flatten_tree_path(p, &mut result, language)?; + InjectionLanguage::TreePath(result) + } + InjectionLanguageJSON::Literal(s) => InjectionLanguage::Literal(s), + InjectionLanguageJSON::List(_) => { + panic!("Injection-language cannot be a list of lists") + } + }) + } + result + } + InjectionLanguageJSON::TreePath(p) => vec![{ + let mut result = Vec::new(); + Self::flatten_tree_path(p, &mut result, language)?; + InjectionLanguage::TreePath(result) + }], + InjectionLanguageJSON::Literal(s) => vec![InjectionLanguage::Literal(s)], + }; + + let contents = match content_json { + InjectionContentJSON::List(l) => { + let mut result = Vec::with_capacity(l.len()); + for element in l { + result.push(match element { + InjectionContentJSON::TreePath(p) => { + let mut result = Vec::new(); + Self::flatten_tree_path(p, &mut result, language)?; + result + } + InjectionContentJSON::List(_) => { + panic!("Injection-content cannot be a list of lists") + } + }) + } + result + } + InjectionContentJSON::TreePath(p) => vec![{ + let mut result = Vec::new(); + Self::flatten_tree_path(p, &mut result, language)?; + result + }], + }; + + if languages.len() == contents.len() { + Ok(languages + .into_iter() + .zip(contents.into_iter()) + .map(|(language, content)| Injection { language, content }) + .collect()) + } else { + Err(format!( + "Mismatch: got {} injection-language values but {} injection-content values", + languages.len(), + contents.len(), + )) + } + } + }?; + + Ok(Self { + scope: json.scope, + injections, + }) + } + + // Transform a tree path from the format expressed directly in the property sheet + // (nested function calls), to a flat sequence of steps for transforming a list of + // nodes. This way, we can evaluate these tree paths with no recursion and a single + // vector of intermediate storage. + fn flatten_tree_path( + p: TreePathJSON, + steps: &mut Vec, + language: Language, + ) -> Result<(), String> { + match p { + TreePathJSON::This => {} + TreePathJSON::Child { args } => { + let (tree_path, index, kinds) = Self::parse_args("child", args, language)?; + Self::flatten_tree_path(tree_path, steps, language)?; + steps.push(TreeStep::Child { + index: index + .ok_or_else(|| "The `child` function requires an index".to_string())?, + kinds: kinds, + }); + } + TreePathJSON::Children { args } => { + let (tree_path, _, kinds) = Self::parse_args("children", args, language)?; + Self::flatten_tree_path(tree_path, steps, language)?; + steps.push(TreeStep::Children { kinds }); + } + TreePathJSON::Next { args } => { + let (tree_path, _, kinds) = Self::parse_args("next", args, language)?; + Self::flatten_tree_path(tree_path, steps, language)?; + steps.push(TreeStep::Next { kinds }); + } + } + Ok(()) + } + + fn parse_args( + name: &str, + args: Vec, + language: Language, + ) -> Result<(TreePathJSON, Option, Option>), String> { + let tree_path; + let mut index = None; + let mut kinds = Vec::new(); + let mut iter = args.into_iter(); + + match iter.next() { + Some(TreePathArgJSON::TreePath(p)) => tree_path = p, + _ => { + return Err(format!( + "First argument to `{}()` must be a tree path", + name + )); + } + } + + for arg in iter { + match arg { + TreePathArgJSON::TreePath(_) => { + return Err(format!( + "Other arguments to `{}()` must be strings or numbers", + name + )); + } + TreePathArgJSON::Number(i) => index = Some(i), + TreePathArgJSON::String(s) => kinds.push(s), + } + } + + if kinds.len() > 0 { + let mut kind_ids = Vec::new(); + for i in 0..(language.node_kind_count() as u16) { + if kinds.iter().any(|s| s == language.node_kind_for_id(i)) + && language.node_kind_is_named(i) + { + kind_ids.push(i); + } + } + if kind_ids.len() == 0 { + return Err(format!("Non-existent node kinds: {:?}", kinds)); + } + + Ok((tree_path, index, Some(kind_ids))) + } else { + Ok((tree_path, index, None)) + } + } +} + +impl<'a, T: LanguageRegistry> Highlighter<'a, T> { + fn new( + language_registry: &'a T, + source: &'a [u8], + language: Language, + property_sheet: &'a PropertySheet, + ) -> Result { + let mut parser = Parser::new(); + parser.set_language(language)?; + let tree = parser + .parse(source, None) + .ok_or_else(|| format!("Tree-sitter: failed to parse"))?; + Ok(Self { + language_registry, + source, + source_offset: 0, + parser, + layers: vec![Layer::new( + source, + tree, + property_sheet, + vec![Range { + start_byte: 0, + end_byte: usize::MAX, + start_point: Point::new(0, 0), + end_point: Point::new(usize::MAX, usize::MAX), + }], + )], + utf8_error_len: None, + }) + } + + fn emit_source(&mut self, next_offset: usize) -> Option> { + let input = &self.source[self.source_offset..next_offset]; + match str::from_utf8(input) { + Ok(valid) => { + self.source_offset = next_offset; + Some(HighlightEvent::Source(valid)) + } + Err(error) => { + if let Some(error_len) = error.error_len() { + if error.valid_up_to() > 0 { + let prefix = &input[0..error.valid_up_to()]; + self.utf8_error_len = Some(error_len); + Some(HighlightEvent::Source(unsafe { + str::from_utf8_unchecked(prefix) + })) + } else { + self.source_offset += error_len; + Some(HighlightEvent::Source("\u{FFFD}")) + } + } else { + None + } + } + } + } + + fn process_tree_step(&self, step: &TreeStep, nodes: &mut Vec) { + let len = nodes.len(); + for i in 0..len { + let node = nodes[i]; + match step { + TreeStep::Child { index, kinds } => { + let index = if *index >= 0 { + *index as usize + } else { + (node.child_count() as isize + *index) as usize + }; + if let Some(child) = node.child(index) { + if let Some(kinds) = kinds { + if kinds.contains(&child.kind_id()) { + nodes.push(child); + } + } else { + nodes.push(child); + } + } + } + TreeStep::Children { kinds } => { + for child in node.children() { + if let Some(kinds) = kinds { + if kinds.contains(&child.kind_id()) { + nodes.push(child); + } + } else { + nodes.push(child); + } + } + } + TreeStep::Next { .. } => unimplemented!(), + } + } + nodes.drain(0..len); + } + + fn nodes_for_tree_path(&self, node: Node<'a>, steps: &Vec) -> Vec> { + let mut nodes = vec![node]; + for step in steps.iter() { + self.process_tree_step(step, &mut nodes); + } + nodes + } + + // An injected language name may either be specified as a fixed string, or based + // on the text of some node in the syntax tree. + fn injection_language_string( + &self, + node: &Node, + language: &InjectionLanguage, + ) -> Option { + match language { + InjectionLanguage::Literal(s) => Some(s.to_string()), + InjectionLanguage::TreePath(steps) => self + .nodes_for_tree_path(*node, steps) + .first() + .and_then(|node| { + str::from_utf8(&self.source[node.start_byte()..node.end_byte()]) + .map(|s| s.to_owned()) + .ok() + }), + } + } + + // Compute the ranges that should be included when parsing an injection. + // This takes into account two things: + // * `nodes` - Every injection takes place within a set of nodes. The injection ranges + // are the ranges of those nodes, *minus* the ranges of those nodes' children. + // * `parent_ranges` - The new injection may be nested inside of *another* injection + // (e.g. JavaScript within HTML within ERB). The parent injection's ranges must + // be taken into account. + fn intersect_ranges(parent_ranges: &Vec, nodes: &Vec) -> Vec { + let mut result = Vec::new(); + let mut parent_range_iter = parent_ranges.iter(); + let mut parent_range = parent_range_iter + .next() + .expect("Layers should only be constructed with non-empty ranges vectors"); + for node in nodes.iter() { + let range = node.range(); + let mut preceding_range = Range { + start_byte: 0, + start_point: Point::new(0, 0), + end_byte: range.start_byte, + end_point: range.start_point, + }; + let following_range = Range { + start_byte: node.end_byte(), + start_point: node.end_position(), + end_byte: usize::MAX, + end_point: Point::new(usize::MAX, usize::MAX), + }; + + for child_range in node + .children() + .map(|c| c.range()) + .chain([following_range].iter().cloned()) + { + let mut range = Range { + start_byte: preceding_range.end_byte, + start_point: preceding_range.end_point, + end_byte: child_range.start_byte, + end_point: child_range.start_point, + }; + preceding_range = child_range; + + if range.end_byte < parent_range.start_byte { + continue; + } + + while parent_range.start_byte <= range.end_byte { + if parent_range.end_byte > range.start_byte { + if range.start_byte < parent_range.start_byte { + range.start_byte = parent_range.start_byte; + range.start_point = parent_range.start_point; + } + + if parent_range.end_byte < range.end_byte { + if range.start_byte < parent_range.end_byte { + result.push(Range { + start_byte: range.start_byte, + start_point: range.start_point, + end_byte: parent_range.end_byte, + end_point: parent_range.end_point, + }); + } + range.start_byte = parent_range.end_byte; + range.start_point = parent_range.end_point; + } else { + if range.start_byte < range.end_byte { + result.push(range); + } + break; + } + } + + if let Some(next_range) = parent_range_iter.next() { + parent_range = next_range; + } else { + return result; + } + } + } + } + result + } + + fn add_layer(&mut self, language_string: &str, ranges: Vec) { + if let Some((language, property_sheet)) = self + .language_registry + .language_for_injection_string(language_string) + { + self.parser + .set_language(language) + .expect("Failed to set language"); + self.parser.set_included_ranges(&ranges); + let tree = self + .parser + .parse(self.source, None) + .expect("Failed to parse"); + let layer = Layer::new(self.source, tree, property_sheet, ranges); + match self + .layers + .binary_search_by_key(&(layer.offset(), 1), |l| (l.offset(), 0)) + { + Ok(i) | Err(i) => self.layers.insert(i, layer), + }; + } + } +} + +impl<'a, T: LanguageRegistry> Iterator for Highlighter<'a, T> { + type Item = HighlightEvent<'a>; + + fn next(&mut self) -> Option { + if let Some(utf8_error_len) = self.utf8_error_len.take() { + self.source_offset += utf8_error_len; + return Some(HighlightEvent::Source("\u{FFFD}")); + } + + while !self.layers.is_empty() { + let first_layer = &self.layers[0]; + let properties = &first_layer.cursor.node_properties(); + + // Add any injections for the current node. + if !first_layer.at_node_end { + let node = first_layer.cursor.node(); + let injections = properties + .injections + .iter() + .filter_map(|Injection { language, content }| { + if let Some(language) = self.injection_language_string(&node, language) { + let nodes = self.nodes_for_tree_path(node, content); + let ranges = Self::intersect_ranges(&first_layer.ranges, &nodes); + if ranges.len() > 0 { + return Some((language, ranges)); + } + } + None + }) + .collect::>(); + + for (language, ranges) in injections { + self.add_layer(&language, ranges); + } + } + + // Determine if any scopes start or end at the current position. + let scope_event; + if let Some(scope) = properties.scope { + let next_offset = cmp::min(self.source.len(), self.layers[0].offset()); + + // Before returning any scope boundaries, return any remaining slice of + // the source code the precedes that scope boundary. + if self.source_offset < next_offset { + return self.emit_source(next_offset); + } + + scope_event = if self.layers[0].at_node_end { + Some(HighlightEvent::ScopeEnd(scope)) + } else { + Some(HighlightEvent::ScopeStart(scope)) + }; + } else { + scope_event = None; + }; + + // Advance the current layer's tree cursor. This might cause that cursor to move + // beyond one of the other layers' cursors for a different syntax tree, so we need + // to re-sort the layers. If the cursor is already at the end of its syntax tree, + // remove it. + if self.layers[0].advance() { + self.layers.sort_unstable_by_key(|layer| layer.offset()); + } else { + self.layers.remove(0); + } + + if scope_event.is_some() { + return scope_event; + } + } + + if self.source_offset < self.source.len() { + self.emit_source(self.source.len()) + } else { + None + } + } +} + +impl<'a> Layer<'a> { + fn new( + source: &'a [u8], + tree: Tree, + sheet: &'a PropertySheet, + ranges: Vec, + ) -> Self { + // The cursor's lifetime parameter indicates that the tree must outlive the cursor. + // But because the tree is really a pointer to the heap, the cursor can remain + // valid when the tree is moved. There's no way to express this with lifetimes + // right now, so we have to `transmute` the cursor's lifetime. + let cursor = unsafe { transmute(tree.walk_with_properties(sheet, source)) }; + Self { + _tree: tree, + cursor, + ranges, + at_node_end: false, + } + } + + fn offset(&self) -> usize { + if self.at_node_end { + self.cursor.node().end_byte() + } else { + self.cursor.node().start_byte() + } + } + + fn advance(&mut self) -> bool { + if self.at_node_end { + if self.cursor.goto_next_sibling() { + self.at_node_end = false; + } else if !self.cursor.goto_parent() { + return false; + } + } else if !self.cursor.goto_first_child() { + self.at_node_end = true; + } + true + } +} + +impl<'de> Deserialize<'de> for Scope { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + let s = String::deserialize(deserializer)?; + match s.as_str() { + "attribute" => Ok(Scope::Attribute), + "comment" => Ok(Scope::Comment), + "constant" => Ok(Scope::Constant), + "constant.builtin" => Ok(Scope::ConstantBuiltin), + "constructor" => Ok(Scope::Constructor), + "constructor.builtin" => Ok(Scope::ConstructorBuiltin), + "embedded" => Ok(Scope::Embedded), + "escape" => Ok(Scope::Escape), + "function" => Ok(Scope::Function), + "function.builtin" => Ok(Scope::FunctionBuiltin), + "keyword" => Ok(Scope::Keyword), + "number" => Ok(Scope::Number), + "operator" => Ok(Scope::Operator), + "property" => Ok(Scope::Property), + "property.builtin" => Ok(Scope::PropertyBuiltin), + "punctuation" => Ok(Scope::Punctuation), + "punctuation.bracket" => Ok(Scope::PunctuationBracket), + "punctuation.delimiter" => Ok(Scope::PunctuationDelimiter), + "punctuation.special" => Ok(Scope::PunctuationSpecial), + "string" => Ok(Scope::String), + "string.special" => Ok(Scope::StringSpecial), + "type" => Ok(Scope::Type), + "type.builtin" => Ok(Scope::TypeBuiltin), + "variable" => Ok(Scope::Variable), + "variable.builtin" => Ok(Scope::VariableBuiltin), + "tag" => Ok(Scope::Tag), + _ => Ok(Scope::Unknown), + } + } +} + +pub fn highlight<'a, T: LanguageRegistry>( + language_registry: &'a T, + source: &'a [u8], + language: Language, + property_sheet: &'a PropertySheet, +) -> Result> + 'a, String> { + Highlighter::new(language_registry, source, language, property_sheet) +} + +pub fn highlight_html<'a, T: LanguageRegistry, F: Fn(Scope) -> &'a str>( + language_registry: &'a T, + source: &'a [u8], + language: Language, + property_sheet: &'a PropertySheet, + attribute_callback: F, +) -> Result, String> { + let highlighter = Highlighter::new(language_registry, source, language, property_sheet)?; + let mut renderer = HtmlRenderer::new(attribute_callback); + let mut scopes = Vec::new(); + for event in highlighter { + match event { + HighlightEvent::ScopeStart(s) => { + scopes.push(s); + renderer.start_scope(s); + } + HighlightEvent::ScopeEnd(s) => { + assert_eq!(scopes.pop(), Some(s)); + renderer.end_scope(); + } + HighlightEvent::Source(src) => { + renderer.render_line(src, &scopes); + } + }; + } + renderer.flush(); + Ok(renderer.result) +} + +struct HtmlRenderer<'a, F: Fn(Scope) -> &'a str> { + result: Vec, + buffer: String, + attribute_callback: F, +} + +impl<'a, F: Fn(Scope) -> &'a str> HtmlRenderer<'a, F> { + fn new(attribute_callback: F) -> Self { + HtmlRenderer { + result: Vec::new(), + buffer: String::new(), + attribute_callback, + } + } + + fn start_scope(&mut self, s: Scope) { + write!(&mut self.buffer, "", (self.attribute_callback)(s),).unwrap(); + } + + fn end_scope(&mut self) { + write!(&mut self.buffer, "").unwrap(); + } + + fn flush(&mut self) { + if !self.buffer.is_empty() { + self.buffer.push('\n'); + self.result.push(self.buffer.clone()); + self.buffer.clear(); + } + } + + fn render_line(&mut self, src: &str, scopes: &Vec) { + let mut multiline = false; + for line in src.split('\n') { + let line = line.trim_end_matches('\r'); + if multiline { + scopes.iter().for_each(|_| self.end_scope()); + self.flush(); + scopes.iter().for_each(|scope| self.start_scope(*scope)); + } + write!(&mut self.buffer, "{}", escape::Escape(line)).unwrap(); + multiline = true; + } + } +} From a46515b80f18d9be80d0ae7351c6eebdd2a9b303 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 19 Feb 2019 17:07:12 -0800 Subject: [PATCH 05/27] Replace LanguageRegistry trait with a simple callback --- cli/src/highlight.rs | 55 ++++++++++++++++++++++----- cli/src/loader.rs | 33 +---------------- cli/src/tests/highlight_test.rs | 29 ++++++--------- highlight/src/lib.rs | 66 +++++++++++++++++++-------------- 4 files changed, 96 insertions(+), 87 deletions(-) diff --git a/cli/src/highlight.rs b/cli/src/highlight.rs index 1651b98d..0f88149a 100644 --- a/cli/src/highlight.rs +++ b/cli/src/highlight.rs @@ -4,7 +4,7 @@ use ansi_term::{Color, Style}; use lazy_static::lazy_static; use serde_json::Value; use std::collections::HashMap; -use std::{fmt, fs, io, mem, path}; +use std::{fmt, fs, io, path}; use tree_sitter::{Language, PropertySheet}; use tree_sitter_highlight::{highlight, highlight_html, HighlightEvent, Properties, Scope}; @@ -195,7 +195,9 @@ pub fn ansi( let stdout = io::stdout(); let mut stdout = stdout.lock(); let mut scope_stack = Vec::new(); - for event in highlight(loader, source, language, property_sheet)? { + for event in highlight(source, language, property_sheet, &|s| { + language_for_injection_string(loader, s) + })? { match event { HighlightEvent::Source(s) => { if let Some(style) = scope_stack.last().and_then(|s| theme.ansi_style(*s)) { @@ -252,13 +254,19 @@ pub fn html( let stdout = io::stdout(); let mut stdout = stdout.lock(); write!(&mut stdout, "\n")?; - let lines = highlight_html(loader, source, language, property_sheet, |scope| { - if let Some(css_style) = theme.css_style(scope) { - css_style - } else { - "" - } - })?; + let lines = highlight_html( + source, + language, + property_sheet, + &|s| language_for_injection_string(loader, s), + &|scope| { + if let Some(css_style) = theme.css_style(scope) { + css_style + } else { + "" + } + }, + )?; for (i, line) in lines.into_iter().enumerate() { write!( &mut stdout, @@ -270,3 +278,32 @@ pub fn html( write!(&mut stdout, "
\n")?; Ok(()) } + +fn language_for_injection_string<'a>( + loader: &'a Loader, + string: &str, +) -> Option<(Language, &'a PropertySheet)> { + match loader.language_configuration_for_injection_string(string) { + Err(message) => { + eprintln!( + "Failed to load language for injection string '{}': {}", + string, message.0 + ); + None + } + Ok(None) => None, + Ok(Some((language, configuration))) => { + match configuration.highlight_property_sheet(language) { + Err(message) => { + eprintln!( + "Failed to load property sheet for injection string '{}': {}", + string, message.0 + ); + None + } + Ok(None) => None, + Ok(Some(sheet)) => Some((language, sheet)), + } + } + } +} diff --git a/cli/src/loader.rs b/cli/src/loader.rs index d19acf46..49bab4b4 100644 --- a/cli/src/loader.rs +++ b/cli/src/loader.rs @@ -10,7 +10,7 @@ use std::process::Command; use std::time::SystemTime; use std::{fs, mem}; use tree_sitter::{Language, PropertySheet}; -use tree_sitter_highlight::{load_property_sheet, LanguageRegistry, Properties}; +use tree_sitter_highlight::{load_property_sheet, Properties}; #[cfg(unix)] const DYLIB_EXTENSION: &'static str = "so"; @@ -320,37 +320,6 @@ impl Loader { } } -impl LanguageRegistry for Loader { - fn language_for_injection_string<'a>( - &'a self, - string: &str, - ) -> Option<(Language, &'a PropertySheet)> { - match self.language_configuration_for_injection_string(string) { - Err(message) => { - eprintln!( - "Failed to load language for injection string '{}': {}", - string, message.0 - ); - None - } - Ok(None) => None, - Ok(Some((language, configuration))) => { - match configuration.highlight_property_sheet(language) { - Err(message) => { - eprintln!( - "Failed to load property sheet for injection string '{}': {}", - string, message.0 - ); - None - } - Ok(None) => None, - Ok(Some(sheet)) => Some((language, sheet)), - } - } - } - } -} - impl LanguageConfiguration { pub fn highlight_property_sheet( &self, diff --git a/cli/src/tests/highlight_test.rs b/cli/src/tests/highlight_test.rs index ea14a1c2..6e07ab4a 100644 --- a/cli/src/tests/highlight_test.rs +++ b/cli/src/tests/highlight_test.rs @@ -1,9 +1,7 @@ use super::helpers::fixtures::{get_language, get_property_sheet}; use lazy_static::lazy_static; use tree_sitter::{Language, PropertySheet}; -use tree_sitter_highlight::{ - highlight, highlight_html, HighlightEvent, LanguageRegistry, Properties, Scope, -}; +use tree_sitter_highlight::{highlight, highlight_html, HighlightEvent, Properties, Scope}; lazy_static! { static ref JS_SHEET: PropertySheet = @@ -124,18 +122,13 @@ fn test_highlighting_multiline_scopes_to_html() { ); } -struct TestLanguageRegistry; - -impl LanguageRegistry for TestLanguageRegistry { - fn language_for_injection_string( - &self, - string: &str, - ) -> Option<(Language, &PropertySheet)> { - match string { - "javascript" => Some((get_language("javascript"), &JS_SHEET)), - "html" => Some((get_language("html"), &HTML_SHEET)), - _ => None, - } +fn test_language_for_injection_string<'a>( + string: &str, +) -> Option<(Language, &'a PropertySheet)> { + match string { + "javascript" => Some((get_language("javascript"), &JS_SHEET)), + "html" => Some((get_language("html"), &HTML_SHEET)), + _ => None, } } @@ -145,11 +138,11 @@ fn to_html<'a>( property_sheet: &'a PropertySheet, ) -> Result, String> { highlight_html( - &TestLanguageRegistry, src.as_bytes(), language, property_sheet, - |scope| SCOPE_CLASS_STRINGS[scope as usize].as_str(), + &test_language_for_injection_string, + &|scope| SCOPE_CLASS_STRINGS[scope as usize].as_str(), ) } @@ -162,10 +155,10 @@ fn to_token_vector<'a>( let mut scopes = Vec::new(); let mut line = Vec::new(); for event in highlight( - &TestLanguageRegistry, src.as_bytes(), language, property_sheet, + &test_language_for_injection_string, )? { match event { HighlightEvent::ScopeStart(s) => scopes.push(s), diff --git a/highlight/src/lib.rs b/highlight/src/lib.rs index bdf35b9f..453685f4 100644 --- a/highlight/src/lib.rs +++ b/highlight/src/lib.rs @@ -9,13 +9,6 @@ use std::str; use std::usize; use tree_sitter::{Language, Node, Parser, Point, PropertySheet, Range, Tree, TreePropertyCursor}; -pub trait LanguageRegistry { - fn language_for_injection_string<'a>( - &'a self, - s: &str, - ) -> Option<(Language, &'a PropertySheet)>; -} - #[derive(Debug)] enum TreeStep { Child { @@ -87,8 +80,11 @@ struct Layer<'a> { at_node_end: bool, } -struct Highlighter<'a, T: LanguageRegistry> { - language_registry: &'a T, +struct Highlighter<'a, T> +where + T: Fn(&str) -> Option<(Language, &'a PropertySheet)>, +{ + injection_callback: &'a T, source: &'a [u8], source_offset: usize, parser: Parser, @@ -349,12 +345,15 @@ impl Properties { } } -impl<'a, T: LanguageRegistry> Highlighter<'a, T> { +impl<'a, F> Highlighter<'a, F> +where + F: Fn(&str) -> Option<(Language, &'a PropertySheet)>, +{ fn new( - language_registry: &'a T, source: &'a [u8], language: Language, property_sheet: &'a PropertySheet, + injection_callback: &'a F, ) -> Result { let mut parser = Parser::new(); parser.set_language(language)?; @@ -362,7 +361,7 @@ impl<'a, T: LanguageRegistry> Highlighter<'a, T> { .parse(source, None) .ok_or_else(|| format!("Tree-sitter: failed to parse"))?; Ok(Self { - language_registry, + injection_callback, source, source_offset: 0, parser, @@ -457,7 +456,7 @@ impl<'a, T: LanguageRegistry> Highlighter<'a, T> { // on the text of some node in the syntax tree. fn injection_language_string( &self, - node: &Node, + node: &Node<'a>, language: &InjectionLanguage, ) -> Option { match language { @@ -556,10 +555,7 @@ impl<'a, T: LanguageRegistry> Highlighter<'a, T> { } fn add_layer(&mut self, language_string: &str, ranges: Vec) { - if let Some((language, property_sheet)) = self - .language_registry - .language_for_injection_string(language_string) - { + if let Some((language, property_sheet)) = (self.injection_callback)(language_string) { self.parser .set_language(language) .expect("Failed to set language"); @@ -579,7 +575,9 @@ impl<'a, T: LanguageRegistry> Highlighter<'a, T> { } } -impl<'a, T: LanguageRegistry> Iterator for Highlighter<'a, T> { +impl<'a, T: Fn(&str) -> Option<(Language, &'a PropertySheet)>> Iterator + for Highlighter<'a, T> +{ type Item = HighlightEvent<'a>; fn next(&mut self) -> Option { @@ -738,23 +736,32 @@ impl<'de> Deserialize<'de> for Scope { } } -pub fn highlight<'a, T: LanguageRegistry>( - language_registry: &'a T, +pub trait HTMLAttributeCallback<'a>: Fn(Scope) -> &'a str {} + +pub fn highlight<'a, F>( source: &'a [u8], language: Language, property_sheet: &'a PropertySheet, -) -> Result> + 'a, String> { - Highlighter::new(language_registry, source, language, property_sheet) + injection_callback: &'a F, +) -> Result> + 'a, String> +where + F: Fn(&str) -> Option<(Language, &'a PropertySheet)>, +{ + Highlighter::new(source, language, property_sheet, injection_callback) } -pub fn highlight_html<'a, T: LanguageRegistry, F: Fn(Scope) -> &'a str>( - language_registry: &'a T, +pub fn highlight_html<'a, F1, F2>( source: &'a [u8], language: Language, property_sheet: &'a PropertySheet, - attribute_callback: F, -) -> Result, String> { - let highlighter = Highlighter::new(language_registry, source, language, property_sheet)?; + injection_callback: &'a F1, + attribute_callback: &'a F2, +) -> Result, String> +where + F1: Fn(&str) -> Option<(Language, &'a PropertySheet)>, + F2: Fn(Scope) -> &'a str, +{ + let highlighter = Highlighter::new(source, language, property_sheet, injection_callback)?; let mut renderer = HtmlRenderer::new(attribute_callback); let mut scopes = Vec::new(); for event in highlighter { @@ -782,7 +789,10 @@ struct HtmlRenderer<'a, F: Fn(Scope) -> &'a str> { attribute_callback: F, } -impl<'a, F: Fn(Scope) -> &'a str> HtmlRenderer<'a, F> { +impl<'a, F> HtmlRenderer<'a, F> +where + F: Fn(Scope) -> &'a str, +{ fn new(attribute_callback: F) -> Self { HtmlRenderer { result: Vec::new(), From c20a330fa5e03dc6e8915972062cef07fbeff92f Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 19 Feb 2019 17:56:46 -0800 Subject: [PATCH 06/27] highlight: Add a README --- highlight/README.md | 58 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 highlight/README.md diff --git a/highlight/README.md b/highlight/README.md new file mode 100644 index 00000000..b6b311cc --- /dev/null +++ b/highlight/README.md @@ -0,0 +1,58 @@ +Tree-sitter Highlighting +========================= + +[![Build Status](https://travis-ci.org/tree-sitter/tree-sitter.svg?branch=master)](https://travis-ci.org/tree-sitter/tree-sitter) +[![Build status](https://ci.appveyor.com/api/projects/status/vtmbd6i92e97l55w/branch/master?svg=true)](https://ci.appveyor.com/project/maxbrunsfeld/tree-sitter/branch/master) +[![Crates.io](https://img.shields.io/crates/v/tree-sitter-highlight.svg)](https://crates.io/crates/tree-sitter-highlight) + +### Usage + +Compile some languages into your app, and declare them: + +```rust +extern "C" tree_sitter_html(); +extern "C" tree_sitter_javascript(); +``` + +Load some *property sheets*: + +```rust +use tree_sitter_highlight::load_property_sheet; + +let javascript_property_sheet = load_property_sheet( + fs::read_to_string("./tree-sitter-javascript/src/highlights.json").unwrap() +).unwrap(); + +let html_property_sheet = load_property_sheet( + fs::read_to_string("./tree-sitter-html/src/highlights.json").unwrap() +).unwrap(); +``` + +Highlight some code: + +```rust +use tree_sitter_highlight::{highlight, HighlightEvent}; + +let highlights = highlight( + b"const x = new Y();", + unsafe { tree_sitter_javascript() }, + &javascript_property_sheet, + &|_| None +).unwrap(); + +for event in highlights { + match event { + HighlightEvent::Source(s) { + eprintln!("source: {:?}", s); + }, + HighlightEvent::ScopeStart(s) { + eprintln!("scope started: {:?}", s); + }, + HighlightEvent::ScopeEnd(s) { + eprintln!("scope ended: {:?}", s); + }, + } +} +``` + +The last parameter to `highlight` is a *language injection* callback. This allows other languages to be retrieved when Tree-sitter detects an embedded document (for example, a piece of JavaScript code inside of a `script` tag within HTML). From 2ee5cbbc1dedb9238c08e7167ca50701d217699f Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 20 Feb 2019 10:27:08 -0800 Subject: [PATCH 07/27] highlight: take callback parameters by value --- cli/src/highlight.rs | 6 +++--- highlight/src/lib.rs | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/cli/src/highlight.rs b/cli/src/highlight.rs index 0f88149a..6cd19392 100644 --- a/cli/src/highlight.rs +++ b/cli/src/highlight.rs @@ -195,7 +195,7 @@ pub fn ansi( let stdout = io::stdout(); let mut stdout = stdout.lock(); let mut scope_stack = Vec::new(); - for event in highlight(source, language, property_sheet, &|s| { + for event in highlight(source, language, property_sheet, |s| { language_for_injection_string(loader, s) })? { match event { @@ -258,8 +258,8 @@ pub fn html( source, language, property_sheet, - &|s| language_for_injection_string(loader, s), - &|scope| { + |s| language_for_injection_string(loader, s), + |scope| { if let Some(css_style) = theme.css_style(scope) { css_style } else { diff --git a/highlight/src/lib.rs b/highlight/src/lib.rs index 453685f4..bbe0b424 100644 --- a/highlight/src/lib.rs +++ b/highlight/src/lib.rs @@ -84,7 +84,7 @@ struct Highlighter<'a, T> where T: Fn(&str) -> Option<(Language, &'a PropertySheet)>, { - injection_callback: &'a T, + injection_callback: T, source: &'a [u8], source_offset: usize, parser: Parser, @@ -353,7 +353,7 @@ where source: &'a [u8], language: Language, property_sheet: &'a PropertySheet, - injection_callback: &'a F, + injection_callback: F, ) -> Result { let mut parser = Parser::new(); parser.set_language(language)?; @@ -742,10 +742,10 @@ pub fn highlight<'a, F>( source: &'a [u8], language: Language, property_sheet: &'a PropertySheet, - injection_callback: &'a F, + injection_callback: F, ) -> Result> + 'a, String> where - F: Fn(&str) -> Option<(Language, &'a PropertySheet)>, + F: Fn(&str) -> Option<(Language, &'a PropertySheet)> + 'a, { Highlighter::new(source, language, property_sheet, injection_callback) } @@ -754,8 +754,8 @@ pub fn highlight_html<'a, F1, F2>( source: &'a [u8], language: Language, property_sheet: &'a PropertySheet, - injection_callback: &'a F1, - attribute_callback: &'a F2, + injection_callback: F1, + attribute_callback: F2, ) -> Result, String> where F1: Fn(&str) -> Option<(Language, &'a PropertySheet)>, From d63368552a37823afe12234be50a7d12e674d090 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 20 Feb 2019 10:42:56 -0800 Subject: [PATCH 08/27] highlight: 0.1.1 --- Cargo.lock | 4 ++-- highlight/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 50058336..3c6825d9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -639,12 +639,12 @@ dependencies = [ "smallbitvec 2.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "spin 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", "tree-sitter 0.3.8", - "tree-sitter-highlight 0.1.0", + "tree-sitter-highlight 0.1.1", ] [[package]] name = "tree-sitter-highlight" -version = "0.1.0" +version = "0.1.1" dependencies = [ "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/highlight/Cargo.toml b/highlight/Cargo.toml index dd33add2..5f8aa7ac 100644 --- a/highlight/Cargo.toml +++ b/highlight/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "tree-sitter-highlight" description = "Library for performing syntax highlighting with Tree-sitter" -version = "0.1.0" +version = "0.1.1" authors = [ "Max Brunsfeld ", "Tim Clem " From d2264d597f9a14302e5d1415c1394b0723e26f2d Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 20 Feb 2019 14:38:19 -0800 Subject: [PATCH 09/27] cli: Add --scope flag to highlight command --- cli/src/loader.rs | 21 ++++++++++++++++++--- cli/src/main.rs | 38 +++++++++++++++++++++++++++++++------- 2 files changed, 49 insertions(+), 10 deletions(-) diff --git a/cli/src/loader.rs b/cli/src/loader.rs index 49bab4b4..23a55cc6 100644 --- a/cli/src/loader.rs +++ b/cli/src/loader.rs @@ -27,7 +27,7 @@ struct LanguageRepo { } pub struct LanguageConfiguration { - pub name: String, + scope: Option, _content_regex: Option, _first_line_regex: Option, injection_regex: Option, @@ -79,6 +79,21 @@ impl Loader { } } + pub fn language_configuration_for_scope( + &self, + scope: &str, + ) -> Result> { + for (i, repo) in self.language_repos.iter().enumerate() { + for configuration in &repo.configurations { + if configuration.scope.as_ref().map_or(false, |s| s == scope) { + let (language, _) = self.language_configuration_for_id(i)?; + return Ok(Some((language, &configuration))); + } + } + } + Ok(None) + } + pub fn language_configuration_for_file_name( &self, path: &Path, @@ -258,7 +273,7 @@ impl Loader { fn find_language_at_path<'a>(&'a mut self, parser_path: &Path) -> Result { #[derive(Deserialize)] struct LanguageConfigurationJSON { - name: String, + scope: Option, #[serde(rename = "file-types")] file_types: Option>, #[serde(rename = "content-regex")] @@ -284,7 +299,7 @@ impl Loader { configurations .into_iter() .map(|conf| LanguageConfiguration { - name: conf.name, + scope: conf.scope, file_types: conf.file_types.unwrap_or(Vec::new()), _content_regex: conf .content_regex diff --git a/cli/src/main.rs b/cli/src/main.rs index 9cd4e131..255f680b 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -72,6 +72,7 @@ fn run() -> error::Result<()> { .multiple(true) .required(true), ) + .arg(Arg::with_name("scope").long("scope").takes_value(true)) .arg(Arg::with_name("html").long("html").short("h")), ) .get_matches(); @@ -169,17 +170,40 @@ fn run() -> error::Result<()> { println!("{}", highlight::HTML_HEADER); } + let language_config; + if let Some(scope) = matches.value_of("scope") { + language_config = loader.language_configuration_for_scope(scope)?; + if language_config.is_none() { + return Err(error::Error(format!("Unknown scope '{}'", scope))); + } + } else { + language_config = None; + } + for path in paths { let path = Path::new(path); - if let Some((language, config)) = loader.language_configuration_for_file_name(path)? { - if let Some(sheet) = config.highlight_property_sheet(language)? { - let source = fs::read(path)?; - if html_mode { - highlight::html(&loader, &theme, &source, language, sheet)?; - } else { - highlight::ansi(&loader, &theme, &source, language, sheet)?; + let (language, config) = match language_config { + Some(v) => v, + None => match loader.language_configuration_for_file_name(path)? { + Some(v) => v, + None => { + eprintln!("No language found for path {:?}", path); + continue; } + }, + }; + + if let Some(sheet) = config.highlight_property_sheet(language)? { + let source = fs::read(path)?; + if html_mode { + highlight::html(&loader, &theme, &source, language, sheet)?; + } else { + highlight::ansi(&loader, &theme, &source, language, sheet)?; } + } else { + return Err(error::Error(format!( + "No syntax highlighting property sheet specified" + ))); } } } From e239aa82295762622069ca300b38560da47b8a3b Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 20 Feb 2019 16:45:51 -0800 Subject: [PATCH 10/27] highlight: don't include scope in ScopeEnd events When there are embedded documents, multiple scopes can start or end at the same position. Previously, there was no guarantee that the ScopeEnd events would always occur in the reverse order of the ScopeStart events. The easiest way to avoid exposing inconsistency is to not surface the scopes being ended. --- cli/src/highlight.rs | 2 +- cli/src/tests/highlight_test.rs | 3 +-- highlight/src/lib.rs | 24 +++++++++++++++--------- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/cli/src/highlight.rs b/cli/src/highlight.rs index 6cd19392..55ef4bc2 100644 --- a/cli/src/highlight.rs +++ b/cli/src/highlight.rs @@ -209,7 +209,7 @@ pub fn ansi( HighlightEvent::ScopeStart(s) => { scope_stack.push(s); } - HighlightEvent::ScopeEnd(_) => { + HighlightEvent::ScopeEnd => { scope_stack.pop(); } } diff --git a/cli/src/tests/highlight_test.rs b/cli/src/tests/highlight_test.rs index 6e07ab4a..57f61e16 100644 --- a/cli/src/tests/highlight_test.rs +++ b/cli/src/tests/highlight_test.rs @@ -162,8 +162,7 @@ fn to_token_vector<'a>( )? { match event { HighlightEvent::ScopeStart(s) => scopes.push(s), - HighlightEvent::ScopeEnd(s) => { - assert_eq!(*scopes.last().unwrap(), s); + HighlightEvent::ScopeEnd => { scopes.pop(); } HighlightEvent::Source(s) => { diff --git a/highlight/src/lib.rs b/highlight/src/lib.rs index bbe0b424..7ec186d8 100644 --- a/highlight/src/lib.rs +++ b/highlight/src/lib.rs @@ -96,7 +96,7 @@ where pub enum HighlightEvent<'a> { Source(&'a str), ScopeStart(Scope), - ScopeEnd(Scope), + ScopeEnd, } #[derive(Debug, Deserialize)] @@ -565,10 +565,7 @@ where .parse(self.source, None) .expect("Failed to parse"); let layer = Layer::new(self.source, tree, property_sheet, ranges); - match self - .layers - .binary_search_by_key(&(layer.offset(), 1), |l| (l.offset(), 0)) - { + match self.layers.binary_search_by(|l| l.cmp(&layer)) { Ok(i) | Err(i) => self.layers.insert(i, layer), }; } @@ -625,7 +622,7 @@ impl<'a, T: Fn(&str) -> Option<(Language, &'a PropertySheet)>> Itera } scope_event = if self.layers[0].at_node_end { - Some(HighlightEvent::ScopeEnd(scope)) + Some(HighlightEvent::ScopeEnd) } else { Some(HighlightEvent::ScopeStart(scope)) }; @@ -638,7 +635,7 @@ impl<'a, T: Fn(&str) -> Option<(Language, &'a PropertySheet)>> Itera // to re-sort the layers. If the cursor is already at the end of its syntax tree, // remove it. if self.layers[0].advance() { - self.layers.sort_unstable_by_key(|layer| layer.offset()); + self.layers.sort_unstable_by(|a, b| a.cmp(&b)); } else { self.layers.remove(0); } @@ -676,6 +673,15 @@ impl<'a> Layer<'a> { } } + fn cmp(&self, other: &Layer) -> cmp::Ordering { + // Events are ordered primarily by their position in the document. But if + // one scope starts at a given position and another scope ends at that + // same position, return the scope end event before the scope start event. + self.offset() + .cmp(&other.offset()) + .then_with(|| other.at_node_end.cmp(&self.at_node_end)) + } + fn offset(&self) -> usize { if self.at_node_end { self.cursor.node().end_byte() @@ -770,8 +776,8 @@ where scopes.push(s); renderer.start_scope(s); } - HighlightEvent::ScopeEnd(s) => { - assert_eq!(scopes.pop(), Some(s)); + HighlightEvent::ScopeEnd => { + scopes.pop(); renderer.end_scope(); } HighlightEvent::Source(src) => { From 27d4f0d69dd52516aef626a04d017fc3fd7d1395 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 20 Feb 2019 16:47:27 -0800 Subject: [PATCH 11/27] highlight: 0.1.2 --- Cargo.lock | 4 ++-- highlight/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3c6825d9..1810fb4d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -639,12 +639,12 @@ dependencies = [ "smallbitvec 2.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "spin 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", "tree-sitter 0.3.8", - "tree-sitter-highlight 0.1.1", + "tree-sitter-highlight 0.1.2", ] [[package]] name = "tree-sitter-highlight" -version = "0.1.1" +version = "0.1.2" dependencies = [ "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/highlight/Cargo.toml b/highlight/Cargo.toml index 5f8aa7ac..ee2dd80e 100644 --- a/highlight/Cargo.toml +++ b/highlight/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "tree-sitter-highlight" description = "Library for performing syntax highlighting with Tree-sitter" -version = "0.1.1" +version = "0.1.2" authors = [ "Max Brunsfeld ", "Tim Clem " From 743d18d956e0e8f2909f7fb0123fc17d8f8097fb Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 21 Feb 2019 16:18:22 -0800 Subject: [PATCH 12/27] CI: move binary instead of copying to preserve executable permission --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 06c71b34..cacd2f27 100644 --- a/.travis.yml +++ b/.travis.yml @@ -23,7 +23,7 @@ branches: - /\d+\.\d+\.\d+/ before_deploy: - - cp target/release/tree-sitter . + - mv target/release/tree-sitter . - gzip --suffix "-${TRAVIS_OS_NAME}-x64.gz" tree-sitter deploy: From af4414715777cbf5d15f6ad78593f9c28b2e7840 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 21 Feb 2019 16:41:22 -0800 Subject: [PATCH 13/27] docs: Fix references to runtime.h, libruntime.a --- docs/section-2-using-parsers.md | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/docs/section-2-using-parsers.md b/docs/section-2-using-parsers.md index 4df802c4..879b2dd2 100644 --- a/docs/section-2-using-parsers.md +++ b/docs/section-2-using-parsers.md @@ -5,33 +5,33 @@ permalink: using-parsers # Using Parsers -All of Tree-sitter's parsing functionality is exposed through C APIs. Applications written in higher-level languages can use Tree-sitter via binding libraries like [node-tree-sitter](https://github.com/tree-sitter/node-tree-sitter) or [rust-tree-sitter](https://github.com/tree-sitter/rust-tree-sitter), which have their own documentation. +All of Tree-sitter's parsing functionality is exposed through C APIs. Applications written in higher-level languages can use Tree-sitter via binding libraries like [node-tree-sitter](https://github.com/tree-sitter/node-tree-sitter) or [rust-tree-sitter](https://github.com/tree-sitter/tree-sitter/tree/master/lib/binding), which have their own documentation. This document will describes the general concepts of how to use Tree-sitter, which should be relevant regardless of what language you're using. It also goes into some C-specific details that are useful if you're using the C API directly or are building a new binding to a different language. -## Building the Runtime Library +## Building the Library -Building the runtime library requires one git submodule: [`utf8proc`](https://github.com/JuliaStrings/utf8proc). Make sure that `utf8proc` is downloaded by running this command from the Tree-sitter directory: +Building the library requires one git submodule: [`utf8proc`](https://github.com/JuliaStrings/utf8proc). Make sure that `utf8proc` is downloaded by running this command from the Tree-sitter directory: ```sh git submodule update --init ``` -To build the runtime library on a POSIX system, run this script, which will create a static library called `libruntime.a` in the Tree-sitter folder: +To build the library on a POSIX system, run this script, which will create a static library called `libtree-sitter.a` in the Tree-sitter folder: ```sh -script/build-runtime +script/build-lib ``` -Alternatively, you can use the runtime library in a larger project by adding one source file to the project. This source file needs three directories to be in the include path when compiled: +Alternatively, you can use the library in a larger project by adding one source file to the project. This source file needs three directories to be in the include path when compiled: **source file:** -* `tree-sitter/src/runtime/runtime.c` +* `tree-sitter/lib/src/lib.c` **include directories:** -* `tree-sitter/src` -* `tree-sitter/include` -* `tree-sitter/externals/utf8proc` +* `tree-sitter/lib/src` +* `tree-sitter/lib/include` +* `tree-sitter/lib/utf8proc` ## The Objects @@ -51,7 +51,7 @@ Here's an example of a simple C program that uses the Tree-sitter [JSON parser]( #include #include #include -#include +#include // Declare the `tree_sitter_json` function, which is // implemented by the `tree-sitter-json` library. @@ -103,14 +103,14 @@ int main() { } ``` -This program uses the Tree-sitter C API, which is declared in the header file `tree_sitter/runtime.h`, so we need to add the `tree_sitter/include` directory to the include path. We also need to link `libruntime.a` into the binary. We compile the source code of the JSON language directly into the binary as well. +This program uses the Tree-sitter C API, which is declared in the header file `tree_sitter/api.h`, so we need to add the `tree_sitter/include` directory to the include path. We also need to link `libtree-sitter.a` into the binary. We compile the source code of the JSON language directly into the binary as well. ```sh clang \ -I tree-sitter/include \ test-json-parser.c \ tree-sitter-json/src/parser.c \ - tree-sitter/libruntime.a \ + tree-sitter/libtree-sitter.a \ -o test-json-parser ./test-json-parser @@ -303,7 +303,7 @@ Conceptually, it can be represented by three syntax trees with overlapping range ```c #include -#include +#include // These functions are each implemented in their own repo. const TSLanguage *tree_sitter_embedded_template(); From 2249ddb687279885cc89be2c45e11ec27b18004a Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 21 Feb 2019 16:41:52 -0800 Subject: [PATCH 14/27] docs: Adjust intro, remove mentions of node-gyp, npm install --- docs/section-3-creating-parsers.md | 45 +++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/docs/section-3-creating-parsers.md b/docs/section-3-creating-parsers.md index f332060c..262de2cd 100644 --- a/docs/section-3-creating-parsers.md +++ b/docs/section-3-creating-parsers.md @@ -11,28 +11,50 @@ Developing Tree-sitter parsers can have a difficult learning curve, but once you Writing a grammar requires creativity. There are an infinite number of CFGs (context-free grammars) that can be used to describe any given language. In order to produce a good Tree-sitter parser, you need to create a grammar with two important properties: - 1. **An intuitive structure** - Tree-sitter's output is a [concrete syntax tree][cst]; each node in the tree corresponds directly to a [terminal or non-terminal symbol][non-terminal] in the grammar. So in order to produce an easy-to-analyze tree, there should be a direct correspondence between the symbols in your grammar and the recognizable constructs in the language. This might seem obvious, but it is very different from the way that context-free grammars are often written in contexts like [language specifications][language-spec] or [Yacc][yacc]/[Bison][bison] parsers. +1. **An intuitive structure** - Tree-sitter's output is a [concrete syntax tree][cst]; each node in the tree corresponds directly to a [terminal or non-terminal symbol][non-terminal] in the grammar. So in order to produce an easy-to-analyze tree, there should be a direct correspondence between the symbols in your grammar and the recognizable constructs in the language. This might seem obvious, but it is very different from the way that context-free grammars are often written in contexts like [language specifications][language-spec] or [Yacc][yacc]/[Bison][bison] parsers. - 2. **A close adherence to LR(1)** - Tree-sitter is based on the [GLR parsing][glr-parsing] algorithm. This means that while it can handle any context-free grammar, it works most efficiently with a class of context-free grammars called [LR(1) Grammars][lr-grammars]. In this respect, Tree-sitter's grammars are similar to (but less restrictive than) [Yacc][yacc] and [Bison][bison] grammars, but *different* from [ANTLR grammars][antlr], [Parsing Expression Grammars][peg], or the [ambiguous grammars][ambiguous-grammar] commonly used in language specifications. +2. **A close adherence to LR(1)** - Tree-sitter is based on the [GLR parsing][glr-parsing] algorithm. This means that while it can handle any context-free grammar, it works most efficiently with a class of context-free grammars called [LR(1) Grammars][lr-grammars]. In this respect, Tree-sitter's grammars are similar to (but less restrictive than) [Yacc][yacc] and [Bison][bison] grammars, but *different* from [ANTLR grammars][antlr], [Parsing Expression Grammars][peg], or the [ambiguous grammars][ambiguous-grammar] commonly used in language specifications. It's unlikely that you'll be able to satisfy these two properties just by translating an existing context-free grammar directly into Tree-sitter's grammar format. There are a few kinds of adjustments that are often required. The following sections will explain these adjustments in more depth. -## Installing the tools +## Dependencies -The best way to create a Tree-sitter parser is with the [`Tree-sitter CLI`][tree-sitter-cli], which is distributed as [a Node.js module][node-module]. To install it, first install [`node`][node.js] and its package manager [`npm`][npm] on your system. Then use `npm` to create a new node module and add `tree-sitter-cli` and [`nan`][nan] as dependencies: +In order to develop a Tree-sitter parser, there are two dependencies that you need to install: + +* **Node.js** - Tree-sitter grammars are written in JavaScript, and Tree-sitter uses [Node.js][node.js] to interpret JavaScript files. It requires the `node` command to be in one of the directories in your [`PATH`][path-env]. It shouldn't matter what version of Node you have. +* **C Compiler** - Tree-sitter creates parsers that are written in C. In order to run and test these parsers with the `tree-sitter parse` or `tree-sitter test` commands, you must have a C/C++ compiler installed. Tree-sitter will try to look for these compilers in the standard places for each platform. + +## Installation + +To create a Tree-sitter parser, you need to use the [the `tree-sitter` CLI][tree-sitter-cli]. You can install the CLI in a few different ways: + +* Install the pre-built `tree-sitter-cli` [Node.js module][node-module] using [`npm`][npm], the Node package manager. This is the recommended approach, and it is discussed further in the next section. +* Download a binary for your platform from [the latest GitHub release][releases], and put it into a directory on your `PATH`. +* Build the `tree-sitter-cli` [Rust crate][crate] from source using [`cargo`][cargo], the Rust package manager. + +## Setting up a Project + +The preferred convention is to name the parser repository "tree-sitter-" followed by the name of the language. ```sh mkdir tree-sitter-${YOUR_LANGUAGE_NAME} cd tree-sitter-${YOUR_LANGUAGE_NAME} +``` +You should create a `package.json` file that describes your project, and allows your parser to be used from Node.js. + +```sh # This will prompt you for input npm init +# This allows your parser to be built as a native node module. npm install --save nan + +# This installs the Tree-sitter CLI itself npm install --save-dev tree-sitter-cli ``` -This will install the CLI and its dependencies into the `node_modules` folder in your directory. An executable program called `tree-sitter` will be created at the path `./node_modules/.bin/tree-sitter`. You may want to follow the Node.js convention of adding `./node_modules/.bin` to your `PATH` so that you can easily run this program when working in this directory. +The last command will install the CLI into the `node_modules` folder in your project. An executable program called `tree-sitter` will be created at the path `./node_modules/.bin/tree-sitter`. You may want to follow the Node.js convention of adding `./node_modules/.bin` to your `PATH` so that you can easily run this program when working in this directory. Once you have the CLI installed, create a file called `grammar.js` with the following skeleton: @@ -51,7 +73,6 @@ Then run the the following command: ```sh tree-sitter generate -npm install ``` This will generate the C code required to parse this trivial language, as well as all of the files needed to compile and load this native parser as a Node.js module. You can test this parser by creating a source file with the contents `hello;` and parsing it: @@ -60,12 +81,13 @@ This will generate the C code required to parse this trivial language, as well a tree-sitter parse ./the-file ``` -This should print: +This should print the following: + ``` (source_file [0, 0] - [0, 5]) ``` -When you make changes to the grammar, you can update the parser simply by re-running `tree-sitter generate`. The best way to recompile the C-code is to run the command `node-gyp build`. You may have to install the [`node-gyp`][node-gyp] tool separately by running `npm install -g node-gyp`. +You might notice that the first time you run `tree-sitter parse`, it takes a few seconds. This is because Tree-sitter automatically compiles your C code into a dynamically-loadable library. Whenever you make changes to your grammar, you can update the parser simply by re-running `tree-sitter generate`. When the parser changes, Tree-sitter will recompile it as needed. ## Starting to define the grammar @@ -449,6 +471,7 @@ Aside from improving error detection, keyword extraction also has performance be [longest-match]: https://en.wikipedia.org/wiki/Maximal_munch [cst]: https://en.wikipedia.org/wiki/Parse_tree [dfa]: https://en.wikipedia.org/wiki/Deterministic_finite_automaton +[path-env]: https://en.wikipedia.org/wiki/PATH_(variable) [non-terminal]: https://en.wikipedia.org/wiki/Terminal_and_nonterminal_symbols [language-spec]: https://en.wikipedia.org/wiki/Programming_language_specification [glr-parsing]: https://en.wikipedia.org/wiki/GLR_parser @@ -460,13 +483,15 @@ Aside from improving error detection, keyword extraction also has performance be [ambiguous-grammar]: https://en.wikipedia.org/wiki/Ambiguous_grammar [tree-sitter-javascript]: https://github.com/tree-sitter/tree-sitter-javascript [ecmascript-spec]: https://www.ecma-international.org/ecma-262/6.0 -[tree-sitter-cli]: https://github.com/tree-sitter/tree-sitter-cli +[tree-sitter-cli]: https://github.com/tree-sitter/tree-sitter/tree/master/cli +[releases]: https://github.com/tree-sitter/tree-sitter/releases/latest [node-module]: https://www.npmjs.com/package/tree-sitter-cli [node.js]: https://nodejs.org [npm]: https://docs.npmjs.com +[cargo]: https://doc.rust-lang.org/cargo/getting-started/installation.html +[crate]: https://crates.io/crates/tree-sitter-cli [nan]: https://github.com/nodejs/nan [s-exp]: https://en.wikipedia.org/wiki/S-expression -[node-gyp]: https://github.com/nodejs/node-gyp [ebnf]: https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form [lr-conflict]: https://en.wikipedia.org/wiki/LR_parser#Conflicts_in_the_constructed_tables [yacc-prec]: https://docs.oracle.com/cd/E19504-01/802-5880/6i9k05dh3/index.html From 190b219be5368a04200668ea469bc4fc5bf98357 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 22 Feb 2019 09:35:29 -0800 Subject: [PATCH 15/27] Document external scanners Fixes #281 --- docs/section-3-creating-parsers.md | 224 +++++++++++++++++++++++------ 1 file changed, 183 insertions(+), 41 deletions(-) diff --git a/docs/section-3-creating-parsers.md b/docs/section-3-creating-parsers.md index 262de2cd..7e280077 100644 --- a/docs/section-3-creating-parsers.md +++ b/docs/section-3-creating-parsers.md @@ -7,7 +7,9 @@ permalink: creating-parsers Developing Tree-sitter parsers can have a difficult learning curve, but once you get the hang of it, it can be fun and even zen-like. This document should help you to build an effective mental model for parser development. -## Understanding the problem +## Getting Started + +### Understanding the problem Writing a grammar requires creativity. There are an infinite number of CFGs (context-free grammars) that can be used to describe any given language. In order to produce a good Tree-sitter parser, you need to create a grammar with two important properties: @@ -17,22 +19,22 @@ Writing a grammar requires creativity. There are an infinite number of CFGs (con It's unlikely that you'll be able to satisfy these two properties just by translating an existing context-free grammar directly into Tree-sitter's grammar format. There are a few kinds of adjustments that are often required. The following sections will explain these adjustments in more depth. -## Dependencies +### Dependencies In order to develop a Tree-sitter parser, there are two dependencies that you need to install: * **Node.js** - Tree-sitter grammars are written in JavaScript, and Tree-sitter uses [Node.js][node.js] to interpret JavaScript files. It requires the `node` command to be in one of the directories in your [`PATH`][path-env]. It shouldn't matter what version of Node you have. * **C Compiler** - Tree-sitter creates parsers that are written in C. In order to run and test these parsers with the `tree-sitter parse` or `tree-sitter test` commands, you must have a C/C++ compiler installed. Tree-sitter will try to look for these compilers in the standard places for each platform. -## Installation +### Installation To create a Tree-sitter parser, you need to use the [the `tree-sitter` CLI][tree-sitter-cli]. You can install the CLI in a few different ways: -* Install the pre-built `tree-sitter-cli` [Node.js module][node-module] using [`npm`][npm], the Node package manager. This is the recommended approach, and it is discussed further in the next section. +* Install the `tree-sitter-cli` [Node.js module][node-module] using [`npm`][npm], the Node package manager. This is the recommended approach, and it is discussed further in the next section. * Download a binary for your platform from [the latest GitHub release][releases], and put it into a directory on your `PATH`. * Build the `tree-sitter-cli` [Rust crate][crate] from source using [`cargo`][cargo], the Rust package manager. -## Setting up a Project +### Setting up a Project The preferred convention is to name the parser repository "tree-sitter-" followed by the name of the language. @@ -47,7 +49,7 @@ You should create a `package.json` file that describes your project, and allows # This will prompt you for input npm init -# This allows your parser to be built as a native node module. +# This installs a small module that lets your parser be used from Node npm install --save nan # This installs the Tree-sitter CLI itself @@ -89,7 +91,7 @@ This should print the following: You might notice that the first time you run `tree-sitter parse`, it takes a few seconds. This is because Tree-sitter automatically compiles your C code into a dynamically-loadable library. Whenever you make changes to your grammar, you can update the parser simply by re-running `tree-sitter generate`. When the parser changes, Tree-sitter will recompile it as needed. -## Starting to define the grammar +## Writing the Grammar It's usually a good idea to find a formal specification for the language you're trying to parse. This specification will most likely contain a context-free grammar. As you read through the rules of this CFG, you will probably discover a complex and cyclic graph of relationships. It might be unclear how you should navigate this graph as you define your grammar. @@ -190,7 +192,7 @@ With this structure in place, you can now freely decide what part of the grammar After developing the *type* sublanguage a bit further, you might decide to switch to working on *statements* or *expressions* instead. It's often useful to check your progress by trying to parse some real code using `tree-sitter parse`. -## Writing unit tests +### Writing unit tests For each rule that you add to the grammar, you should first create a *test* that describes how the syntax trees should look when parsing that rule. These tests are written using specially-formatted text files in a `corpus` directory in your parser's root folder. Here is an example of how these tests should look: @@ -231,13 +233,13 @@ tree-sitter test -f 'Return statements' The recommendation is to be comprehensive in adding tests. If it's a visible node, add it to a test file in your `corpus` directory. It's typically a good idea to test all of the permutations of each language construct. This increases test coverage, but doubly acquaints readers with a way to examine expected outputs and understand the "edges" of a language. -## Using the grammar DSL +### The Grammar DSL The following is a complete list of built-in functions you can use to define Tree-sitter grammars. Use-cases for some of these functions will be explained in more detail in later sections. * **Symbols (the `$` object)** - Every grammar rule is written as a JavaScript function that takes a parameter conventionally called `$`. The syntax `$.identifier` is how you refer to another grammar symbol within a rule. * **String and Regex literals** - The terminal symbols in a grammar are described using JavaScript strings and regular expressions. Of course during parsing, Tree-sitter does not actually use JavaScript's regex engine to evaluate these regexes; it generates its own regex-matching logic as part of each parser. Regex literals are just used as a convenient way of writing regular expressions in your grammar. -* **Sequences : `seq(rule1, rule2, ...)`** - This function creates a rule that matches any number of other rules, one after another. It is analogous to simply writing multiple symbols next to each other in [EBNF notation](enbf). +* **Sequences : `seq(rule1, rule2, ...)`** - This function creates a rule that matches any number of other rules, one after another. It is analogous to simply writing multiple symbols next to each other in [EBNF notation][ebnf]. * **Alternatives : `choice(rule1, rule2, ...)`** - This function creates a rule that matches *one* of a set of possible rules. The order of the arguments does not matter. This is analogous to the `|` (pipe) operator in EBNF notation. * **Repetitions : `repeat(rule)`** - This function creates a rule that matches *zero-or-more* occurrences of a given rule. It is analogous to the `{x}` (curly brace) syntax in EBNF notation. * **Repetitions : `repeat1(rule)`** - This function creates a rule that matches *one-or-more* occurrences of a given rule. The previous `repeat` rule is implemented in terms of `repeat1` but is included because it is very commonly used. @@ -251,13 +253,13 @@ The following is a complete list of built-in functions you can use to define Tre In addition to the `name` and `rules` fields, grammars have a few other optional public fields that influence the behavior of the parser. -* `extras` - an array of tokens that may appear *anywhere* in the language. This is often used for whitespace and comments. The default value of `extras` is to accept whitespace. To control whitespace explicitly, specify `extras: $ => []` in your grammar. -* `inline` - an array of rule names that should be automatically *removed* from the grammar by replacing all of their usages with a copy of their definition. This is useful for rules that are used in multiple places but for which you *don't* want to create syntax tree nodes at runtime. -* `conflicts` - an array of arrays of rule names. Each inner array represents a set of rules that's involved in an *LR(1) conflict* that is *intended to exist* in the grammar. When these conflicts occur at runtime, Tree-sitter will use the GLR algorithm to explore all of the possible interpretations. If *multiple* parses end up succeeding, Tree-sitter will pick the subtree whose corresponding rule has the highest total *dynamic precedence*. -* `externals` - an array of token names which can be returned by an *external scanner*. External scanners allow you to write custom C code which runs during the lexing process in order to handle lexical rules (e.g. Python's indentation tokens) that cannot be described by regular expressions. -* `word` - the name of a token that will match keywords for the purpose of the [keyword extraction](#keyword-extraction) optimization. +* **`extras`** - an array of tokens that may appear *anywhere* in the language. This is often used for whitespace and comments. The default value of `extras` is to accept whitespace. To control whitespace explicitly, specify `extras: $ => []` in your grammar. +* **`inline`** - an array of rule names that should be automatically *removed* from the grammar by replacing all of their usages with a copy of their definition. This is useful for rules that are used in multiple places but for which you *don't* want to create syntax tree nodes at runtime. +* **`conflicts`** - an array of arrays of rule names. Each inner array represents a set of rules that's involved in an *LR(1) conflict* that is *intended to exist* in the grammar. When these conflicts occur at runtime, Tree-sitter will use the GLR algorithm to explore all of the possible interpretations. If *multiple* parses end up succeeding, Tree-sitter will pick the subtree whose corresponding rule has the highest total *dynamic precedence*. +* **`externals`** - an array of token names which can be returned by an [*external scanner*](#external-scanners). External scanners allow you to write custom C code which runs during the lexing process in order to handle lexical rules (e.g. Python's indentation tokens) that cannot be described by regular expressions. +* **`word`** - the name of a token that will match keywords for the purpose of the [keyword extraction](#keyword-extraction) optimization. -## Adjusting existing grammars +### Adjusting existing grammars Imagine that you were just starting work on the [Tree-sitter JavaScript parser][tree-sitter-javascript]. You might try to directly mirror the structure of the [ECMAScript Language Spec][ecmascript-spec]. To illustrate the problem with this approach, consider the following line of code: @@ -342,7 +344,7 @@ Possible resolutions: 4: Add a conflict for these rules: `binary_expression` `unary_expression` ``` -For an expression like `-a * b`, it's not clear whether the `-` operator applies to the `a * b` or just to the `a`. This is where the `prec` function described above comes into play. By wrapping a rule with `prec`, we can indicate that certain sequence of symbols should *bind to each other more tightly* than others. For example, the `'-', $._expression` sequence in `unary_expression` should bind more tightly than the `$._expression, '+', $._expression` sequence in `binary_expression`: +For an expression like `-a * b`, it's not clear whether the `-` operator applies to the `a * b` or just to the `a`. This is where the `prec` function [described above](#the-grammar-dsl) comes into play. By wrapping a rule with `prec`, we can indicate that certain sequence of symbols should *bind to each other more tightly* than others. For example, the `'-', $._expression` sequence in `unary_expression` should bind more tightly than the `$._expression, '+', $._expression` sequence in `binary_expression`: ```js { @@ -438,6 +440,8 @@ For example, suppose we added `identifier` as the `word` token in our JavaScript ```js grammar({ + name: 'javascript', + word: $ => $.identifier, rules: { @@ -460,39 +464,177 @@ grammar({ identifier: $ => /[a-z_]+/ } -}) +}); ``` Tree-sitter would identify `typeof` and `instanceof` as keywords. Then, when parsing the invalid code above, rather than scanning for the `instanceof` token individually, it would scan for an `identifier` first, and find `instanceofSomething`. It would then correctly recognize the code as invalid. Aside from improving error detection, keyword extraction also has performance benefits. It allows Tree-sitter to generate a smaller, simpler lexing function, which means that **the parser will compile much more quickly**. -[lexing]: https://en.wikipedia.org/wiki/Lexical_analysis -[longest-match]: https://en.wikipedia.org/wiki/Maximal_munch -[cst]: https://en.wikipedia.org/wiki/Parse_tree -[dfa]: https://en.wikipedia.org/wiki/Deterministic_finite_automaton -[path-env]: https://en.wikipedia.org/wiki/PATH_(variable) -[non-terminal]: https://en.wikipedia.org/wiki/Terminal_and_nonterminal_symbols -[language-spec]: https://en.wikipedia.org/wiki/Programming_language_specification -[glr-parsing]: https://en.wikipedia.org/wiki/GLR_parser -[lr-grammars]: https://en.wikipedia.org/wiki/LR_parser -[yacc]: https://en.wikipedia.org/wiki/Yacc -[bison]: https://en.wikipedia.org/wiki/GNU_bison -[antlr]: http://www.antlr.org/ -[peg]: https://en.wikipedia.org/wiki/Parsing_expression_grammar +### External Scanners + +Many languages have some tokens whose structure is impossible or inconvenient to describe with a regular expression. Some examples: +* [Indent and dedent][indent-tokens] tokens in Python +* [Heredocs][heredoc] in Bash and Ruby +* [Percent strings][percent-string] in Ruby + +Tree-sitter allows you to handle these kinds of tokens using *external scanners*. An external scanner is a set of C functions that you, the grammar author, can write by hand in order to add custom logic for recognizing certain tokens. + +To use an external scanner, there are a few steps. First, add an `externals` section to your grammar. This section should list the names of all of your external tokens. These names can then be used elsewhere in your grammar. + +```js +grammar({ + name: 'my_language', + + externals: $ => [ + $.indent, + $.dedent, + $.newline + ], + + // ... +}); +``` + +Then, add another C or C++ source file to your project. Currently, its path must be `src/scanner.c` or `src/scanner.cc` for the CLI to recognize it. Be sure to add this file to the `sources` section of your `binding.gyp` file so that it will be included when your project is compiled by Node.js. + +In this new source file, define an [`enum`][enum] type containing the names of all of your external tokens. The ordering of this enum must match the order in your grammar's `externals` array. + +```c +#include + +enum TokenType { + INDENT, + DEDENT, + NEWLINE +} +``` + +Finally, you must define five functions with specific names, based on your language's name and five actions: *create*, *destroy*, *serialize*, *deserialize*, and *scan*. These functions must all use [C linkage][c-linkage], so if you're writing the scanner in C++, you need to declare them with the `extern "C"` qualifier. + +#### Create + +```c +void * tree_sitter_my_language_external_scanner_create() { + // ... +} +``` + +This function should create your scanner object. It will only be called once anytime your language is set on a parser. Often, you will want to allocate memory on the heap and return a pointer to it. If your external scanner doesn't need to maintain any state, it's ok to return `NULL`. + + +#### Destroy + +```c +void tree_sitter_my_language_external_scanner_destroy(void *payload) { + // ... +} +``` + +This function should free any memory used by your scanner. It is called once when a parser is deleted or assigned a different language. It receives as an argument the same pointer that was returned from the *create* function. If your *create* function didn't allocate any memory, this function can be a noop. + +#### Serialize + +```c +unsigned tree_sitter_my_language_external_scanner_serialize( + void *payload, + char *buffer +) { + // ... +} +``` + +This function should copy the complete state of your scanner into a given byte buffer, and return the number of bytes written. The function is called every time the external scanner successfully recognizes a token. It receives a pointer to your scanner and a pointer to a buffer. The maximum number of bytes that you can write is given by the `TREE_SITTER_SERIALIZATION_BUFFER_SIZE` constant, defined in the `tree_sitter/parser.h` header file. + +The data that this function writes will ultimately be stored in the syntax tree so that the scanner can be restored to the right state when handling edits or ambiguities. For your parser to work correctly, the `serialize` function must store its entire state, and `deserialize` must restore the entire state. For good performance, you should design your scanner so that its state can be serialized as quickly and compactly as possible. + +#### Deserialize + +```c +void tree_sitter_my_language_external_scanner_deserialize( + void *payload, + const char *buffer, + unsigned length +) { + // ... +} +``` + +This function should *restore* the state of your scanner based the bytes that were previously written by the `serialize` function. It is called with a pointer to your scanner, a pointer to the buffer of bytes, and the number of bytes that should be read. + +#### Scan + +```c +bool tree_sitter_my_language_external_scanner_scan( + void *payload, + TSLexer *lexer, + const bool *valid_symbols +) { + // ... +} +``` + +This function is responsible for recognizing external tokens. It should return `true` if a token was recognized, and `false` otherwise. It is called with a "lexer" struct with the following fields: + +* **`uint32_t lookahead`** - The current next character in the input stream, represented as a 32-bit unicode code point. +* **`TSSymbol result_symbol`** - The symbol that was recognized. Your scan function should *assign* to this field one of the values from the `TokenType` enum, described above. +* **`void (*advance)(TSLexer *, bool skip)`** - A function for advancing to the next character. Pass `true` for the second argument, the current character will be treated as whitespace. +* **`void (*mark_end)(TSLexer *)`** - A function for marking the end of the recognized token. This allows matching tokens that require multiple characters of lookahead. By default (if you don't call `mark_end`), any character that you moved past using the `advance` function will be included in the size of the token. But once you call `mark_end`, then any later calls to `advance` will *not* increase the size of the returned token. You can call `mark_end` multiple times to increase the size of the token. +* **`uint32_t (*get_column)(TSLexer *)`** - **(Experimental)** A function for querying the current column position of the lexer. It returns the number of unicode code points (not bytes) since the start of the current line. +* **`bool (*is_at_included_range_start)(TSLexer *)`** - A function for checking if the parser has just skipped some characters in the document. When parsing an embedded document using the `ts_parser_set_included_ranges` function (described in the [multi-language document section][multi-language-section]), your scanner may want to apply some special behavior when moving to a disjoint part of the document. For example, in [EJS documents][ejs], the JavaScript parser uses this function to enable inserting automatic semicolon tokens in between the code directives, delimited by `<%` and `%>`. + +The third argument to the `scan` function is an array of booleans that indicates which of your external tokens are currently expected by the parser. You should only look for a given token if it is valid according to this array. At the same time, you cannot backtrack, so you may need to combine certain pieces of logic. + +```c +if (valid_symbols[INDENT] || valid_symbol[DEDENT]) { + + // ... logic that is common to both `INDENT` and `DEDENT` + + if (valid_symbols[INDENT]) { + + // ... logic that is specific to `INDENT` + + lexer->result_symbol = INDENT; + return true; + } +} +``` + + + [ambiguous-grammar]: https://en.wikipedia.org/wiki/Ambiguous_grammar -[tree-sitter-javascript]: https://github.com/tree-sitter/tree-sitter-javascript -[ecmascript-spec]: https://www.ecma-international.org/ecma-262/6.0 -[tree-sitter-cli]: https://github.com/tree-sitter/tree-sitter/tree/master/cli -[releases]: https://github.com/tree-sitter/tree-sitter/releases/latest -[node-module]: https://www.npmjs.com/package/tree-sitter-cli -[node.js]: https://nodejs.org -[npm]: https://docs.npmjs.com +[antlr]: http://www.antlr.org/ +[bison-dprec]: https://www.gnu.org/software/bison/manual/html_node/Generalized-LR-Parsing.html +[bison]: https://en.wikipedia.org/wiki/GNU_bison +[c-linkage]: https://en.cppreference.com/w/cpp/language/language_linkage [cargo]: https://doc.rust-lang.org/cargo/getting-started/installation.html [crate]: https://crates.io/crates/tree-sitter-cli -[nan]: https://github.com/nodejs/nan -[s-exp]: https://en.wikipedia.org/wiki/S-expression +[cst]: https://en.wikipedia.org/wiki/Parse_tree +[dfa]: https://en.wikipedia.org/wiki/Deterministic_finite_automaton [ebnf]: https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form +[ecmascript-spec]: https://www.ecma-international.org/ecma-262/6.0 +[ejs]: https://ejs.co +[enum]: https://en.wikipedia.org/wiki/Enumerated_type#C +[glr-parsing]: https://en.wikipedia.org/wiki/GLR_parser +[heredoc]: https://en.wikipedia.org/wiki/Here_document +[indent-tokens]: https://en.wikipedia.org/wiki/Off-side_rule +[language-spec]: https://en.wikipedia.org/wiki/Programming_language_specification +[lexing]: https://en.wikipedia.org/wiki/Lexical_analysis +[longest-match]: https://en.wikipedia.org/wiki/Maximal_munch [lr-conflict]: https://en.wikipedia.org/wiki/LR_parser#Conflicts_in_the_constructed_tables +[lr-grammars]: https://en.wikipedia.org/wiki/LR_parser +[multi-language-section]: /using-parsers#multi-language-documents +[nan]: https://github.com/nodejs/nan +[node-module]: https://www.npmjs.com/package/tree-sitter-cli +[node.js]: https://nodejs.org +[non-terminal]: https://en.wikipedia.org/wiki/Terminal_and_nonterminal_symbols +[npm]: https://docs.npmjs.com +[path-env]: https://en.wikipedia.org/wiki/PATH_(variable) +[peg]: https://en.wikipedia.org/wiki/Parsing_expression_grammar +[percent-string]: https://docs.ruby-lang.org/en/2.5.0/syntax/literals_rdoc.html#label-Percent+Strings +[releases]: https://github.com/tree-sitter/tree-sitter/releases/latest +[s-exp]: https://en.wikipedia.org/wiki/S-expression +[tree-sitter-cli]: https://github.com/tree-sitter/tree-sitter/tree/master/cli +[tree-sitter-javascript]: https://github.com/tree-sitter/tree-sitter-javascript [yacc-prec]: https://docs.oracle.com/cd/E19504-01/802-5880/6i9k05dh3/index.html -[bison-dprec]: https://www.gnu.org/software/bison/manual/html_node/Generalized-LR-Parsing.html +[yacc]: https://en.wikipedia.org/wiki/Yacc From 92e9f984edae9efb74c90f3b600ed753505e184a Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 22 Feb 2019 09:44:25 -0800 Subject: [PATCH 16/27] Fix small docs errors --- docs/section-2-using-parsers.md | 2 +- docs/section-3-creating-parsers.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/section-2-using-parsers.md b/docs/section-2-using-parsers.md index 879b2dd2..3085633f 100644 --- a/docs/section-2-using-parsers.md +++ b/docs/section-2-using-parsers.md @@ -36,7 +36,7 @@ Alternatively, you can use the library in a larger project by adding one source ## The Objects There are four main types of objects involved when using Tree-sitter: languages, parsers, syntax trees, and syntax nodes. In C, these are called `TSParser`, `TSLanguage`, `TSTree`, and `TSNode`. -* A `TSLanguage` is an opaque object that defines how to parse a particular programming language. The code for each `TSLanguage` is generated by Tree-sitter. Many languages are already available in separate git repositories within the the [Tree-sitter GitHub organization](https://github.com/tree-sitter). See [the next section](/creating-parsers) for how to create new languages. +* A `TSLanguage` is an opaque object that defines how to parse a particular programming language. The code for each `TSLanguage` is generated by Tree-sitter. Many languages are already available in separate git repositories within the the [Tree-sitter GitHub organization](https://github.com/tree-sitter). See [the next section](./creating-parsers) for how to create new languages. * A `TSParser` is a stateful object that can be assigned a `TSLanguage` and used to produce a `TSTree` based on some source code. * A `TSTree` represents the syntax tree of an entire source code file. Its contains `TSNode` instances that indicate the structure of the source code. It can also be edited and used to produce a new `TSTree` in the event that the source code changes. * A `TSNode` represents a single node in the syntax tree. It tracks its start and end positions in the source code, as well as its relation to other nodes like its parent, siblings and children. diff --git a/docs/section-3-creating-parsers.md b/docs/section-3-creating-parsers.md index 7e280077..1e7989fa 100644 --- a/docs/section-3-creating-parsers.md +++ b/docs/section-3-creating-parsers.md @@ -578,7 +578,7 @@ This function is responsible for recognizing external tokens. It should return ` * **`uint32_t lookahead`** - The current next character in the input stream, represented as a 32-bit unicode code point. * **`TSSymbol result_symbol`** - The symbol that was recognized. Your scan function should *assign* to this field one of the values from the `TokenType` enum, described above. -* **`void (*advance)(TSLexer *, bool skip)`** - A function for advancing to the next character. Pass `true` for the second argument, the current character will be treated as whitespace. +* **`void (*advance)(TSLexer *, bool skip)`** - A function for advancing to the next character. If you pass `true` for the second argument, the current character will be treated as whitespace. * **`void (*mark_end)(TSLexer *)`** - A function for marking the end of the recognized token. This allows matching tokens that require multiple characters of lookahead. By default (if you don't call `mark_end`), any character that you moved past using the `advance` function will be included in the size of the token. But once you call `mark_end`, then any later calls to `advance` will *not* increase the size of the returned token. You can call `mark_end` multiple times to increase the size of the token. * **`uint32_t (*get_column)(TSLexer *)`** - **(Experimental)** A function for querying the current column position of the lexer. It returns the number of unicode code points (not bytes) since the start of the current line. * **`bool (*is_at_included_range_start)(TSLexer *)`** - A function for checking if the parser has just skipped some characters in the document. When parsing an embedded document using the `ts_parser_set_included_ranges` function (described in the [multi-language document section][multi-language-section]), your scanner may want to apply some special behavior when moving to a disjoint part of the document. For example, in [EJS documents][ejs], the JavaScript parser uses this function to enable inserting automatic semicolon tokens in between the code directives, delimited by `<%` and `%>`. @@ -623,7 +623,7 @@ if (valid_symbols[INDENT] || valid_symbol[DEDENT]) { [longest-match]: https://en.wikipedia.org/wiki/Maximal_munch [lr-conflict]: https://en.wikipedia.org/wiki/LR_parser#Conflicts_in_the_constructed_tables [lr-grammars]: https://en.wikipedia.org/wiki/LR_parser -[multi-language-section]: /using-parsers#multi-language-documents +[multi-language-section]: ./using-parsers#multi-language-documents [nan]: https://github.com/nodejs/nan [node-module]: https://www.npmjs.com/package/tree-sitter-cli [node.js]: https://nodejs.org From 14b7af34360dedbb7a1ce6b00541890c4c551df1 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 22 Feb 2019 11:48:29 -0800 Subject: [PATCH 17/27] highlight: Fix HTML rendering of empty lines --- cli/src/tests/highlight_test.rs | 31 +++++++++++++++++++++++++++++ highlight/src/lib.rs | 35 +++++++++++++++++++-------------- 2 files changed, 51 insertions(+), 15 deletions(-) diff --git a/cli/src/tests/highlight_test.rs b/cli/src/tests/highlight_test.rs index 57f61e16..accca617 100644 --- a/cli/src/tests/highlight_test.rs +++ b/cli/src/tests/highlight_test.rs @@ -122,6 +122,37 @@ fn test_highlighting_multiline_scopes_to_html() { ); } +#[test] +fn test_highlighting_empty_lines() { + let source = vec![ + "class A {", + "", + " b(c) {", + "", + " d(e)", + "", + " }", + "", + "}", + ] + .join("\n"); + + assert_eq!( + &to_html(&source, get_language("javascript"), &JS_SHEET,).unwrap(), + &[ + "class A {\n".to_string(), + "\n".to_string(), + " b(c) {\n".to_string(), + "\n".to_string(), + " d(e)\n".to_string(), + "\n".to_string(), + " }\n".to_string(), + "\n".to_string(), + "}\n".to_string(), + ] + ); +} + fn test_language_for_injection_string<'a>( string: &str, ) -> Option<(Language, &'a PropertySheet)> { diff --git a/highlight/src/lib.rs b/highlight/src/lib.rs index 7ec186d8..647064bb 100644 --- a/highlight/src/lib.rs +++ b/highlight/src/lib.rs @@ -781,17 +781,19 @@ where renderer.end_scope(); } HighlightEvent::Source(src) => { - renderer.render_line(src, &scopes); + renderer.add_text(src, &scopes); } }; } - renderer.flush(); + if !renderer.current_line.is_empty() { + renderer.finish_line(); + } Ok(renderer.result) } struct HtmlRenderer<'a, F: Fn(Scope) -> &'a str> { result: Vec, - buffer: String, + current_line: String, attribute_callback: F, } @@ -802,37 +804,40 @@ where fn new(attribute_callback: F) -> Self { HtmlRenderer { result: Vec::new(), - buffer: String::new(), + current_line: String::new(), attribute_callback, } } fn start_scope(&mut self, s: Scope) { - write!(&mut self.buffer, "", (self.attribute_callback)(s),).unwrap(); + write!( + &mut self.current_line, + "", + (self.attribute_callback)(s), + ) + .unwrap(); } fn end_scope(&mut self) { - write!(&mut self.buffer, "").unwrap(); + write!(&mut self.current_line, "").unwrap(); } - fn flush(&mut self) { - if !self.buffer.is_empty() { - self.buffer.push('\n'); - self.result.push(self.buffer.clone()); - self.buffer.clear(); - } + fn finish_line(&mut self) { + self.current_line.push('\n'); + self.result.push(self.current_line.clone()); + self.current_line.clear(); } - fn render_line(&mut self, src: &str, scopes: &Vec) { + fn add_text(&mut self, src: &str, scopes: &Vec) { let mut multiline = false; for line in src.split('\n') { let line = line.trim_end_matches('\r'); if multiline { scopes.iter().for_each(|_| self.end_scope()); - self.flush(); + self.finish_line(); scopes.iter().for_each(|scope| self.start_scope(*scope)); } - write!(&mut self.buffer, "{}", escape::Escape(line)).unwrap(); + write!(&mut self.current_line, "{}", escape::Escape(line)).unwrap(); multiline = true; } } From d3628f3c0cc3260c37507313f1f1d1f2669b6c92 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 22 Feb 2019 11:50:27 -0800 Subject: [PATCH 18/27] highlight: 0.1.3 --- Cargo.lock | 4 ++-- highlight/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1810fb4d..c7dad351 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -639,12 +639,12 @@ dependencies = [ "smallbitvec 2.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "spin 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", "tree-sitter 0.3.8", - "tree-sitter-highlight 0.1.2", + "tree-sitter-highlight 0.1.3", ] [[package]] name = "tree-sitter-highlight" -version = "0.1.2" +version = "0.1.3" dependencies = [ "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/highlight/Cargo.toml b/highlight/Cargo.toml index ee2dd80e..8351c8bd 100644 --- a/highlight/Cargo.toml +++ b/highlight/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "tree-sitter-highlight" description = "Library for performing syntax highlighting with Tree-sitter" -version = "0.1.2" +version = "0.1.3" authors = [ "Max Brunsfeld ", "Tim Clem " From 858b4ba8accc648edbe414576038b116b0c0f805 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 22 Feb 2019 16:15:36 -0800 Subject: [PATCH 19/27] cli: Fix handling of import chains in property sheets --- Cargo.lock | 24 +++++++ cli/Cargo.toml | 1 + cli/src/properties.rs | 154 ++++++++++++++++++++++++++++++------------ 3 files changed, 135 insertions(+), 44 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c7dad351..4f5263f7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -455,6 +455,14 @@ dependencies = [ "ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "remove_dir_all" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "rsass" version = "0.9.6" @@ -578,6 +586,19 @@ dependencies = [ "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "tempfile" +version = "3.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "rand 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", + "redox_syscall 0.1.43 (registry+https://github.com/rust-lang/crates.io-index)", + "remove_dir_all 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "termion" version = "1.5.1" @@ -638,6 +659,7 @@ dependencies = [ "serde_json 1.0.33 (registry+https://github.com/rust-lang/crates.io-index)", "smallbitvec 2.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "spin 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", + "tempfile 3.0.7 (registry+https://github.com/rust-lang/crates.io-index)", "tree-sitter 0.3.8", "tree-sitter-highlight 0.1.3", ] @@ -772,6 +794,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum redox_users 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "214a97e49be64fd2c86f568dd0cb2c757d2cc53de95b273b6ad0a1c908482f26" "checksum regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "37e7cbbd370869ce2e8dff25c7018702d10b21a20ef7135316f8daecd6c25b7f" "checksum regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)" = "4e47a2ed29da7a9e1960e1639e7a982e6edc6d49be308a3b02daf511504a16d1" +"checksum remove_dir_all 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3488ba1b9a2084d38645c4c08276a1752dcbf2c7130d74f1569681ad5d2799c5" "checksum rsass 0.9.6 (registry+https://github.com/rust-lang/crates.io-index)" = "7a5dde55023a6c19470f7aeb59f75f897d8b80cbe00d61dfcaf7bbbe3de4c0a6" "checksum rustc-demangle 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "bcfe5b13211b4d78e5c2cadfebd7769197d95c639c35a50057eb4c05de811395" "checksum rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" @@ -789,6 +812,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum strsim 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bb4f380125926a99e52bc279241539c018323fab05ad6368b56f93d9369ff550" "checksum syn 0.15.22 (registry+https://github.com/rust-lang/crates.io-index)" = "ae8b29eb5210bc5cf63ed6149cbf9adfc82ac0be023d8735c176ee74a2db4da7" "checksum synstructure 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)" = "73687139bf99285483c96ac0add482c3776528beac1d97d444f6e91f203a2015" +"checksum tempfile 3.0.7 (registry+https://github.com/rust-lang/crates.io-index)" = "b86c784c88d98c801132806dadd3819ed29d8600836c4088e855cdf3e178ed8a" "checksum termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "689a3bdfaab439fd92bc87df5c4c78417d3cbe537487274e9b0b2dce76e92096" "checksum textwrap 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "307686869c93e71f94da64286f9a9524c0f308a9e1c87a583de8e9c9039ad3f6" "checksum thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c6b53e329000edc2b34dbe8545fd20e55a333362d0a321909685a19bd28c3f1b" diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 242ed72b..cd2e44e0 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -53,3 +53,4 @@ features = ["std"] [dev-dependencies] rand = "0.6.4" spin = "0.5" +tempfile = "3" diff --git a/cli/src/properties.rs b/cli/src/properties.rs index 996c77f5..a7dc9ec1 100644 --- a/cli/src/properties.rs +++ b/cli/src/properties.rs @@ -445,38 +445,9 @@ fn generate_property_sheet(path: impl AsRef, css: &str) -> Result Result> { - let mut i = 0; + let mut schema_paths = Vec::new(); let mut items = rsass::parse_scss_data(css.as_bytes())?; - while i < items.len() { - match &items[i] { - rsass::Item::Import(arg) => { - if let Some(s) = get_sass_string(arg) { - let import_path = resolve_path(path, s)?; - let imported_items = rsass::parse_scss_file(&import_path)?; - items.splice(i..(i + 1), imported_items); - continue; - } else { - return Err(Error("@import arguments must be strings".to_string())); - } - } - rsass::Item::AtRule { name, args, .. } => match name.as_str() { - "schema" => { - if let Some(s) = get_sass_string(args) { - // TODO - use schema - let _schema_path = resolve_path(path, s)?; - items.remove(i); - continue; - } else { - return Err(Error("@schema arguments must be strings".to_string())); - } - } - _ => return Err(Error(format!("Unsupported at-rule '{}'", name))), - }, - _ => {} - } - i += 1; - } - + process_at_rules(&mut items, &mut schema_paths, path)?; let mut result = Vec::new(); let selector_prefixes = vec![Vec::new()]; parse_sass_items(items, &selector_prefixes, &mut result)?; @@ -581,6 +552,45 @@ fn parse_sass_items( Ok(()) } +fn process_at_rules( + items: &mut Vec, + schema_paths: &mut Vec, + path: &Path, +) -> Result<()> { + let mut i = 0; + while i < items.len() { + match &items[i] { + rsass::Item::Import(arg) => { + if let Some(s) = get_sass_string(arg) { + let import_path = resolve_path(path, s)?; + let mut imported_items = rsass::parse_scss_file(&import_path)?; + process_at_rules(&mut imported_items, schema_paths, &import_path)?; + items.splice(i..(i + 1), imported_items); + continue; + } else { + return Err(Error("@import arguments must be strings".to_string())); + } + } + rsass::Item::AtRule { name, args, .. } => match name.as_str() { + "schema" => { + if let Some(s) = get_sass_string(args) { + let schema_path = resolve_path(path, s)?; + schema_paths.push(schema_path); + items.remove(i); + continue; + } else { + return Err(Error("@schema arguments must be strings".to_string())); + } + } + _ => return Err(Error(format!("Unsupported at-rule '{}'", name))), + }, + _ => {} + } + i += 1; + } + Ok(()) +} + fn parse_sass_value(value: &Value) -> Result { match value { Value::Literal(s) => { @@ -632,23 +642,22 @@ fn get_sass_string(value: &Value) -> Option<&str> { fn resolve_path(base: &Path, p: &str) -> Result { let path = Path::new(p); - let mut result = base.to_owned(); - result.pop(); + let mut base = base.to_owned(); + base.pop(); if path.starts_with(".") { - result.push(path); - if result.exists() { - return Ok(result); + base.push(path); + if base.exists() { + return Ok(base); } } else { loop { + let mut result = base.clone(); result.push("node_modules"); result.push(path); if result.exists() { return Ok(result); } - result.pop(); - result.pop(); - if !result.pop() { + if !base.pop() { break; } } @@ -660,9 +669,10 @@ fn resolve_path(base: &Path, p: &str) -> Result { mod tests { use super::*; use regex::Regex; + use tempfile::TempDir; #[test] - fn test_immediate_child_and_descendant_selectors() { + fn test_property_sheet_with_immediate_child_and_descendant_selectors() { let sheet = generate_property_sheet( "foo.css", " @@ -767,7 +777,7 @@ mod tests { } #[test] - fn test_text_attribute() { + fn test_property_sheet_with_text_attribute() { let sheet = generate_property_sheet( "foo.css", " @@ -810,7 +820,7 @@ mod tests { } #[test] - fn test_cascade_ordering_as_tie_breaker() { + fn test_property_sheet_with_cascade_ordering_as_tie_breaker() { let sheet = generate_property_sheet( "foo.css", " @@ -845,7 +855,7 @@ mod tests { } #[test] - fn test_css_function_calls() { + fn test_property_sheet_with_function_calls() { let sheet = generate_property_sheet( "foo.css", " @@ -882,7 +892,7 @@ mod tests { } #[test] - fn test_array_by_declaring_property_multiple_times() { + fn test_property_sheet_with_array_by_declaring_property_multiple_times() { let sheet = generate_property_sheet( "foo.css", " @@ -922,6 +932,62 @@ mod tests { ); } + #[test] + fn test_property_sheet_with_imports() { + let repo_dir = TempDir::new().unwrap(); + let properties_dir = repo_dir.path().join("properties"); + let dependency_properties_dir = repo_dir + .path() + .join("node_modules") + .join("the-dependency") + .join("properties"); + fs::create_dir_all(&properties_dir).unwrap(); + fs::create_dir_all(&dependency_properties_dir).unwrap(); + let sheet_path1 = properties_dir.join("sheet1.css"); + let sheet_path2 = properties_dir.join("sheet2.css"); + let dependency_sheet_path1 = dependency_properties_dir.join("dependency-sheet1.css"); + let dependency_sheet_path2 = dependency_properties_dir.join("dependency-sheet2.css"); + + fs::write( + sheet_path2, + r#" + a { x: '1'; } + "#, + ) + .unwrap(); + fs::write( + dependency_sheet_path1, + r#" + @import "./dependency-sheet2.css"; + a { y: '2'; } + "#, + ) + .unwrap(); + fs::write( + dependency_sheet_path2, + r#" + b { x: '3'; } + "#, + ) + .unwrap(); + let sheet = generate_property_sheet( + sheet_path1, + r#" + @import "./sheet2.css"; + @import "the-dependency/properties/dependency-sheet1.css"; + b { y: '4'; } + "#, + ) + .unwrap(); + + let a = query_simple(&sheet, vec!["a"]); + assert_eq!(a["x"], string("1"),); + assert_eq!(a["y"], string("2"),); + let b = query_simple(&sheet, vec!["b"]); + assert_eq!(b["x"], string("3"),); + assert_eq!(b["y"], string("4"),); + } + fn query_simple<'a>( sheet: &'a PropertySheetJSON, node_stack: Vec<&'static str>, From 1bad6dc41eb322737cab727a1ef0d914a534162a Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 25 Feb 2019 12:33:24 -0800 Subject: [PATCH 20/27] Add `~/.tree-sitter/config.json` file, `init-config` command Right now this is just used for two things: * Specifying folders for locarting parsers to use with `tree-sitter parse` and `tree-sitter highlight` * Specifying colors to use for `tree-sitter-highlight` --- cli/src/config.rs | 69 +++++++++++++++++++++++++++ cli/src/highlight.rs | 110 ++++++++++++++++++++++++++++++++++--------- cli/src/lib.rs | 1 + cli/src/main.rs | 36 +++++++------- highlight/src/lib.rs | 39 ++++++++++++++- 5 files changed, 212 insertions(+), 43 deletions(-) create mode 100644 cli/src/config.rs diff --git a/cli/src/config.rs b/cli/src/config.rs new file mode 100644 index 00000000..1c9cc8f6 --- /dev/null +++ b/cli/src/config.rs @@ -0,0 +1,69 @@ +use super::highlight::Theme; +use serde_derive::{Deserialize, Serialize}; +use std::path::{Path, PathBuf}; +use std::{env, fs, io}; + +#[derive(Default, Deserialize, Serialize)] +pub struct Config { + #[serde(skip)] + pub binary_directory: PathBuf, + + #[serde(default)] + #[serde(rename = "parser-directories")] + pub parser_directories: Vec, + + #[serde(default)] + pub theme: Theme, +} + +impl Config { + pub fn get_path(home_dir: &Path) -> PathBuf { + env::var("TREE_SITTER_DIR") + .map(|p| p.into()) + .unwrap_or_else(|_| home_dir.join(".tree-sitter")) + } + + pub fn load(home_dir: &Path) -> Self { + let tree_sitter_dir = Self::get_path(home_dir); + let config_path = tree_sitter_dir.join("config.json"); + let mut result = fs::read_to_string(&config_path) + .map_err(drop) + .and_then(|json| serde_json::from_str(&json).map_err(drop)) + .unwrap_or_else(|_| Self::default()); + result.init(home_dir, &tree_sitter_dir); + result + } + + pub fn save(&self, home_dir: &Path) -> io::Result<()> { + let tree_sitter_dir = Self::get_path(home_dir); + let config_path = tree_sitter_dir.join("config.json"); + let json = serde_json::to_string_pretty(self).expect("Failed to serialize config"); + fs::write(config_path, json) + } + + pub fn new(home_dir: &Path) -> Self { + let tree_sitter_dir = Self::get_path(home_dir); + let mut result = Self::default(); + result.init(home_dir, &tree_sitter_dir); + result + } + + fn init(&mut self, home_dir: &Path, tree_sitter_dir: &Path) { + if self.parser_directories.is_empty() { + self.parser_directories = vec![ + home_dir.join("github"), + home_dir.join("src"), + home_dir.join("source"), + ] + } + + let binary_path = tree_sitter_dir.join("bin"); + self.binary_directory = binary_path; + fs::create_dir_all(&self.binary_directory).unwrap_or_else(|error| { + panic!( + "Could not find or create parser binary directory {:?}. Error: {}", + self.binary_directory, error + ) + }); + } +} diff --git a/cli/src/highlight.rs b/cli/src/highlight.rs index 55ef4bc2..703c4053 100644 --- a/cli/src/highlight.rs +++ b/cli/src/highlight.rs @@ -2,7 +2,9 @@ use crate::error::Result; use crate::loader::Loader; use ansi_term::{Color, Style}; use lazy_static::lazy_static; -use serde_json::Value; +use serde::ser::SerializeMap; +use serde::{Deserialize, Deserializer, Serialize, Serializer}; +use serde_json::{json, Value}; use std::collections::HashMap; use std::{fmt, fs, io, path}; use tree_sitter::{Language, PropertySheet}; @@ -21,24 +23,7 @@ pub struct Theme { impl Theme { pub fn load(path: &path::Path) -> io::Result { let json = fs::read_to_string(path)?; - Ok(Self::new(&json)) - } - - pub fn new(json: &str) -> Self { - let mut ansi_styles = vec![None; 30]; - let mut css_styles = vec![None; 30]; - if let Ok(colors) = serde_json::from_str::>(json) { - for (scope, style_value) in colors { - let mut style = Style::default(); - parse_style(&mut style, style_value); - ansi_styles[scope as usize] = Some(style); - css_styles[scope as usize] = Some(style_to_css(style)); - } - } - Self { - ansi_styles, - css_styles, - } + Ok(serde_json::from_str(&json).unwrap_or_default()) } fn ansi_style(&self, scope: Scope) -> Option<&Style> { @@ -50,9 +35,85 @@ impl Theme { } } +impl<'de> Deserialize<'de> for Theme { + fn deserialize(deserializer: D) -> std::result::Result + where + D: Deserializer<'de>, + { + let scope_count = Scope::Unknown as usize + 1; + let mut ansi_styles = vec![None; scope_count]; + let mut css_styles = vec![None; scope_count]; + if let Ok(colors) = HashMap::::deserialize(deserializer) { + for (scope, style_value) in colors { + let mut style = Style::default(); + parse_style(&mut style, style_value); + ansi_styles[scope as usize] = Some(style); + css_styles[scope as usize] = Some(style_to_css(style)); + } + } + Ok(Self { + ansi_styles, + css_styles, + }) + } +} + +impl Serialize for Theme { + fn serialize(&self, serializer: S) -> std::result::Result + where + S: Serializer, + { + let entry_count = self.ansi_styles.iter().filter(|i| i.is_some()).count(); + let mut map = serializer.serialize_map(Some(entry_count))?; + for (i, style) in self.ansi_styles.iter().enumerate() { + let scope = Scope::from_usize(i).unwrap(); + if scope == Scope::Unknown { + break; + } + if let Some(style) = style { + let color = style.foreground.map(|color| match color { + Color::Black => json!("black"), + Color::Blue => json!("blue"), + Color::Cyan => json!("cyan"), + Color::Green => json!("green"), + Color::Purple => json!("purple"), + Color::Red => json!("red"), + Color::White => json!("white"), + Color::Yellow => json!("yellow"), + Color::RGB(r, g, b) => json!(format!("#{:x?}{:x?}{:x?}", r, g, b)), + Color::Fixed(n) => json!(n), + }); + if style.is_bold || style.is_italic || style.is_underline { + let mut entry = HashMap::new(); + if let Some(color) = color { + entry.insert("color", color); + } + if style.is_bold { + entry.insert("bold", Value::Bool(true)); + } + if style.is_italic { + entry.insert("italic", Value::Bool(true)); + } + if style.is_underline { + entry.insert("underline", Value::Bool(true)); + } + map.serialize_entry(&scope, &entry)?; + } else if let Some(color) = color { + map.serialize_entry(&scope, &color)?; + } else { + map.serialize_entry(&scope, &Value::Null)?; + } + } else { + map.serialize_entry(&scope, &Value::Null)?; + } + } + map.end() + } +} + impl Default for Theme { fn default() -> Self { - Theme::new( + serde_json::from_str( r#" { "attribute": {"color": 124, "italic": true}, @@ -71,11 +132,14 @@ impl Default for Theme { "punctuation.delimiter": 239, "string.special": 30, "string": 28, - "tag": {"color": 18}, + "tag": 18, + "type": 23, + "type.builtin": {"color": 23, "bold": true}, "variable.builtin": {"bold": true} } "#, ) + .unwrap() } } @@ -102,9 +166,8 @@ fn parse_style(style: &mut Style, json: Value) { if let Value::Object(entries) = json { for (property_name, value) in entries { match property_name.as_str() { - "italic" => *style = style.italic(), "bold" => *style = style.bold(), - "dimmed" => *style = style.dimmed(), + "italic" => *style = style.italic(), "underline" => *style = style.underline(), "color" => { if let Some(color) = parse_color(value) { @@ -126,6 +189,7 @@ fn parse_color(json: Value) -> Option { _ => None, }, Value::String(s) => match s.to_lowercase().as_str() { + "black" => Some(Color::Black), "blue" => Some(Color::Blue), "cyan" => Some(Color::Cyan), "green" => Some(Color::Green), diff --git a/cli/src/lib.rs b/cli/src/lib.rs index 0ece9cac..19b82194 100644 --- a/cli/src/lib.rs +++ b/cli/src/lib.rs @@ -1,3 +1,4 @@ +pub mod config; pub mod error; pub mod generate; pub mod highlight; diff --git a/cli/src/main.rs b/cli/src/main.rs index 255f680b..147f7fad 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -4,7 +4,9 @@ use std::fs; use std::path::Path; use std::process::exit; use std::usize; -use tree_sitter_cli::{error, generate, highlight, loader, logger, parse, properties, test}; +use tree_sitter_cli::{ + config, error, generate, highlight, loader, logger, parse, properties, test, +}; fn main() { if let Err(e) = run() { @@ -24,6 +26,7 @@ fn run() -> error::Result<()> { .setting(AppSettings::SubcommandRequiredElseHelp) .author("Max Brunsfeld ") .about("Generates and tests parsers") + .subcommand(SubCommand::with_name("init-config").about("Generate a default config file")) .subcommand( SubCommand::with_name("generate") .about("Generate a parser") @@ -77,19 +80,15 @@ fn run() -> error::Result<()> { ) .get_matches(); - let home_dir = dirs::home_dir().unwrap(); + let home_dir = dirs::home_dir().expect("Failed to read home directory"); let current_dir = env::current_dir().unwrap(); - let config_dir = home_dir.join(".tree-sitter"); - let theme_path = config_dir.join("theme.json"); - let parsers_dir = config_dir.join("parsers"); + let config = config::Config::load(&home_dir); + let mut loader = loader::Loader::new(config.binary_directory.clone()); - // TODO - make configurable - let parser_repo_paths = vec![home_dir.join("github")]; - - fs::create_dir_all(&parsers_dir).unwrap(); - let mut loader = loader::Loader::new(config_dir); - - if let Some(matches) = matches.subcommand_matches("generate") { + if matches.subcommand_matches("init-config").is_some() { + let config = config::Config::new(&home_dir); + config.save(&home_dir)?; + } else if let Some(matches) = matches.subcommand_matches("generate") { if matches.is_present("log") { logger::init(); } @@ -127,7 +126,7 @@ fn run() -> error::Result<()> { let debug_graph = matches.is_present("debug-graph"); let quiet = matches.is_present("quiet"); let time = matches.is_present("time"); - loader.find_all_languages(&parser_repo_paths)?; + loader.find_all_languages(&config.parser_directories)?; let paths = matches .values_of("path") .unwrap() @@ -161,10 +160,9 @@ fn run() -> error::Result<()> { return Err(error::Error(String::new())); } } else if let Some(matches) = matches.subcommand_matches("highlight") { - loader.find_all_languages(&parser_repo_paths)?; - let theme = highlight::Theme::load(&theme_path).unwrap_or_default(); let paths = matches.values_of("path").unwrap().into_iter(); let html_mode = matches.is_present("html"); + loader.find_all_languages(&config.parser_directories)?; if html_mode { println!("{}", highlight::HTML_HEADER); @@ -182,7 +180,7 @@ fn run() -> error::Result<()> { for path in paths { let path = Path::new(path); - let (language, config) = match language_config { + let (language, language_config) = match language_config { Some(v) => v, None => match loader.language_configuration_for_file_name(path)? { Some(v) => v, @@ -193,12 +191,12 @@ fn run() -> error::Result<()> { }, }; - if let Some(sheet) = config.highlight_property_sheet(language)? { + if let Some(sheet) = language_config.highlight_property_sheet(language)? { let source = fs::read(path)?; if html_mode { - highlight::html(&loader, &theme, &source, language, sheet)?; + highlight::html(&loader, &config.theme, &source, language, sheet)?; } else { - highlight::ansi(&loader, &theme, &source, language, sheet)?; + highlight::ansi(&loader, &config.theme, &source, language, sheet)?; } } else { return Err(error::Error(format!( diff --git a/highlight/src/lib.rs b/highlight/src/lib.rs index 647064bb..e5499fbc 100644 --- a/highlight/src/lib.rs +++ b/highlight/src/lib.rs @@ -1,6 +1,6 @@ mod escape; -use serde::{Deserialize, Deserializer}; +use serde::{Deserialize, Deserializer, Serialize, Serializer}; use serde_derive::*; use std::cmp; use std::fmt::Write; @@ -742,6 +742,43 @@ impl<'de> Deserialize<'de> for Scope { } } +impl Serialize for Scope { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + match self { + Scope::Attribute => serializer.serialize_str("attribute"), + Scope::Comment => serializer.serialize_str("comment"), + Scope::Constant => serializer.serialize_str("constant"), + Scope::ConstantBuiltin => serializer.serialize_str("constant.builtin"), + Scope::Constructor => serializer.serialize_str("constructor"), + Scope::ConstructorBuiltin => serializer.serialize_str("constructor.builtin"), + Scope::Embedded => serializer.serialize_str("embedded"), + Scope::Escape => serializer.serialize_str("escape"), + Scope::Function => serializer.serialize_str("function"), + Scope::FunctionBuiltin => serializer.serialize_str("function.builtin"), + Scope::Keyword => serializer.serialize_str("keyword"), + Scope::Number => serializer.serialize_str("number"), + Scope::Operator => serializer.serialize_str("operator"), + Scope::Property => serializer.serialize_str("property"), + Scope::PropertyBuiltin => serializer.serialize_str("property.builtin"), + Scope::Punctuation => serializer.serialize_str("punctuation"), + Scope::PunctuationBracket => serializer.serialize_str("punctuation.bracket"), + Scope::PunctuationDelimiter => serializer.serialize_str("punctuation.delimiter"), + Scope::PunctuationSpecial => serializer.serialize_str("punctuation.special"), + Scope::String => serializer.serialize_str("string"), + Scope::StringSpecial => serializer.serialize_str("string.special"), + Scope::Type => serializer.serialize_str("type"), + Scope::TypeBuiltin => serializer.serialize_str("type.builtin"), + Scope::Variable => serializer.serialize_str("variable"), + Scope::VariableBuiltin => serializer.serialize_str("variable.builtin"), + Scope::Tag => serializer.serialize_str("tag"), + Scope::Unknown => serializer.serialize_str(""), + } + } +} + pub trait HTMLAttributeCallback<'a>: Fn(Scope) -> &'a str {} pub fn highlight<'a, F>( From 4e059e18cbdec02d2e82d9dcff096f810ed22b2a Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 25 Feb 2019 12:55:05 -0800 Subject: [PATCH 21/27] 0.14.5 --- Cargo.lock | 2 +- cli/Cargo.toml | 2 +- cli/npm/package-lock.json | 2 +- cli/npm/package.json | 2 +- script/version | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4f5263f7..74ab59a9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -638,7 +638,7 @@ dependencies = [ [[package]] name = "tree-sitter-cli" -version = "0.14.4" +version = "0.14.5" dependencies = [ "ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)", "cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/cli/Cargo.toml b/cli/Cargo.toml index cd2e44e0..b06f5c9d 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "tree-sitter-cli" description = "CLI tool for developing, testing, and using Tree-sitter parsers" -version = "0.14.4" +version = "0.14.5" authors = ["Max Brunsfeld "] edition = "2018" license = "MIT" diff --git a/cli/npm/package-lock.json b/cli/npm/package-lock.json index 65076390..06ff40b3 100644 --- a/cli/npm/package-lock.json +++ b/cli/npm/package-lock.json @@ -1,5 +1,5 @@ { "name": "tree-sitter-cli", - "version": "0.14.4", + "version": "0.14.5", "lockfileVersion": 1 } diff --git a/cli/npm/package.json b/cli/npm/package.json index e463d6f4..862d89dd 100644 --- a/cli/npm/package.json +++ b/cli/npm/package.json @@ -1,6 +1,6 @@ { "name": "tree-sitter-cli", - "version": "0.14.4", + "version": "0.14.5", "author": "Max Brunsfeld", "license": "MIT", "repository": { diff --git a/script/version b/script/version index e46898f7..c2bbeed3 100755 --- a/script/version +++ b/script/version @@ -24,7 +24,7 @@ const arg = process.argv[2]; if (!arg) { console.log([ - `Usage: script/version major |minor | patch | `, + `Usage: script/version major | minor | patch | `, '', 'Update the CLI version by the given increment or to the given', 'version number, creating a commit and tag for the new version.', From ae0e89c29755da19669ec06e5d66a8b3c05f4d62 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 25 Feb 2019 12:55:05 -0800 Subject: [PATCH 22/27] lib: 0.3.9 --- Cargo.lock | 6 +++--- lib/Cargo.toml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 74ab59a9..78978ca5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -627,7 +627,7 @@ dependencies = [ [[package]] name = "tree-sitter" -version = "0.3.8" +version = "0.3.9" dependencies = [ "cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)", "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", @@ -660,7 +660,7 @@ dependencies = [ "smallbitvec 2.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "spin 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", "tempfile 3.0.7 (registry+https://github.com/rust-lang/crates.io-index)", - "tree-sitter 0.3.8", + "tree-sitter 0.3.9", "tree-sitter-highlight 0.1.3", ] @@ -672,7 +672,7 @@ dependencies = [ "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.33 (registry+https://github.com/rust-lang/crates.io-index)", - "tree-sitter 0.3.8", + "tree-sitter 0.3.9", ] [[package]] diff --git a/lib/Cargo.toml b/lib/Cargo.toml index f3c3efb3..c6f84ada 100644 --- a/lib/Cargo.toml +++ b/lib/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "tree-sitter" description = "Rust bindings to the Tree-sitter parsing library" -version = "0.3.8" +version = "0.3.9" authors = ["Max Brunsfeld "] license = "MIT" readme = "binding/README.md" From ba165c2909c93cc2733c0fe29f58d15143815264 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 25 Feb 2019 12:55:05 -0800 Subject: [PATCH 23/27] highlight: 0.1.4 --- Cargo.lock | 4 ++-- highlight/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 78978ca5..8215ca3d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -661,12 +661,12 @@ dependencies = [ "spin 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", "tempfile 3.0.7 (registry+https://github.com/rust-lang/crates.io-index)", "tree-sitter 0.3.9", - "tree-sitter-highlight 0.1.3", + "tree-sitter-highlight 0.1.4", ] [[package]] name = "tree-sitter-highlight" -version = "0.1.3" +version = "0.1.4" dependencies = [ "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/highlight/Cargo.toml b/highlight/Cargo.toml index 8351c8bd..688a2f6c 100644 --- a/highlight/Cargo.toml +++ b/highlight/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "tree-sitter-highlight" description = "Library for performing syntax highlighting with Tree-sitter" -version = "0.1.3" +version = "0.1.4" authors = [ "Max Brunsfeld ", "Tim Clem " From e6d6b4a70034f1433155649bf2ee9705eaf241d4 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 25 Feb 2019 13:41:57 -0800 Subject: [PATCH 24/27] Revert "CI: move binary instead of copying to preserve executable permission" This reverts commit 743d18d956e0e8f2909f7fb0123fc17d8f8097fb. --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index cacd2f27..06c71b34 100644 --- a/.travis.yml +++ b/.travis.yml @@ -23,7 +23,7 @@ branches: - /\d+\.\d+\.\d+/ before_deploy: - - mv target/release/tree-sitter . + - cp target/release/tree-sitter . - gzip --suffix "-${TRAVIS_OS_NAME}-x64.gz" tree-sitter deploy: From 363079bf3e7fb3af63489a95a72324e910c61218 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sun, 3 Mar 2019 18:23:01 -0800 Subject: [PATCH 25/27] Re-enable HTML language in benchmarks Made possible by this error recovery performance fix: https://github.com/tree-sitter/tree-sitter-html/commit/4c7d5fe26ce3668458d7871600debea993734757 --- cli/benches/benchmark.rs | 5 ----- cli/src/loader.rs | 3 ++- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/cli/benches/benchmark.rs b/cli/benches/benchmark.rs index 7983bde5..9cb4f275 100644 --- a/cli/benches/benchmark.rs +++ b/cli/benches/benchmark.rs @@ -61,11 +61,6 @@ fn main() { let mut all_error_speeds = Vec::new(); for (language_name, example_paths) in EXAMPLE_PATHS_BY_LANGUAGE_NAME.iter() { - // TODO - remove after fixing slow error parsing HTML. - if language_name == "html" { - continue; - } - if let Some(filter) = LANGUAGE_FILTER.as_ref() { if language_name != filter.as_str() { continue; diff --git a/cli/src/loader.rs b/cli/src/loader.rs index 23a55cc6..b6e23a3a 100644 --- a/cli/src/loader.rs +++ b/cli/src/loader.rs @@ -233,7 +233,8 @@ impl Loader { .arg("-I") .arg(header_path) .arg("-o") - .arg(&library_path); + .arg(&library_path) + .arg("-O2"); if let Some(scanner_path) = scanner_path.as_ref() { if scanner_path.extension() == Some("c".as_ref()) { command.arg("-xc").arg("-std=c99").arg(scanner_path); From 08ac66a656cff612ac77767c0209908023847769 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sun, 3 Mar 2019 18:53:24 -0800 Subject: [PATCH 26/27] Don't generate prop sheets when a specific grammar path is passed --- cli/src/main.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index 147f7fad..3769efa0 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -96,12 +96,14 @@ fn run() -> error::Result<()> { let grammar_path = matches.value_of("grammar-path"); let minimize = !matches.is_present("no-minimize"); let properties_only = matches.is_present("properties-only"); + let parser_only = grammar_path.is_some(); let state_ids_to_log = matches .values_of("state-ids-to-log") .map_or(Vec::new(), |ids| { ids.filter_map(|id| usize::from_str_radix(id, 10).ok()) .collect() }); + if !properties_only { generate::generate_parser_in_directory( ¤t_dir, @@ -110,7 +112,10 @@ fn run() -> error::Result<()> { state_ids_to_log, )?; } - properties::generate_property_sheets_in_directory(¤t_dir)?; + + if !parser_only { + properties::generate_property_sheets_in_directory(¤t_dir)?; + } } else if let Some(matches) = matches.subcommand_matches("test") { let debug = matches.is_present("debug"); let debug_graph = matches.is_present("debug-graph"); From a20fc3c111e380e9dd077f3d4e35e1b6b235dd1c Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 4 Mar 2019 10:10:04 -0800 Subject: [PATCH 27/27] Remove unnecessary character escape processing for regexes Fixes #289 --- cli/src/generate/dsl.js | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/cli/src/generate/dsl.js b/cli/src/generate/dsl.js index c18ac530..4cc31060 100644 --- a/cli/src/generate/dsl.js +++ b/cli/src/generate/dsl.js @@ -1,5 +1,3 @@ -const UNICODE_ESCAPE_PATTERN = /\\u([0-9a-f]{4})/gi; - function alias(rule, value) { const result = { type: "ALIAS", @@ -172,12 +170,8 @@ function normalize(value) { }; case RegExp: return { - type: 'PATTERN', - value: value.source - .replace( - UNICODE_ESCAPE_PATTERN, - (match, group) => String.fromCharCode(parseInt(group, 16)) - ) + type: 'PATTERN', + value: value.source }; case ReferenceError: throw value