From feac368a3030c5f87e60c53513835ab3fe9f8a96 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 4 Mar 2020 14:27:31 -0800 Subject: [PATCH 01/42] Start work on new tree-sitter-tags crate Co-Authored-By: Patrick Thomson --- Cargo.lock | 15 +++++++ cli/Cargo.toml | 4 ++ cli/src/lib.rs | 1 + cli/src/loader.rs | 76 ++++++++++++++++++++----------- cli/src/main.rs | 76 ++++++++++++++++++++++++++----- cli/src/tags.rs | 16 +++++++ tags/Cargo.toml | 30 +++++++++++++ tags/src/lib.rs | 111 ++++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 291 insertions(+), 38 deletions(-) create mode 100644 cli/src/tags.rs create mode 100644 tags/Cargo.toml create mode 100644 tags/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 172e2d78..76a9973a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -608,6 +608,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" name = "serde" version = "1.0.80" source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", +] [[package]] name = "serde_derive" @@ -772,6 +775,7 @@ dependencies = [ "tiny_http 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)", "tree-sitter 0.6.3", "tree-sitter-highlight 0.1.6", + "tree-sitter-tags 0.1.6", "webbrowser 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -786,6 +790,17 @@ dependencies = [ "tree-sitter 0.6.3", ] +[[package]] +name = "tree-sitter-tags" +version = "0.1.6" +dependencies = [ + "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_json 1.0.33 (registry+https://github.com/rust-lang/crates.io-index)", + "tree-sitter 0.6.3", +] + [[package]] name = "ucd-util" version = "0.1.3" diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 4304d1b6..27706945 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -44,6 +44,10 @@ path = "../lib" version = ">= 0.1.0" path = "../highlight" +[dependencies.tree-sitter-tags] +version = ">= 0.1.0" +path = "../tags" + [dependencies.serde_json] version = "1.0" features = ["preserve_order"] diff --git a/cli/src/lib.rs b/cli/src/lib.rs index 945fe339..97c288a1 100644 --- a/cli/src/lib.rs +++ b/cli/src/lib.rs @@ -6,6 +6,7 @@ pub mod loader; pub mod logger; pub mod parse; pub mod query; +pub mod tags; pub mod test; pub mod test_highlight; pub mod util; diff --git a/cli/src/loader.rs b/cli/src/loader.rs index 1f9a1978..b761c137 100644 --- a/cli/src/loader.rs +++ b/cli/src/loader.rs @@ -12,6 +12,7 @@ use std::time::SystemTime; use std::{fs, mem}; use tree_sitter::Language; use tree_sitter_highlight::HighlightConfiguration; +use tree_sitter_tags::TagsConfiguration; #[cfg(unix)] const DYLIB_EXTENSION: &'static str = "so"; @@ -33,6 +34,7 @@ pub struct LanguageConfiguration<'a> { pub locals_filenames: Option>, language_id: usize, highlight_config: OnceCell>, + tags_config: OnceCell>, highlight_names: &'a Mutex>, use_all_highlight_names: bool, } @@ -481,6 +483,7 @@ impl Loader { locals_filenames: config_json.locals.into_vec(), highlights_filenames: config_json.highlights.into_vec(), highlight_config: OnceCell::new(), + tags_config: OnceCell::new(), highlight_names: &*self.highlight_names, use_all_highlight_names: self.use_all_highlight_names, }; @@ -513,6 +516,7 @@ impl Loader { locals_filenames: None, highlights_filenames: None, highlight_config: OnceCell::new(), + tags_config: OnceCell::new(), highlight_names: &*self.highlight_names, use_all_highlight_names: self.use_all_highlight_names, }; @@ -534,32 +538,11 @@ impl<'a> LanguageConfiguration<'a> { pub fn highlight_config(&self, language: Language) -> Result> { self.highlight_config .get_or_try_init(|| { - let queries_path = self.root_path.join("queries"); - let read_queries = |paths: &Option>, default_path: &str| { - if let Some(paths) = paths.as_ref() { - let mut query = String::new(); - for path in paths { - let path = self.root_path.join(path); - query += &fs::read_to_string(&path).map_err(Error::wrap(|| { - format!("Failed to read query file {:?}", path) - }))?; - } - Ok(query) - } else { - let path = queries_path.join(default_path); - if path.exists() { - fs::read_to_string(&path).map_err(Error::wrap(|| { - format!("Failed to read query file {:?}", path) - })) - } else { - Ok(String::new()) - } - } - }; - - let highlights_query = read_queries(&self.highlights_filenames, "highlights.scm")?; - let injections_query = read_queries(&self.injections_filenames, "injections.scm")?; - let locals_query = read_queries(&self.locals_filenames, "locals.scm")?; + let highlights_query = + self.read_queries(&self.highlights_filenames, "highlights.scm")?; + let injections_query = + self.read_queries(&self.injections_filenames, "injections.scm")?; + let locals_query = self.read_queries(&self.locals_filenames, "locals.scm")?; if highlights_query.is_empty() { Ok(None) @@ -587,6 +570,47 @@ impl<'a> LanguageConfiguration<'a> { }) .map(Option::as_ref) } + + pub fn tags_config(&self, language: Language) -> Result> { + self.tags_config + .get_or_try_init(|| { + let tags_query = self.read_queries(&self.highlights_filenames, "tags.scm")?; + let locals_query = self.read_queries(&self.locals_filenames, "locals.scm")?; + if tags_query.is_empty() { + Ok(None) + } else { + TagsConfiguration::new(language, &tags_query, &locals_query) + .map_err(Error::wrap(|| { + format!("Failed to load queries in {:?}", self.root_path) + })) + .map(|config| Some(config)) + } + }) + .map(Option::as_ref) + } + + fn read_queries(&self, paths: &Option>, default_path: &str) -> Result { + if let Some(paths) = paths.as_ref() { + let mut query = String::new(); + for path in paths { + let path = self.root_path.join(path); + query += &fs::read_to_string(&path).map_err(Error::wrap(|| { + format!("Failed to read query file {:?}", path) + }))?; + } + Ok(query) + } else { + let queries_path = self.root_path.join("queries"); + let path = queries_path.join(default_path); + if path.exists() { + fs::read_to_string(&path).map_err(Error::wrap(|| { + format!("Failed to read query file {:?}", path) + })) + } else { + Ok(String::new()) + } + } + } } fn needs_recompile( diff --git a/cli/src/main.rs b/cli/src/main.rs index 79d310fe..0bbf6b25 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -6,8 +6,8 @@ use std::process::exit; use std::{env, fs, u64}; use tree_sitter::Language; use tree_sitter_cli::{ - config, error, generate, highlight, loader, logger, parse, query, test, test_highlight, wasm, - web_ui, + config, error, generate, highlight, loader, logger, parse, query, tags, test, test_highlight, + wasm, web_ui, }; const BUILD_VERSION: &'static str = env!("CARGO_PKG_VERSION"); @@ -88,6 +88,30 @@ fn run() -> error::Result<()> { .arg(Arg::with_name("scope").long("scope").takes_value(true)) .arg(Arg::with_name("captures").long("captures").short("c")), ) + .subcommand( + SubCommand::with_name("tags") + .arg( + Arg::with_name("format") + .short("f") + .long("format") + .value_name("json|protobuf") + .help("Determine output format (default: json)"), + ) + .arg(Arg::with_name("scope").long("scope").takes_value(true)) + .arg( + Arg::with_name("inputs") + .help("The source file to use") + .index(1) + .required(true) + .multiple(true), + ) + .arg( + Arg::with_name("v") + .short("v") + .multiple(true) + .help("Sets the level of verbosity"), + ), + ) .subcommand( SubCommand::with_name("test") .about("Run a parser's tests") @@ -240,6 +264,38 @@ fn run() -> error::Result<()> { )?; let query_path = Path::new(matches.value_of("query-path").unwrap()); query::query_files_at_paths(language, paths, query_path, ordered_captures)?; + } else if let Some(matches) = matches.subcommand_matches("tags") { + loader.find_all_languages(&config.parser_directories)?; + let paths = collect_paths(matches.values_of("inputs").unwrap())?; + + let mut lang = None; + if let Some(scope) = matches.value_of("scope") { + lang = loader.language_configuration_for_scope(scope)?; + if lang.is_none() { + return Error::err(format!("Unknown scope '{}'", scope)); + } + } + + for path in paths { + let path = Path::new(&path); + let (language, language_config) = match lang { + Some(v) => v, + None => match loader.language_configuration_for_file_name(path)? { + Some(v) => v, + None => { + eprintln!("No language found for path {:?}", path); + continue; + } + }, + }; + + if let Some(tags_config) = language_config.tags_config(language)? { + let source = fs::read(path)?; + tags::generate_tags(tags_config, &source)?; + } else { + eprintln!("No tags config found for path {:?}", path); + } + } } else if let Some(matches) = matches.subcommand_matches("highlight") { loader.configure_highlights(&config.theme.highlight_names); loader.find_all_languages(&config.parser_directories)?; @@ -251,19 +307,17 @@ fn run() -> error::Result<()> { println!("{}", highlight::HTML_HEADER); } - let language_config; + let mut lang = None; if let Some(scope) = matches.value_of("scope") { - language_config = loader.language_configuration_for_scope(scope)?; - if language_config.is_none() { + lang = loader.language_configuration_for_scope(scope)?; + if lang.is_none() { return Error::err(format!("Unknown scope '{}'", scope)); } - } else { - language_config = None; } for path in paths { let path = Path::new(&path); - let (language, language_config) = match language_config { + let (language, language_config) = match lang { Some(v) => v, None => match loader.language_configuration_for_file_name(path)? { Some(v) => v, @@ -274,23 +328,21 @@ fn run() -> error::Result<()> { }, }; - let source = fs::read(path)?; - if let Some(highlight_config) = language_config.highlight_config(language)? { + let source = fs::read(path)?; if html_mode { highlight::html(&loader, &config.theme, &source, highlight_config, time)?; } else { highlight::ansi(&loader, &config.theme, &source, highlight_config, time)?; } } else { - return Error::err(format!("No syntax highlighting query found")); + eprintln!("No syntax highlighting config found for path {:?}", path); } } if html_mode { println!("{}", highlight::HTML_FOOTER); } - } else if let Some(matches) = matches.subcommand_matches("build-wasm") { let grammar_path = current_dir.join(matches.value_of("path").unwrap_or("")); wasm::compile_language_to_wasm(&grammar_path, matches.is_present("docker"))?; diff --git a/cli/src/tags.rs b/cli/src/tags.rs new file mode 100644 index 00000000..23d448fc --- /dev/null +++ b/cli/src/tags.rs @@ -0,0 +1,16 @@ +use crate::error::Result; +use std::io; +use tree_sitter_tags::{TagsConfiguration, TagsContext}; + +pub fn generate_tags(config: &TagsConfiguration, source: &[u8]) -> Result<()> { + let mut context = TagsContext::new(); + + let stdout = io::stdout(); + let mut stdout = stdout.lock(); + + for tag in context.generate_tags(config, source) { + serde_json::to_writer(&mut stdout, &tag)?; + } + + Ok(()) +} diff --git a/tags/Cargo.toml b/tags/Cargo.toml new file mode 100644 index 00000000..1d47c951 --- /dev/null +++ b/tags/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "tree-sitter-tags" +description = "Library for extracting tag information" +version = "0.1.6" +authors = [ + "Max Brunsfeld ", + "Patrick Thomson " +] +license = "MIT" +readme = "README.md" +edition = "2018" +keywords = ["incremental", "parsing", "syntax", "highlighting"] +categories = ["parsing", "text-editors"] +repository = "https://github.com/tree-sitter/tree-sitter" + +[lib] +crate-type = ["lib", "staticlib"] + +[dependencies] +regex = "1" +serde_json = "1.0" +serde_derive = "1.0" + +[dependencies.serde] +version = "1.0" +features = ["derive"] + +[dependencies.tree-sitter] +version = ">= 0.3.7" +path = "../lib" diff --git a/tags/src/lib.rs b/tags/src/lib.rs new file mode 100644 index 00000000..3cad20b5 --- /dev/null +++ b/tags/src/lib.rs @@ -0,0 +1,111 @@ +use serde::{Serialize, Serializer}; +use tree_sitter::{Language, Parser, Query, QueryCursor, QueryError}; + +/// Contains the data neeeded to compute tags for code written in a +/// particular language. +pub struct TagsConfiguration { + pub language: Language, + pub query: Query, + locals_pattern_index: usize, +} + +pub struct TagsContext { + parser: Parser, + cursor: QueryCursor, +} + +#[derive(Serialize)] +pub struct Range { + pub start: i64, + pub end: i64, +} + +#[derive(Serialize)] +pub struct Loc { + pub byte_range: Range, + pub span: Span, +} + +#[derive(Serialize)] +pub struct Span { + pub start: Pos, + pub end: Pos, +} + +#[derive(Serialize)] +pub struct Pos { + pub line: i64, + pub column: i64, +} + +pub enum TagKind { + Function, + Method, + Class, + Module, + Call, +} + +#[derive(Serialize)] +pub struct Tag<'a> { + pub kind: TagKind, + pub loc: Loc, + pub name: &'a str, + pub line: &'a str, + pub docs: Option<&'a str>, +} + +impl Serialize for TagKind { + fn serialize(&self, s: S) -> Result + where + S: Serializer, + { + match self { + TagKind::Call => "Call".serialize(s), + TagKind::Module => "Module".serialize(s), + TagKind::Class => "Class".serialize(s), + TagKind::Method => "Method".serialize(s), + TagKind::Function => "Function".serialize(s), + } + } +} + +impl TagsConfiguration { + pub fn new( + language: Language, + tags_query: &str, + locals_query: &str, + ) -> Result { + let query = Query::new(language, &format!("{}{}", tags_query, locals_query))?; + + let locals_query_offset = tags_query.len(); + let mut locals_pattern_index = 0; + for i in 0..(query.pattern_count()) { + let pattern_offset = query.start_byte_for_pattern(i); + if pattern_offset < locals_query_offset { + locals_pattern_index += 1; + } + } + + query.pattern_count(); + query.start_byte_for_pattern(5); + Ok(TagsConfiguration { + language, + query, + locals_pattern_index, + }) + } +} + +impl TagsContext { + pub fn new() -> Self { + TagsContext { + parser: Parser::new(), + cursor: QueryCursor::new(), + } + } + + pub fn generate_tags(&mut self, config: &TagsConfiguration, source: &[u8]) -> Vec { + Vec::new() + } +} From 38a9f33d9e830275b5bf4abeeef1cacd0442b1be Mon Sep 17 00:00:00 2001 From: Patrick Thomson Date: Thu, 5 Mar 2020 09:57:57 -0500 Subject: [PATCH 02/42] Simplify Serialize implementation for TagKind. --- tags/src/lib.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tags/src/lib.rs b/tags/src/lib.rs index 3cad20b5..64e79020 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -61,12 +61,13 @@ impl Serialize for TagKind { S: Serializer, { match self { - TagKind::Call => "Call".serialize(s), - TagKind::Module => "Module".serialize(s), - TagKind::Class => "Class".serialize(s), - TagKind::Method => "Method".serialize(s), - TagKind::Function => "Function".serialize(s), + TagKind::Call => "Call", + TagKind::Module => "Module", + TagKind::Class => "Class", + TagKind::Method => "Method", + TagKind::Function => "Function", } + .serialize(s) } } From a3f0087b11d6ab5faebbb68f37e8dc1231c8f1e4 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 5 Mar 2020 13:04:49 -0800 Subject: [PATCH 03/42] Start work on tagging unit test Co-Authored-By: Patrick Thomson --- cli/src/tests/mod.rs | 1 + cli/src/tests/tags_test.rs | 56 ++++++++++++++++++++++++++++++++++++++ tags/src/lib.rs | 32 ++++++++++++++++------ 3 files changed, 81 insertions(+), 8 deletions(-) create mode 100644 cli/src/tests/tags_test.rs diff --git a/cli/src/tests/mod.rs b/cli/src/tests/mod.rs index 0ccb0ae0..ac54db00 100644 --- a/cli/src/tests/mod.rs +++ b/cli/src/tests/mod.rs @@ -4,5 +4,6 @@ mod highlight_test; mod node_test; mod parser_test; mod query_test; +mod tags_test; mod test_highlight_test; mod tree_test; diff --git a/cli/src/tests/tags_test.rs b/cli/src/tests/tags_test.rs new file mode 100644 index 00000000..e9ad908c --- /dev/null +++ b/cli/src/tests/tags_test.rs @@ -0,0 +1,56 @@ +use super::helpers::fixtures::get_language; +use tree_sitter_tags::{TagKind, TagsConfiguration, TagsContext}; + +#[test] +fn test_tags_javascript() { + let language = get_language("python"); + let tags_config = TagsConfiguration::new( + language, + r#" + ((function_definition + name: (identifier) @name + body: (block + . (string) @doc)) @function + (set! replace @doc "(^['\s]*)|(['\s]*$)")) + + (function_definition + name: (identifier) @name) @function + (class_definition + name: (identifier) @name) @class + (call + function: (identifier) @name) @call + "#, + "", + ) + .unwrap(); + + let mut tag_context = TagsContext::new(); + let tags = tag_context.generate_tags( + &tags_config, + br#" + class Customer: + """ + Data about a customer + """ + + def age(self): + """ + Get the customer's age + """ + compute_age(self.id); + } + "#, + ); + + assert_eq!( + tags.iter().map(|t| (t.name, t.kind)).collect::>(), + &[ + ("Customer", TagKind::Class), + ("age", TagKind::Function), + ("compute_age", TagKind::Call), + ] + ); + + assert_eq!(tags[0].docs, Some("Data about a customer")); + assert_eq!(tags[1].docs, Some("Get the customer's age")); +} diff --git a/tags/src/lib.rs b/tags/src/lib.rs index 64e79020..bc4f8e9a 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -14,30 +14,31 @@ pub struct TagsContext { cursor: QueryCursor, } -#[derive(Serialize)] +#[derive(Debug, Serialize)] pub struct Range { pub start: i64, pub end: i64, } -#[derive(Serialize)] +#[derive(Debug, Serialize)] pub struct Loc { pub byte_range: Range, pub span: Span, } -#[derive(Serialize)] +#[derive(Debug, Serialize)] pub struct Span { pub start: Pos, pub end: Pos, } -#[derive(Serialize)] +#[derive(Debug, Serialize)] pub struct Pos { pub line: i64, pub column: i64, } +#[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum TagKind { Function, Method, @@ -46,7 +47,7 @@ pub enum TagKind { Call, } -#[derive(Serialize)] +#[derive(Debug, Serialize)] pub struct Tag<'a> { pub kind: TagKind, pub loc: Loc, @@ -88,8 +89,6 @@ impl TagsConfiguration { } } - query.pattern_count(); - query.start_byte_for_pattern(5); Ok(TagsConfiguration { language, query, @@ -107,6 +106,23 @@ impl TagsContext { } pub fn generate_tags(&mut self, config: &TagsConfiguration, source: &[u8]) -> Vec { - Vec::new() + self.parser + .set_language(config.language) + .expect("Incompatible language"); + let tree = self + .parser + .parse(source, None) + .expect("Parsing failed unexpectedly"); + let matches = self + .cursor + .matches(&config.query, tree.root_node(), |node| { + &source[node.byte_range()] + }); + matches + .map(|mat| { + for capture in mat.captures {} + unimplemented!(); + }) + .collect() } } From 680a9e053101dbaf4fdc56b30469ade80e4104f1 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 6 Mar 2020 13:24:03 -0800 Subject: [PATCH 04/42] wip --- cli/src/tests/tags_test.rs | 4 +- tags/src/lib.rs | 138 +++++++++++++++++++++++++++---------- 2 files changed, 104 insertions(+), 38 deletions(-) diff --git a/cli/src/tests/tags_test.rs b/cli/src/tests/tags_test.rs index e9ad908c..d293806b 100644 --- a/cli/src/tests/tags_test.rs +++ b/cli/src/tests/tags_test.rs @@ -9,10 +9,8 @@ fn test_tags_javascript() { r#" ((function_definition name: (identifier) @name - body: (block - . (string) @doc)) @function + body: (block . (string) @doc)) @function (set! replace @doc "(^['\s]*)|(['\s]*$)")) - (function_definition name: (identifier) @name) @function (class_definition diff --git a/tags/src/lib.rs b/tags/src/lib.rs index bc4f8e9a..8e48cea0 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -1,12 +1,19 @@ use serde::{Serialize, Serializer}; -use tree_sitter::{Language, Parser, Query, QueryCursor, QueryError}; +use std::{ops, str}; +use tree_sitter::{Language, Node, Parser, Point, Query, QueryCursor, QueryError}; /// Contains the data neeeded to compute tags for code written in a /// particular language. pub struct TagsConfiguration { pub language: Language, pub query: Query, + call_capture_index: Option, + class_capture_index: Option, + doc_capture_index: Option, + function_capture_index: Option, locals_pattern_index: usize, + module_capture_index: Option, + name_capture_index: Option, } pub struct TagsContext { @@ -14,22 +21,10 @@ pub struct TagsContext { cursor: QueryCursor, } -#[derive(Debug, Serialize)] -pub struct Range { - pub start: i64, - pub end: i64, -} - #[derive(Debug, Serialize)] pub struct Loc { - pub byte_range: Range, - pub span: Span, -} - -#[derive(Debug, Serialize)] -pub struct Span { - pub start: Pos, - pub end: Pos, + pub byte_range: ops::Range, + pub span: ops::Range, } #[derive(Debug, Serialize)] @@ -56,22 +51,6 @@ pub struct Tag<'a> { pub docs: Option<&'a str>, } -impl Serialize for TagKind { - fn serialize(&self, s: S) -> Result - where - S: Serializer, - { - match self { - TagKind::Call => "Call", - TagKind::Module => "Module", - TagKind::Class => "Class", - TagKind::Method => "Method", - TagKind::Function => "Function", - } - .serialize(s) - } -} - impl TagsConfiguration { pub fn new( language: Language, @@ -89,10 +68,35 @@ impl TagsConfiguration { } } + let mut call_capture_index = None; + let mut class_capture_index = None; + let mut doc_capture_index = None; + let mut function_capture_index = None; + let mut module_capture_index = None; + let mut name_capture_index = None; + for (i, name) in query.capture_names().iter().enumerate() { + let index = match name.as_str() { + "call" => &mut call_capture_index, + "class" => &mut class_capture_index, + "doc" => &mut doc_capture_index, + "function" => &mut function_capture_index, + "module" => &mut module_capture_index, + "name" => &mut name_capture_index, + _ => continue, + }; + *index = Some(i as u32); + } + Ok(TagsConfiguration { language, query, locals_pattern_index, + function_capture_index, + class_capture_index, + module_capture_index, + doc_capture_index, + call_capture_index, + name_capture_index, }) } } @@ -105,7 +109,11 @@ impl TagsContext { } } - pub fn generate_tags(&mut self, config: &TagsConfiguration, source: &[u8]) -> Vec { + pub fn generate_tags<'a>( + &mut self, + config: &TagsConfiguration, + source: &'a [u8], + ) -> Vec> { self.parser .set_language(config.language) .expect("Incompatible language"); @@ -119,10 +127,70 @@ impl TagsContext { &source[node.byte_range()] }); matches - .map(|mat| { - for capture in mat.captures {} - unimplemented!(); + .filter_map(|mat| { + let mut call_node = None; + let mut doc_node = None; + let mut class_node = None; + let mut function_node = None; + let mut module_node = None; + let mut name_node = None; + + for capture in mat.captures { + let index = Some(capture.index); + let node = Some(capture.node); + if index == config.call_capture_index { + call_node = node; + } else if index == config.class_capture_index { + class_node = node; + } else if index == config.doc_capture_index { + doc_node = node; + } else if index == config.function_capture_index { + function_node = node; + } else if index == config.module_capture_index { + module_node = node; + } else if index == config.name_capture_index { + name_node = node; + } + } + + if let (Some(function), Some(name)) = (function_node, name_node) { + if let Ok(name) = str::from_utf8(&source[name.byte_range()]) { + return Some(Tag { + name, + line: "", + loc: loc_for_node(function), + kind: TagKind::Function, + docs: doc_node + .and_then(|n| str::from_utf8(&source[n.byte_range()]).ok()), + }); + } + } + + None }) .collect() } } + +impl Serialize for TagKind { + fn serialize(&self, s: S) -> Result + where + S: Serializer, + { + match self { + TagKind::Call => "Call", + TagKind::Module => "Module", + TagKind::Class => "Class", + TagKind::Method => "Method", + TagKind::Function => "Function", + } + .serialize(s) + } +} + +fn loc_for_node(node: Node) -> Loc { + Loc { + byte_range: node.byte_range(), + span: node.start_position()..node.start_position(), + } +} From 8546a71c31438fcaac6ca297a172b3226942ab3d Mon Sep 17 00:00:00 2001 From: Patrick Thomson Date: Fri, 6 Mar 2020 16:48:15 -0500 Subject: [PATCH 05/42] Implement class and call tagging. --- tags/src/lib.rs | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/tags/src/lib.rs b/tags/src/lib.rs index 8e48cea0..3e582b84 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -153,17 +153,26 @@ impl TagsContext { } } - if let (Some(function), Some(name)) = (function_node, name_node) { + let tag_from_node = |kind, name: Node, node| { if let Ok(name) = str::from_utf8(&source[name.byte_range()]) { return Some(Tag { name, - line: "", - loc: loc_for_node(function), - kind: TagKind::Function, + line: "TODO", + loc: loc_for_node(node), + kind: kind, docs: doc_node .and_then(|n| str::from_utf8(&source[n.byte_range()]).ok()), }); - } + }; + return None; + }; + + if let (Some(function), Some(name)) = (function_node, name_node) { + return tag_from_node(TagKind::Function, name, function); + } else if let (Some(call), Some(name)) = (call_node, name_node) { + return tag_from_node(TagKind::Call, name, call); + } else if let (Some(class), Some(name)) = (class_node, name_node) { + return tag_from_node(TagKind::Class, name, class); } None @@ -191,6 +200,15 @@ impl Serialize for TagKind { fn loc_for_node(node: Node) -> Loc { Loc { byte_range: node.byte_range(), - span: node.start_position()..node.start_position(), + span: node.start_position().into()..node.start_position().into(), + } +} + +impl From for Pos { + fn from(point: tree_sitter::Point) -> Self { + return Pos { + line: point.row as i64, + column: point.column as i64, + }; } } From 00dcc1eaa6a0ea5c291eb32dabea58935c5d829c Mon Sep 17 00:00:00 2001 From: Patrick Thomson Date: Fri, 6 Mar 2020 17:48:55 -0500 Subject: [PATCH 06/42] Need to use expression_statement here. --- cli/src/tests/tags_test.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cli/src/tests/tags_test.rs b/cli/src/tests/tags_test.rs index d293806b..f8ad2527 100644 --- a/cli/src/tests/tags_test.rs +++ b/cli/src/tests/tags_test.rs @@ -9,10 +9,14 @@ fn test_tags_javascript() { r#" ((function_definition name: (identifier) @name - body: (block . (string) @doc)) @function + body: (block . (expression_statement (string) @doc))) @function (set! replace @doc "(^['\s]*)|(['\s]*$)")) (function_definition name: (identifier) @name) @function + ((class_definition + name: (identifier) @name + body: (block . (expression_statement (string) @doc))) @class + (set! replace @doc "(^['\s]*)|(['\s]*$)")) (class_definition name: (identifier) @name) @class (call From 5d8e288b36bbbbee6ad1fc87a196debdbc0f93bb Mon Sep 17 00:00:00 2001 From: Patrick Thomson Date: Fri, 6 Mar 2020 17:57:24 -0500 Subject: [PATCH 07/42] Keep track of the last-matched kind to simulate alternating choice. --- tags/src/lib.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tags/src/lib.rs b/tags/src/lib.rs index 3e582b84..b54eaff3 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -126,6 +126,7 @@ impl TagsContext { .matches(&config.query, tree.root_node(), |node| { &source[node.byte_range()] }); + let mut last_matched_kind = None; matches .filter_map(|mat| { let mut call_node = None; @@ -153,8 +154,15 @@ impl TagsContext { } } - let tag_from_node = |kind, name: Node, node| { + let mut tag_from_node = |kind, name: Node, node: Node| { if let Ok(name) = str::from_utf8(&source[name.byte_range()]) { + let current_kind = Some(node.kind()); + if last_matched_kind == current_kind { + return None; + } else { + last_matched_kind = current_kind; + } + return Some(Tag { name, line: "TODO", @@ -167,7 +175,9 @@ impl TagsContext { return None; }; - if let (Some(function), Some(name)) = (function_node, name_node) { + if let (Some(function), Some(name), Some(_doc)) = + (function_node, name_node, doc_node) + { return tag_from_node(TagKind::Function, name, function); } else if let (Some(call), Some(name)) = (call_node, name_node) { return tag_from_node(TagKind::Call, name, call); From dd181073149bd6fc63fb1b1982776a541f6ae157 Mon Sep 17 00:00:00 2001 From: Patrick Thomson Date: Mon, 9 Mar 2020 15:28:29 -0400 Subject: [PATCH 08/42] WIP --- tags/src/lib.rs | 150 +++++++++++++++++++++++++++++------------------- 1 file changed, 92 insertions(+), 58 deletions(-) diff --git a/tags/src/lib.rs b/tags/src/lib.rs index b54eaff3..1e682b15 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -1,4 +1,5 @@ use serde::{Serialize, Serializer}; +use std::collections::HashMap; use std::{ops, str}; use tree_sitter::{Language, Node, Parser, Point, Query, QueryCursor, QueryError}; @@ -21,13 +22,13 @@ pub struct TagsContext { cursor: QueryCursor, } -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, Clone)] pub struct Loc { pub byte_range: ops::Range, pub span: ops::Range, } -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, Clone)] pub struct Pos { pub line: i64, pub column: i64, @@ -42,7 +43,7 @@ pub enum TagKind { Call, } -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, Clone)] pub struct Tag<'a> { pub kind: TagKind, pub loc: Loc, @@ -109,6 +110,7 @@ impl TagsContext { } } + // TODO: This should return an iterator rather than build up a vector pub fn generate_tags<'a>( &mut self, config: &TagsConfiguration, @@ -126,68 +128,100 @@ impl TagsContext { .matches(&config.query, tree.root_node(), |node| { &source[node.byte_range()] }); - let mut last_matched_kind = None; - matches - .filter_map(|mat| { - let mut call_node = None; - let mut doc_node = None; - let mut class_node = None; - let mut function_node = None; - let mut module_node = None; - let mut name_node = None; + let mut neighbor_map: HashMap, usize)> = HashMap::new(); - for capture in mat.captures { - let index = Some(capture.index); - let node = Some(capture.node); - if index == config.call_capture_index { - call_node = node; - } else if index == config.class_capture_index { - class_node = node; - } else if index == config.doc_capture_index { - doc_node = node; - } else if index == config.function_capture_index { - function_node = node; - } else if index == config.module_capture_index { - module_node = node; - } else if index == config.name_capture_index { - name_node = node; - } + for mat in matches { + let mut call_node = None; + let mut doc_node = None; + let mut class_node = None; + let mut function_node = None; + let mut module_node = None; + let mut name_node = None; + + for capture in mat.captures { + let index = Some(capture.index); + let node = Some(capture.node); + if index == config.call_capture_index { + call_node = node; + } else if index == config.class_capture_index { + class_node = node; + } else if index == config.doc_capture_index { + doc_node = node; + } else if index == config.function_capture_index { + function_node = node; + } else if index == config.module_capture_index { + module_node = node; + } else if index == config.name_capture_index { + name_node = node; } + } - let mut tag_from_node = |kind, name: Node, node: Node| { - if let Ok(name) = str::from_utf8(&source[name.byte_range()]) { - let current_kind = Some(node.kind()); - if last_matched_kind == current_kind { - return None; - } else { - last_matched_kind = current_kind; + let tag_from_node = |node: Node| -> Option { + return None; + } + + for (optFound, theKind) in [ + (call_node, TagKind::Call), + (class_node, TagKind::Class), + (function_node, TagKind::Function), + (module_node, TagKind::Module), + ] { + if let (Some(found), Some(name)) = (optFound, name_node) { + match neighbor_map.entry(found) { + hash_map::Entry::Occupied(entry) => { + let (tag, old_idx) = entry.get_mut(); + if old_idx > mat.pattern_index { + *tag = + } } + } + } + } + } - return Some(Tag { - name, - line: "TODO", - loc: loc_for_node(node), - kind: kind, - docs: doc_node - .and_then(|n| str::from_utf8(&source[n.byte_range()]).ok()), - }); - }; - return None; - }; + // some computation + return neighbor_map.into_iter().map(|t| (t.1).0).collect(); - if let (Some(function), Some(name), Some(_doc)) = - (function_node, name_node, doc_node) - { - return tag_from_node(TagKind::Function, name, function); - } else if let (Some(call), Some(name)) = (call_node, name_node) { - return tag_from_node(TagKind::Call, name, call); - } else if let (Some(class), Some(name)) = (class_node, name_node) { - return tag_from_node(TagKind::Class, name, class); - } + // matches + // .filter_map(|mat| { - None - }) - .collect() + // for capture in mat.captures { + // + // + + // } + + // let tag_from_node = |kind, name: Node, node| -> Option { + // if let Ok(name) = str::from_utf8(&source[name.byte_range()]) { + // if let Some((tag, index)) = neighbor_map.get(&node) { + // if index > &mat.pattern_index { + // return Some(tag.clone()); + // } + // } + + // return Some(Tag { + // name, + // line: "TODO", + // loc: loc_for_node(node), + // kind: kind, + // docs: doc_node + // .and_then(|n| str::from_utf8(&source[n.byte_range()]).ok()), + // }); + // }; + // return None; + // }; + + // if let (Some(function), Some(name)) = (function_node, name_node) { + // return tag_from_node(TagKind::Function, name, function); + // } else if let (Some(call), Some(name)) = (call_node, name_node) { + // return tag_from_node(TagKind::Call, name, call); + // } else if let (Some(class), Some(name)) = (class_node, name_node) { + // return tag_from_node(TagKind::Class, name, class); + // } + + // None + // }) + // .collect() } } From 3c79a10c85de2302ffadcffad4f92d942712bda4 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 10 Mar 2020 09:43:01 -0700 Subject: [PATCH 09/42] Use a hash map to dedup tags Co-Authored-By: Patrick Thomson --- tags/src/lib.rs | 83 ++++++++++++++++++------------------------------- 1 file changed, 30 insertions(+), 53 deletions(-) diff --git a/tags/src/lib.rs b/tags/src/lib.rs index 1e682b15..b79a22df 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -1,5 +1,5 @@ use serde::{Serialize, Serializer}; -use std::collections::HashMap; +use std::collections::{hash_map, HashMap}; use std::{ops, str}; use tree_sitter::{Language, Node, Parser, Point, Query, QueryCursor, QueryError}; @@ -156,72 +156,49 @@ impl TagsContext { } } - let tag_from_node = |node: Node| -> Option { - return None; - } + let tag_from_node = |node: Node, kind: TagKind| -> Option { + let name = str::from_utf8(&source[name_node?.byte_range()]).ok()?; + let docs = doc_node.and_then(|n| str::from_utf8(&source[n.byte_range()]).ok()); + Some(Tag { + name, + line: "TODO", + loc: loc_for_node(node), + kind: kind, + docs, + }) + }; - for (optFound, theKind) in [ + for (tag_node, tag_kind) in [ (call_node, TagKind::Call), (class_node, TagKind::Class), (function_node, TagKind::Function), (module_node, TagKind::Module), - ] { - if let (Some(found), Some(name)) = (optFound, name_node) { + ] + .iter() + .cloned() + { + if let Some(found) = tag_node { match neighbor_map.entry(found) { - hash_map::Entry::Occupied(entry) => { + hash_map::Entry::Occupied(mut entry) => { let (tag, old_idx) = entry.get_mut(); - if old_idx > mat.pattern_index { - *tag = + if *old_idx > mat.pattern_index { + if let Some(new_tag) = tag_from_node(found, tag_kind) { + *tag = new_tag; + *old_idx = mat.pattern_index; + } + } + } + hash_map::Entry::Vacant(entry) => { + if let Some(tag) = tag_from_node(found, tag_kind) { + entry.insert((tag, mat.pattern_index)); } } } - } + } } } - // some computation return neighbor_map.into_iter().map(|t| (t.1).0).collect(); - - // matches - // .filter_map(|mat| { - - // for capture in mat.captures { - // - // - - // } - - // let tag_from_node = |kind, name: Node, node| -> Option { - // if let Ok(name) = str::from_utf8(&source[name.byte_range()]) { - // if let Some((tag, index)) = neighbor_map.get(&node) { - // if index > &mat.pattern_index { - // return Some(tag.clone()); - // } - // } - - // return Some(Tag { - // name, - // line: "TODO", - // loc: loc_for_node(node), - // kind: kind, - // docs: doc_node - // .and_then(|n| str::from_utf8(&source[n.byte_range()]).ok()), - // }); - // }; - // return None; - // }; - - // if let (Some(function), Some(name)) = (function_node, name_node) { - // return tag_from_node(TagKind::Function, name, function); - // } else if let (Some(call), Some(name)) = (call_node, name_node) { - // return tag_from_node(TagKind::Call, name, call); - // } else if let (Some(class), Some(name)) = (class_node, name_node) { - // return tag_from_node(TagKind::Class, name, class); - // } - - // None - // }) - // .collect() } } From 90cacca04046fe91282660da72fda93b599a3fc1 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 10 Mar 2020 10:42:22 -0700 Subject: [PATCH 10/42] rust: Simplify and generalize handling of set! and is? predicates --- lib/binding_rust/lib.rs | 51 ++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 29 deletions(-) diff --git a/lib/binding_rust/lib.rs b/lib/binding_rust/lib.rs index bf732faa..4a04ff13 100644 --- a/lib/binding_rust/lib.rs +++ b/lib/binding_rust/lib.rs @@ -1425,46 +1425,39 @@ impl Query { ))); } - let mut i = 0; let mut capture_id = None; - if args[i].type_ == ffi::TSQueryPredicateStepType_TSQueryPredicateStepTypeCapture { - capture_id = Some(args[i].value_id as usize); - i += 1; - - if i == args.len() { - return Err(QueryError::Predicate(format!( - "No key specified for {} predicate.", - function_name, - ))); - } - if args[i].type_ == ffi::TSQueryPredicateStepType_TSQueryPredicateStepTypeCapture { - return Err(QueryError::Predicate(format!( - "Invalid arguments to {} predicate. Expected string, got @{}", - function_name, capture_names[args[i].value_id as usize] - ))); - } - } - - let key = &string_values[args[i].value_id as usize]; - i += 1; - + let mut key = None; let mut value = None; - if i < args.len() { - if args[i].type_ == ffi::TSQueryPredicateStepType_TSQueryPredicateStepTypeCapture { + + for arg in args { + if arg.type_ == ffi::TSQueryPredicateStepType_TSQueryPredicateStepTypeCapture { if capture_id.is_some() { return Err(QueryError::Predicate(format!( "Invalid arguments to {} predicate. Unexpected second capture name @{}", - function_name, capture_names[args[i].value_id as usize] + function_name, capture_names[arg.value_id as usize] ))); - } else { - capture_id = Some(args[i].value_id as usize); } + capture_id = Some(arg.value_id as usize); + } else if key.is_none() { + key = Some(&string_values[arg.value_id as usize]); + } else if value.is_none() { + value = Some(string_values[arg.value_id as usize].as_str()); } else { - value = Some(string_values[args[i].value_id as usize].as_str()); + return Err(QueryError::Predicate(format!( + "Invalid arguments to {} predicate. Unexpected third argument @{}", + function_name, string_values[arg.value_id as usize] + ))); } } - Ok(QueryProperty::new(key, value, capture_id)) + if let Some(key) = key { + Ok(QueryProperty::new(key, value, capture_id)) + } else { + return Err(QueryError::Predicate(format!( + "Invalid arguments to {} predicate. Missing key argument", + function_name, + ))); + } } } From 157258d8819707d42fa9c8d1c30b2ab129f9ae7c Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 10 Mar 2020 10:43:23 -0700 Subject: [PATCH 11/42] tags: Implement strip regex for docs processing Co-Authored-By: Patrick Thomson --- cli/src/error.rs | 6 ++++ cli/src/tests/tags_test.rs | 16 +++++------ tags/src/lib.rs | 58 ++++++++++++++++++++++++++++++++------ 3 files changed, 64 insertions(+), 16 deletions(-) diff --git a/cli/src/error.rs b/cli/src/error.rs index 73dcb732..824bd92f 100644 --- a/cli/src/error.rs +++ b/cli/src/error.rs @@ -81,6 +81,12 @@ impl<'a> From for Error { } } +impl<'a> From for Error { + fn from(error: tree_sitter_tags::Error) -> Self { + Error::new(format!("{:?}", error)) + } +} + impl From for Error { fn from(error: serde_json::Error) -> Self { Error::new(error.to_string()) diff --git a/cli/src/tests/tags_test.rs b/cli/src/tests/tags_test.rs index f8ad2527..2cc79749 100644 --- a/cli/src/tests/tags_test.rs +++ b/cli/src/tests/tags_test.rs @@ -10,13 +10,13 @@ fn test_tags_javascript() { ((function_definition name: (identifier) @name body: (block . (expression_statement (string) @doc))) @function - (set! replace @doc "(^['\s]*)|(['\s]*$)")) + (set! strip @doc "(^['\s]*)|(['\s]*$)")) (function_definition name: (identifier) @name) @function ((class_definition name: (identifier) @name body: (block . (expression_statement (string) @doc))) @class - (set! replace @doc "(^['\s]*)|(['\s]*$)")) + (set! strip @doc "(^['\s]*)|(['\s]*$)")) (class_definition name: (identifier) @name) @class (call @@ -31,14 +31,14 @@ fn test_tags_javascript() { &tags_config, br#" class Customer: - """ + ''' Data about a customer - """ + ''' def age(self): - """ + ''' Get the customer's age - """ + ''' compute_age(self.id); } "#, @@ -53,6 +53,6 @@ fn test_tags_javascript() { ] ); - assert_eq!(tags[0].docs, Some("Data about a customer")); - assert_eq!(tags[1].docs, Some("Get the customer's age")); + assert_eq!(tags[0].docs.as_ref().unwrap(), "Data about a customer"); + assert_eq!(tags[1].docs.as_ref().unwrap(), "Get the customer's age"); } diff --git a/tags/src/lib.rs b/tags/src/lib.rs index b79a22df..577c4608 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -1,7 +1,8 @@ +use regex::Regex; use serde::{Serialize, Serializer}; use std::collections::{hash_map, HashMap}; use std::{ops, str}; -use tree_sitter::{Language, Node, Parser, Point, Query, QueryCursor, QueryError}; +use tree_sitter::{Language, Node, Parser, Query, QueryCursor, QueryError}; /// Contains the data neeeded to compute tags for code written in a /// particular language. @@ -15,6 +16,7 @@ pub struct TagsConfiguration { locals_pattern_index: usize, module_capture_index: Option, name_capture_index: Option, + doc_strip_regexes: Vec>, } pub struct TagsContext { @@ -49,15 +51,17 @@ pub struct Tag<'a> { pub loc: Loc, pub name: &'a str, pub line: &'a str, - pub docs: Option<&'a str>, + pub docs: Option, +} + +#[derive(Debug)] +pub enum Error { + Query(QueryError), + Regex(regex::Error), } impl TagsConfiguration { - pub fn new( - language: Language, - tags_query: &str, - locals_query: &str, - ) -> Result { + pub fn new(language: Language, tags_query: &str, locals_query: &str) -> Result { let query = Query::new(language, &format!("{}{}", tags_query, locals_query))?; let locals_query_offset = tags_query.len(); @@ -88,6 +92,23 @@ impl TagsConfiguration { *index = Some(i as u32); } + let doc_strip_regexes = (0..query.pattern_count()) + .map(|pattern_index| { + let properties = query.property_settings(pattern_index); + for property in properties { + if property.key.as_ref() == "strip" + && property.capture_id.map(|id| id as u32) == doc_capture_index + { + if let Some(value) = &property.value { + let regex = Regex::new(value.as_ref())?; + return Ok(Some(regex)); + } + } + } + return Ok(None); + }) + .collect::, Error>>()?; + Ok(TagsConfiguration { language, query, @@ -98,6 +119,7 @@ impl TagsConfiguration { doc_capture_index, call_capture_index, name_capture_index, + doc_strip_regexes, }) } } @@ -158,7 +180,15 @@ impl TagsContext { let tag_from_node = |node: Node, kind: TagKind| -> Option { let name = str::from_utf8(&source[name_node?.byte_range()]).ok()?; - let docs = doc_node.and_then(|n| str::from_utf8(&source[n.byte_range()]).ok()); + let docs = doc_node + .and_then(|n| str::from_utf8(&source[n.byte_range()]).ok()) + .map(|s| { + if let Some(regex) = &config.doc_strip_regexes[mat.pattern_index] { + regex.replace_all(s, "").to_string() + } else { + s.to_string() + } + }); Some(Tag { name, line: "TODO", @@ -233,3 +263,15 @@ impl From for Pos { }; } } + +impl From for Error { + fn from(error: regex::Error) -> Self { + Error::Regex(error) + } +} + +impl From for Error { + fn from(error: QueryError) -> Self { + Error::Query(error) + } +} From 0eb162c6854032c560911cea01d59e803f5b9f5e Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 10 Mar 2020 11:45:31 -0700 Subject: [PATCH 12/42] wip: converting generate_tags to return an iterator --- cli/src/tests/tags_test.rs | 10 +++-- tags/src/lib.rs | 75 +++++++++++++++++++++++++++----------- 2 files changed, 60 insertions(+), 25 deletions(-) diff --git a/cli/src/tests/tags_test.rs b/cli/src/tests/tags_test.rs index 2cc79749..38d3cc83 100644 --- a/cli/src/tests/tags_test.rs +++ b/cli/src/tests/tags_test.rs @@ -27,9 +27,10 @@ fn test_tags_javascript() { .unwrap(); let mut tag_context = TagsContext::new(); - let tags = tag_context.generate_tags( - &tags_config, - br#" + let tags = tag_context + .generate_tags( + &tags_config, + br#" class Customer: ''' Data about a customer @@ -42,7 +43,8 @@ fn test_tags_javascript() { compute_age(self.id); } "#, - ); + ) + .collect::>(); assert_eq!( tags.iter().map(|t| (t.name, t.kind)).collect::>(), diff --git a/tags/src/lib.rs b/tags/src/lib.rs index 577c4608..1c8ed5c5 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -1,8 +1,8 @@ use regex::Regex; use serde::{Serialize, Serializer}; use std::collections::{hash_map, HashMap}; -use std::{ops, str}; -use tree_sitter::{Language, Node, Parser, Query, QueryCursor, QueryError}; +use std::{mem, ops, str}; +use tree_sitter::{Language, Node, Parser, Query, QueryCursor, QueryError, Tree}; /// Contains the data neeeded to compute tags for code written in a /// particular language. @@ -24,6 +24,17 @@ pub struct TagsContext { cursor: QueryCursor, } +struct TagsIter<'a, I> +where + I: Iterator>, +{ + matches: I, + tree: Tree, + source: &'a [u8], + config: &'a TagsConfiguration, + tags: Vec<(Node<'a>, usize, Tag<'a>)>, +} + #[derive(Debug, Serialize, Clone)] pub struct Loc { pub byte_range: ops::Range, @@ -134,10 +145,10 @@ impl TagsContext { // TODO: This should return an iterator rather than build up a vector pub fn generate_tags<'a>( - &mut self, - config: &TagsConfiguration, + &'a mut self, + config: &'a TagsConfiguration, source: &'a [u8], - ) -> Vec> { + ) -> impl Iterator> + 'a { self.parser .set_language(config.language) .expect("Incompatible language"); @@ -145,14 +156,30 @@ impl TagsContext { .parser .parse(source, None) .expect("Parsing failed unexpectedly"); + let tree_ref = unsafe { mem::transmute::<_, &'static Tree>(&tree) }; let matches = self .cursor - .matches(&config.query, tree.root_node(), |node| { + .matches(&config.query, tree_ref.root_node(), move |node| { &source[node.byte_range()] }); - let mut neighbor_map: HashMap, usize)> = HashMap::new(); + TagsIter { + matches, + tree, + source, + config, + tags: Vec::new(), + } + } +} - for mat in matches { +impl<'a, I> Iterator for TagsIter<'a, I> +where + I: Iterator>, +{ + type Item = Tag<'a>; + + fn next(&mut self) -> Option> { + if let Some(mat) = self.matches.next() { let mut call_node = None; let mut doc_node = None; let mut class_node = None; @@ -163,21 +190,23 @@ impl TagsContext { for capture in mat.captures { let index = Some(capture.index); let node = Some(capture.node); - if index == config.call_capture_index { + if index == self.config.call_capture_index { call_node = node; - } else if index == config.class_capture_index { + } else if index == self.config.class_capture_index { class_node = node; - } else if index == config.doc_capture_index { + } else if index == self.config.doc_capture_index { doc_node = node; - } else if index == config.function_capture_index { + } else if index == self.config.function_capture_index { function_node = node; - } else if index == config.module_capture_index { + } else if index == self.config.module_capture_index { module_node = node; - } else if index == config.name_capture_index { + } else if index == self.config.name_capture_index { name_node = node; } } + let source = &self.source; + let config = &self.config; let tag_from_node = |node: Node, kind: TagKind| -> Option { let name = str::from_utf8(&source[name_node?.byte_range()]).ok()?; let docs = doc_node @@ -208,9 +237,13 @@ impl TagsContext { .cloned() { if let Some(found) = tag_node { - match neighbor_map.entry(found) { - hash_map::Entry::Occupied(mut entry) => { - let (tag, old_idx) = entry.get_mut(); + match self + .tags + .binary_search_by_key(&(found.end_byte(), found.id()), |(node, _, _)| { + (node.end_byte(), node.id()) + }) { + Ok(i) => { + let (_, old_idx, tag) = &mut self.tags[i]; if *old_idx > mat.pattern_index { if let Some(new_tag) = tag_from_node(found, tag_kind) { *tag = new_tag; @@ -218,17 +251,17 @@ impl TagsContext { } } } - hash_map::Entry::Vacant(entry) => { + Err(i) => { if let Some(tag) = tag_from_node(found, tag_kind) { - entry.insert((tag, mat.pattern_index)); + self.tags.insert(i, (found, mat.pattern_index, tag)) } } } } } + } else { } - - return neighbor_map.into_iter().map(|t| (t.1).0).collect(); + None } } From 17cc38678c00878ae34e6715b053db8a72906d32 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 10 Mar 2020 12:05:09 -0700 Subject: [PATCH 13/42] Get generate_tags with the new iterator API --- cli/src/tests/tags_test.rs | 4 +- tags/src/lib.rs | 162 ++++++++++++++++++++----------------- 2 files changed, 89 insertions(+), 77 deletions(-) diff --git a/cli/src/tests/tags_test.rs b/cli/src/tests/tags_test.rs index 38d3cc83..fe030574 100644 --- a/cli/src/tests/tags_test.rs +++ b/cli/src/tests/tags_test.rs @@ -2,7 +2,7 @@ use super::helpers::fixtures::get_language; use tree_sitter_tags::{TagKind, TagsConfiguration, TagsContext}; #[test] -fn test_tags_javascript() { +fn test_tags_python() { let language = get_language("python"); let tags_config = TagsConfiguration::new( language, @@ -40,7 +40,7 @@ fn test_tags_javascript() { ''' Get the customer's age ''' - compute_age(self.id); + compute_age(self.id) } "#, ) diff --git a/tags/src/lib.rs b/tags/src/lib.rs index 1c8ed5c5..3bd1c217 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -1,6 +1,5 @@ use regex::Regex; use serde::{Serialize, Serializer}; -use std::collections::{hash_map, HashMap}; use std::{mem, ops, str}; use tree_sitter::{Language, Node, Parser, Query, QueryCursor, QueryError, Tree}; @@ -29,7 +28,7 @@ where I: Iterator>, { matches: I, - tree: Tree, + _tree: Tree, source: &'a [u8], config: &'a TagsConfiguration, tags: Vec<(Node<'a>, usize, Tag<'a>)>, @@ -156,6 +155,9 @@ impl TagsContext { .parser .parse(source, None) .expect("Parsing failed unexpectedly"); + + // The `matches` iterator borrows the `Tree`, which prevents it from being moved. + // But the tree is really just a pointer, so it's actually ok to move it. let tree_ref = unsafe { mem::transmute::<_, &'static Tree>(&tree) }; let matches = self .cursor @@ -164,10 +166,10 @@ impl TagsContext { }); TagsIter { matches, - tree, source, config, tags: Vec::new(), + _tree: tree, } } } @@ -179,89 +181,99 @@ where type Item = Tag<'a>; fn next(&mut self) -> Option> { - if let Some(mat) = self.matches.next() { - let mut call_node = None; - let mut doc_node = None; - let mut class_node = None; - let mut function_node = None; - let mut module_node = None; - let mut name_node = None; - - for capture in mat.captures { - let index = Some(capture.index); - let node = Some(capture.node); - if index == self.config.call_capture_index { - call_node = node; - } else if index == self.config.class_capture_index { - class_node = node; - } else if index == self.config.doc_capture_index { - doc_node = node; - } else if index == self.config.function_capture_index { - function_node = node; - } else if index == self.config.module_capture_index { - module_node = node; - } else if index == self.config.name_capture_index { - name_node = node; + loop { + if let Some(last_entry) = self.tags.last() { + if self.tags.len() > 1 && self.tags[0].0.end_byte() < last_entry.0.start_byte() { + return Some(self.tags.remove(0).2); } } - let source = &self.source; - let config = &self.config; - let tag_from_node = |node: Node, kind: TagKind| -> Option { - let name = str::from_utf8(&source[name_node?.byte_range()]).ok()?; - let docs = doc_node - .and_then(|n| str::from_utf8(&source[n.byte_range()]).ok()) - .map(|s| { - if let Some(regex) = &config.doc_strip_regexes[mat.pattern_index] { - regex.replace_all(s, "").to_string() - } else { - s.to_string() - } - }); - Some(Tag { - name, - line: "TODO", - loc: loc_for_node(node), - kind: kind, - docs, - }) - }; + if let Some(mat) = self.matches.next() { + let mut call_node = None; + let mut doc_node = None; + let mut class_node = None; + let mut function_node = None; + let mut module_node = None; + let mut name_node = None; - for (tag_node, tag_kind) in [ - (call_node, TagKind::Call), - (class_node, TagKind::Class), - (function_node, TagKind::Function), - (module_node, TagKind::Module), - ] - .iter() - .cloned() - { - if let Some(found) = tag_node { - match self - .tags - .binary_search_by_key(&(found.end_byte(), found.id()), |(node, _, _)| { - (node.end_byte(), node.id()) - }) { - Ok(i) => { - let (_, old_idx, tag) = &mut self.tags[i]; - if *old_idx > mat.pattern_index { - if let Some(new_tag) = tag_from_node(found, tag_kind) { - *tag = new_tag; - *old_idx = mat.pattern_index; + for capture in mat.captures { + let index = Some(capture.index); + let node = Some(capture.node); + if index == self.config.call_capture_index { + call_node = node; + } else if index == self.config.class_capture_index { + class_node = node; + } else if index == self.config.doc_capture_index { + doc_node = node; + } else if index == self.config.function_capture_index { + function_node = node; + } else if index == self.config.module_capture_index { + module_node = node; + } else if index == self.config.name_capture_index { + name_node = node; + } + } + + let source = &self.source; + let config = &self.config; + let tag_from_node = |node: Node, kind: TagKind| -> Option { + let name = str::from_utf8(&source[name_node?.byte_range()]).ok()?; + let docs = doc_node + .and_then(|n| str::from_utf8(&source[n.byte_range()]).ok()) + .map(|s| { + if let Some(regex) = &config.doc_strip_regexes[mat.pattern_index] { + regex.replace_all(s, "").to_string() + } else { + s.to_string() + } + }); + Some(Tag { + name, + line: "TODO", + loc: loc_for_node(node), + kind: kind, + docs, + }) + }; + + for (tag_node, tag_kind) in [ + (call_node, TagKind::Call), + (class_node, TagKind::Class), + (function_node, TagKind::Function), + (module_node, TagKind::Module), + ] + .iter() + .cloned() + { + if let Some(found) = tag_node { + match self.tags.binary_search_by_key( + &(found.end_byte(), found.start_byte(), found.id()), + |(node, _, _)| (node.end_byte(), node.start_byte(), node.id()), + ) { + Ok(i) => { + let (_, old_idx, tag) = &mut self.tags[i]; + if *old_idx > mat.pattern_index { + if let Some(new_tag) = tag_from_node(found, tag_kind) { + *tag = new_tag; + *old_idx = mat.pattern_index; + } + } + } + Err(i) => { + if let Some(tag) = tag_from_node(found, tag_kind) { + self.tags.insert(i, (found, mat.pattern_index, tag)) } } } - Err(i) => { - if let Some(tag) = tag_from_node(found, tag_kind) { - self.tags.insert(i, (found, mat.pattern_index, tag)) - } - } + break; } } + } else if !self.tags.is_empty() { + return Some(self.tags.remove(0).2); + } else { + return None; } - } else { } - None } } From d3ab651bd5b67d7d4304174728bfb9b3d2e7bb0c Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 10 Mar 2020 14:34:52 -0700 Subject: [PATCH 14/42] tags: Add a few comments --- tags/src/lib.rs | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/tags/src/lib.rs b/tags/src/lib.rs index 3bd1c217..0fb2d174 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -31,7 +31,7 @@ where _tree: Tree, source: &'a [u8], config: &'a TagsConfiguration, - tags: Vec<(Node<'a>, usize, Tag<'a>)>, + tag_queue: Vec<(Node<'a>, usize, Tag<'a>)>, } #[derive(Debug, Serialize, Clone)] @@ -168,7 +168,7 @@ impl TagsContext { matches, source, config, - tags: Vec::new(), + tag_queue: Vec::new(), _tree: tree, } } @@ -182,12 +182,18 @@ where fn next(&mut self) -> Option> { loop { - if let Some(last_entry) = self.tags.last() { - if self.tags.len() > 1 && self.tags[0].0.end_byte() < last_entry.0.start_byte() { - return Some(self.tags.remove(0).2); + // If there is a queued tag for an earlier node in the syntax tree, then pop + // it off of the queue and return it. + if let Some(last_entry) = self.tag_queue.last() { + if self.tag_queue.len() > 1 + && self.tag_queue[0].0.end_byte() < last_entry.0.start_byte() + { + return Some(self.tag_queue.remove(0).2); } } + // If there is another match, then compute its tag and add it to the + // tag queue. if let Some(mat) = self.matches.next() { let mut call_node = None; let mut doc_node = None; @@ -246,12 +252,14 @@ where .cloned() { if let Some(found) = tag_node { - match self.tags.binary_search_by_key( + // Only create one tag per node. The tag queue is sorted by node position + // to allow for fast lookup. + match self.tag_queue.binary_search_by_key( &(found.end_byte(), found.start_byte(), found.id()), |(node, _, _)| (node.end_byte(), node.start_byte(), node.id()), ) { Ok(i) => { - let (_, old_idx, tag) = &mut self.tags[i]; + let (_, old_idx, tag) = &mut self.tag_queue[i]; if *old_idx > mat.pattern_index { if let Some(new_tag) = tag_from_node(found, tag_kind) { *tag = new_tag; @@ -261,15 +269,17 @@ where } Err(i) => { if let Some(tag) = tag_from_node(found, tag_kind) { - self.tags.insert(i, (found, mat.pattern_index, tag)) + self.tag_queue.insert(i, (found, mat.pattern_index, tag)) } } } break; } } - } else if !self.tags.is_empty() { - return Some(self.tags.remove(0).2); + } + // If there are no more matches, then drain the queue. + else if !self.tag_queue.is_empty() { + return Some(self.tag_queue.remove(0).2); } else { return None; } From 0e02ead0de61876f14bf950c33585a73443295a5 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 10 Mar 2020 15:53:42 -0700 Subject: [PATCH 15/42] Update tags test to reflect new handling of escapes in queries --- cli/src/tests/tags_test.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cli/src/tests/tags_test.rs b/cli/src/tests/tags_test.rs index fe030574..dac0a90a 100644 --- a/cli/src/tests/tags_test.rs +++ b/cli/src/tests/tags_test.rs @@ -10,13 +10,13 @@ fn test_tags_python() { ((function_definition name: (identifier) @name body: (block . (expression_statement (string) @doc))) @function - (set! strip @doc "(^['\s]*)|(['\s]*$)")) + (set! strip @doc "(^['\"\\s]*)|(['\"\\s]*$)")) (function_definition name: (identifier) @name) @function ((class_definition name: (identifier) @name body: (block . (expression_statement (string) @doc))) @class - (set! strip @doc "(^['\s]*)|(['\s]*$)")) + (set! strip @doc "(^['\"\\s]*)|(['\"\\s]*$)")) (class_definition name: (identifier) @name) @class (call @@ -32,9 +32,9 @@ fn test_tags_python() { &tags_config, br#" class Customer: - ''' + """ Data about a customer - ''' + """ def age(self): ''' From 4996cbe830b2289f42eac141cdd41779da824a25 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 10 Mar 2020 16:52:10 -0700 Subject: [PATCH 16/42] cli: Move more of the tags code from main into the tags module --- cli/src/main.rs | 30 +----------------------------- cli/src/tags.rs | 44 +++++++++++++++++++++++++++++++++++++------- tags/src/lib.rs | 4 ++-- 3 files changed, 40 insertions(+), 38 deletions(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index 0bbf6b25..c5c0e0e0 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -267,35 +267,7 @@ fn run() -> error::Result<()> { } else if let Some(matches) = matches.subcommand_matches("tags") { loader.find_all_languages(&config.parser_directories)?; let paths = collect_paths(matches.values_of("inputs").unwrap())?; - - let mut lang = None; - if let Some(scope) = matches.value_of("scope") { - lang = loader.language_configuration_for_scope(scope)?; - if lang.is_none() { - return Error::err(format!("Unknown scope '{}'", scope)); - } - } - - for path in paths { - let path = Path::new(&path); - let (language, language_config) = match lang { - Some(v) => v, - None => match loader.language_configuration_for_file_name(path)? { - Some(v) => v, - None => { - eprintln!("No language found for path {:?}", path); - continue; - } - }, - }; - - if let Some(tags_config) = language_config.tags_config(language)? { - let source = fs::read(path)?; - tags::generate_tags(tags_config, &source)?; - } else { - eprintln!("No tags config found for path {:?}", path); - } - } + tags::generate_tags(&loader, matches.value_of("scope"), &paths)?; } else if let Some(matches) = matches.subcommand_matches("highlight") { loader.configure_highlights(&config.theme.highlight_names); loader.find_all_languages(&config.parser_directories)?; diff --git a/cli/src/tags.rs b/cli/src/tags.rs index 23d448fc..8cf0d611 100644 --- a/cli/src/tags.rs +++ b/cli/src/tags.rs @@ -1,15 +1,45 @@ -use crate::error::Result; -use std::io; -use tree_sitter_tags::{TagsConfiguration, TagsContext}; +use super::loader::Loader; +use crate::error::{Error, Result}; +use std::fs; +use std::io::{self, Write}; +use std::path::Path; +use tree_sitter_tags::TagsContext; + +pub fn generate_tags(loader: &Loader, scope: Option<&str>, paths: &[String]) -> Result<()> { + let mut lang = None; + if let Some(scope) = scope { + lang = loader.language_configuration_for_scope(scope)?; + if lang.is_none() { + return Error::err(format!("Unknown scope '{}'", scope)); + } + } -pub fn generate_tags(config: &TagsConfiguration, source: &[u8]) -> Result<()> { let mut context = TagsContext::new(); - let stdout = io::stdout(); let mut stdout = stdout.lock(); - for tag in context.generate_tags(config, source) { - serde_json::to_writer(&mut stdout, &tag)?; + for path in paths { + let path = Path::new(&path); + let (language, language_config) = match lang { + Some(v) => v, + None => match loader.language_configuration_for_file_name(path)? { + Some(v) => v, + None => { + eprintln!("No language found for path {:?}", path); + continue; + } + }, + }; + + if let Some(tags_config) = language_config.tags_config(language)? { + let source = fs::read(path)?; + for tag in context.generate_tags(tags_config, &source) { + serde_json::to_writer(&mut stdout, &tag)?; + stdout.write(b"\n")?; + } + } else { + eprintln!("No tags config found for path {:?}", path); + } } Ok(()) diff --git a/tags/src/lib.rs b/tags/src/lib.rs index 0fb2d174..e90352a0 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -58,10 +58,10 @@ pub enum TagKind { #[derive(Debug, Serialize, Clone)] pub struct Tag<'a> { pub kind: TagKind, - pub loc: Loc, pub name: &'a str, - pub line: &'a str, pub docs: Option, + pub loc: Loc, + pub line: &'a str, } #[derive(Debug)] From d798bd6bd9c1f4375ed01a886e4375ee02fbe67c Mon Sep 17 00:00:00 2001 From: Patrick Thomson Date: Tue, 10 Mar 2020 20:39:04 -0400 Subject: [PATCH 17/42] Slice out the line associated with a tag. --- cli/src/tests/tags_test.rs | 1 + tags/src/lib.rs | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/cli/src/tests/tags_test.rs b/cli/src/tests/tags_test.rs index dac0a90a..9b4042d3 100644 --- a/cli/src/tests/tags_test.rs +++ b/cli/src/tests/tags_test.rs @@ -56,5 +56,6 @@ fn test_tags_python() { ); assert_eq!(tags[0].docs.as_ref().unwrap(), "Data about a customer"); + assert_eq!(tags[0].line, "class Customer:"); assert_eq!(tags[1].docs.as_ref().unwrap(), "Get the customer's age"); } diff --git a/tags/src/lib.rs b/tags/src/lib.rs index e90352a0..afb7e376 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -224,6 +224,11 @@ where let config = &self.config; let tag_from_node = |node: Node, kind: TagKind| -> Option { let name = str::from_utf8(&source[name_node?.byte_range()]).ok()?; + let mut line_range = node.byte_range(); + if line_range.len() > 180 { + line_range.end = line_range.start + 180; + } + let line = str::from_utf8(&source[line_range]).ok()?.lines().next()?; let docs = doc_node .and_then(|n| str::from_utf8(&source[n.byte_range()]).ok()) .map(|s| { @@ -235,7 +240,7 @@ where }); Some(Tag { name, - line: "TODO", + line, loc: loc_for_node(node), kind: kind, docs, From 4dfebbe52d1789ed0a93786e3a6f01f79bb0386b Mon Sep 17 00:00:00 2001 From: Patrick Thomson Date: Tue, 10 Mar 2020 20:46:55 -0400 Subject: [PATCH 18/42] some whitespace and comments --- tags/src/lib.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tags/src/lib.rs b/tags/src/lib.rs index afb7e376..fa508cb1 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -224,11 +224,12 @@ where let config = &self.config; let tag_from_node = |node: Node, kind: TagKind| -> Option { let name = str::from_utf8(&source[name_node?.byte_range()]).ok()?; + + // Slice out the first line of the text corresponding to the node in question. let mut line_range = node.byte_range(); - if line_range.len() > 180 { - line_range.end = line_range.start + 180; - } + line_range.end = line_range.end.min(line_range.start + 180); let line = str::from_utf8(&source[line_range]).ok()?.lines().next()?; + let docs = doc_node .and_then(|n| str::from_utf8(&source[n.byte_range()]).ok()) .map(|s| { From 6f636a0357b8285909cd499d1ff345a421e5b737 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 12 Mar 2020 15:10:58 -0700 Subject: [PATCH 19/42] query: Add postfix '+' operator for token repetition Co-Authored-By: Patrick Thomson --- cli/src/tests/query_test.rs | 68 ++++++++++++ lib/src/query.c | 204 +++++++++++++++++++++++------------- 2 files changed, 201 insertions(+), 71 deletions(-) diff --git a/cli/src/tests/query_test.rs b/cli/src/tests/query_test.rs index 87420501..c6980d45 100644 --- a/cli/src/tests/query_test.rs +++ b/cli/src/tests/query_test.rs @@ -464,6 +464,7 @@ fn test_query_matches_with_wildcard_at_the_root() { ); }); } + #[test] fn test_query_with_immediate_siblings() { allocations::record(|| { @@ -515,6 +516,73 @@ fn test_query_with_immediate_siblings() { }); } +#[test] +fn test_query_matches_with_repeated_nodes() { + allocations::record(|| { + let language = get_language("javascript"); + + let query = Query::new( + language, + " + (* + (comment)+ @doc + . + (class_declaration + name: (identifier) @name)) + + (* + (comment)+ @doc + . + (function_declaration + name: (identifier) @name)) + ", + ) + .unwrap(); + + let source = " + // one + // two + a(); + + // three + { + // four + // five + // six + class B {} + + // seven + c(); + + // eight + function d() {} + } + "; + + let mut parser = Parser::new(); + parser.set_language(language).unwrap(); + let tree = parser.parse(source, None).unwrap(); + let mut cursor = QueryCursor::new(); + let matches = cursor.matches(&query, tree.root_node(), to_callback(source)); + + assert_eq!( + collect_matches(matches, &query, source), + &[ + ( + 0, + vec![ + ("doc", "// four"), + ("doc", "// five"), + ("doc", "// six"), + ("name", "B") + ] + ), + (1, vec![("doc", "// eight"), ("name", "d")]), + ] + ); + }); +} + #[test] fn test_query_matches_in_language_with_simple_aliases() { allocations::record(|| { diff --git a/lib/src/query.c b/lib/src/query.c index 65144395..b93c2ea4 100644 --- a/lib/src/query.c +++ b/lib/src/query.c @@ -40,10 +40,11 @@ typedef struct { TSSymbol symbol; TSFieldId field; uint16_t capture_ids[MAX_STEP_CAPTURE_COUNT]; - uint16_t depth: 13; + uint16_t depth: 12; bool contains_captures: 1; bool is_immediate: 1; bool is_last: 1; + bool is_repeated: 1; } QueryStep; /* @@ -88,12 +89,15 @@ typedef struct { uint16_t start_depth; uint16_t pattern_index; uint16_t step_index; - uint16_t capture_count; uint16_t capture_list_id; uint16_t consumed_capture_count; uint32_t id; + uint16_t current_step_match_count; + bool seeking_non_match; } QueryState; +typedef Array(TSQueryCapture) CaptureList; + /* * CaptureListPool - A collection of *lists* of captures. Each QueryState * needs to maintain its own list of captures. They are all represented as @@ -101,7 +105,7 @@ typedef struct { * parts of the shared array are currently in use by a QueryState. */ typedef struct { - Array(TSQueryCapture) list; + CaptureList list[32]; uint32_t usage_map; } CaptureListPool; @@ -233,24 +237,22 @@ static void stream_scan_identifier(Stream *stream) { static CaptureListPool capture_list_pool_new() { return (CaptureListPool) { - .list = array_new(), .usage_map = UINT32_MAX, }; } -static void capture_list_pool_reset(CaptureListPool *self, uint16_t list_size) { +static void capture_list_pool_reset(CaptureListPool *self) { self->usage_map = UINT32_MAX; - uint32_t total_size = MAX_STATE_COUNT * list_size; - array_reserve(&self->list, total_size); - self->list.size = total_size; } static void capture_list_pool_delete(CaptureListPool *self) { - array_delete(&self->list); + for (unsigned i = 0; i < 32; i++) { + array_delete(&self->list[i]); + } } -static TSQueryCapture *capture_list_pool_get(CaptureListPool *self, uint16_t id) { - return &self->list.contents[id * (self->list.size / MAX_STATE_COUNT)]; +static CaptureList *capture_list_pool_get(CaptureListPool *self, uint16_t id) { + return &self->list[id]; } static bool capture_list_pool_is_empty(const CaptureListPool *self) { @@ -269,6 +271,7 @@ static uint16_t capture_list_pool_acquire(CaptureListPool *self) { } static void capture_list_pool_release(CaptureListPool *self, uint16_t id) { + array_clear(&self->list[id]); self->usage_map |= bitmask_for_index(id); } @@ -408,6 +411,7 @@ static QueryStep query_step__new( .capture_ids = {NONE, NONE, NONE, NONE}, .contains_captures = false, .is_immediate = is_immediate, + .is_repeated = false, }; } @@ -842,27 +846,42 @@ static TSQueryError ts_query__parse_pattern( stream_skip_whitespace(stream); - // Parse an '@'-prefixed capture pattern - while (stream->next == '@') { - stream_advance(stream); - - // Parse the capture name - if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax; - const char *capture_name = stream->input; - stream_scan_identifier(stream); - uint32_t length = stream->input - capture_name; - - // Add the capture id to the first step of the pattern - uint16_t capture_id = symbol_table_insert_name( - &self->captures, - capture_name, - length - ); + // Parse suffixes modifiers for this pattern + for (;;) { QueryStep *step = &self->steps.contents[starting_step_index]; - query_step__add_capture(step, capture_id); - (*capture_count)++; - stream_skip_whitespace(stream); + if (stream->next == '+') { + stream_advance(stream); + step->is_repeated = true; + stream_skip_whitespace(stream); + } + + // Parse an '@'-prefixed capture pattern + else if (stream->next == '@') { + stream_advance(stream); + + // Parse the capture name + if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax; + const char *capture_name = stream->input; + stream_scan_identifier(stream); + uint32_t length = stream->input - capture_name; + + // Add the capture id to the first step of the pattern + uint16_t capture_id = symbol_table_insert_name( + &self->captures, + capture_name, + length + ); + query_step__add_capture(step, capture_id); + (*capture_count)++; + + stream_skip_whitespace(stream); + } + + // No more suffix modifiers + else { + break; + } } return 0; @@ -1089,7 +1108,7 @@ void ts_query_cursor_exec( array_clear(&self->states); array_clear(&self->finished_states); ts_tree_cursor_reset(&self->cursor, node); - capture_list_pool_reset(&self->capture_list_pool, query->max_capture_count); + capture_list_pool_reset(&self->capture_list_pool); self->next_state_id = 0; self->depth = 0; self->ascending = false; @@ -1133,12 +1152,12 @@ static bool ts_query_cursor__first_in_progress_capture( bool result = false; for (unsigned i = 0; i < self->states.size; i++) { const QueryState *state = &self->states.contents[i]; - if (state->capture_count > 0) { - const TSQueryCapture *captures = capture_list_pool_get( - &self->capture_list_pool, - state->capture_list_id - ); - uint32_t capture_byte = ts_node_start_byte(captures[0].node); + const CaptureList *captures = capture_list_pool_get( + &self->capture_list_pool, + state->capture_list_id + ); + if (captures->size > 0) { + uint32_t capture_byte = ts_node_start_byte(captures->contents[0].node); if ( !result || capture_byte < *byte_offset || @@ -1192,8 +1211,9 @@ static bool ts_query__cursor_add_state( .step_index = pattern->step_index, .pattern_index = pattern->pattern_index, .start_depth = self->depth, - .capture_count = 0, .consumed_capture_count = 0, + .current_step_match_count = 0, + .seeking_non_match = false, })); return true; } @@ -1207,15 +1227,15 @@ static QueryState *ts_query__cursor_copy_state( array_push(&self->states, *state); QueryState *new_state = array_back(&self->states); new_state->capture_list_id = new_list_id; - TSQueryCapture *old_captures = capture_list_pool_get( + CaptureList *old_captures = capture_list_pool_get( &self->capture_list_pool, state->capture_list_id ); - TSQueryCapture *new_captures = capture_list_pool_get( + CaptureList *new_captures = capture_list_pool_get( &self->capture_list_pool, new_list_id ); - memcpy(new_captures, old_captures, state->capture_count * sizeof(TSQueryCapture)); + array_push_all(new_captures, old_captures); return new_state; } @@ -1371,7 +1391,27 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { } } - if (!node_does_match) { + if (node_does_match) { + // The `seeking_non_match` flag indicates that a previous QueryState + // has already begun processing this repeating sequence, so that *this* + // QueryState should not begin matching until a separate repeating sequence + // is found. + if (state->seeking_non_match) continue; + } else { + // If this QueryState has processed a repeating sequence, and that repeating + // sequence has ended, move on to the *next* step of this state's pattern. + if (state->current_step_match_count > 0) { + LOG( + " finish repetition state. pattern:%u, step:%u\n", + state->pattern_index, + state->step_index + ); + state->step_index++; + state->current_step_match_count = 0; + i--; + continue; + } + if (!later_sibling_can_match) { LOG( " discard state. pattern:%u, step:%u\n", @@ -1386,6 +1426,8 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { i--; n--; } + + state->seeking_non_match = false; continue; } @@ -1400,9 +1442,18 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { if ( step->depth > 0 && step->contains_captures && - later_sibling_can_match + later_sibling_can_match && + state->current_step_match_count == 0 ) { QueryState *copy = ts_query__cursor_copy_state(self, state); + + // The QueryState that matched this node has begun matching a repeating + // sequence. The QueryState that *skipped* this node should not start + // matching later elements of the same repeating sequence. + if (step->is_repeated) { + state->seeking_non_match = true; + } + if (copy) { LOG( " split state. pattern:%u, step:%u\n", @@ -1411,7 +1462,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { ); next_state = copy; } else { - LOG(" canot split state.\n"); + LOG(" cannot split state.\n"); } } @@ -1431,35 +1482,44 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { next_state->pattern_index, capture_id ); - TSQueryCapture *capture_list = capture_list_pool_get( + CaptureList *capture_list = capture_list_pool_get( &self->capture_list_pool, next_state->capture_list_id ); - capture_list[next_state->capture_count++] = (TSQueryCapture) { + array_push(capture_list, ((TSQueryCapture) { node, capture_id - }; + })); } - // If the pattern is now done, then remove it from the list of - // in-progress states, and add it to the list of finished states. - next_state->step_index++; - QueryStep *next_step = step + 1; - if (next_step->depth == PATTERN_DONE_MARKER) { - LOG(" finish pattern %u\n", next_state->pattern_index); + // If this step repeats, then don't move to the next step until + // this step no longer matches. + if (step->is_repeated) { + next_state->current_step_match_count++; + } else { + next_state->step_index++; + next_state->current_step_match_count = 0; + QueryStep *next_step = step + 1; - next_state->id = self->next_state_id++; - array_push(&self->finished_states, *next_state); - if (next_state == state) { - array_erase(&self->states, i); - i--; - n--; - } else { - self->states.size--; + // If the pattern is now done, then remove it from the list of + // in-progress states, and add it to the list of finished states. + if (next_step->depth == PATTERN_DONE_MARKER) { + LOG(" finish pattern %u\n", next_state->pattern_index); + + next_state->id = self->next_state_id++; + array_push(&self->finished_states, *next_state); + if (next_state == state) { + array_erase(&self->states, i); + i--; + n--; + } else { + self->states.size--; + } } } } + // Continue descending if possible. if (ts_tree_cursor_goto_first_child(&self->cursor)) { self->depth++; @@ -1485,11 +1545,12 @@ bool ts_query_cursor_next_match( QueryState *state = &self->finished_states.contents[0]; match->id = state->id; match->pattern_index = state->pattern_index; - match->capture_count = state->capture_count; - match->captures = capture_list_pool_get( + CaptureList *captures = capture_list_pool_get( &self->capture_list_pool, state->capture_list_id ); + match->captures = captures->contents; + match->capture_count = captures->size; capture_list_pool_release(&self->capture_list_pool, state->capture_list_id); array_erase(&self->finished_states, 0); return true; @@ -1542,13 +1603,13 @@ bool ts_query_cursor_next_capture( uint32_t first_finished_pattern_index = first_unfinished_pattern_index; for (unsigned i = 0; i < self->finished_states.size; i++) { const QueryState *state = &self->finished_states.contents[i]; - if (state->capture_count > state->consumed_capture_count) { - const TSQueryCapture *captures = capture_list_pool_get( - &self->capture_list_pool, - state->capture_list_id - ); + CaptureList *captures = capture_list_pool_get( + &self->capture_list_pool, + state->capture_list_id + ); + if (captures->size > state->consumed_capture_count) { uint32_t capture_byte = ts_node_start_byte( - captures[state->consumed_capture_count].node + captures->contents[state->consumed_capture_count].node ); if ( capture_byte < first_finished_capture_byte || @@ -1580,11 +1641,12 @@ bool ts_query_cursor_next_capture( ]; match->id = state->id; match->pattern_index = state->pattern_index; - match->capture_count = state->capture_count; - match->captures = capture_list_pool_get( + CaptureList *captures = capture_list_pool_get( &self->capture_list_pool, state->capture_list_id ); + match->captures = captures->contents; + match->capture_count = captures->size; *capture_index = state->consumed_capture_count; state->consumed_capture_count++; return true; From 6e2df06dc2fea5e4f19a3b82194647e56905285b Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 12 Mar 2020 16:33:52 -0700 Subject: [PATCH 20/42] Start proving out tags support for JavaScript --- cli/src/tests/tags_test.rs | 53 ++++++++++++++++++++++++++++++++++++++ tags/src/lib.rs | 38 ++++++++++++++++++--------- 2 files changed, 79 insertions(+), 12 deletions(-) diff --git a/cli/src/tests/tags_test.rs b/cli/src/tests/tags_test.rs index 9b4042d3..f7b5f6ee 100644 --- a/cli/src/tests/tags_test.rs +++ b/cli/src/tests/tags_test.rs @@ -59,3 +59,56 @@ fn test_tags_python() { assert_eq!(tags[0].line, "class Customer:"); assert_eq!(tags[1].docs.as_ref().unwrap(), "Get the customer's age"); } + +#[test] +fn test_tags_javascript() { + let language = get_language("javascript"); + let tags_config = TagsConfiguration::new( + language, + r#" + ((* + (comment)+ @doc + . + (class_declaration + name: (identifier) @name) @class) + (set! strip @doc "(^[/\\*\\s]*)|([/\\*\\s]*$)")) + ((* + (comment)+ @doc + . + (method_definition + name: (property_identifier) @name) @method) + (set! strip @doc "(^[/\\*\\s]*)|([/\\*\\s]*$)")) + "#, + "", + ) + .unwrap(); + + let mut tag_context = TagsContext::new(); + let tags = tag_context + .generate_tags( + &tags_config, + br#" + // Data about a customer. + // bla bla bla + class Customer { + /* + * Get the customer's age + */ + getAge() { + + } + } + "#, + ) + .collect::>(); + + assert_eq!( + tags.iter().map(|t| (t.name, t.kind)).collect::>(), + &[("getAge", TagKind::Method), ("Customer", TagKind::Class)] + ); + assert_eq!(tags[0].docs.as_ref().unwrap(), "Get the customer's age"); + assert_eq!( + tags[1].docs.as_ref().unwrap(), + "Data about a customer.\nbla bla bla" + ); +} diff --git a/tags/src/lib.rs b/tags/src/lib.rs index fa508cb1..4b5d2a7e 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -13,6 +13,7 @@ pub struct TagsConfiguration { doc_capture_index: Option, function_capture_index: Option, locals_pattern_index: usize, + method_capture_index: Option, module_capture_index: Option, name_capture_index: Option, doc_strip_regexes: Vec>, @@ -87,6 +88,7 @@ impl TagsConfiguration { let mut class_capture_index = None; let mut doc_capture_index = None; let mut function_capture_index = None; + let mut method_capture_index = None; let mut module_capture_index = None; let mut name_capture_index = None; for (i, name) in query.capture_names().iter().enumerate() { @@ -95,6 +97,7 @@ impl TagsConfiguration { "class" => &mut class_capture_index, "doc" => &mut doc_capture_index, "function" => &mut function_capture_index, + "method" => &mut method_capture_index, "module" => &mut module_capture_index, "name" => &mut name_capture_index, _ => continue, @@ -125,6 +128,7 @@ impl TagsConfiguration { locals_pattern_index, function_capture_index, class_capture_index, + method_capture_index, module_capture_index, doc_capture_index, call_capture_index, @@ -195,10 +199,11 @@ where // If there is another match, then compute its tag and add it to the // tag queue. if let Some(mat) = self.matches.next() { + let mut docs = None; let mut call_node = None; - let mut doc_node = None; let mut class_node = None; let mut function_node = None; + let mut method_node = None; let mut module_node = None; let mut name_node = None; @@ -210,9 +215,27 @@ where } else if index == self.config.class_capture_index { class_node = node; } else if index == self.config.doc_capture_index { - doc_node = node; + if let Ok(content) = str::from_utf8(&self.source[capture.node.byte_range()]) + { + let content = if let Some(regex) = + &self.config.doc_strip_regexes[mat.pattern_index] + { + regex.replace_all(content, "").to_string() + } else { + content.to_string() + }; + match &mut docs { + None => docs = Some(content), + Some(d) => { + d.push('\n'); + d.push_str(&content); + } + } + } } else if index == self.config.function_capture_index { function_node = node; + } else if index == self.config.method_capture_index { + method_node = node; } else if index == self.config.module_capture_index { module_node = node; } else if index == self.config.name_capture_index { @@ -221,7 +244,6 @@ where } let source = &self.source; - let config = &self.config; let tag_from_node = |node: Node, kind: TagKind| -> Option { let name = str::from_utf8(&source[name_node?.byte_range()]).ok()?; @@ -230,15 +252,6 @@ where line_range.end = line_range.end.min(line_range.start + 180); let line = str::from_utf8(&source[line_range]).ok()?.lines().next()?; - let docs = doc_node - .and_then(|n| str::from_utf8(&source[n.byte_range()]).ok()) - .map(|s| { - if let Some(regex) = &config.doc_strip_regexes[mat.pattern_index] { - regex.replace_all(s, "").to_string() - } else { - s.to_string() - } - }); Some(Tag { name, line, @@ -252,6 +265,7 @@ where (call_node, TagKind::Call), (class_node, TagKind::Class), (function_node, TagKind::Function), + (method_node, TagKind::Method), (module_node, TagKind::Module), ] .iter() From 04577367667cf6387c4c69b8673c2aead7a89580 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 13 Mar 2020 13:02:34 -0700 Subject: [PATCH 21/42] rust: add handling of arbitrary predicate operators --- cli/src/tests/query_test.rs | 20 +++++++++++--- lib/binding_rust/lib.rs | 55 +++++++++++++++++++++++++++++++++---- 2 files changed, 65 insertions(+), 10 deletions(-) diff --git a/cli/src/tests/query_test.rs b/cli/src/tests/query_test.rs index c6980d45..355be9b5 100644 --- a/cli/src/tests/query_test.rs +++ b/cli/src/tests/query_test.rs @@ -2,7 +2,8 @@ use super::helpers::allocations; use super::helpers::fixtures::get_language; use std::fmt::Write; use tree_sitter::{ - Node, Parser, Query, QueryCapture, QueryCursor, QueryError, QueryMatch, QueryProperty, + Node, Parser, Query, QueryCapture, QueryCursor, QueryError, QueryMatch, QueryPredicate, + QueryPredicateArg, QueryProperty, }; #[test] @@ -948,7 +949,7 @@ fn test_query_captures_with_text_conditions() { } #[test] -fn test_query_captures_with_set_properties() { +fn test_query_captures_with_predicates() { allocations::record(|| { let language = get_language("javascript"); @@ -957,7 +958,8 @@ fn test_query_captures_with_set_properties() { r#" ((call_expression (identifier) @foo) (set! name something) - (set! cool)) + (set! cool) + (something! @foo omg)) ((property_identifier) @bar (is? cool) @@ -972,6 +974,16 @@ fn test_query_captures_with_set_properties() { QueryProperty::new("cool", None, None), ] ); + assert_eq!( + query.general_predicates(0), + &[QueryPredicate { + operator: "something!".to_string().into_boxed_str(), + args: vec![ + QueryPredicateArg::Capture(0), + QueryPredicateArg::String("omg".to_string().into_boxed_str()), + ], + },] + ); assert_eq!(query.property_settings(1), &[]); assert_eq!(query.property_predicates(0), &[]); assert_eq!( @@ -985,7 +997,7 @@ fn test_query_captures_with_set_properties() { } #[test] -fn test_query_captures_with_set_quoted_properties() { +fn test_query_captures_with_quoted_predicate_args() { allocations::record(|| { let language = get_language("javascript"); diff --git a/lib/binding_rust/lib.rs b/lib/binding_rust/lib.rs index 4a04ff13..1c9421f5 100644 --- a/lib/binding_rust/lib.rs +++ b/lib/binding_rust/lib.rs @@ -100,6 +100,7 @@ pub struct Query { text_predicates: Vec>, property_settings: Vec>, property_predicates: Vec>, + general_predicates: Vec>, } /// A stateful object for executing a `Query` on a syntax `Tree`. @@ -113,6 +114,19 @@ pub struct QueryProperty { pub capture_id: Option, } +#[derive(Debug, PartialEq, Eq)] +pub enum QueryPredicateArg { + Capture(u32), + String(Box), +} + +/// A key-value pair associated with a particular pattern in a `Query`. +#[derive(Debug, PartialEq, Eq)] +pub struct QueryPredicate { + pub operator: Box, + pub args: Vec, +} + /// A match of a `Query` to a particular set of `Node`s. pub struct QueryMatch<'a> { pub pattern_index: usize, @@ -1199,6 +1213,7 @@ impl Query { text_predicates: Vec::with_capacity(pattern_count), property_predicates: Vec::with_capacity(pattern_count), property_settings: Vec::with_capacity(pattern_count), + general_predicates: Vec::with_capacity(pattern_count), }; // Build a vector of strings to store the capture names. @@ -1242,6 +1257,7 @@ impl Query { let mut text_predicates = Vec::new(); let mut property_predicates = Vec::new(); let mut property_settings = Vec::new(); + let mut general_predicates = Vec::new(); for p in predicate_steps.split(|s| s.type_ == type_done) { if p.is_empty() { continue; @@ -1333,12 +1349,21 @@ impl Query { operator_name == "is?", )), - _ => { - return Err(QueryError::Predicate(format!( - "Unknown query predicate function {}", - operator_name, - ))) - } + _ => general_predicates.push(QueryPredicate { + operator: operator_name.clone().into_boxed_str(), + args: p[1..] + .iter() + .map(|a| { + if a.type_ == type_capture { + QueryPredicateArg::Capture(a.value_id) + } else { + QueryPredicateArg::String( + string_values[a.value_id as usize].clone().into_boxed_str(), + ) + } + }) + .collect(), + }), } } @@ -1351,6 +1376,9 @@ impl Query { result .property_settings .push(property_settings.into_boxed_slice()); + result + .general_predicates + .push(general_predicates.into_boxed_slice()); } Ok(result) } @@ -1380,15 +1408,30 @@ impl Query { } /// Get the properties that are checked for the given pattern index. + /// + /// This includes predicates with the operators `is?` and `is-not?`. pub fn property_predicates(&self, index: usize) -> &[(QueryProperty, bool)] { &self.property_predicates[index] } /// Get the properties that are set for the given pattern index. + /// + /// This includes predicates with the operator `set!`. pub fn property_settings(&self, index: usize) -> &[QueryProperty] { &self.property_settings[index] } + /// Get the other user-defined predicates associated with the given index. + /// + /// This includes predicate with operators other than: + /// * `match?` + /// * `eq?` and `not-eq? + /// * `is?` and `is-not?` + /// * `set!` + pub fn general_predicates(&self, index: usize) -> &[QueryPredicate] { + &self.general_predicates[index] + } + /// Disable a certain capture within a query. /// /// This prevents the capture from being returned in matches, and also avoids any From b5f2ed83fe60ef2d704ae59bbc2c3ba4e89757c7 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 13 Mar 2020 13:02:56 -0700 Subject: [PATCH 22/42] tags: Implement select-adjacent! predicate --- cli/src/tests/tags_test.rs | 43 ++++---- tags/src/lib.rs | 201 +++++++++++++++++++++---------------- 2 files changed, 140 insertions(+), 104 deletions(-) diff --git a/cli/src/tests/tags_test.rs b/cli/src/tests/tags_test.rs index f7b5f6ee..65d2096f 100644 --- a/cli/src/tests/tags_test.rs +++ b/cli/src/tests/tags_test.rs @@ -8,19 +8,19 @@ fn test_tags_python() { language, r#" ((function_definition - name: (identifier) @name - body: (block . (expression_statement (string) @doc))) @function - (set! strip @doc "(^['\"\\s]*)|(['\"\\s]*$)")) + name: (identifier) @name + body: (block . (expression_statement (string) @doc))) @function + (strip! @doc "(^['\"\\s]*)|(['\"\\s]*$)")) (function_definition - name: (identifier) @name) @function + name: (identifier) @name) @function ((class_definition - name: (identifier) @name - body: (block . (expression_statement (string) @doc))) @class - (set! strip @doc "(^['\"\\s]*)|(['\"\\s]*$)")) + name: (identifier) @name + body: (block . (expression_statement (string) @doc))) @class + (strip! @doc "(^['\"\\s]*)|(['\"\\s]*$)")) (class_definition - name: (identifier) @name) @class + name: (identifier) @name) @class (call - function: (identifier) @name) @call + function: (identifier) @name) @call "#, "", ) @@ -67,17 +67,19 @@ fn test_tags_javascript() { language, r#" ((* - (comment)+ @doc - . - (class_declaration - name: (identifier) @name) @class) - (set! strip @doc "(^[/\\*\\s]*)|([/\\*\\s]*$)")) + (comment)+ @doc + . + (class_declaration + name: (identifier) @name) @class) + (select-adjacent! @doc @class) + (strip! @doc "(^[/\\*\\s]*)|([/\\*\\s]*$)")) ((* - (comment)+ @doc - . - (method_definition - name: (property_identifier) @name) @method) - (set! strip @doc "(^[/\\*\\s]*)|([/\\*\\s]*$)")) + (comment)+ @doc + . + (method_definition + name: (property_identifier) @name) @method) + (select-adjacent! @doc @method) + (strip! @doc "(^[/\\*\\s]*)|([/\\*\\s]*$)")) "#, "", ) @@ -88,6 +90,8 @@ fn test_tags_javascript() { .generate_tags( &tags_config, br#" + // hi + // Data about a customer. // bla bla bla class Customer { @@ -95,7 +99,6 @@ fn test_tags_javascript() { * Get the customer's age */ getAge() { - } } "#, diff --git a/tags/src/lib.rs b/tags/src/lib.rs index 4b5d2a7e..19b7be84 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -1,7 +1,9 @@ use regex::Regex; use serde::{Serialize, Serializer}; use std::{mem, ops, str}; -use tree_sitter::{Language, Node, Parser, Query, QueryCursor, QueryError, Tree}; +use tree_sitter::{ + Language, Node, Parser, Query, QueryCursor, QueryError, QueryPredicateArg, Tree, +}; /// Contains the data neeeded to compute tags for code written in a /// particular language. @@ -16,7 +18,7 @@ pub struct TagsConfiguration { method_capture_index: Option, module_capture_index: Option, name_capture_index: Option, - doc_strip_regexes: Vec>, + pattern_info: Vec, } pub struct TagsContext { @@ -24,6 +26,12 @@ pub struct TagsContext { cursor: QueryCursor, } +#[derive(Default)] +struct PatternInfo { + docs_adjacent_capture: Option, + doc_strip_regex: Option, +} + struct TagsIter<'a, I> where I: Iterator>, @@ -105,20 +113,28 @@ impl TagsConfiguration { *index = Some(i as u32); } - let doc_strip_regexes = (0..query.pattern_count()) + let pattern_info = (0..query.pattern_count()) .map(|pattern_index| { - let properties = query.property_settings(pattern_index); - for property in properties { - if property.key.as_ref() == "strip" - && property.capture_id.map(|id| id as u32) == doc_capture_index - { - if let Some(value) = &property.value { - let regex = Regex::new(value.as_ref())?; - return Ok(Some(regex)); + let mut info = PatternInfo::default(); + if let Some(doc_capture_index) = doc_capture_index { + for predicate in query.general_predicates(pattern_index) { + if predicate.args.get(0) + == Some(&QueryPredicateArg::Capture(doc_capture_index)) + { + match (predicate.operator.as_ref(), predicate.args.get(1)) { + ("select-adjacent!", Some(QueryPredicateArg::Capture(index))) => { + info.docs_adjacent_capture = Some(*index); + } + ("strip!", Some(QueryPredicateArg::String(pattern))) => { + let regex = Regex::new(pattern.as_ref())?; + info.doc_strip_regex = Some(regex); + } + _ => {} + } } } } - return Ok(None); + return Ok(info); }) .collect::, Error>>()?; @@ -133,7 +149,7 @@ impl TagsConfiguration { doc_capture_index, call_capture_index, name_capture_index, - doc_strip_regexes, + pattern_info, }) } } @@ -199,26 +215,68 @@ where // If there is another match, then compute its tag and add it to the // tag queue. if let Some(mat) = self.matches.next() { - let mut docs = None; - let mut call_node = None; - let mut class_node = None; - let mut function_node = None; - let mut method_node = None; - let mut module_node = None; - let mut name_node = None; + let mut name = None; + let mut doc_nodes = Vec::new(); + let mut tag_node = None; + let mut tag_kind = TagKind::Call; + let mut docs_adjacent_node = None; for capture in mat.captures { let index = Some(capture.index); - let node = Some(capture.node); - if index == self.config.call_capture_index { - call_node = node; - } else if index == self.config.class_capture_index { - class_node = node; + + if index == self.config.pattern_info[mat.pattern_index].docs_adjacent_capture { + docs_adjacent_node = Some(capture.node); + } + + if index == self.config.name_capture_index { + name = str::from_utf8(&self.source[Some(capture.node)?.byte_range()]).ok(); } else if index == self.config.doc_capture_index { - if let Ok(content) = str::from_utf8(&self.source[capture.node.byte_range()]) - { + doc_nodes.push(capture.node); + } else if index == self.config.call_capture_index { + tag_node = Some(capture.node); + tag_kind = TagKind::Call; + } else if index == self.config.class_capture_index { + tag_node = Some(capture.node); + tag_kind = TagKind::Class; + } else if index == self.config.function_capture_index { + tag_node = Some(capture.node); + tag_kind = TagKind::Function; + } else if index == self.config.method_capture_index { + tag_node = Some(capture.node); + tag_kind = TagKind::Method; + } else if index == self.config.module_capture_index { + tag_node = Some(capture.node); + tag_kind = TagKind::Module; + } + } + + if let (Some(tag_node), Some(name)) = (tag_node, name) { + // If needed, filter the doc nodes based on their ranges, selecting + // only the slice that are adjacent to some specified node. + let mut docs_start_index = 0; + if let (Some(docs_adjacent_node), false) = + (docs_adjacent_node, doc_nodes.is_empty()) + { + docs_start_index = doc_nodes.len(); + let mut start_row = docs_adjacent_node.start_position().row; + while docs_start_index > 0 { + let doc_node = &doc_nodes[docs_start_index - 1]; + let prev_doc_end_row = doc_node.end_position().row; + if prev_doc_end_row + 1 >= start_row { + docs_start_index -= 1; + start_row = doc_node.start_position().row; + } else { + break; + } + } + } + + // Generate a doc string from all of the doc nodes, applying any strip regexes. + let mut docs = None; + for doc_node in &doc_nodes[docs_start_index..] { + if let Ok(content) = str::from_utf8(&self.source[doc_node.byte_range()]) { let content = if let Some(regex) = - &self.config.doc_strip_regexes[mat.pattern_index] + &self.config.pattern_info[mat.pattern_index].doc_strip_regex { regex.replace_all(content, "").to_string() } else { @@ -232,68 +290,43 @@ where } } } - } else if index == self.config.function_capture_index { - function_node = node; - } else if index == self.config.method_capture_index { - method_node = node; - } else if index == self.config.module_capture_index { - module_node = node; - } else if index == self.config.name_capture_index { - name_node = node; } - } - let source = &self.source; - let tag_from_node = |node: Node, kind: TagKind| -> Option { - let name = str::from_utf8(&source[name_node?.byte_range()]).ok()?; + let source = &self.source; + let tag_from_node = |node: Node, kind: TagKind| -> Option { + // Slice out the first line of the text corresponding to the node in question. + let mut line_range = node.byte_range(); + line_range.end = line_range.end.min(line_range.start + 180); + let line = str::from_utf8(&source[line_range]).ok()?.lines().next()?; + Some(Tag { + name, + line, + kind, + docs, + loc: loc_for_node(node), + }) + }; - // Slice out the first line of the text corresponding to the node in question. - let mut line_range = node.byte_range(); - line_range.end = line_range.end.min(line_range.start + 180); - let line = str::from_utf8(&source[line_range]).ok()?.lines().next()?; - - Some(Tag { - name, - line, - loc: loc_for_node(node), - kind: kind, - docs, - }) - }; - - for (tag_node, tag_kind) in [ - (call_node, TagKind::Call), - (class_node, TagKind::Class), - (function_node, TagKind::Function), - (method_node, TagKind::Method), - (module_node, TagKind::Module), - ] - .iter() - .cloned() - { - if let Some(found) = tag_node { - // Only create one tag per node. The tag queue is sorted by node position - // to allow for fast lookup. - match self.tag_queue.binary_search_by_key( - &(found.end_byte(), found.start_byte(), found.id()), - |(node, _, _)| (node.end_byte(), node.start_byte(), node.id()), - ) { - Ok(i) => { - let (_, old_idx, tag) = &mut self.tag_queue[i]; - if *old_idx > mat.pattern_index { - if let Some(new_tag) = tag_from_node(found, tag_kind) { - *tag = new_tag; - *old_idx = mat.pattern_index; - } - } - } - Err(i) => { - if let Some(tag) = tag_from_node(found, tag_kind) { - self.tag_queue.insert(i, (found, mat.pattern_index, tag)) + // Only create one tag per node. The tag queue is sorted by node position + // to allow for fast lookup. + match self.tag_queue.binary_search_by_key( + &(tag_node.end_byte(), tag_node.start_byte(), tag_node.id()), + |(node, _, _)| (node.end_byte(), node.start_byte(), node.id()), + ) { + Ok(i) => { + let (_, old_idx, tag) = &mut self.tag_queue[i]; + if *old_idx > mat.pattern_index { + if let Some(new_tag) = tag_from_node(tag_node, tag_kind) { + *tag = new_tag; + *old_idx = mat.pattern_index; } } } - break; + Err(i) => { + if let Some(tag) = tag_from_node(tag_node, tag_kind) { + self.tag_queue.insert(i, (tag_node, mat.pattern_index, tag)) + } + } } } } From f170d292e0339255a96920e50688e4d6e69a5fd6 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 13 Mar 2020 13:04:36 -0700 Subject: [PATCH 23/42] Suppress unused field warning --- tags/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tags/src/lib.rs b/tags/src/lib.rs index 19b7be84..0e768b71 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -14,11 +14,11 @@ pub struct TagsConfiguration { class_capture_index: Option, doc_capture_index: Option, function_capture_index: Option, - locals_pattern_index: usize, method_capture_index: Option, module_capture_index: Option, name_capture_index: Option, pattern_info: Vec, + _locals_pattern_index: usize, } pub struct TagsContext { @@ -141,7 +141,6 @@ impl TagsConfiguration { Ok(TagsConfiguration { language, query, - locals_pattern_index, function_capture_index, class_capture_index, method_capture_index, @@ -150,6 +149,7 @@ impl TagsConfiguration { call_capture_index, name_capture_index, pattern_info, + _locals_pattern_index: locals_pattern_index, }) } } From b5483c67ab174189f9073fcdc709ab96fae22d22 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 13 Mar 2020 16:12:39 -0700 Subject: [PATCH 24/42] query: allow repetition operator to be used on non-terminal nodes --- cli/src/tests/query_test.rs | 36 +++++++++++++++- lib/src/query.c | 84 +++++++++++++++++++++++-------------- 2 files changed, 87 insertions(+), 33 deletions(-) diff --git a/cli/src/tests/query_test.rs b/cli/src/tests/query_test.rs index 355be9b5..7539bec2 100644 --- a/cli/src/tests/query_test.rs +++ b/cli/src/tests/query_test.rs @@ -518,7 +518,7 @@ fn test_query_with_immediate_siblings() { } #[test] -fn test_query_matches_with_repeated_nodes() { +fn test_query_matches_with_repeated_leaf_nodes() { allocations::record(|| { let language = get_language("javascript"); @@ -584,6 +584,40 @@ fn test_query_matches_with_repeated_nodes() { }); } +#[test] +fn test_query_matches_with_repeated_internal_nodes() { + allocations::record(|| { + let language = get_language("javascript"); + let mut parser = Parser::new(); + parser.set_language(language).unwrap(); + let mut cursor = QueryCursor::new(); + + let query = Query::new( + language, + " + (* + (method_definition + (decorator (identifier) @deco)+ + name: (property_identifier) @name)) + ", + ) + .unwrap(); + let source = " + class A { + @c + @d + e() {} + } + "; + let tree = parser.parse(source, None).unwrap(); + let matches = cursor.matches(&query, tree.root_node(), to_callback(source)); + assert_eq!( + collect_matches(matches, &query, source), + &[(0, vec![("deco", "c"), ("deco", "d"), ("name", "e")]),] + ); + }) +} + #[test] fn test_query_matches_in_language_with_simple_aliases() { allocations::record(|| { diff --git a/lib/src/query.c b/lib/src/query.c index b93c2ea4..aea2415e 100644 --- a/lib/src/query.c +++ b/lib/src/query.c @@ -35,11 +35,15 @@ typedef struct { * captured in this pattern. * - `depth` - The depth where this node occurs in the pattern. The root node * of the pattern has depth zero. + * - `repeat_step_index` - If this step is part of a repetition, the index of + * the beginning of the repetition. A `NONE` value means this step is not + * part of a repetition. */ typedef struct { TSSymbol symbol; TSFieldId field; uint16_t capture_ids[MAX_STEP_CAPTURE_COUNT]; + uint16_t repeat_step_index; uint16_t depth: 12; bool contains_captures: 1; bool is_immediate: 1; @@ -86,13 +90,14 @@ typedef struct { * represented as one of these states. */ typedef struct { + uint32_t id; uint16_t start_depth; uint16_t pattern_index; uint16_t step_index; - uint16_t capture_list_id; uint16_t consumed_capture_count; - uint32_t id; - uint16_t current_step_match_count; + uint16_t repeat_match_count; + uint16_t step_index_on_failure; + uint8_t capture_list_id; bool seeking_non_match; } QueryState; @@ -410,8 +415,10 @@ static QueryStep query_step__new( .field = 0, .capture_ids = {NONE, NONE, NONE, NONE}, .contains_captures = false, - .is_immediate = is_immediate, .is_repeated = false, + .is_last = false, + .is_immediate = is_immediate, + .repeat_step_index = NONE, }; } @@ -853,6 +860,7 @@ static TSQueryError ts_query__parse_pattern( if (stream->next == '+') { stream_advance(stream); step->is_repeated = true; + array_back(&self->steps)->repeat_step_index = starting_step_index; stream_skip_whitespace(stream); } @@ -1212,7 +1220,8 @@ static bool ts_query__cursor_add_state( .pattern_index = pattern->pattern_index, .start_depth = self->depth, .consumed_capture_count = 0, - .current_step_match_count = 0, + .repeat_match_count = 0, + .step_index_on_failure = NONE, .seeking_non_match = false, })); return true; @@ -1391,23 +1400,21 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { } } - if (node_does_match) { - // The `seeking_non_match` flag indicates that a previous QueryState - // has already begun processing this repeating sequence, so that *this* - // QueryState should not begin matching until a separate repeating sequence - // is found. - if (state->seeking_non_match) continue; - } else { + if (!node_does_match) { // If this QueryState has processed a repeating sequence, and that repeating // sequence has ended, move on to the *next* step of this state's pattern. - if (state->current_step_match_count > 0) { + if ( + state->step_index_on_failure != NONE && + (!later_sibling_can_match || step->is_repeated) + ) { LOG( " finish repetition state. pattern:%u, step:%u\n", state->pattern_index, state->step_index ); - state->step_index++; - state->current_step_match_count = 0; + state->step_index = state->step_index_on_failure; + state->step_index_on_failure = NONE; + state->repeat_match_count = 0; i--; continue; } @@ -1431,6 +1438,12 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { continue; } + // The `seeking_non_match` flag indicates that a previous QueryState + // has already begun processing this repeating sequence, so that *this* + // QueryState should not begin matching until a separate repeating sequence + // is found. + if (state->seeking_non_match) continue; + // Some patterns can match their root node in multiple ways, // capturing different children. If this pattern step could match // later children within the same parent, then this query state @@ -1443,7 +1456,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { step->depth > 0 && step->contains_captures && later_sibling_can_match && - state->current_step_match_count == 0 + state->repeat_match_count == 0 ) { QueryState *copy = ts_query__cursor_copy_state(self, state); @@ -1466,22 +1479,11 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { } } - LOG( - " advance state. pattern:%u, step:%u\n", - next_state->pattern_index, - next_state->step_index - ); - // If the current node is captured in this pattern, add it to the // capture list. for (unsigned j = 0; j < MAX_STEP_CAPTURE_COUNT; j++) { uint16_t capture_id = step->capture_ids[j]; if (step->capture_ids[j] == NONE) break; - LOG( - " capture node. pattern:%u, capture_id:%u\n", - next_state->pattern_index, - capture_id - ); CaptureList *capture_list = capture_list_pool_get( &self->capture_list_pool, next_state->capture_list_id @@ -1490,15 +1492,33 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { node, capture_id })); + LOG( + " capture node. pattern:%u, capture_id:%u, capture_count:%u\n", + next_state->pattern_index, + capture_id, + capture_list->size + ); } - // If this step repeats, then don't move to the next step until - // this step no longer matches. - if (step->is_repeated) { - next_state->current_step_match_count++; + // If this is the end of a repetition, then jump back to the beginning + // of that repetition. + if (step->repeat_step_index != NONE) { + next_state->step_index_on_failure = next_state->step_index + 1; + next_state->step_index = step->repeat_step_index; + next_state->repeat_match_count++; + LOG( + " continue repeat. pattern:%u, match_count:%u\n", + next_state->pattern_index, + next_state->repeat_match_count + ); } else { next_state->step_index++; - next_state->current_step_match_count = 0; + LOG( + " advance state. pattern:%u, step:%u\n", + next_state->pattern_index, + next_state->step_index + ); + QueryStep *next_step = step + 1; // If the pattern is now done, then remove it from the list of From 1b3a67834b881f600794b23ae7d5598adb950f8c Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 13 Mar 2020 16:13:19 -0700 Subject: [PATCH 25/42] cli: Fix loading of tags query --- cli/src/loader.rs | 7 ++++++- tags/src/lib.rs | 5 +++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/cli/src/loader.rs b/cli/src/loader.rs index b761c137..cf2eb143 100644 --- a/cli/src/loader.rs +++ b/cli/src/loader.rs @@ -32,6 +32,7 @@ pub struct LanguageConfiguration<'a> { pub highlights_filenames: Option>, pub injections_filenames: Option>, pub locals_filenames: Option>, + pub tags_filenames: Option>, language_id: usize, highlight_config: OnceCell>, tags_config: OnceCell>, @@ -434,6 +435,8 @@ impl Loader { injections: PathsJSON, #[serde(default)] locals: PathsJSON, + #[serde(default)] + tags: PathsJSON, } #[derive(Deserialize)] @@ -481,6 +484,7 @@ impl Loader { injection_regex: Self::regex(config_json.injection_regex), injections_filenames: config_json.injections.into_vec(), locals_filenames: config_json.locals.into_vec(), + tags_filenames: config_json.tags.into_vec(), highlights_filenames: config_json.highlights.into_vec(), highlight_config: OnceCell::new(), tags_config: OnceCell::new(), @@ -515,6 +519,7 @@ impl Loader { injections_filenames: None, locals_filenames: None, highlights_filenames: None, + tags_filenames: None, highlight_config: OnceCell::new(), tags_config: OnceCell::new(), highlight_names: &*self.highlight_names, @@ -574,7 +579,7 @@ impl<'a> LanguageConfiguration<'a> { pub fn tags_config(&self, language: Language) -> Result> { self.tags_config .get_or_try_init(|| { - let tags_query = self.read_queries(&self.highlights_filenames, "tags.scm")?; + let tags_query = self.read_queries(&self.tags_filenames, "tags.scm")?; let locals_query = self.read_queries(&self.locals_filenames, "locals.scm")?; if tags_query.is_empty() { Ok(None) diff --git a/tags/src/lib.rs b/tags/src/lib.rs index 0e768b71..b46f3a07 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -7,6 +7,7 @@ use tree_sitter::{ /// Contains the data neeeded to compute tags for code written in a /// particular language. +#[derive(Debug)] pub struct TagsConfiguration { pub language: Language, pub query: Query, @@ -26,7 +27,7 @@ pub struct TagsContext { cursor: QueryCursor, } -#[derive(Default)] +#[derive(Debug, Default)] struct PatternInfo { docs_adjacent_capture: Option, doc_strip_regex: Option, @@ -229,7 +230,7 @@ where } if index == self.config.name_capture_index { - name = str::from_utf8(&self.source[Some(capture.node)?.byte_range()]).ok(); + name = str::from_utf8(&self.source[capture.node.byte_range()]).ok(); } else if index == self.config.doc_capture_index { doc_nodes.push(capture.node); } else if index == self.config.call_capture_index { From 65f2874b9ebadd8fbfe6a7c84fe4cc2bd349d332 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 16 Mar 2020 14:02:31 -0700 Subject: [PATCH 26/42] query: Optimize handling of patterns with a wildcard at the root Avoid adding and removing states for these patterns on every node in the tree by just skipping the wildcard step of the matching process --- cli/src/tests/query_test.rs | 32 ++++++++++++++++++--- cli/src/tests/tags_test.rs | 2 +- lib/src/query.c | 57 +++++++++++++++++++++++++------------ 3 files changed, 68 insertions(+), 23 deletions(-) diff --git a/cli/src/tests/query_test.rs b/cli/src/tests/query_test.rs index 7539bec2..0daa4d5a 100644 --- a/cli/src/tests/query_test.rs +++ b/cli/src/tests/query_test.rs @@ -439,6 +439,10 @@ fn test_query_matches_with_named_wildcard() { fn test_query_matches_with_wildcard_at_the_root() { allocations::record(|| { let language = get_language("javascript"); + let mut cursor = QueryCursor::new(); + let mut parser = Parser::new(); + parser.set_language(language).unwrap(); + let query = Query::new( language, " @@ -453,16 +457,36 @@ fn test_query_matches_with_wildcard_at_the_root() { let source = "/* one */ var x; /* two */ function y() {} /* three */ class Z {}"; - let mut parser = Parser::new(); - parser.set_language(language).unwrap(); let tree = parser.parse(source, None).unwrap(); - let mut cursor = QueryCursor::new(); let matches = cursor.matches(&query, tree.root_node(), to_callback(source)); - assert_eq!( collect_matches(matches, &query, source), &[(0, vec![("doc", "/* two */"), ("name", "y")]),] ); + + let query = Query::new( + language, + " + (* (string) @a) + (* (number) @b) + (* (true) @c) + (* (false) @d) + ", + ) + .unwrap(); + + let source = "['hi', x(true), {y: false}]"; + + let tree = parser.parse(source, None).unwrap(); + let matches = cursor.matches(&query, tree.root_node(), to_callback(source)); + assert_eq!( + collect_matches(matches, &query, source), + &[ + (0, vec![("a", "'hi'")]), + (2, vec![("c", "true")]), + (3, vec![("d", "false")]), + ] + ); }); } diff --git a/cli/src/tests/tags_test.rs b/cli/src/tests/tags_test.rs index 65d2096f..c8c3a969 100644 --- a/cli/src/tests/tags_test.rs +++ b/cli/src/tests/tags_test.rs @@ -78,7 +78,7 @@ fn test_tags_javascript() { . (method_definition name: (property_identifier) @name) @method) - (select-adjacent! @doc @method) +; (select-adjacent! @doc @method) (strip! @doc "(^[/\\*\\s]*)|([/\\*\\s]*$)")) "#, "", diff --git a/lib/src/query.c b/lib/src/query.c index aea2415e..20c44fbb 100644 --- a/lib/src/query.c +++ b/lib/src/query.c @@ -44,8 +44,9 @@ typedef struct { TSFieldId field; uint16_t capture_ids[MAX_STEP_CAPTURE_COUNT]; uint16_t repeat_step_index; - uint16_t depth: 12; + uint16_t depth: 11; bool contains_captures: 1; + bool is_pattern_start: 1; bool is_immediate: 1; bool is_last: 1; bool is_repeated: 1; @@ -105,9 +106,9 @@ typedef Array(TSQueryCapture) CaptureList; /* * CaptureListPool - A collection of *lists* of captures. Each QueryState - * needs to maintain its own list of captures. They are all represented as - * slices of one shared array. The CaptureListPool keeps track of which - * parts of the shared array are currently in use by a QueryState. + * needs to maintain its own list of captures. To avoid repeated allocations, + * the reuses a fixed set of capture lists, and keeps track of which ones + * are currently in use. */ typedef struct { CaptureList list[32]; @@ -128,7 +129,6 @@ struct TSQuery { Array(Slice) predicates_by_pattern; Array(uint32_t) start_bytes_by_pattern; const TSLanguage *language; - uint16_t max_capture_count; uint16_t wildcard_root_pattern_count; TSSymbol *symbol_map; }; @@ -417,6 +417,7 @@ static QueryStep query_step__new( .contains_captures = false, .is_repeated = false, .is_last = false, + .is_pattern_start = false, .is_immediate = is_immediate, .repeat_step_index = NONE, }; @@ -939,16 +940,14 @@ TSQuery *ts_query_new( .predicates_by_pattern = array_new(), .symbol_map = symbol_map, .wildcard_root_pattern_count = 0, - .max_capture_count = 0, .language = language, }; // Parse all of the S-expressions in the given string. Stream stream = stream_new(source, source_len); stream_skip_whitespace(&stream); - uint32_t start_step_index; while (stream.input < stream.end) { - start_step_index = self->steps.size; + uint32_t start_step_index = self->steps.size; uint32_t capture_count = 0; array_push(&self->start_bytes_by_pattern, stream.input - source); array_push(&self->predicates_by_pattern, ((Slice) { @@ -966,7 +965,19 @@ TSQuery *ts_query_new( return NULL; } + // If a pattern has a wildcard at its root, optimize the matching process + // by skipping matching the wildcard. + if ( + self->steps.contents[start_step_index].symbol == WILDCARD_SYMBOL + ) { + QueryStep *second_step = &self->steps.contents[start_step_index + 1]; + if (second_step->symbol != WILDCARD_SYMBOL && second_step->depth != PATTERN_DONE_MARKER) { + start_step_index += 1; + } + } + // Maintain a map that can look up patterns for a given root symbol. + self->steps.contents[start_step_index].is_pattern_start = true; ts_query__pattern_map_insert( self, self->steps.contents[start_step_index].symbol, @@ -975,13 +986,6 @@ TSQuery *ts_query_new( if (self->steps.contents[start_step_index].symbol == WILDCARD_SYMBOL) { self->wildcard_root_pattern_count++; } - - // Keep track of the maximum number of captures in pattern, because - // that numer determines how much space is needed to store each capture - // list. - if (capture_count > self->max_capture_count) { - self->max_capture_count = capture_count; - } } ts_query__finalize_steps(self); @@ -1188,6 +1192,19 @@ static bool ts_query__cursor_add_state( TSQueryCursor *self, const PatternEntry *pattern ) { + QueryStep *step = &self->query->steps.contents[pattern->step_index]; + + // If this pattern begins with a repetition, then avoid creating + // new states after already matching the repetition one or more times. + // The query should only one match for the repetition - the one that + // started the earliest. + if (step->is_repeated) { + for (unsigned i = 0; i < self->states.size; i++) { + QueryState *state = &self->states.contents[i]; + if (state->step_index == pattern->step_index) return true; + } + } + uint32_t list_id = capture_list_pool_acquire(&self->capture_list_pool); // If there are no capture lists left in the pool, then terminate whichever @@ -1213,12 +1230,16 @@ static bool ts_query__cursor_add_state( } } - LOG(" start state. pattern:%u\n", pattern->pattern_index); + LOG( + " start state. pattern:%u, step:%u\n", + pattern->pattern_index, + pattern->step_index + ); array_push(&self->states, ((QueryState) { .capture_list_id = list_id, .step_index = pattern->step_index, .pattern_index = pattern->pattern_index, - .start_depth = self->depth, + .start_depth = self->depth - step->depth, .consumed_capture_count = 0, .repeat_match_count = 0, .step_index_on_failure = NONE, @@ -1453,7 +1474,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { // siblings. QueryState *next_state = state; if ( - step->depth > 0 && + !step->is_pattern_start && step->contains_captures && later_sibling_can_match && state->repeat_match_count == 0 From 94bbf14d0e3ed230e079d0cf55e4c1733dc75385 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 16 Mar 2020 14:28:58 -0700 Subject: [PATCH 27/42] tags: Add test where no comments are adjacent to definition --- cli/src/tests/tags_test.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/cli/src/tests/tags_test.rs b/cli/src/tests/tags_test.rs index c8c3a969..0a37b618 100644 --- a/cli/src/tests/tags_test.rs +++ b/cli/src/tests/tags_test.rs @@ -101,17 +101,28 @@ fn test_tags_javascript() { getAge() { } } + + // ok + + class Agent { + + } "#, ) .collect::>(); assert_eq!( tags.iter().map(|t| (t.name, t.kind)).collect::>(), - &[("getAge", TagKind::Method), ("Customer", TagKind::Class)] + &[ + ("getAge", TagKind::Method), + ("Customer", TagKind::Class), + ("Agent", TagKind::Class) + ] ); assert_eq!(tags[0].docs.as_ref().unwrap(), "Get the customer's age"); assert_eq!( tags[1].docs.as_ref().unwrap(), "Data about a customer.\nbla bla bla" ); + assert_eq!(tags[2].docs, None); } From 94a60b8e139b29ff9926117aa39ce4995c16bc1a Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 17 Mar 2020 11:19:30 -0700 Subject: [PATCH 28/42] tags: Start adapting Tag struct for use in C API --- Cargo.lock | 20 +--- cli/src/tags.rs | 13 ++- cli/src/tests/tags_test.rs | 91 ++++++++------- tags/Cargo.toml | 1 + tags/src/lib.rs | 219 +++++++++++++++++++------------------ 5 files changed, 176 insertions(+), 168 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 76a9973a..3c8da16a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5,7 +5,7 @@ name = "aho-corasick" version = "0.6.9" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "memchr 2.3.3 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -303,13 +303,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "memchr" -version = "2.1.1" +version = "2.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", - "libc 0.2.61 (registry+https://github.com/rust-lang/crates.io-index)", - "version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", -] [[package]] name = "nodrop" @@ -536,7 +531,7 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "aho-corasick 0.6.9 (registry+https://github.com/rust-lang/crates.io-index)", - "memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "memchr 2.3.3 (registry+https://github.com/rust-lang/crates.io-index)", "regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", "thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", "utf8-ranges 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", @@ -794,6 +789,7 @@ dependencies = [ name = "tree-sitter-tags" version = "0.1.6" dependencies = [ + "memchr 2.3.3 (registry+https://github.com/rust-lang/crates.io-index)", "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", @@ -860,11 +856,6 @@ name = "vec_map" version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -[[package]] -name = "version_check" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "void" version = "1.0.2" @@ -944,7 +935,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum lock_api 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "62ebf1391f6acad60e5c8b43706dde4582df75c06698ab44511d15016bc2442c" "checksum log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c84ec4b527950aa83a329754b01dbe3f58361d1c5efacd1f6d68c494d08a17c6" "checksum matches 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)" = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08" -"checksum memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0a3eb002f0535929f1199681417029ebea04aadc0c7a4224b46be99c7f5d6a16" +"checksum memchr 2.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3728d817d99e5ac407411fa471ff9800a778d88a24685968b36824eaf4bee400" "checksum nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "2f9667ddcc6cc8a43afc9b7917599d7216aa09c463919ea32c59ed6cac8bc945" "checksum num-integer 0.1.39 (registry+https://github.com/rust-lang/crates.io-index)" = "e83d528d2677f0518c570baf2b7abdcf0cd2d248860b68507bdcb3e91d4c0cea" "checksum num-traits 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "0b3a5d7cc97d6d30d8b9bc8fa19bf45349ffe46241e8816f50f62f6d6aaabee1" @@ -1005,7 +996,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum url 1.7.2 (registry+https://github.com/rust-lang/crates.io-index)" = "dd4e7c0d531266369519a4aa4f399d748bd37043b00bde1e4ff1f60a120b355a" "checksum utf8-ranges 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "796f7e48bef87609f7ade7e06495a87d5cd06c7866e6a5cbfceffc558a243737" "checksum vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "05c78687fb1a80548ae3250346c3db86a80a7cdd77bda190189f2d0a0987c81a" -"checksum version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "914b1a6776c4c929a602fafd8bc742e06365d4bcbe48c30f9cca5824f70dc9dd" "checksum void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" "checksum webbrowser 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "c01efd7cb6939b7f34983f1edff0550e5b21b49e2db4495656295922df8939ac" "checksum widestring 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "effc0e4ff8085673ea7b9b2e3c73f6bd4d118810c9009ed8f1e16bd96c331db6" diff --git a/cli/src/tags.rs b/cli/src/tags.rs index 8cf0d611..de9eac1c 100644 --- a/cli/src/tags.rs +++ b/cli/src/tags.rs @@ -1,8 +1,8 @@ use super::loader::Loader; use crate::error::{Error, Result}; -use std::fs; use std::io::{self, Write}; use std::path::Path; +use std::{fs, str}; use tree_sitter_tags::TagsContext; pub fn generate_tags(loader: &Loader, scope: Option<&str>, paths: &[String]) -> Result<()> { @@ -34,8 +34,15 @@ pub fn generate_tags(loader: &Loader, scope: Option<&str>, paths: &[String]) -> if let Some(tags_config) = language_config.tags_config(language)? { let source = fs::read(path)?; for tag in context.generate_tags(tags_config, &source) { - serde_json::to_writer(&mut stdout, &tag)?; - stdout.write(b"\n")?; + writeln!( + &mut stdout, + "{}\t{}\t{} - {}\tdocs:{}", + tag.kind, + str::from_utf8(&source[tag.name_range]).unwrap_or(""), + tag.span.start, + tag.span.end, + tag.docs.unwrap_or(String::new()), + )?; } } else { eprintln!("No tags config found for path {:?}", path); diff --git a/cli/src/tests/tags_test.rs b/cli/src/tests/tags_test.rs index 0a37b618..bca35f71 100644 --- a/cli/src/tests/tags_test.rs +++ b/cli/src/tests/tags_test.rs @@ -26,28 +26,29 @@ fn test_tags_python() { ) .unwrap(); + let source = br#" + class Customer: + """ + Data about a customer + """ + + def age(self): + ''' + Get the customer's age + ''' + compute_age(self.id) + } + "#; + let mut tag_context = TagsContext::new(); let tags = tag_context - .generate_tags( - &tags_config, - br#" - class Customer: - """ - Data about a customer - """ - - def age(self): - ''' - Get the customer's age - ''' - compute_age(self.id) - } - "#, - ) + .generate_tags(&tags_config, source) .collect::>(); assert_eq!( - tags.iter().map(|t| (t.name, t.kind)).collect::>(), + tags.iter() + .map(|t| (substr(source, &t.name_range), t.kind)) + .collect::>(), &[ ("Customer", TagKind::Class), ("age", TagKind::Function), @@ -55,8 +56,12 @@ fn test_tags_python() { ] ); + assert_eq!(substr(source, &tags[0].line_range), " class Customer:"); + assert_eq!( + substr(source, &tags[1].line_range), + " def age(self):" + ); assert_eq!(tags[0].docs.as_ref().unwrap(), "Data about a customer"); - assert_eq!(tags[0].line, "class Customer:"); assert_eq!(tags[1].docs.as_ref().unwrap(), "Get the customer's age"); } @@ -86,33 +91,33 @@ fn test_tags_javascript() { .unwrap(); let mut tag_context = TagsContext::new(); + let source = br#" + // hi + + // Data about a customer. + // bla bla bla + class Customer { + /* + * Get the customer's age + */ + getAge() { + } + } + + // ok + + class Agent { + + } + "#; let tags = tag_context - .generate_tags( - &tags_config, - br#" - // hi - - // Data about a customer. - // bla bla bla - class Customer { - /* - * Get the customer's age - */ - getAge() { - } - } - - // ok - - class Agent { - - } - "#, - ) + .generate_tags(&tags_config, source) .collect::>(); assert_eq!( - tags.iter().map(|t| (t.name, t.kind)).collect::>(), + tags.iter() + .map(|t| (substr(source, &t.name_range), t.kind)) + .collect::>(), &[ ("getAge", TagKind::Method), ("Customer", TagKind::Class), @@ -126,3 +131,7 @@ fn test_tags_javascript() { ); assert_eq!(tags[2].docs, None); } + +fn substr<'a>(source: &'a [u8], range: &std::ops::Range) -> &'a str { + std::str::from_utf8(&source[range.clone()]).unwrap() +} diff --git a/tags/Cargo.toml b/tags/Cargo.toml index 1d47c951..e6fc2425 100644 --- a/tags/Cargo.toml +++ b/tags/Cargo.toml @@ -20,6 +20,7 @@ crate-type = ["lib", "staticlib"] regex = "1" serde_json = "1.0" serde_derive = "1.0" +memchr = "2.3" [dependencies.serde] version = "1.0" diff --git a/tags/src/lib.rs b/tags/src/lib.rs index b46f3a07..89eafa14 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -1,10 +1,13 @@ +use memchr::{memchr, memrchr}; use regex::Regex; -use serde::{Serialize, Serializer}; -use std::{mem, ops, str}; +use std::ops::Range; +use std::{fmt, mem, str}; use tree_sitter::{ - Language, Node, Parser, Query, QueryCursor, QueryError, QueryPredicateArg, Tree, + Language, Node, Parser, Point, Query, QueryCursor, QueryError, QueryPredicateArg, Tree, }; +const MAX_LINE_LEN: usize = 180; + /// Contains the data neeeded to compute tags for code written in a /// particular language. #[derive(Debug)] @@ -27,6 +30,31 @@ pub struct TagsContext { cursor: QueryCursor, } +#[derive(Debug, Clone)] +pub struct Tag { + pub kind: TagKind, + pub range: Range, + pub name_range: Range, + pub line_range: Range, + pub span: Range, + pub docs: Option, +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum TagKind { + Function, + Method, + Class, + Module, + Call, +} + +#[derive(Debug)] +pub enum Error { + Query(QueryError), + Regex(regex::Error), +} + #[derive(Debug, Default)] struct PatternInfo { docs_adjacent_capture: Option, @@ -41,43 +69,7 @@ where _tree: Tree, source: &'a [u8], config: &'a TagsConfiguration, - tag_queue: Vec<(Node<'a>, usize, Tag<'a>)>, -} - -#[derive(Debug, Serialize, Clone)] -pub struct Loc { - pub byte_range: ops::Range, - pub span: ops::Range, -} - -#[derive(Debug, Serialize, Clone)] -pub struct Pos { - pub line: i64, - pub column: i64, -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub enum TagKind { - Function, - Method, - Class, - Module, - Call, -} - -#[derive(Debug, Serialize, Clone)] -pub struct Tag<'a> { - pub kind: TagKind, - pub name: &'a str, - pub docs: Option, - pub loc: Loc, - pub line: &'a str, -} - -#[derive(Debug)] -pub enum Error { - Query(QueryError), - Regex(regex::Error), + tag_queue: Vec<(Node<'a>, usize, Tag)>, } impl TagsConfiguration { @@ -163,12 +155,11 @@ impl TagsContext { } } - // TODO: This should return an iterator rather than build up a vector pub fn generate_tags<'a>( &'a mut self, config: &'a TagsConfiguration, source: &'a [u8], - ) -> impl Iterator> + 'a { + ) -> impl Iterator + 'a { self.parser .set_language(config.language) .expect("Incompatible language"); @@ -199,9 +190,9 @@ impl<'a, I> Iterator for TagsIter<'a, I> where I: Iterator>, { - type Item = Tag<'a>; + type Item = Tag; - fn next(&mut self) -> Option> { + fn next(&mut self) -> Option { loop { // If there is a queued tag for an earlier node in the syntax tree, then pop // it off of the queue and return it. @@ -216,10 +207,10 @@ where // If there is another match, then compute its tag and add it to the // tag queue. if let Some(mat) = self.matches.next() { - let mut name = None; + let mut name_range = None; let mut doc_nodes = Vec::new(); let mut tag_node = None; - let mut tag_kind = TagKind::Call; + let mut kind = TagKind::Call; let mut docs_adjacent_node = None; for capture in mat.captures { @@ -230,28 +221,28 @@ where } if index == self.config.name_capture_index { - name = str::from_utf8(&self.source[capture.node.byte_range()]).ok(); + name_range = Some(capture.node.byte_range()); } else if index == self.config.doc_capture_index { doc_nodes.push(capture.node); } else if index == self.config.call_capture_index { tag_node = Some(capture.node); - tag_kind = TagKind::Call; + kind = TagKind::Call; } else if index == self.config.class_capture_index { tag_node = Some(capture.node); - tag_kind = TagKind::Class; + kind = TagKind::Class; } else if index == self.config.function_capture_index { tag_node = Some(capture.node); - tag_kind = TagKind::Function; + kind = TagKind::Function; } else if index == self.config.method_capture_index { tag_node = Some(capture.node); - tag_kind = TagKind::Method; + kind = TagKind::Method; } else if index == self.config.module_capture_index { tag_node = Some(capture.node); - tag_kind = TagKind::Module; + kind = TagKind::Module; } } - if let (Some(tag_node), Some(name)) = (tag_node, name) { + if let (Some(tag_node), Some(name_range)) = (tag_node, name_range) { // If needed, filter the doc nodes based on their ranges, selecting // only the slice that are adjacent to some specified node. let mut docs_start_index = 0; @@ -293,41 +284,42 @@ where } } - let source = &self.source; - let tag_from_node = |node: Node, kind: TagKind| -> Option { - // Slice out the first line of the text corresponding to the node in question. - let mut line_range = node.byte_range(); - line_range.end = line_range.end.min(line_range.start + 180); - let line = str::from_utf8(&source[line_range]).ok()?.lines().next()?; - Some(Tag { - name, - line, - kind, - docs, - loc: loc_for_node(node), - }) - }; - // Only create one tag per node. The tag queue is sorted by node position // to allow for fast lookup. + let range = tag_node.byte_range(); match self.tag_queue.binary_search_by_key( - &(tag_node.end_byte(), tag_node.start_byte(), tag_node.id()), + &(range.end, range.start, tag_node.id()), |(node, _, _)| (node.end_byte(), node.start_byte(), node.id()), ) { Ok(i) => { - let (_, old_idx, tag) = &mut self.tag_queue[i]; - if *old_idx > mat.pattern_index { - if let Some(new_tag) = tag_from_node(tag_node, tag_kind) { - *tag = new_tag; - *old_idx = mat.pattern_index; - } - } - } - Err(i) => { - if let Some(tag) = tag_from_node(tag_node, tag_kind) { - self.tag_queue.insert(i, (tag_node, mat.pattern_index, tag)) + let (_, pattern_index, tag) = &mut self.tag_queue[i]; + if *pattern_index > mat.pattern_index { + *pattern_index = mat.pattern_index; + *tag = Tag { + line_range: line_range(self.source, range.start, MAX_LINE_LEN), + span: tag_node.start_position()..tag_node.start_position(), + kind, + range, + name_range, + docs, + }; } } + Err(i) => self.tag_queue.insert( + i, + ( + tag_node, + mat.pattern_index, + Tag { + line_range: line_range(self.source, range.start, MAX_LINE_LEN), + span: tag_node.start_position()..tag_node.start_position(), + kind, + range, + name_range, + docs, + }, + ), + ), } } } @@ -341,35 +333,19 @@ where } } -impl Serialize for TagKind { - fn serialize(&self, s: S) -> Result - where - S: Serializer, - { - match self { - TagKind::Call => "Call", - TagKind::Module => "Module", - TagKind::Class => "Class", - TagKind::Method => "Method", - TagKind::Function => "Function", - } - .serialize(s) - } -} - -fn loc_for_node(node: Node) -> Loc { - Loc { - byte_range: node.byte_range(), - span: node.start_position().into()..node.start_position().into(), - } -} - -impl From for Pos { - fn from(point: tree_sitter::Point) -> Self { - return Pos { - line: point.row as i64, - column: point.column as i64, - }; +impl fmt::Display for TagKind { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "{}", + match self { + TagKind::Call => "Call", + TagKind::Module => "Module", + TagKind::Class => "Class", + TagKind::Method => "Method", + TagKind::Function => "Function", + } + ) } } @@ -384,3 +360,28 @@ impl From for Error { Error::Query(error) } } + +fn line_range(text: &[u8], index: usize, max_line_len: usize) -> Range { + let start = memrchr(b'\n', &text[0..index]).map_or(0, |i| i + 1); + let max_line_len = max_line_len.min(text.len() - start); + let end = start + memchr(b'\n', &text[start..(start + max_line_len)]).unwrap_or(max_line_len); + start..end +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_get_line() { + let text = b"abc\ndefg\nhijkl"; + assert_eq!(line_range(text, 0, 10), 0..3); + assert_eq!(line_range(text, 1, 10), 0..3); + assert_eq!(line_range(text, 2, 10), 0..3); + assert_eq!(line_range(text, 3, 10), 0..3); + assert_eq!(line_range(text, 1, 2), 0..2); + assert_eq!(line_range(text, 4, 10), 4..8); + assert_eq!(line_range(text, 5, 10), 4..8); + assert_eq!(line_range(text, 11, 10), 9..14); + } +} From 591e066226524ff1237525e87ceb0c97ad52def3 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 17 Mar 2020 12:05:09 -0700 Subject: [PATCH 29/42] tags: Make cli output more human readable --- cli/src/tags.rs | 16 +++++++++++++--- tags/src/lib.rs | 19 ++++++++----------- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/cli/src/tags.rs b/cli/src/tags.rs index de9eac1c..86eb2e33 100644 --- a/cli/src/tags.rs +++ b/cli/src/tags.rs @@ -32,17 +32,27 @@ pub fn generate_tags(loader: &Loader, scope: Option<&str>, paths: &[String]) -> }; if let Some(tags_config) = language_config.tags_config(language)? { + let path_str = format!("{:?}", path); + writeln!(&mut stdout, "{}", &path_str[1..path_str.len() - 1])?; + let source = fs::read(path)?; for tag in context.generate_tags(tags_config, &source) { - writeln!( + write!( &mut stdout, - "{}\t{}\t{} - {}\tdocs:{}", + " {:<8}\t{:<40}\t{:>9}-{:<9}", tag.kind, str::from_utf8(&source[tag.name_range]).unwrap_or(""), tag.span.start, tag.span.end, - tag.docs.unwrap_or(String::new()), )?; + if let Some(docs) = tag.docs { + if docs.len() > 120 { + write!(&mut stdout, "\t{:?}...", &docs[0..120])?; + } else { + write!(&mut stdout, "\t{:?}", &docs)?; + } + } + writeln!(&mut stdout, "")?; } } else { eprintln!("No tags config found for path {:?}", path); diff --git a/tags/src/lib.rs b/tags/src/lib.rs index 89eafa14..8e3625e5 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -335,17 +335,14 @@ where impl fmt::Display for TagKind { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!( - f, - "{}", - match self { - TagKind::Call => "Call", - TagKind::Module => "Module", - TagKind::Class => "Class", - TagKind::Method => "Method", - TagKind::Function => "Function", - } - ) + match self { + TagKind::Call => "Call", + TagKind::Module => "Module", + TagKind::Class => "Class", + TagKind::Method => "Method", + TagKind::Function => "Function", + } + .fmt(f) } } From e3e1bdba759f3ac4a4a03e891a930d4629518c0e Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 17 Mar 2020 13:15:34 -0700 Subject: [PATCH 30/42] tags: Start work on C API --- Cargo.lock | 6 - tags/Cargo.toml | 8 +- tags/include/tree_sitter/tags.h | 92 ++++++++++++++++ tags/src/c_lib.rs | 188 ++++++++++++++++++++++++++++++++ tags/src/lib.rs | 2 + 5 files changed, 283 insertions(+), 13 deletions(-) create mode 100644 tags/include/tree_sitter/tags.h create mode 100644 tags/src/c_lib.rs diff --git a/Cargo.lock b/Cargo.lock index 3c8da16a..2c298eed 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -603,9 +603,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" name = "serde" version = "1.0.80" source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", -] [[package]] name = "serde_derive" @@ -791,9 +788,6 @@ version = "0.1.6" dependencies = [ "memchr 2.3.3 (registry+https://github.com/rust-lang/crates.io-index)", "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", - "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", - "serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", - "serde_json 1.0.33 (registry+https://github.com/rust-lang/crates.io-index)", "tree-sitter 0.6.3", ] diff --git a/tags/Cargo.toml b/tags/Cargo.toml index e6fc2425..43557bb2 100644 --- a/tags/Cargo.toml +++ b/tags/Cargo.toml @@ -9,7 +9,7 @@ authors = [ license = "MIT" readme = "README.md" edition = "2018" -keywords = ["incremental", "parsing", "syntax", "highlighting"] +keywords = ["incremental", "parsing", "syntax", "tagging"] categories = ["parsing", "text-editors"] repository = "https://github.com/tree-sitter/tree-sitter" @@ -18,14 +18,8 @@ crate-type = ["lib", "staticlib"] [dependencies] regex = "1" -serde_json = "1.0" -serde_derive = "1.0" memchr = "2.3" -[dependencies.serde] -version = "1.0" -features = ["derive"] - [dependencies.tree-sitter] version = ">= 0.3.7" path = "../lib" diff --git a/tags/include/tree_sitter/tags.h b/tags/include/tree_sitter/tags.h new file mode 100644 index 00000000..d492ad31 --- /dev/null +++ b/tags/include/tree_sitter/tags.h @@ -0,0 +1,92 @@ +#ifndef TREE_SITTER_TAGS_H_ +#define TREE_SITTER_TAGS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include "tree_sitter/api.h" + +typedef enum { + TSTagsOk, + TSTagsUnknownScope, + TSTagsTimeout, + TSTagsInvalidLanguage, + TSTagsInvalidUtf8, + TSTagsInvalidRegex, + TSTagsInvalidQuery, +} TSTagsError; + +typedef enum { + TSTagKindFunction, + TSTagKindMethod, + TSTagKindClass, + TSTagKindModule, + TSTagKindCall, +} TSTagKind; + +typedef struct { + TSTagKind kind; + uint32_t start_byte; + uint32_t end_byte; + uint32_t name_start_byte; + uint32_t name_end_byte; + uint32_t line_start_byte; + uint32_t line_end_byte; + TSPoint start_point; + TSPoint end_point; + const char *docs; + uint32_t docs_length; +} TSTag; + +typedef struct TSTagger TSTagger; +typedef struct TSTagsBuffer TSTagsBuffer; + +// Construct a tagger. +TSTagger *ts_tagger_new(); + +// Delete a tagger. +void ts_tagger_delete(TSTagger *); + +// Add a `TSLanguage` to a tagger. The language is associated with a scope name, +// which can be used later to select a language for tagging. Along with the language, +// you must provide two tree query strings, one for matching tags themselves, and one +// specifying local variable definitions. +TSTagsError ts_tagger_add_language( + TSTagger *self, + const char *scope_name, + const TSLanguage *language, + const char *tags_query, + const char *locals_query, + uint32_t tags_query_len, + uint32_t locals_query_len +); + +// Compute syntax highlighting for a given document. You must first +// create a `TSTagsBuffer` to hold the output. +TSTagsError ts_tagger_tag( + const TSTagger *self, + const char *scope_name, + const char *source_code, + uint32_t source_code_len, + TSTagsBuffer *output, + const size_t *cancellation_flag +); + +// A tags buffer stores the results produced by a tagging call. It can be reused +// for multiple calls. +TSTagsBuffer *ts_tags_buffer_new(); + +// Delete a tags buffer. +void ts_tags_buffer_delete(TSTagsBuffer *); + +// Access the tags within a tag buffer. +const TSTag *ts_tags_buffer_line_offsets(const TSTagsBuffer *); +uint32_t ts_tags_buffer_len(const TSTagsBuffer *); + +#ifdef __cplusplus +} +#endif + +#endif // TREE_SITTER_TAGS_H_ diff --git a/tags/src/c_lib.rs b/tags/src/c_lib.rs new file mode 100644 index 00000000..714d956e --- /dev/null +++ b/tags/src/c_lib.rs @@ -0,0 +1,188 @@ +use super::{Error, TagKind, TagsConfiguration, TagsContext}; +use std::collections::HashMap; +use std::ffi::CStr; +use std::process::abort; +use std::{fmt, slice, str}; +use tree_sitter::Language; + +#[repr(C)] +enum TSTagsError { + Ok, + UnknownScope, + Timeout, + InvalidLanguage, + InvalidUtf8, + InvalidRegex, + InvalidQuery, +} + +#[repr(C)] +enum TSTagKind { + Function, + Method, + Class, + Module, + Call, +} + +#[repr(C)] +struct TSPoint { + row: u32, + column: u32, +} + +#[repr(C)] +struct TSTag { + kind: TSTagKind, + start_byte: u32, + end_byte: u32, + name_start_byte: u32, + name_end_byte: u32, + line_start_byte: u32, + line_end_byte: u32, + start_point: TSPoint, + end_point: TSPoint, + docs: *const u8, + docs_length: u32, +} + +struct TSTagger { + languages: HashMap, +} + +struct TSTagsBuffer { + context: TagsContext, + tags: Vec, + docs: Vec, +} + +#[no_mangle] +unsafe extern "C" fn ts_tagger_add_language( + this: *mut TSTagger, + scope_name: *const i8, + language: Language, + tags_query: *const u8, + locals_query: *const u8, + tags_query_len: u32, + locals_query_len: u32, +) -> TSTagsError { + let tagger = unwrap_mut_ptr(this); + let scope_name = unwrap(CStr::from_ptr(scope_name).to_str()); + let tags_query = slice::from_raw_parts(tags_query, tags_query_len as usize); + let locals_query = slice::from_raw_parts(locals_query, locals_query_len as usize); + let tags_query = match str::from_utf8(tags_query) { + Ok(e) => e, + Err(_) => return TSTagsError::InvalidUtf8, + }; + let locals_query = match str::from_utf8(locals_query) { + Ok(e) => e, + Err(_) => return TSTagsError::InvalidUtf8, + }; + match TagsConfiguration::new(language, tags_query, locals_query) { + Ok(c) => { + tagger.languages.insert(scope_name.to_string(), c); + TSTagsError::Ok + } + Err(Error::Query(_)) => TSTagsError::InvalidQuery, + Err(Error::Regex(_)) => TSTagsError::InvalidRegex, + } +} + +#[no_mangle] +unsafe extern "C" fn ts_tagger_tag( + this: *mut TSTagger, + scope_name: *const i8, + source_code: *const u8, + source_code_len: u32, + output: *mut TSTagsBuffer, + cancellation_flag: *const usize, +) -> TSTagsError { + let tagger = unwrap_mut_ptr(this); + let buffer = unwrap_mut_ptr(output); + let scope_name = unwrap(CStr::from_ptr(scope_name).to_str()); + if let Some(config) = tagger.languages.get(scope_name) { + let source_code = slice::from_raw_parts(source_code, source_code_len as usize); + for tag in buffer.context.generate_tags(config, source_code) { + let prev_docs_len = buffer.docs.len(); + if let Some(docs) = tag.docs { + buffer.docs.extend_from_slice(docs.as_bytes()); + } + let docs = &buffer.docs[prev_docs_len..]; + buffer.tags.push(TSTag { + kind: match tag.kind { + TagKind::Function => TSTagKind::Function, + TagKind::Method => TSTagKind::Method, + TagKind::Class => TSTagKind::Class, + TagKind::Module => TSTagKind::Module, + TagKind::Call => TSTagKind::Call, + }, + start_byte: tag.range.start as u32, + end_byte: tag.range.end as u32, + name_start_byte: tag.name_range.start as u32, + name_end_byte: tag.name_range.end as u32, + line_start_byte: tag.line_range.start as u32, + line_end_byte: tag.line_range.end as u32, + start_point: TSPoint { + row: tag.span.start.row as u32, + column: tag.span.start.column as u32, + }, + end_point: TSPoint { + row: tag.span.end.row as u32, + column: tag.span.end.column as u32, + }, + docs: docs.as_ptr(), + docs_length: docs.len() as u32, + }); + } + TSTagsError::Ok + } else { + TSTagsError::UnknownScope + } +} + +#[no_mangle] +extern "C" fn ts_tags_buffer_new() -> *mut TSTagsBuffer { + Box::into_raw(Box::new(TSTagsBuffer { + context: TagsContext::new(), + tags: Vec::new(), + docs: Vec::new(), + })) +} + +#[no_mangle] +extern "C" fn ts_tags_buffer_delete(this: *mut TSTagsBuffer) { + drop(unsafe { Box::from_raw(this) }) +} + +#[no_mangle] +extern "C" fn ts_tags_buffer_line_offsets(this: *const TSTagsBuffer) -> *const TSTag { + let buffer = unwrap_ptr(this); + buffer.tags.as_ptr() +} + +#[no_mangle] +extern "C" fn ts_tags_buffer_len(this: *const TSTagsBuffer) -> u32 { + let buffer = unwrap_ptr(this); + buffer.tags.len() as u32 +} + +fn unwrap_ptr<'a, T>(result: *const T) -> &'a T { + unsafe { result.as_ref() }.unwrap_or_else(|| { + eprintln!("{}:{} - pointer must not be null", file!(), line!()); + abort(); + }) +} + +fn unwrap_mut_ptr<'a, T>(result: *mut T) -> &'a mut T { + unsafe { result.as_mut() }.unwrap_or_else(|| { + eprintln!("{}:{} - pointer must not be null", file!(), line!()); + abort(); + }) +} + +fn unwrap(result: Result) -> T { + result.unwrap_or_else(|error| { + eprintln!("tree-sitter tag error: {}", error); + abort(); + }) +} diff --git a/tags/src/lib.rs b/tags/src/lib.rs index 8e3625e5..5f579d1d 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -1,3 +1,5 @@ +mod c_lib; + use memchr::{memchr, memrchr}; use regex::Regex; use std::ops::Range; From 651fa38c93636acc9353d9ee837248b76a918c2f Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 18 Mar 2020 10:38:20 -0700 Subject: [PATCH 31/42] Add unit test for tagging via C API. Fix docs handling --- cli/src/tests/tags_test.rs | 187 +++++++++++++++++++++++++------- tags/include/tree_sitter/tags.h | 8 +- tags/src/c_lib.rs | 90 +++++++++------ tags/src/lib.rs | 2 +- 4 files changed, 211 insertions(+), 76 deletions(-) diff --git a/cli/src/tests/tags_test.rs b/cli/src/tests/tags_test.rs index bca35f71..d4cbc687 100644 --- a/cli/src/tests/tags_test.rs +++ b/cli/src/tests/tags_test.rs @@ -1,30 +1,56 @@ +use super::helpers::allocations; use super::helpers::fixtures::get_language; +use std::ffi::CString; +use std::{ptr, slice, str}; +use tree_sitter_tags::c_lib as c; use tree_sitter_tags::{TagKind, TagsConfiguration, TagsContext}; +const PYTHON_TAG_QUERY: &'static str = r#" +((function_definition + name: (identifier) @name + body: (block . (expression_statement (string) @doc))) @function + (strip! @doc "(^['\"\\s]*)|(['\"\\s]*$)")) +(function_definition + name: (identifier) @name) @function +((class_definition + name: (identifier) @name + body: (block . (expression_statement (string) @doc))) @class + (strip! @doc "(^['\"\\s]*)|(['\"\\s]*$)")) +(class_definition + name: (identifier) @name) @class +(call + function: (identifier) @name) @call +"#; + +const JS_TAG_QUERY: &'static str = r#" +((* + (comment)+ @doc . + (class_declaration + name: (identifier) @name) @class) + (select-adjacent! @doc @class) + (strip! @doc "(^[/\\*\\s]*)|([/\\*\\s]*$)")) + +((* + (comment)+ @doc . + (method_definition + name: (property_identifier) @name) @method) + (select-adjacent! @doc @method) + (strip! @doc "(^[/\\*\\s]*)|([/\\*\\s]*$)")) + +((* + (comment)+ @doc . + (function_declaration + name: (identifier) @name) @function) + (select-adjacent! @doc @function) + (strip! @doc "(^[/\\*\\s]*)|([/\\*\\s]*$)")) + +(call_expression function: (identifier) @name) @call + "#; + #[test] fn test_tags_python() { let language = get_language("python"); - let tags_config = TagsConfiguration::new( - language, - r#" - ((function_definition - name: (identifier) @name - body: (block . (expression_statement (string) @doc))) @function - (strip! @doc "(^['\"\\s]*)|(['\"\\s]*$)")) - (function_definition - name: (identifier) @name) @function - ((class_definition - name: (identifier) @name - body: (block . (expression_statement (string) @doc))) @class - (strip! @doc "(^['\"\\s]*)|(['\"\\s]*$)")) - (class_definition - name: (identifier) @name) @class - (call - function: (identifier) @name) @call - "#, - "", - ) - .unwrap(); + let tags_config = TagsConfiguration::new(language, PYTHON_TAG_QUERY, "").unwrap(); let source = br#" class Customer: @@ -68,27 +94,7 @@ fn test_tags_python() { #[test] fn test_tags_javascript() { let language = get_language("javascript"); - let tags_config = TagsConfiguration::new( - language, - r#" - ((* - (comment)+ @doc - . - (class_declaration - name: (identifier) @name) @class) - (select-adjacent! @doc @class) - (strip! @doc "(^[/\\*\\s]*)|([/\\*\\s]*$)")) - ((* - (comment)+ @doc - . - (method_definition - name: (property_identifier) @name) @method) -; (select-adjacent! @doc @method) - (strip! @doc "(^[/\\*\\s]*)|([/\\*\\s]*$)")) - "#, - "", - ) - .unwrap(); + let tags_config = TagsConfiguration::new(language, JS_TAG_QUERY, "").unwrap(); let mut tag_context = TagsContext::new(); let source = br#" @@ -132,6 +138,103 @@ fn test_tags_javascript() { assert_eq!(tags[2].docs, None); } +#[test] +fn test_tags_via_c_api() { + allocations::record(|| { + let tagger = c::ts_tagger_new(); + let buffer = c::ts_tags_buffer_new(); + let scope_name = "source.js"; + let language = get_language("javascript"); + + let source_code = " + var a = 1; + + // one + // two + // three + function b() { + } + + // four + // five + class C extends D { + + } + + b(a);" + .lines() + .skip(1) + // remove extra indentation + .map(|line| &line[line.len().min(12)..]) + .collect::>() + .join("\n"); + + let c_scope_name = CString::new(scope_name).unwrap(); + let result = c::ts_tagger_add_language( + tagger, + c_scope_name.as_ptr(), + language, + JS_TAG_QUERY.as_ptr(), + ptr::null(), + JS_TAG_QUERY.len() as u32, + 0, + ); + assert_eq!(result, c::TSTagsError::Ok); + + let result = c::ts_tagger_tag( + tagger, + c_scope_name.as_ptr(), + source_code.as_ptr(), + source_code.len() as u32, + buffer, + ptr::null(), + ); + assert_eq!(result, c::TSTagsError::Ok); + let tags = unsafe { + slice::from_raw_parts( + c::ts_tags_buffer_tags(buffer), + c::ts_tags_buffer_tags_len(buffer) as usize, + ) + }; + let docs = str::from_utf8(unsafe { + slice::from_raw_parts( + c::ts_tags_buffer_docs(buffer) as *const u8, + c::ts_tags_buffer_docs_len(buffer) as usize, + ) + }) + .unwrap(); + + assert_eq!( + tags.iter() + .map(|tag| ( + tag.kind, + &source_code[tag.name_start_byte as usize..tag.name_end_byte as usize], + &source_code[tag.line_start_byte as usize..tag.line_end_byte as usize], + &docs[tag.docs_start_byte as usize..tag.docs_end_byte as usize], + )) + .collect::>(), + &[ + ( + c::TSTagKind::Function, + "b", + "function b() {", + "one\ntwo\nthree" + ), + ( + c::TSTagKind::Class, + "C", + "class C extends D {", + "four\nfive" + ), + (c::TSTagKind::Call, "b", "b(a);", "") + ] + ); + + c::ts_tags_buffer_delete(buffer); + c::ts_tagger_delete(tagger); + }); +} + fn substr<'a>(source: &'a [u8], range: &std::ops::Range) -> &'a str { std::str::from_utf8(&source[range.clone()]).unwrap() } diff --git a/tags/include/tree_sitter/tags.h b/tags/include/tree_sitter/tags.h index d492ad31..6054edc4 100644 --- a/tags/include/tree_sitter/tags.h +++ b/tags/include/tree_sitter/tags.h @@ -82,8 +82,12 @@ TSTagsBuffer *ts_tags_buffer_new(); void ts_tags_buffer_delete(TSTagsBuffer *); // Access the tags within a tag buffer. -const TSTag *ts_tags_buffer_line_offsets(const TSTagsBuffer *); -uint32_t ts_tags_buffer_len(const TSTagsBuffer *); +const TSTag *ts_tags_buffer_tags(const TSTagsBuffer *); +uint32_t ts_tags_buffer_tags_len(const TSTagsBuffer *); + +// Access the string containing all of the docs +const char *ts_tags_buffer_docs(const TSTagsBuffer *); +uint32_t ts_tags_buffer_docs_len(const TSTagsBuffer *); #ifdef __cplusplus } diff --git a/tags/src/c_lib.rs b/tags/src/c_lib.rs index 714d956e..83ef9c5f 100644 --- a/tags/src/c_lib.rs +++ b/tags/src/c_lib.rs @@ -6,7 +6,8 @@ use std::{fmt, slice, str}; use tree_sitter::Language; #[repr(C)] -enum TSTagsError { +#[derive(Debug, PartialEq, Eq)] +pub enum TSTagsError { Ok, UnknownScope, Timeout, @@ -17,7 +18,8 @@ enum TSTagsError { } #[repr(C)] -enum TSTagKind { +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum TSTagKind { Function, Method, Class, @@ -26,38 +28,49 @@ enum TSTagKind { } #[repr(C)] -struct TSPoint { +pub struct TSPoint { row: u32, column: u32, } #[repr(C)] -struct TSTag { - kind: TSTagKind, - start_byte: u32, - end_byte: u32, - name_start_byte: u32, - name_end_byte: u32, - line_start_byte: u32, - line_end_byte: u32, - start_point: TSPoint, - end_point: TSPoint, - docs: *const u8, - docs_length: u32, +pub struct TSTag { + pub kind: TSTagKind, + pub start_byte: u32, + pub end_byte: u32, + pub name_start_byte: u32, + pub name_end_byte: u32, + pub line_start_byte: u32, + pub line_end_byte: u32, + pub start_point: TSPoint, + pub end_point: TSPoint, + pub docs_start_byte: u32, + pub docs_end_byte: u32, } -struct TSTagger { +pub struct TSTagger { languages: HashMap, } -struct TSTagsBuffer { +pub struct TSTagsBuffer { context: TagsContext, tags: Vec, docs: Vec, } #[no_mangle] -unsafe extern "C" fn ts_tagger_add_language( +pub extern "C" fn ts_tagger_new() -> *mut TSTagger { + Box::into_raw(Box::new(TSTagger { + languages: HashMap::new(), + })) +} + +pub extern "C" fn ts_tagger_delete(this: *mut TSTagger) { + drop(unsafe { Box::from_raw(this) }) +} + +#[no_mangle] +pub extern "C" fn ts_tagger_add_language( this: *mut TSTagger, scope_name: *const i8, language: Language, @@ -67,9 +80,9 @@ unsafe extern "C" fn ts_tagger_add_language( locals_query_len: u32, ) -> TSTagsError { let tagger = unwrap_mut_ptr(this); - let scope_name = unwrap(CStr::from_ptr(scope_name).to_str()); - let tags_query = slice::from_raw_parts(tags_query, tags_query_len as usize); - let locals_query = slice::from_raw_parts(locals_query, locals_query_len as usize); + let scope_name = unsafe { unwrap(CStr::from_ptr(scope_name).to_str()) }; + let tags_query = unsafe { slice::from_raw_parts(tags_query, tags_query_len as usize) }; + let locals_query = unsafe { slice::from_raw_parts(locals_query, locals_query_len as usize) }; let tags_query = match str::from_utf8(tags_query) { Ok(e) => e, Err(_) => return TSTagsError::InvalidUtf8, @@ -89,7 +102,7 @@ unsafe extern "C" fn ts_tagger_add_language( } #[no_mangle] -unsafe extern "C" fn ts_tagger_tag( +pub extern "C" fn ts_tagger_tag( this: *mut TSTagger, scope_name: *const i8, source_code: *const u8, @@ -99,15 +112,17 @@ unsafe extern "C" fn ts_tagger_tag( ) -> TSTagsError { let tagger = unwrap_mut_ptr(this); let buffer = unwrap_mut_ptr(output); - let scope_name = unwrap(CStr::from_ptr(scope_name).to_str()); + let scope_name = unsafe { unwrap(CStr::from_ptr(scope_name).to_str()) }; if let Some(config) = tagger.languages.get(scope_name) { - let source_code = slice::from_raw_parts(source_code, source_code_len as usize); + buffer.tags.clear(); + buffer.docs.clear(); + let source_code = unsafe { slice::from_raw_parts(source_code, source_code_len as usize) }; + for tag in buffer.context.generate_tags(config, source_code) { let prev_docs_len = buffer.docs.len(); if let Some(docs) = tag.docs { buffer.docs.extend_from_slice(docs.as_bytes()); } - let docs = &buffer.docs[prev_docs_len..]; buffer.tags.push(TSTag { kind: match tag.kind { TagKind::Function => TSTagKind::Function, @@ -130,10 +145,11 @@ unsafe extern "C" fn ts_tagger_tag( row: tag.span.end.row as u32, column: tag.span.end.column as u32, }, - docs: docs.as_ptr(), - docs_length: docs.len() as u32, + docs_start_byte: prev_docs_len as u32, + docs_end_byte: buffer.docs.len() as u32, }); } + TSTagsError::Ok } else { TSTagsError::UnknownScope @@ -141,7 +157,7 @@ unsafe extern "C" fn ts_tagger_tag( } #[no_mangle] -extern "C" fn ts_tags_buffer_new() -> *mut TSTagsBuffer { +pub extern "C" fn ts_tags_buffer_new() -> *mut TSTagsBuffer { Box::into_raw(Box::new(TSTagsBuffer { context: TagsContext::new(), tags: Vec::new(), @@ -150,22 +166,34 @@ extern "C" fn ts_tags_buffer_new() -> *mut TSTagsBuffer { } #[no_mangle] -extern "C" fn ts_tags_buffer_delete(this: *mut TSTagsBuffer) { +pub extern "C" fn ts_tags_buffer_delete(this: *mut TSTagsBuffer) { drop(unsafe { Box::from_raw(this) }) } #[no_mangle] -extern "C" fn ts_tags_buffer_line_offsets(this: *const TSTagsBuffer) -> *const TSTag { +pub extern "C" fn ts_tags_buffer_tags(this: *const TSTagsBuffer) -> *const TSTag { let buffer = unwrap_ptr(this); buffer.tags.as_ptr() } #[no_mangle] -extern "C" fn ts_tags_buffer_len(this: *const TSTagsBuffer) -> u32 { +pub extern "C" fn ts_tags_buffer_tags_len(this: *const TSTagsBuffer) -> u32 { let buffer = unwrap_ptr(this); buffer.tags.len() as u32 } +#[no_mangle] +pub extern "C" fn ts_tags_buffer_docs(this: *const TSTagsBuffer) -> *const i8 { + let buffer = unwrap_ptr(this); + buffer.docs.as_ptr() as *const i8 +} + +#[no_mangle] +pub extern "C" fn ts_tags_buffer_docs_len(this: *const TSTagsBuffer) -> u32 { + let buffer = unwrap_ptr(this); + buffer.docs.len() as u32 +} + fn unwrap_ptr<'a, T>(result: *const T) -> &'a T { unsafe { result.as_ref() }.unwrap_or_else(|| { eprintln!("{}:{} - pointer must not be null", file!(), line!()); diff --git a/tags/src/lib.rs b/tags/src/lib.rs index 5f579d1d..e5695845 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -1,4 +1,4 @@ -mod c_lib; +pub mod c_lib; use memchr::{memchr, memrchr}; use regex::Regex; From aedab72afa969423d471dabd62d5fe42ec854372 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 20 Mar 2020 10:13:03 -0700 Subject: [PATCH 32/42] tags: Start work on handling local variables for ruby support --- cli/src/tests/tags_test.rs | 94 +++++++++++++++++++++---- tags/src/lib.rs | 139 +++++++++++++++++++++++++++++++------ 2 files changed, 197 insertions(+), 36 deletions(-) diff --git a/cli/src/tests/tags_test.rs b/cli/src/tests/tags_test.rs index d4cbc687..756f63e7 100644 --- a/cli/src/tests/tags_test.rs +++ b/cli/src/tests/tags_test.rs @@ -1,7 +1,7 @@ use super::helpers::allocations; -use super::helpers::fixtures::get_language; +use super::helpers::fixtures::{get_language, get_language_queries_path}; use std::ffi::CString; -use std::{ptr, slice, str}; +use std::{fs, ptr, slice, str}; use tree_sitter_tags::c_lib as c; use tree_sitter_tags::{TagKind, TagsConfiguration, TagsContext}; @@ -47,10 +47,22 @@ const JS_TAG_QUERY: &'static str = r#" (call_expression function: (identifier) @name) @call "#; +const RUBY_TAG_QUERY: &'static str = r#" +(method + name: (identifier) @name) @method + +(method_call + method: (identifier) @name) @call + +((identifier) @name @call + (is-not? local)) +"#; + #[test] fn test_tags_python() { let language = get_language("python"); let tags_config = TagsConfiguration::new(language, PYTHON_TAG_QUERY, "").unwrap(); + let mut tag_context = TagsContext::new(); let source = br#" class Customer: @@ -66,7 +78,6 @@ fn test_tags_python() { } "#; - let mut tag_context = TagsContext::new(); let tags = tag_context .generate_tags(&tags_config, source) .collect::>(); @@ -95,8 +106,6 @@ fn test_tags_python() { fn test_tags_javascript() { let language = get_language("javascript"); let tags_config = TagsConfiguration::new(language, JS_TAG_QUERY, "").unwrap(); - - let mut tag_context = TagsContext::new(); let source = br#" // hi @@ -116,6 +125,8 @@ fn test_tags_javascript() { } "#; + + let mut tag_context = TagsContext::new(); let tags = tag_context .generate_tags(&tags_config, source) .collect::>(); @@ -138,6 +149,58 @@ fn test_tags_javascript() { assert_eq!(tags[2].docs, None); } +#[test] +fn test_tags_ruby() { + let language = get_language("ruby"); + let locals_query = + fs::read_to_string(get_language_queries_path("ruby").join("locals.scm")).unwrap(); + let tags_config = TagsConfiguration::new(language, RUBY_TAG_QUERY, &locals_query).unwrap(); + let source = strip_whitespace( + 8, + " + b = 1 + + def foo() + c = 1 + + # a is a method because it is not in scope + # b is a method because `b` doesn't capture variables from its containing scope + bar a, b, c + + [1, 2, 3].each do |a| + # a is a parameter + # b is a method + # c is a variable, because the block captures variables from its containing scope. + baz a, b, c + end + end", + ); + + let mut tag_context = TagsContext::new(); + let tags = tag_context + .generate_tags(&tags_config, source.as_bytes()) + .collect::>(); + + assert_eq!( + tags.iter() + .map(|t| ( + substr(source.as_bytes(), &t.name_range), + t.kind, + (t.span.start.row, t.span.start.column), + )) + .collect::>(), + &[ + ("foo", TagKind::Method, (2, 0)), + ("bar", TagKind::Call, (7, 4)), + ("a", TagKind::Call, (7, 8)), + ("b", TagKind::Call, (7, 11)), + ("each", TagKind::Call, (9, 14)), + ("baz", TagKind::Call, (13, 8)), + ("b", TagKind::Call, (13, 15),), + ] + ); +} + #[test] fn test_tags_via_c_api() { allocations::record(|| { @@ -146,7 +209,9 @@ fn test_tags_via_c_api() { let scope_name = "source.js"; let language = get_language("javascript"); - let source_code = " + let source_code = strip_whitespace( + 12, + " var a = 1; // one @@ -161,13 +226,8 @@ fn test_tags_via_c_api() { } - b(a);" - .lines() - .skip(1) - // remove extra indentation - .map(|line| &line[line.len().min(12)..]) - .collect::>() - .join("\n"); + b(a);", + ); let c_scope_name = CString::new(scope_name).unwrap(); let result = c::ts_tagger_add_language( @@ -238,3 +298,11 @@ fn test_tags_via_c_api() { fn substr<'a>(source: &'a [u8], range: &std::ops::Range) -> &'a str { std::str::from_utf8(&source[range.clone()]).unwrap() } + +fn strip_whitespace(indent: usize, s: &str) -> String { + s.lines() + .skip(1) + .map(|line| &line[line.len().min(indent)..]) + .collect::>() + .join("\n") +} diff --git a/tags/src/lib.rs b/tags/src/lib.rs index e5695845..c3a52303 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -5,7 +5,7 @@ use regex::Regex; use std::ops::Range; use std::{fmt, mem, str}; use tree_sitter::{ - Language, Node, Parser, Point, Query, QueryCursor, QueryError, QueryPredicateArg, Tree, + Language, Parser, Point, Query, QueryCursor, QueryError, QueryPredicateArg, Tree, }; const MAX_LINE_LEN: usize = 180; @@ -23,8 +23,10 @@ pub struct TagsConfiguration { method_capture_index: Option, module_capture_index: Option, name_capture_index: Option, + local_scope_capture_index: Option, + local_definition_capture_index: Option, + tags_pattern_index: usize, pattern_info: Vec, - _locals_pattern_index: usize, } pub struct TagsContext { @@ -60,9 +62,24 @@ pub enum Error { #[derive(Debug, Default)] struct PatternInfo { docs_adjacent_capture: Option, + local_scope_inherits: bool, + name_must_be_non_local: bool, doc_strip_regex: Option, } +#[derive(Debug)] +struct LocalDef<'a> { + name: &'a [u8], + value_range: Range, +} + +#[derive(Debug)] +struct LocalScope<'a> { + inherits: bool, + range: Range, + local_defs: Vec>, +} + struct TagsIter<'a, I> where I: Iterator>, @@ -71,19 +88,20 @@ where _tree: Tree, source: &'a [u8], config: &'a TagsConfiguration, - tag_queue: Vec<(Node<'a>, usize, Tag)>, + tag_queue: Vec<(Tag, usize)>, + scopes: Vec>, } impl TagsConfiguration { pub fn new(language: Language, tags_query: &str, locals_query: &str) -> Result { - let query = Query::new(language, &format!("{}{}", tags_query, locals_query))?; + let query = Query::new(language, &format!("{}{}", locals_query, tags_query))?; - let locals_query_offset = tags_query.len(); - let mut locals_pattern_index = 0; + let tags_query_offset = locals_query.len(); + let mut tags_pattern_index = 0; for i in 0..(query.pattern_count()) { let pattern_offset = query.start_byte_for_pattern(i); - if pattern_offset < locals_query_offset { - locals_pattern_index += 1; + if pattern_offset < tags_query_offset { + tags_pattern_index += 1; } } @@ -94,6 +112,8 @@ impl TagsConfiguration { let mut method_capture_index = None; let mut module_capture_index = None; let mut name_capture_index = None; + let mut local_scope_capture_index = None; + let mut local_definition_capture_index = None; for (i, name) in query.capture_names().iter().enumerate() { let index = match name.as_str() { "call" => &mut call_capture_index, @@ -103,6 +123,8 @@ impl TagsConfiguration { "method" => &mut method_capture_index, "module" => &mut module_capture_index, "name" => &mut name_capture_index, + "local.scope" => &mut local_scope_capture_index, + "local.definition" => &mut local_definition_capture_index, _ => continue, }; *index = Some(i as u32); @@ -111,6 +133,22 @@ impl TagsConfiguration { let pattern_info = (0..query.pattern_count()) .map(|pattern_index| { let mut info = PatternInfo::default(); + for (property, is_positive) in query.property_predicates(pattern_index) { + if !is_positive && property.key.as_ref() == "local" { + info.name_must_be_non_local = true; + } + } + info.local_scope_inherits = true; + for property in query.property_settings(pattern_index) { + if property.key.as_ref() == "local.scope-inherits" + && property + .value + .as_ref() + .map_or(false, |v| v.as_ref() == "false") + { + info.local_scope_inherits = false; + } + } if let Some(doc_capture_index) = doc_capture_index { for predicate in query.general_predicates(pattern_index) { if predicate.args.get(0) @@ -143,8 +181,10 @@ impl TagsConfiguration { doc_capture_index, call_capture_index, name_capture_index, + tags_pattern_index, + local_scope_capture_index, + local_definition_capture_index, pattern_info, - _locals_pattern_index: locals_pattern_index, }) } } @@ -179,11 +219,16 @@ impl TagsContext { &source[node.byte_range()] }); TagsIter { + _tree: tree, matches, source, config, tag_queue: Vec::new(), - _tree: tree, + scopes: vec![LocalScope { + range: 0..source.len(), + inherits: false, + local_defs: Vec::new(), + }], } } } @@ -200,15 +245,41 @@ where // it off of the queue and return it. if let Some(last_entry) = self.tag_queue.last() { if self.tag_queue.len() > 1 - && self.tag_queue[0].0.end_byte() < last_entry.0.start_byte() + && self.tag_queue[0].0.name_range.end < last_entry.0.name_range.start { - return Some(self.tag_queue.remove(0).2); + return Some(self.tag_queue.remove(0).0); } } // If there is another match, then compute its tag and add it to the // tag queue. if let Some(mat) = self.matches.next() { + let pattern_info = &self.config.pattern_info[mat.pattern_index]; + + if mat.pattern_index < self.config.tags_pattern_index { + for capture in mat.captures { + let index = Some(capture.index); + let range = capture.node.byte_range(); + if index == self.config.local_scope_capture_index { + self.scopes.push(LocalScope { + range, + inherits: pattern_info.local_scope_inherits, + local_defs: Vec::new(), + }); + } else if index == self.config.local_definition_capture_index { + if let Some(scope) = self.scopes.iter_mut().rev().find(|scope| { + scope.range.start <= range.start && scope.range.end >= range.end + }) { + scope.local_defs.push(LocalDef { + name: &self.source[range.clone()], + value_range: range, + }); + } + } + } + continue; + } + let mut name_range = None; let mut doc_nodes = Vec::new(); let mut tag_node = None; @@ -245,6 +316,30 @@ where } if let (Some(tag_node), Some(name_range)) = (tag_node, name_range) { + if pattern_info.name_must_be_non_local { + let mut is_local = false; + for scope in self.scopes.iter().rev() { + if scope.range.start <= name_range.start + && scope.range.end >= name_range.end + { + if scope + .local_defs + .iter() + .any(|d| d.name == &self.source[name_range.clone()]) + { + is_local = true; + break; + } + if !scope.inherits { + break; + } + } + } + if is_local { + continue; + } + } + // If needed, filter the doc nodes based on their ranges, selecting // only the slice that are adjacent to some specified node. let mut docs_start_index = 0; @@ -269,9 +364,7 @@ where let mut docs = None; for doc_node in &doc_nodes[docs_start_index..] { if let Ok(content) = str::from_utf8(&self.source[doc_node.byte_range()]) { - let content = if let Some(regex) = - &self.config.pattern_info[mat.pattern_index].doc_strip_regex - { + let content = if let Some(regex) = &pattern_info.doc_strip_regex { regex.replace_all(content, "").to_string() } else { content.to_string() @@ -289,12 +382,13 @@ where // Only create one tag per node. The tag queue is sorted by node position // to allow for fast lookup. let range = tag_node.byte_range(); - match self.tag_queue.binary_search_by_key( - &(range.end, range.start, tag_node.id()), - |(node, _, _)| (node.end_byte(), node.start_byte(), node.id()), - ) { + match self + .tag_queue + .binary_search_by_key(&(name_range.end, name_range.start), |(tag, _)| { + (tag.name_range.end, tag.name_range.start) + }) { Ok(i) => { - let (_, pattern_index, tag) = &mut self.tag_queue[i]; + let (tag, pattern_index) = &mut self.tag_queue[i]; if *pattern_index > mat.pattern_index { *pattern_index = mat.pattern_index; *tag = Tag { @@ -310,8 +404,6 @@ where Err(i) => self.tag_queue.insert( i, ( - tag_node, - mat.pattern_index, Tag { line_range: line_range(self.source, range.start, MAX_LINE_LEN), span: tag_node.start_position()..tag_node.start_position(), @@ -320,6 +412,7 @@ where name_range, docs, }, + mat.pattern_index, ), ), } @@ -327,7 +420,7 @@ where } // If there are no more matches, then drain the queue. else if !self.tag_queue.is_empty() { - return Some(self.tag_queue.remove(0).2); + return Some(self.tag_queue.remove(0).0); } else { return None; } From 9665f1ba70765719b2a5871e0feacbff2db2d1f2 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 20 Mar 2020 10:29:20 -0700 Subject: [PATCH 33/42] Create tags readme --- tags/README.md | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 tags/README.md diff --git a/tags/README.md b/tags/README.md new file mode 100644 index 00000000..7a55c254 --- /dev/null +++ b/tags/README.md @@ -0,0 +1,60 @@ +Tree-sitter Tags +========================= + +### Usage + +Compile some languages into your app, and declare them: + +```rust +extern "C" tree_sitter_python(); +extern "C" tree_sitter_javascript(); +``` + +Create a tag context. You need one of these for each thread that you're using for tag computation: + +```rust +use tree_sitter_tags::TagsContext; + +let context = TagsContext::new(); +``` + +Load some tagging queries from the `queries` directory of some language repositories: + +```rust +use tree_sitter_highlight::TagsConfiguration; + +let python_language = unsafe { tree_sitter_python() }; +let javascript_language = unsafe { tree_sitter_javascript() }; + +let python_config = HighlightConfiguration::new( + python_language, + &fs::read_to_string("./tree-sitter-python/queries/tags.scm").unwrap(), + &fs::read_to_string("./tree-sitter-python/queries/locals.scm").unwrap(), +).unwrap(); + +let javascript_config = HighlightConfiguration::new( + javascript_language, + &fs::read_to_string("./tree-sitter-javascript/queries/tags.scm").unwrap(), + &fs::read_to_string("./tree-sitter-javascript/queries/locals.scm").unwrap(), +).unwrap(); +``` + +Compute code navigation tags for some source code: + +```rust +use tree_sitter_highlight::HighlightEvent; + +let tags = context.generate_tags( + &javascript_config, + b"class A { getB() { return c(); } }", + None, + |_| None +); + +for tag in tags { + println!("kind: {:?}", tag.kind); + println!("range: {:?}", tag.range); + println!("name_range: {:?}", tag.name_range); + println!("docs: {:?}", tag.docs); +} +``` From ae1c51051acebfa01c38c54f1555c27f8574fed9 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 20 Mar 2020 11:34:33 -0700 Subject: [PATCH 34/42] Fix tag order in JS tags test --- cli/src/tests/tags_test.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cli/src/tests/tags_test.rs b/cli/src/tests/tags_test.rs index 756f63e7..1b6cb2f3 100644 --- a/cli/src/tests/tags_test.rs +++ b/cli/src/tests/tags_test.rs @@ -136,16 +136,16 @@ fn test_tags_javascript() { .map(|t| (substr(source, &t.name_range), t.kind)) .collect::>(), &[ - ("getAge", TagKind::Method), ("Customer", TagKind::Class), + ("getAge", TagKind::Method), ("Agent", TagKind::Class) ] ); - assert_eq!(tags[0].docs.as_ref().unwrap(), "Get the customer's age"); assert_eq!( - tags[1].docs.as_ref().unwrap(), + tags[0].docs.as_ref().unwrap(), "Data about a customer.\nbla bla bla" ); + assert_eq!(tags[1].docs.as_ref().unwrap(), "Get the customer's age"); assert_eq!(tags[2].docs, None); } From a003e5f6bd2c13685281beaef42b932929e1bc54 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 20 Mar 2020 11:35:11 -0700 Subject: [PATCH 35/42] generate: Avoid duplicate string tokens in unique symbol map --- cli/src/generate/render.rs | 17 ++++++++++++++++- cli/src/tests/query_test.rs | 37 ++++++++++++++++++++++++++++++++++++- 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/cli/src/generate/render.rs b/cli/src/generate/render.rs index 824c3bcf..e8c59d07 100644 --- a/cli/src/generate/render.rs +++ b/cli/src/generate/render.rs @@ -325,12 +325,13 @@ impl Generator { add_line!(self, "static TSSymbol ts_symbol_map[] = {{"); indent!(self); for symbol in &self.parse_table.symbols { + let mut mapping = symbol; + // There can be multiple symbols in the grammar that have the same name and kind, // due to simple aliases. When that happens, ensure that they map to the same // public-facing symbol. If one of the symbols is not aliased, choose that one // to be the public-facing symbol. Otherwise, pick the symbol with the lowest // numeric value. - let mut mapping = symbol; if let Some(alias) = self.simple_aliases.get(symbol) { let kind = alias.kind(); for other_symbol in &self.parse_table.symbols { @@ -344,6 +345,20 @@ impl Generator { } } } + // Two anonymous tokens with different flags but the same string value + // should be represented with the same symbol in the public API. Examples: + // * "<" and token(prec(1, "<")) + // * "(" and token.immediate("(") + else if symbol.is_terminal() { + let metadata = self.metadata_for_symbol(*symbol); + for other_symbol in &self.parse_table.symbols { + let other_metadata = self.metadata_for_symbol(*other_symbol); + if other_metadata == metadata { + mapping = other_symbol; + break; + } + } + } add_line!( self, diff --git a/cli/src/tests/query_test.rs b/cli/src/tests/query_test.rs index 0daa4d5a..f69074a8 100644 --- a/cli/src/tests/query_test.rs +++ b/cli/src/tests/query_test.rs @@ -491,7 +491,7 @@ fn test_query_matches_with_wildcard_at_the_root() { } #[test] -fn test_query_with_immediate_siblings() { +fn test_query_matches_with_immediate_siblings() { allocations::record(|| { let language = get_language("python"); @@ -677,6 +677,41 @@ fn test_query_matches_in_language_with_simple_aliases() { }); } +#[test] +fn test_query_matches_with_different_tokens_with_the_same_string_value() { + allocations::record(|| { + let language = get_language("rust"); + let query = Query::new( + language, + r#" + "<" @less + ">" @greater + "#, + ) + .unwrap(); + + // In Rust, there are two '<' tokens: one for the binary operator, + // and one with higher precedence for generics. + let source = "const A: B = d < e || f > g;"; + + let mut parser = Parser::new(); + parser.set_language(language).unwrap(); + let tree = parser.parse(&source, None).unwrap(); + let mut cursor = QueryCursor::new(); + let matches = cursor.matches(&query, tree.root_node(), to_callback(source)); + + assert_eq!( + collect_matches(matches, &query, source), + &[ + (0, vec![("less", "<")]), + (1, vec![("greater", ">")]), + (0, vec![("less", "<")]), + (1, vec![("greater", ">")]), + ] + ); + }); +} + #[test] fn test_query_matches_with_too_many_permutations_to_track() { allocations::record(|| { From 59c457c5cffb01f15cbfa64e8d347defb7782e0f Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 20 Mar 2020 11:35:27 -0700 Subject: [PATCH 36/42] tags: Fix typo in tag ranges --- cli/src/tags.rs | 2 +- tags/src/lib.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cli/src/tags.rs b/cli/src/tags.rs index 86eb2e33..c65d5479 100644 --- a/cli/src/tags.rs +++ b/cli/src/tags.rs @@ -39,7 +39,7 @@ pub fn generate_tags(loader: &Loader, scope: Option<&str>, paths: &[String]) -> for tag in context.generate_tags(tags_config, &source) { write!( &mut stdout, - " {:<8}\t{:<40}\t{:>9}-{:<9}", + " {:<8} {:<40}\t{:>9}-{:<9}", tag.kind, str::from_utf8(&source[tag.name_range]).unwrap_or(""), tag.span.start, diff --git a/tags/src/lib.rs b/tags/src/lib.rs index c3a52303..566efe52 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -393,7 +393,7 @@ where *pattern_index = mat.pattern_index; *tag = Tag { line_range: line_range(self.source, range.start, MAX_LINE_LEN), - span: tag_node.start_position()..tag_node.start_position(), + span: tag_node.start_position()..tag_node.end_position(), kind, range, name_range, @@ -406,7 +406,7 @@ where ( Tag { line_range: line_range(self.source, range.start, MAX_LINE_LEN), - span: tag_node.start_position()..tag_node.start_position(), + span: tag_node.start_position()..tag_node.end_position(), kind, range, name_range, From 9f0bd33429782d268c8c3a114b0176007a8f25ce Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 23 Mar 2020 11:52:24 -0700 Subject: [PATCH 37/42] Add missing no_mangle attribute on ts_tagger_delete --- tags/src/c_lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/tags/src/c_lib.rs b/tags/src/c_lib.rs index 83ef9c5f..a4e54151 100644 --- a/tags/src/c_lib.rs +++ b/tags/src/c_lib.rs @@ -65,6 +65,7 @@ pub extern "C" fn ts_tagger_new() -> *mut TSTagger { })) } +#[no_mangle] pub extern "C" fn ts_tagger_delete(this: *mut TSTagger) { drop(unsafe { Box::from_raw(this) }) } From ae075e75f08482903be0bdb0cd2b812a6aaf46db Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 23 Mar 2020 14:29:29 -0700 Subject: [PATCH 38/42] tags: Avoid returning garbage pointer when length is zero --- tags/include/tree_sitter/tags.h | 4 ++-- tags/src/c_lib.rs | 16 +++++++++++++--- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/tags/include/tree_sitter/tags.h b/tags/include/tree_sitter/tags.h index 6054edc4..946dc6f1 100644 --- a/tags/include/tree_sitter/tags.h +++ b/tags/include/tree_sitter/tags.h @@ -36,8 +36,8 @@ typedef struct { uint32_t line_end_byte; TSPoint start_point; TSPoint end_point; - const char *docs; - uint32_t docs_length; + uint32_t docs_start_byte; + uint32_t docs_end_byte; } TSTag; typedef struct TSTagger TSTagger; diff --git a/tags/src/c_lib.rs b/tags/src/c_lib.rs index a4e54151..0d61fb46 100644 --- a/tags/src/c_lib.rs +++ b/tags/src/c_lib.rs @@ -2,7 +2,7 @@ use super::{Error, TagKind, TagsConfiguration, TagsContext}; use std::collections::HashMap; use std::ffi::CStr; use std::process::abort; -use std::{fmt, slice, str}; +use std::{fmt, ptr, slice, str}; use tree_sitter::Language; #[repr(C)] @@ -92,6 +92,7 @@ pub extern "C" fn ts_tagger_add_language( Ok(e) => e, Err(_) => return TSTagsError::InvalidUtf8, }; + match TagsConfiguration::new(language, tags_query, locals_query) { Ok(c) => { tagger.languages.insert(scope_name.to_string(), c); @@ -114,6 +115,7 @@ pub extern "C" fn ts_tagger_tag( let tagger = unwrap_mut_ptr(this); let buffer = unwrap_mut_ptr(output); let scope_name = unsafe { unwrap(CStr::from_ptr(scope_name).to_str()) }; + if let Some(config) = tagger.languages.get(scope_name) { buffer.tags.clear(); buffer.docs.clear(); @@ -174,7 +176,11 @@ pub extern "C" fn ts_tags_buffer_delete(this: *mut TSTagsBuffer) { #[no_mangle] pub extern "C" fn ts_tags_buffer_tags(this: *const TSTagsBuffer) -> *const TSTag { let buffer = unwrap_ptr(this); - buffer.tags.as_ptr() + if buffer.tags.is_empty() { + ptr::null() + } else { + buffer.tags.as_ptr() + } } #[no_mangle] @@ -186,7 +192,11 @@ pub extern "C" fn ts_tags_buffer_tags_len(this: *const TSTagsBuffer) -> u32 { #[no_mangle] pub extern "C" fn ts_tags_buffer_docs(this: *const TSTagsBuffer) -> *const i8 { let buffer = unwrap_ptr(this); - buffer.docs.as_ptr() as *const i8 + if buffer.docs.is_empty() { + ptr::null() + } else { + buffer.docs.as_ptr() as *const i8 + } } #[no_mangle] From 783c087aecf9f2bffd57abd5f4562fc9d108f00e Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 25 Mar 2020 11:26:52 -0700 Subject: [PATCH 39/42] tags: Handle cancellation --- cli/src/highlight.rs | 22 +++------------- cli/src/tags.rs | 5 +++- cli/src/tests/tags_test.rs | 53 +++++++++++++++++++++++++++++++++----- cli/src/util.rs | 19 +++++++++++++- tags/src/c_lib.rs | 28 ++++++++++++++++++-- tags/src/lib.rs | 45 ++++++++++++++++++++++---------- 6 files changed, 130 insertions(+), 42 deletions(-) diff --git a/cli/src/highlight.rs b/cli/src/highlight.rs index c80e6083..c6b1193d 100644 --- a/cli/src/highlight.rs +++ b/cli/src/highlight.rs @@ -1,3 +1,4 @@ +use super::util; use crate::error::Result; use crate::loader::Loader; use ansi_term::Color; @@ -6,10 +7,8 @@ use serde::ser::SerializeMap; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use serde_json::{json, Value}; use std::collections::HashMap; -use std::sync::atomic::{AtomicUsize, Ordering}; -use std::sync::Arc; use std::time::Instant; -use std::{fs, io, path, str, thread, usize}; +use std::{fs, io, path, str, usize}; use tree_sitter_highlight::{HighlightConfiguration, HighlightEvent, Highlighter, HtmlRenderer}; pub const HTML_HEADER: &'static str = " @@ -273,19 +272,6 @@ fn color_to_css(color: Color) -> &'static str { } } -fn cancel_on_stdin() -> Arc { - let result = Arc::new(AtomicUsize::new(0)); - thread::spawn({ - let flag = result.clone(); - move || { - let mut line = String::new(); - io::stdin().read_line(&mut line).unwrap(); - flag.store(1, Ordering::Relaxed); - } - }); - result -} - pub fn ansi( loader: &Loader, theme: &Theme, @@ -296,7 +282,7 @@ pub fn ansi( let stdout = io::stdout(); let mut stdout = stdout.lock(); let time = Instant::now(); - let cancellation_flag = cancel_on_stdin(); + let cancellation_flag = util::cancel_on_stdin(); let mut highlighter = Highlighter::new(); let events = highlighter.highlight(config, source, Some(&cancellation_flag), |string| { @@ -341,7 +327,7 @@ pub fn html( let stdout = io::stdout(); let mut stdout = stdout.lock(); let time = Instant::now(); - let cancellation_flag = cancel_on_stdin(); + let cancellation_flag = util::cancel_on_stdin(); let mut highlighter = Highlighter::new(); let events = highlighter.highlight(config, source, Some(&cancellation_flag), |string| { diff --git a/cli/src/tags.rs b/cli/src/tags.rs index c65d5479..d6704ec5 100644 --- a/cli/src/tags.rs +++ b/cli/src/tags.rs @@ -1,4 +1,5 @@ use super::loader::Loader; +use super::util; use crate::error::{Error, Result}; use std::io::{self, Write}; use std::path::Path; @@ -15,6 +16,7 @@ pub fn generate_tags(loader: &Loader, scope: Option<&str>, paths: &[String]) -> } let mut context = TagsContext::new(); + let cancellation_flag = util::cancel_on_stdin(); let stdout = io::stdout(); let mut stdout = stdout.lock(); @@ -36,7 +38,8 @@ pub fn generate_tags(loader: &Loader, scope: Option<&str>, paths: &[String]) -> writeln!(&mut stdout, "{}", &path_str[1..path_str.len() - 1])?; let source = fs::read(path)?; - for tag in context.generate_tags(tags_config, &source) { + for tag in context.generate_tags(tags_config, &source, Some(&cancellation_flag))? { + let tag = tag?; write!( &mut stdout, " {:<8} {:<40}\t{:>9}-{:<9}", diff --git a/cli/src/tests/tags_test.rs b/cli/src/tests/tags_test.rs index 1b6cb2f3..41907a3c 100644 --- a/cli/src/tests/tags_test.rs +++ b/cli/src/tests/tags_test.rs @@ -3,7 +3,7 @@ use super::helpers::fixtures::{get_language, get_language_queries_path}; use std::ffi::CString; use std::{fs, ptr, slice, str}; use tree_sitter_tags::c_lib as c; -use tree_sitter_tags::{TagKind, TagsConfiguration, TagsContext}; +use tree_sitter_tags::{Error, TagKind, TagsConfiguration, TagsContext}; const PYTHON_TAG_QUERY: &'static str = r#" ((function_definition @@ -79,8 +79,10 @@ fn test_tags_python() { "#; let tags = tag_context - .generate_tags(&tags_config, source) - .collect::>(); + .generate_tags(&tags_config, source, None) + .unwrap() + .collect::, _>>() + .unwrap(); assert_eq!( tags.iter() @@ -128,8 +130,10 @@ fn test_tags_javascript() { let mut tag_context = TagsContext::new(); let tags = tag_context - .generate_tags(&tags_config, source) - .collect::>(); + .generate_tags(&tags_config, source, None) + .unwrap() + .collect::, _>>() + .unwrap(); assert_eq!( tags.iter() @@ -178,8 +182,10 @@ fn test_tags_ruby() { let mut tag_context = TagsContext::new(); let tags = tag_context - .generate_tags(&tags_config, source.as_bytes()) - .collect::>(); + .generate_tags(&tags_config, source.as_bytes(), None) + .unwrap() + .collect::, _>>() + .unwrap(); assert_eq!( tags.iter() @@ -201,6 +207,39 @@ fn test_tags_ruby() { ); } +#[test] +fn test_tags_cancellation() { + use std::sync::atomic::{AtomicUsize, Ordering}; + + allocations::record(|| { + // Large javascript document + let source = (0..500) + .map(|_| "/* hi */ class A { /* ok */ b() {} }\n") + .collect::(); + + let cancellation_flag = AtomicUsize::new(0); + let language = get_language("javascript"); + let tags_config = TagsConfiguration::new(language, JS_TAG_QUERY, "").unwrap(); + + let mut tag_context = TagsContext::new(); + let tags = tag_context + .generate_tags(&tags_config, source.as_bytes(), Some(&cancellation_flag)) + .unwrap(); + + for (i, tag) in tags.enumerate() { + if i == 150 { + cancellation_flag.store(1, Ordering::SeqCst); + } + if let Err(e) = tag { + assert_eq!(e, Error::Cancelled); + return; + } + } + + panic!("Expected to halt tagging with an error"); + }); +} + #[test] fn test_tags_via_c_api() { allocations::record(|| { diff --git a/cli/src/util.rs b/cli/src/util.rs index e880bea1..8978ecc1 100644 --- a/cli/src/util.rs +++ b/cli/src/util.rs @@ -1,12 +1,29 @@ +use std::io; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; +use std::thread; +use tree_sitter::Parser; + #[cfg(unix)] use std::path::PathBuf; #[cfg(unix)] use std::process::{Child, ChildStdin, Command, Stdio}; -use tree_sitter::Parser; #[cfg(unix)] const HTML_HEADER: &[u8] = b"\n\n\n"; +pub fn cancel_on_stdin() -> Arc { + let result = Arc::new(AtomicUsize::new(0)); + thread::spawn({ + let flag = result.clone(); + move || { + let mut line = String::new(); + io::stdin().read_line(&mut line).unwrap(); + flag.store(1, Ordering::Relaxed); + } + }); + result +} #[cfg(windows)] pub struct LogSession(); diff --git a/tags/src/c_lib.rs b/tags/src/c_lib.rs index 0d61fb46..df785aa7 100644 --- a/tags/src/c_lib.rs +++ b/tags/src/c_lib.rs @@ -2,6 +2,7 @@ use super::{Error, TagKind, TagsConfiguration, TagsContext}; use std::collections::HashMap; use std::ffi::CStr; use std::process::abort; +use std::sync::atomic::AtomicUsize; use std::{fmt, ptr, slice, str}; use tree_sitter::Language; @@ -15,6 +16,7 @@ pub enum TSTagsError { InvalidUtf8, InvalidRegex, InvalidQuery, + Unknown, } #[repr(C)] @@ -100,6 +102,7 @@ pub extern "C" fn ts_tagger_add_language( } Err(Error::Query(_)) => TSTagsError::InvalidQuery, Err(Error::Regex(_)) => TSTagsError::InvalidRegex, + Err(_) => TSTagsError::Unknown, } } @@ -110,7 +113,7 @@ pub extern "C" fn ts_tagger_tag( source_code: *const u8, source_code_len: u32, output: *mut TSTagsBuffer, - cancellation_flag: *const usize, + cancellation_flag: *const AtomicUsize, ) -> TSTagsError { let tagger = unwrap_mut_ptr(this); let buffer = unwrap_mut_ptr(output); @@ -120,8 +123,29 @@ pub extern "C" fn ts_tagger_tag( buffer.tags.clear(); buffer.docs.clear(); let source_code = unsafe { slice::from_raw_parts(source_code, source_code_len as usize) }; + let cancellation_flag = unsafe { cancellation_flag.as_ref() }; + + let tags = match buffer + .context + .generate_tags(config, source_code, cancellation_flag) + { + Ok(tags) => tags, + Err(e) => { + return match e { + Error::InvalidLanguage => TSTagsError::InvalidLanguage, + Error::Cancelled => TSTagsError::Timeout, + _ => TSTagsError::Timeout, + } + } + }; + + for tag in tags { + let tag = if let Ok(tag) = tag { + tag + } else { + return TSTagsError::Timeout; + }; - for tag in buffer.context.generate_tags(config, source_code) { let prev_docs_len = buffer.docs.len(); if let Some(docs) = tag.docs { buffer.docs.extend_from_slice(docs.as_bytes()); diff --git a/tags/src/lib.rs b/tags/src/lib.rs index 566efe52..c3642c8f 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -3,12 +3,14 @@ pub mod c_lib; use memchr::{memchr, memrchr}; use regex::Regex; use std::ops::Range; +use std::sync::atomic::{AtomicUsize, Ordering}; use std::{fmt, mem, str}; use tree_sitter::{ Language, Parser, Point, Query, QueryCursor, QueryError, QueryPredicateArg, Tree, }; const MAX_LINE_LEN: usize = 180; +const CANCELLATION_CHECK_INTERVAL: usize = 100; /// Contains the data neeeded to compute tags for code written in a /// particular language. @@ -53,10 +55,12 @@ pub enum TagKind { Call, } -#[derive(Debug)] +#[derive(Debug, PartialEq)] pub enum Error { Query(QueryError), Regex(regex::Error), + Cancelled, + InvalidLanguage, } #[derive(Debug, Default)] @@ -88,6 +92,8 @@ where _tree: Tree, source: &'a [u8], config: &'a TagsConfiguration, + cancellation_flag: Option<&'a AtomicUsize>, + iter_count: usize, tag_queue: Vec<(Tag, usize)>, scopes: Vec>, } @@ -201,14 +207,13 @@ impl TagsContext { &'a mut self, config: &'a TagsConfiguration, source: &'a [u8], - ) -> impl Iterator + 'a { + cancellation_flag: Option<&'a AtomicUsize>, + ) -> Result> + 'a, Error> { self.parser .set_language(config.language) - .expect("Incompatible language"); - let tree = self - .parser - .parse(source, None) - .expect("Parsing failed unexpectedly"); + .map_err(|_| Error::InvalidLanguage)?; + unsafe { self.parser.set_cancellation_flag(cancellation_flag) }; + let tree = self.parser.parse(source, None).ok_or(Error::Cancelled)?; // The `matches` iterator borrows the `Tree`, which prevents it from being moved. // But the tree is really just a pointer, so it's actually ok to move it. @@ -218,18 +223,20 @@ impl TagsContext { .matches(&config.query, tree_ref.root_node(), move |node| { &source[node.byte_range()] }); - TagsIter { + Ok(TagsIter { _tree: tree, matches, source, config, + cancellation_flag, tag_queue: Vec::new(), + iter_count: 0, scopes: vec![LocalScope { range: 0..source.len(), inherits: false, local_defs: Vec::new(), }], - } + }) } } @@ -237,17 +244,29 @@ impl<'a, I> Iterator for TagsIter<'a, I> where I: Iterator>, { - type Item = Tag; + type Item = Result; - fn next(&mut self) -> Option { + fn next(&mut self) -> Option { loop { + // Periodically check for cancellation, returning `Cancelled` error if the + // cancellation flag was flipped. + if let Some(cancellation_flag) = self.cancellation_flag { + self.iter_count += 1; + if self.iter_count >= CANCELLATION_CHECK_INTERVAL { + self.iter_count = 0; + if cancellation_flag.load(Ordering::Relaxed) != 0 { + return Some(Err(Error::Cancelled)); + } + } + } + // If there is a queued tag for an earlier node in the syntax tree, then pop // it off of the queue and return it. if let Some(last_entry) = self.tag_queue.last() { if self.tag_queue.len() > 1 && self.tag_queue[0].0.name_range.end < last_entry.0.name_range.start { - return Some(self.tag_queue.remove(0).0); + return Some(Ok(self.tag_queue.remove(0).0)); } } @@ -420,7 +439,7 @@ where } // If there are no more matches, then drain the queue. else if !self.tag_queue.is_empty() { - return Some(self.tag_queue.remove(0).0); + return Some(Ok(self.tag_queue.remove(0).0)); } else { return None; } From 9dde6c44ed2ad399dc38a969bf9931d56ddfb5e6 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 25 Mar 2020 12:20:02 -0700 Subject: [PATCH 40/42] tags: Always return non-null pointers from C APIs --- tags/src/c_lib.rs | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/tags/src/c_lib.rs b/tags/src/c_lib.rs index df785aa7..1c255da0 100644 --- a/tags/src/c_lib.rs +++ b/tags/src/c_lib.rs @@ -3,7 +3,7 @@ use std::collections::HashMap; use std::ffi::CStr; use std::process::abort; use std::sync::atomic::AtomicUsize; -use std::{fmt, ptr, slice, str}; +use std::{fmt, slice, str}; use tree_sitter::Language; #[repr(C)] @@ -187,8 +187,8 @@ pub extern "C" fn ts_tagger_tag( pub extern "C" fn ts_tags_buffer_new() -> *mut TSTagsBuffer { Box::into_raw(Box::new(TSTagsBuffer { context: TagsContext::new(), - tags: Vec::new(), - docs: Vec::new(), + tags: Vec::with_capacity(64), + docs: Vec::with_capacity(64), })) } @@ -200,11 +200,7 @@ pub extern "C" fn ts_tags_buffer_delete(this: *mut TSTagsBuffer) { #[no_mangle] pub extern "C" fn ts_tags_buffer_tags(this: *const TSTagsBuffer) -> *const TSTag { let buffer = unwrap_ptr(this); - if buffer.tags.is_empty() { - ptr::null() - } else { - buffer.tags.as_ptr() - } + buffer.tags.as_ptr() } #[no_mangle] @@ -216,11 +212,7 @@ pub extern "C" fn ts_tags_buffer_tags_len(this: *const TSTagsBuffer) -> u32 { #[no_mangle] pub extern "C" fn ts_tags_buffer_docs(this: *const TSTagsBuffer) -> *const i8 { let buffer = unwrap_ptr(this); - if buffer.docs.is_empty() { - ptr::null() - } else { - buffer.docs.as_ptr() as *const i8 - } + buffer.docs.as_ptr() as *const i8 } #[no_mangle] From 4dc82d8b8b9ec10716865eecca88d9d1405d13d2 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 25 Mar 2020 12:47:45 -0700 Subject: [PATCH 41/42] Remove unused serde dependency from highlight crate --- Cargo.lock | 3 --- highlight/Cargo.toml | 3 --- 2 files changed, 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2c298eed..d5ea9e15 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -776,9 +776,6 @@ name = "tree-sitter-highlight" version = "0.1.6" dependencies = [ "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", - "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", - "serde_derive 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)", - "serde_json 1.0.33 (registry+https://github.com/rust-lang/crates.io-index)", "tree-sitter 0.6.3", ] diff --git a/highlight/Cargo.toml b/highlight/Cargo.toml index b0d32c02..94a4e032 100644 --- a/highlight/Cargo.toml +++ b/highlight/Cargo.toml @@ -18,9 +18,6 @@ crate-type = ["lib", "staticlib"] [dependencies] regex = "1" -serde = "1.0" -serde_json = "1.0" -serde_derive = "1.0" [dependencies.tree-sitter] version = ">= 0.3.7" From 322b311c2c2db820d9ca923e8035c85fa1a28340 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 26 Mar 2020 16:10:39 -0700 Subject: [PATCH 42/42] Clear QueryCursor state between exec calls --- lib/src/query.c | 3 +++ tags/src/c_lib.rs | 2 ++ tags/src/lib.rs | 1 + 3 files changed, 6 insertions(+) diff --git a/lib/src/query.c b/lib/src/query.c index 20c44fbb..87ab05b5 100644 --- a/lib/src/query.c +++ b/lib/src/query.c @@ -248,6 +248,9 @@ static CaptureListPool capture_list_pool_new() { static void capture_list_pool_reset(CaptureListPool *self) { self->usage_map = UINT32_MAX; + for (unsigned i = 0; i < 32; i++) { + array_clear(&self->list[i]); + } } static void capture_list_pool_delete(CaptureListPool *self) { diff --git a/tags/src/c_lib.rs b/tags/src/c_lib.rs index 1c255da0..0c367977 100644 --- a/tags/src/c_lib.rs +++ b/tags/src/c_lib.rs @@ -143,6 +143,8 @@ pub extern "C" fn ts_tagger_tag( let tag = if let Ok(tag) = tag { tag } else { + buffer.tags.clear(); + buffer.docs.clear(); return TSTagsError::Timeout; }; diff --git a/tags/src/lib.rs b/tags/src/lib.rs index c3642c8f..8d1853bb 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -212,6 +212,7 @@ impl TagsContext { self.parser .set_language(config.language) .map_err(|_| Error::InvalidLanguage)?; + self.parser.reset(); unsafe { self.parser.set_cancellation_flag(cancellation_flag) }; let tree = self.parser.parse(source, None).ok_or(Error::Cancelled)?;