From fa199e3a1a1f300e6acabe3546e92ba180167f65 Mon Sep 17 00:00:00 2001 From: Timothy Clem Date: Tue, 16 Jun 2020 16:04:02 -0700 Subject: [PATCH 01/71] Allow most tags to be arbitrarily named, remove hardcoded kinds --- tags/src/lib.rs | 104 ++++++++++++++++-------------------------------- 1 file changed, 35 insertions(+), 69 deletions(-) diff --git a/tags/src/lib.rs b/tags/src/lib.rs index 8d1853bb..296ac9ba 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -4,7 +4,8 @@ use memchr::{memchr, memrchr}; use regex::Regex; use std::ops::Range; use std::sync::atomic::{AtomicUsize, Ordering}; -use std::{fmt, mem, str}; +use std::{mem, str}; +use std::collections::HashMap; use tree_sitter::{ Language, Parser, Point, Query, QueryCursor, QueryError, QueryPredicateArg, Tree, }; @@ -18,12 +19,8 @@ const CANCELLATION_CHECK_INTERVAL: usize = 100; pub struct TagsConfiguration { pub language: Language, pub query: Query, - call_capture_index: Option, - class_capture_index: Option, + capture_map: HashMap, doc_capture_index: Option, - function_capture_index: Option, - method_capture_index: Option, - module_capture_index: Option, name_capture_index: Option, local_scope_capture_index: Option, local_definition_capture_index: Option, @@ -38,21 +35,13 @@ pub struct TagsContext { #[derive(Debug, Clone)] pub struct Tag { - pub kind: TagKind, pub range: Range, pub name_range: Range, pub line_range: Range, pub span: Range, pub docs: Option, -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub enum TagKind { - Function, - Method, - Class, - Module, - Call, + pub is_definition: bool, + pub kind: String, } #[derive(Debug, PartialEq)] @@ -111,29 +100,23 @@ impl TagsConfiguration { } } - let mut call_capture_index = None; - let mut class_capture_index = None; + let mut capture_map: HashMap = HashMap::new(); let mut doc_capture_index = None; - let mut function_capture_index = None; - let mut method_capture_index = None; - let mut module_capture_index = None; let mut name_capture_index = None; let mut local_scope_capture_index = None; let mut local_definition_capture_index = None; for (i, name) in query.capture_names().iter().enumerate() { - let index = match name.as_str() { - "call" => &mut call_capture_index, - "class" => &mut class_capture_index, - "doc" => &mut doc_capture_index, - "function" => &mut function_capture_index, - "method" => &mut method_capture_index, - "module" => &mut module_capture_index, - "name" => &mut name_capture_index, - "local.scope" => &mut local_scope_capture_index, - "local.definition" => &mut local_definition_capture_index, - _ => continue, - }; - *index = Some(i as u32); + match name.as_str() { + "" => continue, + "name" => name_capture_index = Some(i as u32), + "doc" => doc_capture_index = Some(i as u32), + "local.scope" => local_scope_capture_index = Some(i as u32), + "local.definition" => local_definition_capture_index = Some(i as u32), + _ => { + capture_map.insert(i as u32, name.to_string()); + continue; + } + } } let pattern_info = (0..query.pattern_count()) @@ -180,12 +163,8 @@ impl TagsConfiguration { Ok(TagsConfiguration { language, query, - function_capture_index, - class_capture_index, - method_capture_index, - module_capture_index, + capture_map, doc_capture_index, - call_capture_index, name_capture_index, tags_pattern_index, local_scope_capture_index, @@ -303,7 +282,8 @@ where let mut name_range = None; let mut doc_nodes = Vec::new(); let mut tag_node = None; - let mut kind = TagKind::Call; + let mut kind = "unknown"; + let mut is_definition = false; let mut docs_adjacent_node = None; for capture in mat.captures { @@ -317,21 +297,18 @@ where name_range = Some(capture.node.byte_range()); } else if index == self.config.doc_capture_index { doc_nodes.push(capture.node); - } else if index == self.config.call_capture_index { + } + + if let Some(name) = self.config.capture_map.get(&capture.index) { tag_node = Some(capture.node); - kind = TagKind::Call; - } else if index == self.config.class_capture_index { - tag_node = Some(capture.node); - kind = TagKind::Class; - } else if index == self.config.function_capture_index { - tag_node = Some(capture.node); - kind = TagKind::Function; - } else if index == self.config.method_capture_index { - tag_node = Some(capture.node); - kind = TagKind::Method; - } else if index == self.config.module_capture_index { - tag_node = Some(capture.node); - kind = TagKind::Module; + kind = if name.starts_with("definition.") { + is_definition = true; + name.trim_start_matches("definition.") + } else if name.starts_with("reference.") { + name.trim_start_matches("reference.") + } else { + name + } } } @@ -414,10 +391,11 @@ where *tag = Tag { line_range: line_range(self.source, range.start, MAX_LINE_LEN), span: tag_node.start_position()..tag_node.end_position(), - kind, range, name_range, docs, + kind: kind.to_string(), + is_definition, }; } } @@ -427,10 +405,11 @@ where Tag { line_range: line_range(self.source, range.start, MAX_LINE_LEN), span: tag_node.start_position()..tag_node.end_position(), - kind, range, name_range, docs, + kind: kind.to_string(), + is_definition, }, mat.pattern_index, ), @@ -448,19 +427,6 @@ where } } -impl fmt::Display for TagKind { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - TagKind::Call => "Call", - TagKind::Module => "Module", - TagKind::Class => "Class", - TagKind::Method => "Method", - TagKind::Function => "Function", - } - .fmt(f) - } -} - impl From for Error { fn from(error: regex::Error) -> Self { Error::Regex(error) From 8d7459ed578b8f66bde36624c3f91e40d54d79a2 Mon Sep 17 00:00:00 2001 From: Timothy Clem Date: Tue, 16 Jun 2020 16:04:13 -0700 Subject: [PATCH 02/71] Bring c_lib inline --- tags/src/c_lib.rs | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/tags/src/c_lib.rs b/tags/src/c_lib.rs index 0c367977..c8ca8ed5 100644 --- a/tags/src/c_lib.rs +++ b/tags/src/c_lib.rs @@ -1,4 +1,4 @@ -use super::{Error, TagKind, TagsConfiguration, TagsContext}; +use super::{Error, TagsConfiguration, TagsContext}; use std::collections::HashMap; use std::ffi::CStr; use std::process::abort; @@ -19,16 +19,6 @@ pub enum TSTagsError { Unknown, } -#[repr(C)] -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub enum TSTagKind { - Function, - Method, - Class, - Module, - Call, -} - #[repr(C)] pub struct TSPoint { row: u32, @@ -37,7 +27,6 @@ pub struct TSPoint { #[repr(C)] pub struct TSTag { - pub kind: TSTagKind, pub start_byte: u32, pub end_byte: u32, pub name_start_byte: u32, @@ -48,6 +37,8 @@ pub struct TSTag { pub end_point: TSPoint, pub docs_start_byte: u32, pub docs_end_byte: u32, + pub kind: String, + pub is_definition: bool, } pub struct TSTagger { @@ -153,13 +144,6 @@ pub extern "C" fn ts_tagger_tag( buffer.docs.extend_from_slice(docs.as_bytes()); } buffer.tags.push(TSTag { - kind: match tag.kind { - TagKind::Function => TSTagKind::Function, - TagKind::Method => TSTagKind::Method, - TagKind::Class => TSTagKind::Class, - TagKind::Module => TSTagKind::Module, - TagKind::Call => TSTagKind::Call, - }, start_byte: tag.range.start as u32, end_byte: tag.range.end as u32, name_start_byte: tag.name_range.start as u32, @@ -176,6 +160,8 @@ pub extern "C" fn ts_tagger_tag( }, docs_start_byte: prev_docs_len as u32, docs_end_byte: buffer.docs.len() as u32, + kind: tag.kind, + is_definition: tag.is_definition, }); } From 9bf4939b9a1093f6c42d0bdcf268fef8a4e04d8f Mon Sep 17 00:00:00 2001 From: Timothy Clem Date: Tue, 16 Jun 2020 16:04:22 -0700 Subject: [PATCH 03/71] Show if tag is a def/ref in the cli --- cli/src/tags.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cli/src/tags.rs b/cli/src/tags.rs index d6704ec5..6308d396 100644 --- a/cli/src/tags.rs +++ b/cli/src/tags.rs @@ -42,9 +42,10 @@ pub fn generate_tags(loader: &Loader, scope: Option<&str>, paths: &[String]) -> let tag = tag?; write!( &mut stdout, - " {:<8} {:<40}\t{:>9}-{:<9}", + " {:<8} {:<40}\t [{}] {:>9}-{:<9}", tag.kind, str::from_utf8(&source[tag.name_range]).unwrap_or(""), + if tag.is_definition { "definition" } else { "reference" }, tag.span.start, tag.span.end, )?; From d802b3779145d833dc16e3e075f8e34dd684504a Mon Sep 17 00:00:00 2001 From: Timothy Clem Date: Tue, 16 Jun 2020 17:09:34 -0700 Subject: [PATCH 04/71] Bring back a SyntaxType enum --- cli/src/tags.rs | 2 +- tags/src/c_lib.rs | 28 ++++++++++++-- tags/src/lib.rs | 98 ++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 105 insertions(+), 23 deletions(-) diff --git a/cli/src/tags.rs b/cli/src/tags.rs index 6308d396..06f4f4fa 100644 --- a/cli/src/tags.rs +++ b/cli/src/tags.rs @@ -43,7 +43,7 @@ pub fn generate_tags(loader: &Loader, scope: Option<&str>, paths: &[String]) -> write!( &mut stdout, " {:<8} {:<40}\t [{}] {:>9}-{:<9}", - tag.kind, + tag.syntax_type, str::from_utf8(&source[tag.name_range]).unwrap_or(""), if tag.is_definition { "definition" } else { "reference" }, tag.span.start, diff --git a/tags/src/c_lib.rs b/tags/src/c_lib.rs index c8ca8ed5..72c708d0 100644 --- a/tags/src/c_lib.rs +++ b/tags/src/c_lib.rs @@ -1,4 +1,4 @@ -use super::{Error, TagsConfiguration, TagsContext}; +use super::{Error, SyntaxType, TagsConfiguration, TagsContext}; use std::collections::HashMap; use std::ffi::CStr; use std::process::abort; @@ -19,6 +19,19 @@ pub enum TSTagsError { Unknown, } +#[repr(C)] +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum TSSyntaxType { + Function, + Method, + Class, + Module, + Call, + Type, + Interface, + Implementation, +} + #[repr(C)] pub struct TSPoint { row: u32, @@ -37,7 +50,7 @@ pub struct TSTag { pub end_point: TSPoint, pub docs_start_byte: u32, pub docs_end_byte: u32, - pub kind: String, + pub syntax_type: TSSyntaxType, pub is_definition: bool, } @@ -160,7 +173,16 @@ pub extern "C" fn ts_tagger_tag( }, docs_start_byte: prev_docs_len as u32, docs_end_byte: buffer.docs.len() as u32, - kind: tag.kind, + syntax_type: match tag.syntax_type { + SyntaxType::Function => TSSyntaxType::Function, + SyntaxType::Method => TSSyntaxType::Method, + SyntaxType::Class => TSSyntaxType::Class, + SyntaxType::Module => TSSyntaxType::Module, + SyntaxType::Call => TSSyntaxType::Call, + SyntaxType::Type => TSSyntaxType::Type, + SyntaxType::Interface => TSSyntaxType::Interface, + SyntaxType::Implementation => TSSyntaxType::Implementation, + }, is_definition: tag.is_definition, }); } diff --git a/tags/src/lib.rs b/tags/src/lib.rs index 296ac9ba..e6179b8b 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -4,7 +4,7 @@ use memchr::{memchr, memrchr}; use regex::Regex; use std::ops::Range; use std::sync::atomic::{AtomicUsize, Ordering}; -use std::{mem, str}; +use std::{fmt, mem, str}; use std::collections::HashMap; use tree_sitter::{ Language, Parser, Point, Query, QueryCursor, QueryError, QueryPredicateArg, Tree, @@ -19,7 +19,7 @@ const CANCELLATION_CHECK_INTERVAL: usize = 100; pub struct TagsConfiguration { pub language: Language, pub query: Query, - capture_map: HashMap, + capture_map: HashMap, doc_capture_index: Option, name_capture_index: Option, local_scope_capture_index: Option, @@ -28,6 +28,27 @@ pub struct TagsConfiguration { pattern_info: Vec, } + +#[derive(Debug)] +pub struct NamedCapture { + pub syntax_type: SyntaxType, + pub is_definition: bool, +} + +// Should stay in sync with list of valid syntax types in semantic. +// See: https://github.com/github/semantic/blob/621696f5bc523a651f1cf9fc2ac58c557ea02d07/proto/semantic.proto#L165-L174 +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum SyntaxType { + Function, + Method, + Class, + Module, + Call, + Type, + Interface, + Implementation, +} + pub struct TagsContext { parser: Parser, cursor: QueryCursor, @@ -41,7 +62,7 @@ pub struct Tag { pub span: Range, pub docs: Option, pub is_definition: bool, - pub kind: String, + pub syntax_type: SyntaxType, } #[derive(Debug, PartialEq)] @@ -100,7 +121,7 @@ impl TagsConfiguration { } } - let mut capture_map: HashMap = HashMap::new(); + let mut capture_map: HashMap = HashMap::new(); let mut doc_capture_index = None; let mut name_capture_index = None; let mut local_scope_capture_index = None; @@ -112,9 +133,8 @@ impl TagsConfiguration { "doc" => doc_capture_index = Some(i as u32), "local.scope" => local_scope_capture_index = Some(i as u32), "local.definition" => local_definition_capture_index = Some(i as u32), - _ => { - capture_map.insert(i as u32, name.to_string()); - continue; + _ => if let Some(nc) = NamedCapture::new(name) { + capture_map.insert(i as u32, nc); } } } @@ -282,7 +302,7 @@ where let mut name_range = None; let mut doc_nodes = Vec::new(); let mut tag_node = None; - let mut kind = "unknown"; + let mut syntax_type = SyntaxType::Function; let mut is_definition = false; let mut docs_adjacent_node = None; @@ -299,16 +319,18 @@ where doc_nodes.push(capture.node); } - if let Some(name) = self.config.capture_map.get(&capture.index) { + if let Some(named_capture) = self.config.capture_map.get(&capture.index) { tag_node = Some(capture.node); - kind = if name.starts_with("definition.") { - is_definition = true; - name.trim_start_matches("definition.") - } else if name.starts_with("reference.") { - name.trim_start_matches("reference.") - } else { - name - } + syntax_type = named_capture.syntax_type; + is_definition = named_capture.is_definition; + // kind = if name.starts_with("definition.") { + // is_definition = true; + // name.trim_start_matches("definition.") + // } else if name.starts_with("reference.") { + // name.trim_start_matches("reference.") + // } else { + // name + // } } } @@ -394,7 +416,7 @@ where range, name_range, docs, - kind: kind.to_string(), + syntax_type, is_definition, }; } @@ -408,7 +430,7 @@ where range, name_range, docs, - kind: kind.to_string(), + syntax_type, is_definition, }, mat.pattern_index, @@ -427,6 +449,44 @@ where } } +impl NamedCapture { + pub fn new(name: &String) -> Option { + let mut is_definition = false; + + let kind = if name.starts_with("definition.") { + is_definition = true; + name.trim_start_matches("definition.") + } else if name.starts_with("reference.") { + name.trim_start_matches("reference.") + } else { + name + }; + + let syntax_type = match kind.as_ref() { + "function" => {is_definition = true; SyntaxType::Function}, + "method" => {is_definition = true; SyntaxType::Method}, + "class" => SyntaxType::Class, + "module" => SyntaxType::Module, + "call" => SyntaxType::Call, + "type" => SyntaxType::Type, + "interface" => SyntaxType::Interface, + "implementation" => SyntaxType::Implementation, + _ => return None, + }; + + return Some(NamedCapture{ + syntax_type, + is_definition + }) + } +} + +impl fmt::Display for SyntaxType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{:?}", self) + } +} + impl From for Error { fn from(error: regex::Error) -> Self { Error::Regex(error) From 80f5c522594de99d487aa12a756f369ae48372a3 Mon Sep 17 00:00:00 2001 From: Timothy Clem Date: Tue, 16 Jun 2020 17:19:35 -0700 Subject: [PATCH 05/71] Tests compile --- cli/src/tests/tags_test.rs | 42 +++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/cli/src/tests/tags_test.rs b/cli/src/tests/tags_test.rs index fad8ebd8..b6283507 100644 --- a/cli/src/tests/tags_test.rs +++ b/cli/src/tests/tags_test.rs @@ -3,7 +3,7 @@ use super::helpers::fixtures::{get_language, get_language_queries_path}; use std::ffi::CString; use std::{fs, ptr, slice, str}; use tree_sitter_tags::c_lib as c; -use tree_sitter_tags::{Error, TagKind, TagsConfiguration, TagsContext}; +use tree_sitter_tags::{Error, SyntaxType, TagsConfiguration, TagsContext}; const PYTHON_TAG_QUERY: &'static str = r#" ( @@ -99,12 +99,12 @@ fn test_tags_python() { assert_eq!( tags.iter() - .map(|t| (substr(source, &t.name_range), t.kind)) + .map(|t| (substr(source, &t.name_range), t.syntax_type)) .collect::>(), &[ - ("Customer", TagKind::Class), - ("age", TagKind::Function), - ("compute_age", TagKind::Call), + ("Customer", SyntaxType::Class), + ("age", SyntaxType::Function), + ("compute_age", SyntaxType::Call), ] ); @@ -150,12 +150,12 @@ fn test_tags_javascript() { assert_eq!( tags.iter() - .map(|t| (substr(source, &t.name_range), t.kind)) + .map(|t| (substr(source, &t.name_range), t.syntax_type)) .collect::>(), &[ - ("Customer", TagKind::Class), - ("getAge", TagKind::Method), - ("Agent", TagKind::Class) + ("Customer", SyntaxType::Class), + ("getAge", SyntaxType::Method), + ("Agent", SyntaxType::Class) ] ); assert_eq!( @@ -204,18 +204,18 @@ fn test_tags_ruby() { tags.iter() .map(|t| ( substr(source.as_bytes(), &t.name_range), - t.kind, + t.syntax_type, (t.span.start.row, t.span.start.column), )) .collect::>(), &[ - ("foo", TagKind::Method, (2, 0)), - ("bar", TagKind::Call, (7, 4)), - ("a", TagKind::Call, (7, 8)), - ("b", TagKind::Call, (7, 11)), - ("each", TagKind::Call, (9, 14)), - ("baz", TagKind::Call, (13, 8)), - ("b", TagKind::Call, (13, 15),), + ("foo", SyntaxType::Method, (2, 0)), + ("bar", SyntaxType::Call, (7, 4)), + ("a", SyntaxType::Call, (7, 8)), + ("b", SyntaxType::Call, (7, 11)), + ("each", SyntaxType::Call, (9, 14)), + ("baz", SyntaxType::Call, (13, 8)), + ("b", SyntaxType::Call, (13, 15),), ] ); } @@ -319,7 +319,7 @@ fn test_tags_via_c_api() { assert_eq!( tags.iter() .map(|tag| ( - tag.kind, + tag.syntax_type, &source_code[tag.name_start_byte as usize..tag.name_end_byte as usize], &source_code[tag.line_start_byte as usize..tag.line_end_byte as usize], &docs[tag.docs_start_byte as usize..tag.docs_end_byte as usize], @@ -327,18 +327,18 @@ fn test_tags_via_c_api() { .collect::>(), &[ ( - c::TSTagKind::Function, + c::TSSyntaxType::Function, "b", "function b() {", "one\ntwo\nthree" ), ( - c::TSTagKind::Class, + c::TSSyntaxType::Class, "C", "class C extends D {", "four\nfive" ), - (c::TSTagKind::Call, "b", "b(a);", "") + (c::TSSyntaxType::Call, "b", "b(a);", "") ] ); From 929bb40adcb3678b3a229a272222bd3edab62ecf Mon Sep 17 00:00:00 2001 From: Timothy Clem Date: Wed, 17 Jun 2020 10:34:55 -0700 Subject: [PATCH 06/71] Shorten to def/ref --- cli/src/tags.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cli/src/tags.rs b/cli/src/tags.rs index 06f4f4fa..4869b8cc 100644 --- a/cli/src/tags.rs +++ b/cli/src/tags.rs @@ -45,7 +45,7 @@ pub fn generate_tags(loader: &Loader, scope: Option<&str>, paths: &[String]) -> " {:<8} {:<40}\t [{}] {:>9}-{:<9}", tag.syntax_type, str::from_utf8(&source[tag.name_range]).unwrap_or(""), - if tag.is_definition { "definition" } else { "reference" }, + if tag.is_definition { "def" } else { "ref" }, tag.span.start, tag.span.end, )?; From c08333e0cdbf0cb47253abe1eb856f3f80e4a9ea Mon Sep 17 00:00:00 2001 From: Timothy Clem Date: Wed, 17 Jun 2020 10:35:07 -0700 Subject: [PATCH 07/71] Defer to debug formatting take 2 --- tags/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tags/src/lib.rs b/tags/src/lib.rs index e6179b8b..dd74f833 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -483,7 +483,7 @@ impl NamedCapture { impl fmt::Display for SyntaxType { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{:?}", self) + format!("{:?}", self).fmt(f) } } From 3e8bf9daceb19c64cf3e84530d62594729000d1a Mon Sep 17 00:00:00 2001 From: Timothy Clem Date: Wed, 17 Jun 2020 10:35:16 -0700 Subject: [PATCH 08/71] These are always definitions --- tags/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tags/src/lib.rs b/tags/src/lib.rs index dd74f833..991d3cb5 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -465,8 +465,8 @@ impl NamedCapture { let syntax_type = match kind.as_ref() { "function" => {is_definition = true; SyntaxType::Function}, "method" => {is_definition = true; SyntaxType::Method}, - "class" => SyntaxType::Class, - "module" => SyntaxType::Module, + "class" => {is_definition = true; SyntaxType::Class}, + "module" => {is_definition = true; SyntaxType::Module}, "call" => SyntaxType::Call, "type" => SyntaxType::Type, "interface" => SyntaxType::Interface, From 30132c682b22b57d7f42883f2cb8480691182551 Mon Sep 17 00:00:00 2001 From: Timothy Clem Date: Wed, 17 Jun 2020 14:12:14 -0700 Subject: [PATCH 09/71] Bring tags.h inline --- tags/include/tree_sitter/tags.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tags/include/tree_sitter/tags.h b/tags/include/tree_sitter/tags.h index 946dc6f1..e1ed68bd 100644 --- a/tags/include/tree_sitter/tags.h +++ b/tags/include/tree_sitter/tags.h @@ -19,15 +19,17 @@ typedef enum { } TSTagsError; typedef enum { - TSTagKindFunction, - TSTagKindMethod, - TSTagKindClass, - TSTagKindModule, - TSTagKindCall, -} TSTagKind; + TSSyntaxTypeFunction, + TSSyntaxTypeMethod, + TSSyntaxTypeClass, + TSSyntaxTypeModule, + TSSyntaxTypeCall, + TSSyntaxTypeType, + TSSyntaxTypeInterface, + TSSyntaxTypeImplementation, +} TSTagSyntaxType; typedef struct { - TSTagKind kind; uint32_t start_byte; uint32_t end_byte; uint32_t name_start_byte; @@ -38,6 +40,8 @@ typedef struct { TSPoint end_point; uint32_t docs_start_byte; uint32_t docs_end_byte; + TSTagSyntaxType syntax_type; + bool is_definition; } TSTag; typedef struct TSTagger TSTagger; From 15202d0b382a083ffa7d3019eec9348c5c35c7d9 Mon Sep 17 00:00:00 2001 From: Timothy Clem Date: Wed, 17 Jun 2020 15:11:31 -0700 Subject: [PATCH 10/71] Remove commented code --- tags/src/lib.rs | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tags/src/lib.rs b/tags/src/lib.rs index 991d3cb5..8cd73457 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -323,14 +323,6 @@ where tag_node = Some(capture.node); syntax_type = named_capture.syntax_type; is_definition = named_capture.is_definition; - // kind = if name.starts_with("definition.") { - // is_definition = true; - // name.trim_start_matches("definition.") - // } else if name.starts_with("reference.") { - // name.trim_start_matches("reference.") - // } else { - // name - // } } } From 3c39b016a4c538d645a7e0f5bdfd476e4588afd9 Mon Sep 17 00:00:00 2001 From: Timothy Clem Date: Wed, 17 Jun 2020 15:11:42 -0700 Subject: [PATCH 11/71] Trim whitespace from tag source lines --- tags/src/lib.rs | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/tags/src/lib.rs b/tags/src/lib.rs index 8cd73457..32eaa0d9 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -495,7 +495,16 @@ fn line_range(text: &[u8], index: usize, max_line_len: usize) -> Range { let start = memrchr(b'\n', &text[0..index]).map_or(0, |i| i + 1); let max_line_len = max_line_len.min(text.len() - start); let end = start + memchr(b'\n', &text[start..(start + max_line_len)]).unwrap_or(max_line_len); - start..end + trim_start(text, start..end) +} + +fn trim_start(text: &[u8], r: Range) -> Range { + for (index, c) in text[r.start..r.end].iter().enumerate() { + if !c.is_ascii_whitespace(){ + return index..r.end + } + } + return r } #[cfg(test)] @@ -514,4 +523,13 @@ mod tests { assert_eq!(line_range(text, 5, 10), 4..8); assert_eq!(line_range(text, 11, 10), 9..14); } + + #[test] + fn test_get_line_trims() { + let text = b" foo\nbar\n"; + assert_eq!(line_range(text, 0, 10), 3..6); + + let text = b"\t func foo\nbar\n"; + assert_eq!(line_range(text, 0, 10), 2..10); + } } From 7b2514a6108593f9da31b4bb6638a145bfa77b51 Mon Sep 17 00:00:00 2001 From: Timothy Clem Date: Wed, 17 Jun 2020 15:12:16 -0700 Subject: [PATCH 12/71] Whitespace --- tags/src/lib.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/tags/src/lib.rs b/tags/src/lib.rs index 32eaa0d9..d0746b3d 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -28,7 +28,6 @@ pub struct TagsConfiguration { pattern_info: Vec, } - #[derive(Debug)] pub struct NamedCapture { pub syntax_type: SyntaxType, From 819b800cf973418c7dbd73e628ae26401d618580 Mon Sep 17 00:00:00 2001 From: Timothy Clem Date: Wed, 17 Jun 2020 15:54:29 -0700 Subject: [PATCH 13/71] Pick up the proper initial index and test --- tags/src/lib.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tags/src/lib.rs b/tags/src/lib.rs index d0746b3d..d57e3fb5 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -500,7 +500,7 @@ fn line_range(text: &[u8], index: usize, max_line_len: usize) -> Range { fn trim_start(text: &[u8], r: Range) -> Range { for (index, c) in text[r.start..r.end].iter().enumerate() { if !c.is_ascii_whitespace(){ - return index..r.end + return (r.start+index)..r.end } } return r @@ -530,5 +530,6 @@ mod tests { let text = b"\t func foo\nbar\n"; assert_eq!(line_range(text, 0, 10), 2..10); + assert_eq!(line_range(text, 11, 10), 11..14); } } From f24a952cb48706cf3134ad8da505462098b65348 Mon Sep 17 00:00:00 2001 From: Timothy Clem Date: Wed, 17 Jun 2020 15:54:36 -0700 Subject: [PATCH 14/71] Minor output changes --- cli/src/tags.rs | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/cli/src/tags.rs b/cli/src/tags.rs index 4869b8cc..3493f616 100644 --- a/cli/src/tags.rs +++ b/cli/src/tags.rs @@ -34,20 +34,27 @@ pub fn generate_tags(loader: &Loader, scope: Option<&str>, paths: &[String]) -> }; if let Some(tags_config) = language_config.tags_config(language)? { - let path_str = format!("{:?}", path); - writeln!(&mut stdout, "{}", &path_str[1..path_str.len() - 1])?; + let ident = if paths.len() > 1 { + let path_str = format!("{:?}", path); + writeln!(&mut stdout, "{}", &path_str[1..path_str.len() - 1])?; + "\t" + } else { + "" + }; let source = fs::read(path)?; for tag in context.generate_tags(tags_config, &source, Some(&cancellation_flag))? { let tag = tag?; write!( &mut stdout, - " {:<8} {:<40}\t [{}] {:>9}-{:<9}", - tag.syntax_type, + "{}{:<10}\t | {:<8}\t{} {} - {} `{}`", + ident, str::from_utf8(&source[tag.name_range]).unwrap_or(""), + tag.syntax_type, if tag.is_definition { "def" } else { "ref" }, tag.span.start, tag.span.end, + str::from_utf8(&source[tag.line_range]).unwrap_or(""), )?; if let Some(docs) = tag.docs { if docs.len() > 120 { From 016ad53a2f4f5a79ef4164eaf57a13e5147eb53a Mon Sep 17 00:00:00 2001 From: Timothy Clem Date: Thu, 18 Jun 2020 07:40:48 -0700 Subject: [PATCH 15/71] Trim end of lines as well --- tags/src/lib.rs | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/tags/src/lib.rs b/tags/src/lib.rs index d57e3fb5..1959c753 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -494,18 +494,27 @@ fn line_range(text: &[u8], index: usize, max_line_len: usize) -> Range { let start = memrchr(b'\n', &text[0..index]).map_or(0, |i| i + 1); let max_line_len = max_line_len.min(text.len() - start); let end = start + memchr(b'\n', &text[start..(start + max_line_len)]).unwrap_or(max_line_len); - trim_start(text, start..end) + trim_end(text, trim_start(text, start..end)) } fn trim_start(text: &[u8], r: Range) -> Range { for (index, c) in text[r.start..r.end].iter().enumerate() { - if !c.is_ascii_whitespace(){ + if !c.is_ascii_whitespace() { return (r.start+index)..r.end } } return r } +fn trim_end(text: &[u8], r: Range) -> Range { + for (index, c) in text[r.start..r.end].iter().rev().enumerate() { + if !c.is_ascii_whitespace() { + return r.start..(r.end-index) + } + } + return r +} + #[cfg(test)] mod tests { use super::*; @@ -528,8 +537,15 @@ mod tests { let text = b" foo\nbar\n"; assert_eq!(line_range(text, 0, 10), 3..6); - let text = b"\t func foo\nbar\n"; + let text = b"\t func foo \nbar\n"; assert_eq!(line_range(text, 0, 10), 2..10); - assert_eq!(line_range(text, 11, 10), 11..14); + + let r = line_range(text, 0, 14); + assert_eq!(r, 2..10); + assert_eq!(str::from_utf8(&text[r]).unwrap_or(""), "func foo"); + + let r = line_range(text, 12, 14); + assert_eq!(r, 12..15); + assert_eq!(str::from_utf8(&text[r]).unwrap_or(""), "bar"); } } From 3bcb1f8c9405f77242a0c2f46dabfe4c8e59b53d Mon Sep 17 00:00:00 2001 From: Timothy Clem Date: Thu, 18 Jun 2020 10:48:33 -0700 Subject: [PATCH 16/71] Assert line trimming --- cli/src/tests/tags_test.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cli/src/tests/tags_test.rs b/cli/src/tests/tags_test.rs index b6283507..02d06ff6 100644 --- a/cli/src/tests/tags_test.rs +++ b/cli/src/tests/tags_test.rs @@ -108,10 +108,10 @@ fn test_tags_python() { ] ); - assert_eq!(substr(source, &tags[0].line_range), " class Customer:"); + assert_eq!(substr(source, &tags[0].line_range), "class Customer:"); assert_eq!( substr(source, &tags[1].line_range), - " def age(self):" + "def age(self):" ); assert_eq!(tags[0].docs.as_ref().unwrap(), "Data about a customer"); assert_eq!(tags[1].docs.as_ref().unwrap(), "Get the customer's age"); From 54586c4e5bf5536bf075558b0529f4518f348676 Mon Sep 17 00:00:00 2001 From: Timothy Clem Date: Thu, 18 Jun 2020 14:42:30 -0700 Subject: [PATCH 17/71] Named captures are dynamic New c api for getting list of syntax_type names. --- tags/include/tree_sitter/tags.h | 16 ++--- tags/src/c_lib.rs | 46 +++++++------- tags/src/lib.rs | 105 +++++++++++++------------------- 3 files changed, 69 insertions(+), 98 deletions(-) diff --git a/tags/include/tree_sitter/tags.h b/tags/include/tree_sitter/tags.h index e1ed68bd..f6113a0f 100644 --- a/tags/include/tree_sitter/tags.h +++ b/tags/include/tree_sitter/tags.h @@ -18,17 +18,6 @@ typedef enum { TSTagsInvalidQuery, } TSTagsError; -typedef enum { - TSSyntaxTypeFunction, - TSSyntaxTypeMethod, - TSSyntaxTypeClass, - TSSyntaxTypeModule, - TSSyntaxTypeCall, - TSSyntaxTypeType, - TSSyntaxTypeInterface, - TSSyntaxTypeImplementation, -} TSTagSyntaxType; - typedef struct { uint32_t start_byte; uint32_t end_byte; @@ -40,7 +29,7 @@ typedef struct { TSPoint end_point; uint32_t docs_start_byte; uint32_t docs_end_byte; - TSTagSyntaxType syntax_type; + uint32_t syntax_type_id; bool is_definition; } TSTag; @@ -93,6 +82,9 @@ uint32_t ts_tags_buffer_tags_len(const TSTagsBuffer *); const char *ts_tags_buffer_docs(const TSTagsBuffer *); uint32_t ts_tags_buffer_docs_len(const TSTagsBuffer *); +// Get the syntax kinds for a scope. +const char **ts_tagger_syntax_kinds_for_scope_name(const TSTagger *, const char *scope_name, uint32_t *len); + #ifdef __cplusplus } #endif diff --git a/tags/src/c_lib.rs b/tags/src/c_lib.rs index 72c708d0..6dc48195 100644 --- a/tags/src/c_lib.rs +++ b/tags/src/c_lib.rs @@ -1,4 +1,4 @@ -use super::{Error, SyntaxType, TagsConfiguration, TagsContext}; +use super::{Error, TagsConfiguration, TagsContext}; use std::collections::HashMap; use std::ffi::CStr; use std::process::abort; @@ -19,19 +19,6 @@ pub enum TSTagsError { Unknown, } -#[repr(C)] -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub enum TSSyntaxType { - Function, - Method, - Class, - Module, - Call, - Type, - Interface, - Implementation, -} - #[repr(C)] pub struct TSPoint { row: u32, @@ -50,7 +37,7 @@ pub struct TSTag { pub end_point: TSPoint, pub docs_start_byte: u32, pub docs_end_byte: u32, - pub syntax_type: TSSyntaxType, + pub syntax_type_id: u32, pub is_definition: bool, } @@ -173,16 +160,7 @@ pub extern "C" fn ts_tagger_tag( }, docs_start_byte: prev_docs_len as u32, docs_end_byte: buffer.docs.len() as u32, - syntax_type: match tag.syntax_type { - SyntaxType::Function => TSSyntaxType::Function, - SyntaxType::Method => TSSyntaxType::Method, - SyntaxType::Class => TSSyntaxType::Class, - SyntaxType::Module => TSSyntaxType::Module, - SyntaxType::Call => TSSyntaxType::Call, - SyntaxType::Type => TSSyntaxType::Type, - SyntaxType::Interface => TSSyntaxType::Interface, - SyntaxType::Implementation => TSSyntaxType::Implementation, - }, + syntax_type_id: tag.syntax_type_id, is_definition: tag.is_definition, }); } @@ -231,6 +209,24 @@ pub extern "C" fn ts_tags_buffer_docs_len(this: *const TSTagsBuffer) -> u32 { buffer.docs.len() as u32 } +#[no_mangle] +pub extern "C" fn ts_tagger_syntax_kinds_for_scope_name( + this: *mut TSTagger, + scope_name: *const i8, + len: *mut u32, +) -> *const *const i8 { + let tagger = unwrap_mut_ptr(this); + let scope_name = unsafe { unwrap(CStr::from_ptr(scope_name).to_str()) }; + let len = unwrap_mut_ptr(len); + + *len = 0; + if let Some(config) = tagger.languages.get(scope_name) { + *len = config.c_syntax_type_names.len() as u32; + return config.c_syntax_type_names.as_ptr() as *const *const i8 + } + std::ptr::null() +} + fn unwrap_ptr<'a, T>(result: *const T) -> &'a T { unsafe { result.as_ref() }.unwrap_or_else(|| { eprintln!("{}:{} - pointer must not be null", file!(), line!()); diff --git a/tags/src/lib.rs b/tags/src/lib.rs index 1959c753..3d5ce770 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -5,6 +5,7 @@ use regex::Regex; use std::ops::Range; use std::sync::atomic::{AtomicUsize, Ordering}; use std::{fmt, mem, str}; +use std::ffi::CStr; use std::collections::HashMap; use tree_sitter::{ Language, Parser, Point, Query, QueryCursor, QueryError, QueryPredicateArg, Tree, @@ -19,6 +20,8 @@ const CANCELLATION_CHECK_INTERVAL: usize = 100; pub struct TagsConfiguration { pub language: Language, pub query: Query, + syntax_type_names: Vec>, + c_syntax_type_names: Vec<*const u8>, capture_map: HashMap, doc_capture_index: Option, name_capture_index: Option, @@ -30,24 +33,10 @@ pub struct TagsConfiguration { #[derive(Debug)] pub struct NamedCapture { - pub syntax_type: SyntaxType, + pub syntax_type_id: u32, pub is_definition: bool, } -// Should stay in sync with list of valid syntax types in semantic. -// See: https://github.com/github/semantic/blob/621696f5bc523a651f1cf9fc2ac58c557ea02d07/proto/semantic.proto#L165-L174 -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub enum SyntaxType { - Function, - Method, - Class, - Module, - Call, - Type, - Interface, - Implementation, -} - pub struct TagsContext { parser: Parser, cursor: QueryCursor, @@ -61,7 +50,7 @@ pub struct Tag { pub span: Range, pub docs: Option, pub is_definition: bool, - pub syntax_type: SyntaxType, + pub syntax_type_id: u32, } #[derive(Debug, PartialEq)] @@ -70,6 +59,7 @@ pub enum Error { Regex(regex::Error), Cancelled, InvalidLanguage, + InvalidCapture(String), } #[derive(Debug, Default)] @@ -120,11 +110,13 @@ impl TagsConfiguration { } } - let mut capture_map: HashMap = HashMap::new(); + let mut capture_map = HashMap::new(); + let mut syntax_type_names = Vec::new(); let mut doc_capture_index = None; let mut name_capture_index = None; let mut local_scope_capture_index = None; let mut local_definition_capture_index = None; + let mut syntax_type_id = 0; for (i, name) in query.capture_names().iter().enumerate() { match name.as_str() { "" => continue, @@ -132,12 +124,32 @@ impl TagsConfiguration { "doc" => doc_capture_index = Some(i as u32), "local.scope" => local_scope_capture_index = Some(i as u32), "local.definition" => local_definition_capture_index = Some(i as u32), - _ => if let Some(nc) = NamedCapture::new(name) { - capture_map.insert(i as u32, nc); + "local.reference" => continue, + _ => { + let mut is_definition = false; + + let kind = if name.starts_with("definition.") { + is_definition = true; + name.trim_start_matches("definition.") + } else if name.starts_with("reference.") { + name.trim_start_matches("reference.") + } else { + return Err(Error::InvalidCapture(name.to_string())) + }.to_string()+"\0"; + + capture_map.insert(i as u32, NamedCapture{ syntax_type_id, is_definition }); + syntax_type_id+=1; + if let Ok(cstr) = CStr::from_bytes_with_nul(kind.as_bytes()) { + syntax_type_names.push(cstr.to_bytes_with_nul().to_vec().into_boxed_slice()); + } } } } + let c_syntax_type_names = syntax_type_names.iter().map( |s| { + s.as_ptr() + }).collect(); + let pattern_info = (0..query.pattern_count()) .map(|pattern_index| { let mut info = PatternInfo::default(); @@ -182,6 +194,8 @@ impl TagsConfiguration { Ok(TagsConfiguration { language, query, + syntax_type_names, + c_syntax_type_names, capture_map, doc_capture_index, name_capture_index, @@ -191,6 +205,13 @@ impl TagsConfiguration { pattern_info, }) } + + pub fn syntax_type_name(&self, id: u32) -> &str { + unsafe { + let cstr = CStr::from_ptr(self.syntax_type_names[id as usize].as_ptr() as *const i8).to_bytes(); + str::from_utf8(cstr).expect("syntax type name was not valid utf-8") + } + } } impl TagsContext { @@ -301,7 +322,7 @@ where let mut name_range = None; let mut doc_nodes = Vec::new(); let mut tag_node = None; - let mut syntax_type = SyntaxType::Function; + let mut syntax_type_id = 0; let mut is_definition = false; let mut docs_adjacent_node = None; @@ -320,7 +341,7 @@ where if let Some(named_capture) = self.config.capture_map.get(&capture.index) { tag_node = Some(capture.node); - syntax_type = named_capture.syntax_type; + syntax_type_id = named_capture.syntax_type_id; is_definition = named_capture.is_definition; } } @@ -407,7 +428,7 @@ where range, name_range, docs, - syntax_type, + syntax_type_id, is_definition, }; } @@ -421,7 +442,7 @@ where range, name_range, docs, - syntax_type, + syntax_type_id, is_definition, }, mat.pattern_index, @@ -440,44 +461,6 @@ where } } -impl NamedCapture { - pub fn new(name: &String) -> Option { - let mut is_definition = false; - - let kind = if name.starts_with("definition.") { - is_definition = true; - name.trim_start_matches("definition.") - } else if name.starts_with("reference.") { - name.trim_start_matches("reference.") - } else { - name - }; - - let syntax_type = match kind.as_ref() { - "function" => {is_definition = true; SyntaxType::Function}, - "method" => {is_definition = true; SyntaxType::Method}, - "class" => {is_definition = true; SyntaxType::Class}, - "module" => {is_definition = true; SyntaxType::Module}, - "call" => SyntaxType::Call, - "type" => SyntaxType::Type, - "interface" => SyntaxType::Interface, - "implementation" => SyntaxType::Implementation, - _ => return None, - }; - - return Some(NamedCapture{ - syntax_type, - is_definition - }) - } -} - -impl fmt::Display for SyntaxType { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - format!("{:?}", self).fmt(f) - } -} - impl From for Error { fn from(error: regex::Error) -> Self { Error::Regex(error) From 75724698f0b668b6511b8dcf4bf718733abfffb5 Mon Sep 17 00:00:00 2001 From: Timothy Clem Date: Thu, 18 Jun 2020 14:42:41 -0700 Subject: [PATCH 18/71] Fix up tests --- cli/src/tests/tags_test.rs | 60 +++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 21 deletions(-) diff --git a/cli/src/tests/tags_test.rs b/cli/src/tests/tags_test.rs index 02d06ff6..cc339e0a 100644 --- a/cli/src/tests/tags_test.rs +++ b/cli/src/tests/tags_test.rs @@ -2,8 +2,9 @@ use super::helpers::allocations; use super::helpers::fixtures::{get_language, get_language_queries_path}; use std::ffi::CString; use std::{fs, ptr, slice, str}; +use std::ffi::CStr; use tree_sitter_tags::c_lib as c; -use tree_sitter_tags::{Error, SyntaxType, TagsConfiguration, TagsContext}; +use tree_sitter_tags::{Error, TagsConfiguration, TagsContext}; const PYTHON_TAG_QUERY: &'static str = r#" ( @@ -97,14 +98,15 @@ fn test_tags_python() { .collect::, _>>() .unwrap(); + assert_eq!( tags.iter() - .map(|t| (substr(source, &t.name_range), t.syntax_type)) + .map(|t| (substr(source, &t.name_range), tags_config.syntax_type_name(t.syntax_type_id))) .collect::>(), &[ - ("Customer", SyntaxType::Class), - ("age", SyntaxType::Function), - ("compute_age", SyntaxType::Call), + ("Customer", "class"), + ("age", "function"), + ("compute_age", "call"), ] ); @@ -150,12 +152,12 @@ fn test_tags_javascript() { assert_eq!( tags.iter() - .map(|t| (substr(source, &t.name_range), t.syntax_type)) + .map(|t| (substr(source, &t.name_range), tags_config.syntax_type_name(t.syntax_type_id))) .collect::>(), &[ - ("Customer", SyntaxType::Class), - ("getAge", SyntaxType::Method), - ("Agent", SyntaxType::Class) + ("Customer", "class"), + ("getAge", "method"), + ("Agent", "class") ] ); assert_eq!( @@ -204,18 +206,18 @@ fn test_tags_ruby() { tags.iter() .map(|t| ( substr(source.as_bytes(), &t.name_range), - t.syntax_type, + tags_config.syntax_type_name(t.syntax_type_id), (t.span.start.row, t.span.start.column), )) .collect::>(), &[ - ("foo", SyntaxType::Method, (2, 0)), - ("bar", SyntaxType::Call, (7, 4)), - ("a", SyntaxType::Call, (7, 8)), - ("b", SyntaxType::Call, (7, 11)), - ("each", SyntaxType::Call, (9, 14)), - ("baz", SyntaxType::Call, (13, 8)), - ("b", SyntaxType::Call, (13, 15),), + ("foo", "method", (2, 0)), + ("bar", "call", (7, 4)), + ("a", "call", (7, 8)), + ("b", "call", (7, 11)), + ("each", "call", (9, 14)), + ("baz", "call", (13, 8)), + ("b", "call", (13, 15),), ] ); } @@ -253,6 +255,14 @@ fn test_tags_cancellation() { }); } +#[test] +fn test_invalid_cpature() { + let language = get_language("python"); + let e = TagsConfiguration::new(language, "(identifier) @method", "") + .expect_err("expected InvalidCapture error"); + assert_eq!(e, Error::InvalidCapture("method".to_string())); +} + #[test] fn test_tags_via_c_api() { allocations::record(|| { @@ -316,10 +326,18 @@ fn test_tags_via_c_api() { }) .unwrap(); + let syntax_types: Vec<&str> = unsafe { + let mut len: u32 = 0; + let ptr = c::ts_tagger_syntax_kinds_for_scope_name(tagger, c_scope_name.as_ptr(), &mut len); + slice::from_raw_parts(ptr, len as usize).iter().map(|i| { + CStr::from_ptr(*i).to_str().unwrap() + }).collect() + }; + assert_eq!( tags.iter() .map(|tag| ( - tag.syntax_type, + syntax_types[tag.syntax_type_id as usize], &source_code[tag.name_start_byte as usize..tag.name_end_byte as usize], &source_code[tag.line_start_byte as usize..tag.line_end_byte as usize], &docs[tag.docs_start_byte as usize..tag.docs_end_byte as usize], @@ -327,18 +345,18 @@ fn test_tags_via_c_api() { .collect::>(), &[ ( - c::TSSyntaxType::Function, + "function", "b", "function b() {", "one\ntwo\nthree" ), ( - c::TSSyntaxType::Class, + "class", "C", "class C extends D {", "four\nfive" ), - (c::TSSyntaxType::Call, "b", "b(a);", "") + ("call", "b", "b(a);", "") ] ); From b6ae67a6100a7c1fa6a249a2b4e0ff04378a41b5 Mon Sep 17 00:00:00 2001 From: Timothy Clem Date: Thu, 18 Jun 2020 14:43:10 -0700 Subject: [PATCH 19/71] Fix up CLI, use new syntax_type_name --- cli/src/tags.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cli/src/tags.rs b/cli/src/tags.rs index 3493f616..515f4c52 100644 --- a/cli/src/tags.rs +++ b/cli/src/tags.rs @@ -50,7 +50,7 @@ pub fn generate_tags(loader: &Loader, scope: Option<&str>, paths: &[String]) -> "{}{:<10}\t | {:<8}\t{} {} - {} `{}`", ident, str::from_utf8(&source[tag.name_range]).unwrap_or(""), - tag.syntax_type, + &tags_config.syntax_type_name(tag.syntax_type_id), if tag.is_definition { "def" } else { "ref" }, tag.span.start, tag.span.end, From 17d26c0d5a5d2b836a0b5f77414c007572589b97 Mon Sep 17 00:00:00 2001 From: Timothy Clem Date: Thu, 18 Jun 2020 14:43:27 -0700 Subject: [PATCH 20/71] Improved errors --- cli/src/error.rs | 2 +- tags/src/lib.rs | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/cli/src/error.rs b/cli/src/error.rs index 824bd92f..d583d1b9 100644 --- a/cli/src/error.rs +++ b/cli/src/error.rs @@ -83,7 +83,7 @@ impl<'a> From for Error { impl<'a> From for Error { fn from(error: tree_sitter_tags::Error) -> Self { - Error::new(format!("{:?}", error)) + Error::new(format!("{}", error)) } } diff --git a/tags/src/lib.rs b/tags/src/lib.rs index 3d5ce770..07fed3af 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -62,6 +62,15 @@ pub enum Error { InvalidCapture(String), } +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Error::InvalidCapture(name) => write!(f, "Invalid capture @{}. Expected one of: @definition.*, @reference.*, @doc, @name, @local.(scope|definition|reference).", name), + _ => write!(f, "{:?}", self) + } + } +} + #[derive(Debug, Default)] struct PatternInfo { docs_adjacent_capture: Option, From ef15f4df24af34f685eefc630b2af69b1ee661b2 Mon Sep 17 00:00:00 2001 From: Timothy Clem Date: Thu, 18 Jun 2020 15:05:08 -0700 Subject: [PATCH 21/71] Dedupe items in syntax_type_names --- tags/src/lib.rs | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tags/src/lib.rs b/tags/src/lib.rs index 07fed3af..128a01cf 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -5,7 +5,7 @@ use regex::Regex; use std::ops::Range; use std::sync::atomic::{AtomicUsize, Ordering}; use std::{fmt, mem, str}; -use std::ffi::CStr; +use std::ffi::{CStr, CString}; use std::collections::HashMap; use tree_sitter::{ Language, Parser, Point, Query, QueryCursor, QueryError, QueryPredicateArg, Tree, @@ -125,7 +125,6 @@ impl TagsConfiguration { let mut name_capture_index = None; let mut local_scope_capture_index = None; let mut local_definition_capture_index = None; - let mut syntax_type_id = 0; for (i, name) in query.capture_names().iter().enumerate() { match name.as_str() { "" => continue, @@ -144,12 +143,15 @@ impl TagsConfiguration { name.trim_start_matches("reference.") } else { return Err(Error::InvalidCapture(name.to_string())) - }.to_string()+"\0"; + }; - capture_map.insert(i as u32, NamedCapture{ syntax_type_id, is_definition }); - syntax_type_id+=1; - if let Ok(cstr) = CStr::from_bytes_with_nul(kind.as_bytes()) { - syntax_type_names.push(cstr.to_bytes_with_nul().to_vec().into_boxed_slice()); + if let Ok(cstr) = CString::new(kind) { + let c_kind = cstr.to_bytes_with_nul().to_vec().into_boxed_slice(); + let syntax_type_id = syntax_type_names.iter().position(|n| { n == &c_kind }).unwrap_or_else(|| { + syntax_type_names.push(c_kind); + syntax_type_names.len() - 1 + }) as u32; + capture_map.insert(i as u32, NamedCapture{ syntax_type_id, is_definition }); } } } From f166947abb3fa834463dfb21b0044d30b0617795 Mon Sep 17 00:00:00 2001 From: Timothy Clem Date: Thu, 18 Jun 2020 15:05:27 -0700 Subject: [PATCH 22/71] Test updates, definition/reference prefix is now required --- cli/src/tests/tags_test.rs | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/cli/src/tests/tags_test.rs b/cli/src/tests/tags_test.rs index cc339e0a..540e2b01 100644 --- a/cli/src/tests/tags_test.rs +++ b/cli/src/tests/tags_test.rs @@ -10,65 +10,65 @@ const PYTHON_TAG_QUERY: &'static str = r#" ( (function_definition name: (identifier) @name - body: (block . (expression_statement (string) @doc))) @function + body: (block . (expression_statement (string) @doc))) @definition.function (#strip! @doc "(^['\"\\s]*)|(['\"\\s]*$)") ) (function_definition - name: (identifier) @name) @function + name: (identifier) @name) @definition.function ( (class_definition name: (identifier) @name body: (block - . (expression_statement (string) @doc))) @class + . (expression_statement (string) @doc))) @definition.class (#strip! @doc "(^['\"\\s]*)|(['\"\\s]*$)") ) (class_definition - name: (identifier) @name) @class + name: (identifier) @name) @definition.class (call - function: (identifier) @name) @call + function: (identifier) @name) @reference.call "#; const JS_TAG_QUERY: &'static str = r#" ( (comment)* @doc . (class_declaration - name: (identifier) @name) @class - (#select-adjacent! @doc @class) + name: (identifier) @name) @definition.class + (#select-adjacent! @doc @definition.class) (#strip! @doc "(^[/\\*\\s]*)|([/\\*\\s]*$)") ) ( (comment)* @doc . (method_definition - name: (property_identifier) @name) @method - (#select-adjacent! @doc @method) + name: (property_identifier) @name) @definition.method + (#select-adjacent! @doc @definition.method) (#strip! @doc "(^[/\\*\\s]*)|([/\\*\\s]*$)") ) ( (comment)* @doc . (function_declaration - name: (identifier) @name) @function - (#select-adjacent! @doc @function) + name: (identifier) @name) @definition.function + (#select-adjacent! @doc @definition.function) (#strip! @doc "(^[/\\*\\s]*)|([/\\*\\s]*$)") ) (call_expression - function: (identifier) @name) @call + function: (identifier) @name) @reference.call "#; const RUBY_TAG_QUERY: &'static str = r#" (method - name: (identifier) @name) @method + name: (identifier) @name) @definition.method (method_call - method: (identifier) @name) @call + method: (identifier) @name) @reference.call -((identifier) @name @call +((identifier) @name @reference.call (#is-not? local)) "#; @@ -256,7 +256,7 @@ fn test_tags_cancellation() { } #[test] -fn test_invalid_cpature() { +fn test_invalid_capture() { let language = get_language("python"); let e = TagsConfiguration::new(language, "(identifier) @method", "") .expect_err("expected InvalidCapture error"); From d9d3da994218339e525925b6cfda81247a22c001 Mon Sep 17 00:00:00 2001 From: Timothy Clem Date: Thu, 18 Jun 2020 16:04:05 -0700 Subject: [PATCH 23/71] Fill out rest of c errors --- tags/include/tree_sitter/tags.h | 1 + tags/src/c_lib.rs | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tags/include/tree_sitter/tags.h b/tags/include/tree_sitter/tags.h index f6113a0f..58f5bbd9 100644 --- a/tags/include/tree_sitter/tags.h +++ b/tags/include/tree_sitter/tags.h @@ -16,6 +16,7 @@ typedef enum { TSTagsInvalidUtf8, TSTagsInvalidRegex, TSTagsInvalidQuery, + TSTagsInvalidCapture, } TSTagsError; typedef struct { diff --git a/tags/src/c_lib.rs b/tags/src/c_lib.rs index 6dc48195..77f8aae5 100644 --- a/tags/src/c_lib.rs +++ b/tags/src/c_lib.rs @@ -16,6 +16,7 @@ pub enum TSTagsError { InvalidUtf8, InvalidRegex, InvalidQuery, + InvalidCapture, Unknown, } @@ -93,7 +94,9 @@ pub extern "C" fn ts_tagger_add_language( } Err(Error::Query(_)) => TSTagsError::InvalidQuery, Err(Error::Regex(_)) => TSTagsError::InvalidRegex, - Err(_) => TSTagsError::Unknown, + Err(Error::Cancelled) => TSTagsError::Timeout, + Err(Error::InvalidLanguage) => TSTagsError::InvalidLanguage, + Err(Error::InvalidCapture(_)) => TSTagsError::InvalidCapture, } } From 0438ed03ffbb4db86283ae3fcea3529971f1715b Mon Sep 17 00:00:00 2001 From: intrigus-lgtm <60750685+intrigus-lgtm@users.noreply.github.com> Date: Mon, 6 Jul 2020 22:47:10 +0200 Subject: [PATCH 24/71] Fix wrong file name (#666) "build_fuzzers" -> "build-fuzzers". It should be a hypen and not an underscore. --- test/fuzz/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/fuzz/README.md b/test/fuzz/README.md index 649d2d89..a02d2689 100644 --- a/test/fuzz/README.md +++ b/test/fuzz/README.md @@ -22,10 +22,10 @@ The fuzzers can then be built with: export CLANG_DIR=$HOME/src/third_party/llvm-build/Release+Asserts/bin CC="$CLANG_DIR/clang" CXX="$CLANG_DIR/clang++" LINK="$CLANG_DIR/clang++" \ LIB_FUZZER_PATH=$HOME/src/compiler-rt/lib/fuzzer/libFuzzer.a \ - ./script/build_fuzzers + ./script/build-fuzzers ``` -This will generate a separate fuzzer for each grammar defined in `test/fixtures/grammars` and will be instrumented with [AddressSanitizer](https://clang.llvm.org/docs/AddressSanitizer.html) and [UndefinedBehaviorSanitizer](https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html). Individual fuzzers can be built with, for example, `./script/build_fuzzers python ruby`. +This will generate a separate fuzzer for each grammar defined in `test/fixtures/grammars` and will be instrumented with [AddressSanitizer](https://clang.llvm.org/docs/AddressSanitizer.html) and [UndefinedBehaviorSanitizer](https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html). Individual fuzzers can be built with, for example, `./script/build-fuzzers python ruby`. The `run-fuzzer` script handles running an individual fuzzer with a sensible default set of arguments: ``` From 0bf2450b4aa26e79d9fcb1e2007e183ff14d2424 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 6 Jul 2020 15:56:21 -0700 Subject: [PATCH 25/71] Always enforce stack version limit during reductions Fixes #669 --- cli/src/tests/mod.rs | 1 + cli/src/tests/pathological_test.rs | 15 +++++++++++++++ lib/src/parser.c | 18 ++++++++++++------ 3 files changed, 28 insertions(+), 6 deletions(-) create mode 100644 cli/src/tests/pathological_test.rs diff --git a/cli/src/tests/mod.rs b/cli/src/tests/mod.rs index ac54db00..24e8160e 100644 --- a/cli/src/tests/mod.rs +++ b/cli/src/tests/mod.rs @@ -3,6 +3,7 @@ mod helpers; mod highlight_test; mod node_test; mod parser_test; +mod pathological_test; mod query_test; mod tags_test; mod test_highlight_test; diff --git a/cli/src/tests/pathological_test.rs b/cli/src/tests/pathological_test.rs new file mode 100644 index 00000000..7ebd5439 --- /dev/null +++ b/cli/src/tests/pathological_test.rs @@ -0,0 +1,15 @@ +use super::helpers::allocations; +use super::helpers::fixtures::get_language; +use tree_sitter::Parser; + +#[test] +fn test_pathological_example_1() { + let language = "cpp"; + let source = r#"*ss(qqXstack); - uint32_t removed_version_count = 0; - StackSliceArray pop = ts_stack_pop_count(self->stack, version, count); + // Pop the given number of nodes from the given version of the parse stack. + // If stack versions have previously merged, then there may be more than one + // path back through the stack. For each path, create a new parent node to + // contain the popped children, and push it onto the stack in place of the + // children. + StackSliceArray pop = ts_stack_pop_count(self->stack, version, count); + uint32_t removed_version_count = 0; for (uint32_t i = 0; i < pop.size; i++) { StackSlice slice = pop.contents[i]; StackVersion slice_version = slice.version - removed_version_count; - // Error recovery can sometimes cause lots of stack versions to merge, - // such that a single pop operation can produce a lots of slices. - // Avoid creating too many stack versions in that situation. - if (i > 0 && slice_version > MAX_VERSION_COUNT + MAX_VERSION_COUNT_OVERFLOW) { + // This is where new versions are added to the parse stack. The versions + // will all be sorted and truncated at the end of the outer parsing loop. + // Allow the maximum version count to be temporarily exceeded, but only + // by a limited threshold. + if (slice_version > MAX_VERSION_COUNT + MAX_VERSION_COUNT_OVERFLOW) { ts_stack_remove_version(self->stack, slice_version); ts_subtree_array_delete(&self->tree_pool, &slice.subtrees); removed_version_count++; From 86a5dabbcbdac650c53a889183bf56d7e721e09e Mon Sep 17 00:00:00 2001 From: Jacob Gillespie Date: Tue, 7 Jul 2020 16:45:23 +0100 Subject: [PATCH 26/71] Add TypeScript definition for DSL (#658) --- cli/npm/dsl.d.ts | 356 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 356 insertions(+) create mode 100644 cli/npm/dsl.d.ts diff --git a/cli/npm/dsl.d.ts b/cli/npm/dsl.d.ts new file mode 100644 index 00000000..b9bf1c98 --- /dev/null +++ b/cli/npm/dsl.d.ts @@ -0,0 +1,356 @@ +type AliasRule = {type: 'ALIAS'; named: boolean; content: Rule; value: string}; +type BlankRule = {type: 'BLANK'}; +type ChoiceRule = {type: 'CHOICE'; members: Rule[]}; +type FieldRule = {type: 'FIELD'; name: string; content: Rule}; +type ImmediateTokenRule = {type: 'IMMEDIATE_TOKEN'; content: Rule}; +type PatternRule = {type: 'PATTERN'; value: string}; +type PrecDynamicRule = {type: 'PREC_DYNAMIC'; content: Rule; value: number}; +type PrecLeftRule = {type: 'PREC_LEFT'; content: Rule; value: number}; +type PrecRightRule = {type: 'PREC_RIGHT'; content: Rule; value: number}; +type PrecRule = {type: 'PREC'; content: Rule; value: number}; +type Repeat1Rule = {type: 'REPEAT1'; content: Rule}; +type RepeatRule = {type: 'REPEAT'; content: Rule}; +type SeqRule = {type: 'SEQ'; members: Rule[]}; +type StringRule = {type: 'STRING'; value: string}; +type SymbolRule = {type: 'SYMBOL'; name: Name}; +type TokenRule = {type: 'TOKEN'; content: Rule}; + +type Rule = + | AliasRule + | BlankRule + | ChoiceRule + | FieldRule + | ImmediateTokenRule + | PatternRule + | PrecDynamicRule + | PrecLeftRule + | PrecRightRule + | PrecRule + | Repeat1Rule + | RepeatRule + | SeqRule + | StringRule + | SymbolRule + | TokenRule; + +type RuleOrLiteral = Rule | RegExp | string; + +type GrammarSymbols = { + [name in RuleName]: SymbolRule; +} & + Record>; + +type RuleBuilder = ( + $: GrammarSymbols, +) => RuleOrLiteral; + +type RuleBuilders< + RuleName extends string, + BaseGrammarRuleName extends string +> = { + [name in RuleName]: RuleBuilder; +}; + +interface Grammar< + RuleName extends string, + BaseGrammarRuleName extends string = never, + Rules extends RuleBuilders = RuleBuilders< + RuleName, + BaseGrammarRuleName + > +> { + /** + * Name of the grammar language. + */ + name: string; + + /** Mapping of grammar rule names to rule builder functions. */ + rules: Rules; + + /** + * An array of arrays of rule names. Each inner array represents a set of + * rules that's involved in an _LR(1) conflict_ that is _intended to exist_ + * in the grammar. When these conflicts occur at runtime, Tree-sitter will + * use the GLR algorithm to explore all of the possible interpretations. If + * _multiple_ parses end up succeeding, Tree-sitter will pick the subtree + * whose corresponding rule has the highest total _dynamic precedence_. + * + * @param $ grammar rules + */ + conflicts?: ( + $: GrammarSymbols, + ) => RuleOrLiteral[][]; + + /** + * An array of token names which can be returned by an _external scanner_. + * External scanners allow you to write custom C code which runs during the + * lexing process in order to handle lexical rules (e.g. Python's indentation + * tokens) that cannot be described by regular expressions. + * + * @param $ grammar rules + * @param previous array of externals from the base schema, if any + * + * @see https://tree-sitter.github.io/tree-sitter/creating-parsers#external-scanners + */ + externals?: ( + $: Record>, + previous: Rule[], + ) => SymbolRule[]; + + /** + * An array of tokens that may appear anywhere in the language. This + * is often used for whitespace and comments. The default value of + * extras is to accept whitespace. To control whitespace explicitly, + * specify extras: `$ => []` in your grammar. + * + * @param $ grammar rules + */ + extras?: ( + $: GrammarSymbols, + ) => RuleOrLiteral[]; + + /** + * An array of rules that should be automatically removed from the + * grammar by replacing all of their usages with a copy of their definition. + * This is useful for rules that are used in multiple places but for which + * you don't want to create syntax tree nodes at runtime. + * + * @param $ grammar rules + */ + inline?: ( + $: GrammarSymbols, + ) => RuleOrLiteral[]; + + /** + * A list of hidden rule names that should be considered supertypes in the + * generated node types file. + * + * @param $ grammar rules + * + * @see http://tree-sitter.github.io/tree-sitter/using-parsers#static-node-types + */ + supertypes?: ( + $: GrammarSymbols, + ) => RuleOrLiteral[]; + + /** + * The name of a token that will match keywords for the purpose of the + * keyword extraction optimization. + * + * @param $ grammar rules + * + * @see https://tree-sitter.github.io/tree-sitter/creating-parsers#keyword-extraction + */ + word?: ($: GrammarSymbols) => RuleOrLiteral; +} + +type GrammarSchema = { + [K in keyof Grammar]: K extends 'rules' + ? Record + : Grammar[K]; +}; + +/** + * Causes the given rule to appear with an alternative name in the syntax tree. + * For instance with `alias($.foo, 'bar')`, the aliased rule will appear as an + * anonymous node, as if the rule had been written as the simple string. + * + * @param rule rule that will be aliased + * @param name target name for the alias + */ +declare function alias(rule: RuleOrLiteral, name: string): AliasRule; + +/** + * Causes the given rule to appear as an alternative named node, for instance + * with `alias($.foo, $.bar)`, the aliased rule `foo` will appear as a named + * node called `bar`. + * + * @param rule rule that will be aliased + * @param symbol target symbol for the alias + */ +declare function alias( + rule: RuleOrLiteral, + symbol: SymbolRule, +): AliasRule; + +/** + * Creates a blank rule, matching nothing. + */ +declare function blank(): BlankRule; + +/** + * Assigns a field name to the child node(s) matched by the given rule. + * In the resulting syntax tree, you can then use that field name to + * access specific children. + * + * @param name name of the field + * @param rule rule the field should match + */ +declare function field(name: string, rule: RuleOrLiteral): FieldRule; + +/** + * Creates a rule that matches one of a set of possible rules. The order + * of the arguments does not matter. This is analogous to the `|` (pipe) + * operator in EBNF notation. + * + * @param options possible rule choices + */ +declare function choice(...options: RuleOrLiteral[]): ChoiceRule; + +/** + * Creates a rule that matches zero or one occurrence of a given rule. + * It is analogous to the `[x]` (square bracket) syntax in EBNF notation. + * + * @param value rule to be made optional + */ +declare function optional(rule: RuleOrLiteral): ChoiceRule; + +/** + * Marks the given rule with a numerical precedence which will be used to + * resolve LR(1) conflicts at parser-generation time. When two rules overlap + * in a way that represents either a true ambiguity or a _local_ ambiguity + * given one token of lookahead, Tree-sitter will try to resolve the conflict by + * matching the rule with the higher precedence. The default precedence of all + * rules is zero. This works similarly to the precedence directives in Yacc grammars. + * + * @param number precedence weight + * @param rule rule being weighted + * + * @see https://en.wikipedia.org/wiki/LR_parser#Conflicts_in_the_constructed_tables + * @see https://docs.oracle.com/cd/E19504-01/802-5880/6i9k05dh3/index.html + */ +declare const prec: { + (number: number, rule: RuleOrLiteral): PrecRule; + + /** + * Marks the given rule as left-associative (and optionally applies a + * numerical precedence). When an LR(1) conflict arises in which all of the + * rules have the same numerical precedence, Tree-sitter will consult the + * rules' associativity. If there is a left-associative rule, Tree-sitter + * will prefer matching a rule that ends _earlier_. This works similarly to + * associativity directives in Yacc grammars. + * + * @param number (optional) precedence weight + * @param rule rule to mark as left-associative + * + * @see https://docs.oracle.com/cd/E19504-01/802-5880/6i9k05dh3/index.html + */ + left(rule: RuleOrLiteral): PrecLeftRule; + left(number: number, rule: RuleOrLiteral): PrecLeftRule; + + /** + * Marks the given rule as right-associative (and optionally applies a + * numerical precedence). When an LR(1) conflict arises in which all of the + * rules have the same numerical precedence, Tree-sitter will consult the + * rules' associativity. If there is a right-associative rule, Tree-sitter + * will prefer matching a rule that ends _later_. This works similarly to + * associativity directives in Yacc grammars. + * + * @param number (optional) precedence weight + * @param rule rule to mark as right-associative + * + * @see https://docs.oracle.com/cd/E19504-01/802-5880/6i9k05dh3/index.html + */ + right(rule: RuleOrLiteral): PrecRightRule; + right(number: number, rule: RuleOrLiteral): PrecRightRule; + + /** + * Marks the given rule with a numerical precedence which will be used to + * resolve LR(1) conflicts at _runtime_ instead of parser-generation time. + * This is only necessary when handling a conflict dynamically using the + * `conflicts` field in the grammar, and when there is a genuine _ambiguity_: + * multiple rules correctly match a given piece of code. In that event, + * Tree-sitter compares the total dynamic precedence associated with each + * rule, and selects the one with the highest total. This is similar to + * dynamic precedence directives in Bison grammars. + * + * @param number precedence weight + * @param rule rule being weighted + * + * @see https://www.gnu.org/software/bison/manual/html_node/Generalized-LR-Parsing.html + */ + dynamic(number: number, rule: RuleOrLiteral): PrecDynamicRule; +}; + +/** + * Creates a rule that matches _zero-or-more_ occurrences of a given rule. + * It is analogous to the `{x}` (curly brace) syntax in EBNF notation. This + * rule is implemented in terms of `repeat1` but is included because it + * is very commonly used. + * + * @param rule rule to repeat, zero or more times + */ +declare function repeat(rule: RuleOrLiteral): RepeatRule; + +/** + * Creates a rule that matches one-or-more occurrences of a given rule. + * + * @param rule rule to repeat, one or more times + */ +declare function repeat1(rule: RuleOrLiteral): Repeat1Rule; + +/** + * Creates a rule that matches any number of other rules, one after another. + * It is analogous to simply writing multiple symbols next to each other + * in EBNF notation. + * + * @param rules ordered rules that comprise the sequence + */ +declare function seq(...rules: RuleOrLiteral[]): SeqRule; + +/** + * Creates a symbol rule, representing another rule in the grammar by name. + * + * @param name name of the target rule + */ +declare function sym(name: Name): SymbolRule; + +/** + * Marks the given rule as producing only a single token. Tree-sitter's + * default is to treat each String or RegExp literal in the grammar as a + * separate token. Each token is matched separately by the lexer and + * returned as its own leaf node in the tree. The token function allows + * you to express a complex rule using the DSL functions (rather + * than as a single regular expression) but still have Tree-sitter treat + * it as a single token. + * + * @param rule rule to represent as a single token + */ +declare const token: { + (rule: RuleOrLiteral): TokenRule; + + /** + * Marks the given rule as producing an immediate token. This allows + * the parser to produce a different token based on whether or not + * there are `extras` preceding the token's main content. When there + * are _no_ leading `extras`, an immediate token is preferred over a + * normal token which would otherwise match. + * + * @param rule rule to represent as an immediate token + */ + immediate(rule: RuleOrLiteral): ImmediateTokenRule; +}; + +/** + * Creates a new language grammar with the provided schema. + * + * @param options grammar options + */ +declare function grammar( + options: Grammar, +): GrammarSchema; + +/** + * Extends an existing language grammar with the provided options, + * creating a new language. + * + * @param baseGrammar base grammar schema to extend from + * @param options grammar options for the new extended language + */ +declare function grammar< + BaseGrammarRuleName extends string, + RuleName extends string +>( + baseGrammar: GrammarSchema, + options: Grammar, +): GrammarSchema; From d614c14c2cfc5911674f233ba7073c3dc3a90fdd Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 8 Jul 2020 12:36:59 -0700 Subject: [PATCH 27/71] tags: Make spans refer to name, not entire def/ref Co-authored-by: Tim Clem Co-authored-by: Beka Valentine --- cli/src/tests/tags_test.rs | 23 ++++++++++++++++++----- tags/src/lib.rs | 12 +++++++----- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/cli/src/tests/tags_test.rs b/cli/src/tests/tags_test.rs index fad8ebd8..f3df4b53 100644 --- a/cli/src/tests/tags_test.rs +++ b/cli/src/tests/tags_test.rs @@ -2,6 +2,7 @@ use super::helpers::allocations; use super::helpers::fixtures::{get_language, get_language_queries_path}; use std::ffi::CString; use std::{fs, ptr, slice, str}; +use tree_sitter::Point; use tree_sitter_tags::c_lib as c; use tree_sitter_tags::{Error, TagKind, TagsConfiguration, TagsContext}; @@ -150,12 +151,24 @@ fn test_tags_javascript() { assert_eq!( tags.iter() - .map(|t| (substr(source, &t.name_range), t.kind)) + .map(|t| (substr(source, &t.name_range), t.span.clone(), t.kind)) .collect::>(), &[ - ("Customer", TagKind::Class), - ("getAge", TagKind::Method), - ("Agent", TagKind::Class) + ( + "Customer", + Point::new(5, 10)..Point::new(5, 18), + TagKind::Class + ), + ( + "getAge", + Point::new(9, 8)..Point::new(9, 14), + TagKind::Method + ), + ( + "Agent", + Point::new(15, 10)..Point::new(15, 15), + TagKind::Class + ) ] ); assert_eq!( @@ -209,7 +222,7 @@ fn test_tags_ruby() { )) .collect::>(), &[ - ("foo", TagKind::Method, (2, 0)), + ("foo", TagKind::Method, (2, 4)), ("bar", TagKind::Call, (7, 4)), ("a", TagKind::Call, (7, 8)), ("b", TagKind::Call, (7, 11)), diff --git a/tags/src/lib.rs b/tags/src/lib.rs index 8d1853bb..613e56ac 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -300,7 +300,7 @@ where continue; } - let mut name_range = None; + let mut name_node = None; let mut doc_nodes = Vec::new(); let mut tag_node = None; let mut kind = TagKind::Call; @@ -314,7 +314,7 @@ where } if index == self.config.name_capture_index { - name_range = Some(capture.node.byte_range()); + name_node = Some(capture.node); } else if index == self.config.doc_capture_index { doc_nodes.push(capture.node); } else if index == self.config.call_capture_index { @@ -335,7 +335,9 @@ where } } - if let (Some(tag_node), Some(name_range)) = (tag_node, name_range) { + if let (Some(tag_node), Some(name_node)) = (tag_node, name_node) { + let name_range = name_node.byte_range(); + if pattern_info.name_must_be_non_local { let mut is_local = false; for scope in self.scopes.iter().rev() { @@ -413,7 +415,7 @@ where *pattern_index = mat.pattern_index; *tag = Tag { line_range: line_range(self.source, range.start, MAX_LINE_LEN), - span: tag_node.start_position()..tag_node.end_position(), + span: name_node.start_position()..name_node.end_position(), kind, range, name_range, @@ -426,7 +428,7 @@ where ( Tag { line_range: line_range(self.source, range.start, MAX_LINE_LEN), - span: tag_node.start_position()..tag_node.end_position(), + span: name_node.start_position()..name_node.end_position(), kind, range, name_range, From 255cf0a9cfe58654a40fd166dcbc3a0849073a22 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 8 Jul 2020 15:23:21 -0700 Subject: [PATCH 28/71] tags: Add utf16 column ranges to tags Also, ensure that line ranges contain only valid UTF8. Co-authored-by: Tim Clem Co-authored-by: Beka Valentine --- cli/src/tests/tags_test.rs | 42 +++++++--- tags/src/lib.rs | 159 +++++++++++++++++++++++++++---------- 2 files changed, 148 insertions(+), 53 deletions(-) diff --git a/cli/src/tests/tags_test.rs b/cli/src/tests/tags_test.rs index f3df4b53..c81f6966 100644 --- a/cli/src/tests/tags_test.rs +++ b/cli/src/tests/tags_test.rs @@ -8,21 +8,21 @@ use tree_sitter_tags::{Error, TagKind, TagsConfiguration, TagsContext}; const PYTHON_TAG_QUERY: &'static str = r#" ( - (function_definition - name: (identifier) @name - body: (block . (expression_statement (string) @doc))) @function - (#strip! @doc "(^['\"\\s]*)|(['\"\\s]*$)") + (function_definition + name: (identifier) @name + body: (block . (expression_statement (string) @doc))) @function + (#strip! @doc "(^['\"\\s]*)|(['\"\\s]*$)") ) (function_definition name: (identifier) @name) @function ( - (class_definition - name: (identifier) @name - body: (block - . (expression_statement (string) @doc))) @class - (#strip! @doc "(^['\"\\s]*)|(['\"\\s]*$)") + (class_definition + name: (identifier) @name + body: (block + . (expression_statement (string) @doc))) @class + (#strip! @doc "(^['\"\\s]*)|(['\"\\s]*$)") ) (class_definition @@ -30,6 +30,10 @@ const PYTHON_TAG_QUERY: &'static str = r#" (call function: (identifier) @name) @call + +(call + function: (attribute + attribute: (identifier) @name)) @call "#; const JS_TAG_QUERY: &'static str = r#" @@ -179,6 +183,26 @@ fn test_tags_javascript() { assert_eq!(tags[2].docs, None); } +#[test] +fn test_tags_columns_measured_in_utf16_code_units() { + let language = get_language("python"); + let tags_config = TagsConfiguration::new(language, PYTHON_TAG_QUERY, "").unwrap(); + let mut tag_context = TagsContext::new(); + + let source = r#""❤️❤️❤️".hello_α_ω()"#.as_bytes(); + + let tag = tag_context + .generate_tags(&tags_config, source, None) + .unwrap() + .next() + .unwrap() + .unwrap(); + + assert_eq!(substr(source, &tag.name_range), "hello_α_ω"); + assert_eq!(tag.span, Point::new(0, 21)..Point::new(0, 32)); + assert_eq!(tag.utf16_column_range, 9..18); +} + #[test] fn test_tags_ruby() { let language = get_language("ruby"); diff --git a/tags/src/lib.rs b/tags/src/lib.rs index 613e56ac..a240666f 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -1,10 +1,10 @@ pub mod c_lib; -use memchr::{memchr, memrchr}; +use memchr::memchr; use regex::Regex; use std::ops::Range; use std::sync::atomic::{AtomicUsize, Ordering}; -use std::{fmt, mem, str}; +use std::{char, fmt, mem, str}; use tree_sitter::{ Language, Parser, Point, Query, QueryCursor, QueryError, QueryPredicateArg, Tree, }; @@ -43,6 +43,7 @@ pub struct Tag { pub name_range: Range, pub line_range: Range, pub span: Range, + pub utf16_column_range: Range, pub docs: Option, } @@ -404,39 +405,32 @@ where // Only create one tag per node. The tag queue is sorted by node position // to allow for fast lookup. let range = tag_node.byte_range(); - match self - .tag_queue - .binary_search_by_key(&(name_range.end, name_range.start), |(tag, _)| { - (tag.name_range.end, tag.name_range.start) - }) { + let span = name_node.start_position()..name_node.end_position(); + let utf16_column_range = + get_utf16_column_range(self.source, &name_range, &span); + let line_range = + line_range(self.source, name_range.start, span.start, MAX_LINE_LEN); + let tag = Tag { + line_range, + span, + utf16_column_range, + kind, + range, + name_range, + docs, + }; + match self.tag_queue.binary_search_by_key( + &(tag.name_range.end, tag.name_range.start), + |(tag, _)| (tag.name_range.end, tag.name_range.start), + ) { Ok(i) => { - let (tag, pattern_index) = &mut self.tag_queue[i]; + let (existing_tag, pattern_index) = &mut self.tag_queue[i]; if *pattern_index > mat.pattern_index { *pattern_index = mat.pattern_index; - *tag = Tag { - line_range: line_range(self.source, range.start, MAX_LINE_LEN), - span: name_node.start_position()..name_node.end_position(), - kind, - range, - name_range, - docs, - }; + *existing_tag = tag; } } - Err(i) => self.tag_queue.insert( - i, - ( - Tag { - line_range: line_range(self.source, range.start, MAX_LINE_LEN), - span: name_node.start_position()..name_node.end_position(), - kind, - range, - name_range, - docs, - }, - mat.pattern_index, - ), - ), + Err(i) => self.tag_queue.insert(i, (tag, mat.pattern_index)), } } } @@ -475,11 +469,92 @@ impl From for Error { } } -fn line_range(text: &[u8], index: usize, max_line_len: usize) -> Range { - let start = memrchr(b'\n', &text[0..index]).map_or(0, |i| i + 1); - let max_line_len = max_line_len.min(text.len() - start); - let end = start + memchr(b'\n', &text[start..(start + max_line_len)]).unwrap_or(max_line_len); - start..end +pub struct LossyUtf8<'a> { + bytes: &'a [u8], + in_replacement: bool, +} + +impl<'a> LossyUtf8<'a> { + pub fn new(bytes: &'a [u8]) -> Self { + LossyUtf8 { + bytes, + in_replacement: false, + } + } +} + +impl<'a> Iterator for LossyUtf8<'a> { + type Item = &'a str; + + fn next(&mut self) -> Option<&'a str> { + if self.bytes.is_empty() { + return None; + } + if self.in_replacement { + self.in_replacement = false; + return Some("\u{fffd}"); + } + match str::from_utf8(self.bytes) { + Ok(valid) => { + self.bytes = &[]; + Some(valid) + } + Err(error) => { + if let Some(error_len) = error.error_len() { + let error_start = error.valid_up_to(); + if error_start > 0 { + let result = + unsafe { str::from_utf8_unchecked(&self.bytes[..error_start]) }; + self.bytes = &self.bytes[(error_start + error_len)..]; + self.in_replacement = true; + Some(result) + } else { + self.bytes = &self.bytes[error_len..]; + Some("\u{fffd}") + } + } else { + None + } + } + } + } +} + +fn line_range( + text: &[u8], + start_byte: usize, + start_point: Point, + max_line_len: usize, +) -> Range { + let line_start_byte = start_byte - start_point.column; + let max_line_len = max_line_len.min(text.len() - line_start_byte); + let text_after_line_start = &text[line_start_byte..(line_start_byte + max_line_len)]; + let len = if let Some(len) = memchr(b'\n', text_after_line_start) { + len + } else { + match str::from_utf8(text_after_line_start) { + Ok(s) => s.len(), + Err(e) => e.valid_up_to(), + } + }; + line_start_byte..(line_start_byte + len) +} + +fn get_utf16_column_range( + text: &[u8], + byte_range: &Range, + point_range: &Range, +) -> Range { + let start = byte_range.start - point_range.start.column; + let preceding_text_on_line = &text[start..byte_range.start]; + let start_col = utf16_len(preceding_text_on_line); + start_col..(start_col + utf16_len(&text[byte_range.clone()])) +} + +fn utf16_len(bytes: &[u8]) -> usize { + LossyUtf8::new(bytes) + .flat_map(|chunk| chunk.chars().map(char::len_utf16)) + .sum() } #[cfg(test)] @@ -488,14 +563,10 @@ mod tests { #[test] fn test_get_line() { - let text = b"abc\ndefg\nhijkl"; - assert_eq!(line_range(text, 0, 10), 0..3); - assert_eq!(line_range(text, 1, 10), 0..3); - assert_eq!(line_range(text, 2, 10), 0..3); - assert_eq!(line_range(text, 3, 10), 0..3); - assert_eq!(line_range(text, 1, 2), 0..2); - assert_eq!(line_range(text, 4, 10), 4..8); - assert_eq!(line_range(text, 5, 10), 4..8); - assert_eq!(line_range(text, 11, 10), 9..14); + let text = "abc\ndefg❤hij\nklmno".as_bytes(); + assert_eq!(line_range(text, 5, Point::new(1, 1), 30), 4..14); + assert_eq!(line_range(text, 5, Point::new(1, 1), 6), 4..8); + assert_eq!(line_range(text, 17, Point::new(2, 2), 30), 15..20); + assert_eq!(line_range(text, 17, Point::new(2, 2), 4), 15..19); } } From e9ea8192a3428a9a204167c27e7d0a76cbd4efd8 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 9 Jul 2020 09:11:34 -0700 Subject: [PATCH 29/71] Mention node version >= 6 in docs Fixes #677 --- docs/section-3-creating-parsers.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/section-3-creating-parsers.md b/docs/section-3-creating-parsers.md index c877ba6f..b075e488 100644 --- a/docs/section-3-creating-parsers.md +++ b/docs/section-3-creating-parsers.md @@ -13,7 +13,7 @@ Developing Tree-sitter grammars can have a difficult learning curve, but once yo In order to develop a Tree-sitter parser, there are two dependencies that you need to install: -* **Node.js** - Tree-sitter grammars are written in JavaScript, and Tree-sitter uses [Node.js][node.js] to interpret JavaScript files. It requires the `node` command to be in one of the directories in your [`PATH`][path-env]. It shouldn't matter what version of Node you have. +* **Node.js** - Tree-sitter grammars are written in JavaScript, and Tree-sitter uses [Node.js][node.js] to interpret JavaScript files. It requires the `node` command to be in one of the directories in your [`PATH`][path-env]. You'll need Node.js version 6.0 or greater. * **A C Compiler** - Tree-sitter creates parsers that are written in C. In order to run and test these parsers with the `tree-sitter parse` or `tree-sitter test` commands, you must have a C/C++ compiler installed. Tree-sitter will try to look for these compilers in the standard places for each platform. ### Installation From b52f28d6d5d740a85e539cde221b6742106f488f Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 9 Jul 2020 11:28:07 -0700 Subject: [PATCH 30/71] Allow measuring time for tags subcommand --- cli/src/main.rs | 20 ++++++++++++--- cli/src/tags.rs | 67 ++++++++++++++++++++++++++++++++----------------- 2 files changed, 61 insertions(+), 26 deletions(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index 757c70eb..d7a5e7b1 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -97,6 +97,8 @@ fn run() -> error::Result<()> { .value_name("json|protobuf") .help("Determine output format (default: json)"), ) + .arg(Arg::with_name("quiet").long("quiet").short("q")) + .arg(Arg::with_name("time").long("quiet").short("t")) .arg(Arg::with_name("scope").long("scope").takes_value(true)) .arg( Arg::with_name("inputs") @@ -149,8 +151,14 @@ fn run() -> error::Result<()> { .arg(Arg::with_name("path").index(1).multiple(true)), ) .subcommand( - SubCommand::with_name("web-ui").about("Test a parser interactively in the browser") - .arg(Arg::with_name("quiet").long("quiet").short("q").help("open in default browser")), + SubCommand::with_name("web-ui") + .about("Test a parser interactively in the browser") + .arg( + Arg::with_name("quiet") + .long("quiet") + .short("q") + .help("open in default browser"), + ), ) .subcommand( SubCommand::with_name("dump-languages") @@ -268,7 +276,13 @@ fn run() -> error::Result<()> { } else if let Some(matches) = matches.subcommand_matches("tags") { loader.find_all_languages(&config.parser_directories)?; let paths = collect_paths(matches.values_of("inputs").unwrap())?; - tags::generate_tags(&loader, matches.value_of("scope"), &paths)?; + tags::generate_tags( + &loader, + matches.value_of("scope"), + &paths, + matches.is_present("quiet"), + matches.is_present("time"), + )?; } else if let Some(matches) = matches.subcommand_matches("highlight") { loader.configure_highlights(&config.theme.highlight_names); loader.find_all_languages(&config.parser_directories)?; diff --git a/cli/src/tags.rs b/cli/src/tags.rs index 515f4c52..5ea00f39 100644 --- a/cli/src/tags.rs +++ b/cli/src/tags.rs @@ -3,10 +3,17 @@ use super::util; use crate::error::{Error, Result}; use std::io::{self, Write}; use std::path::Path; +use std::time::Instant; use std::{fs, str}; use tree_sitter_tags::TagsContext; -pub fn generate_tags(loader: &Loader, scope: Option<&str>, paths: &[String]) -> Result<()> { +pub fn generate_tags( + loader: &Loader, + scope: Option<&str>, + paths: &[String], + quiet: bool, + time: bool, +) -> Result<()> { let mut lang = None; if let Some(scope) = scope { lang = loader.language_configuration_for_scope(scope)?; @@ -34,36 +41,50 @@ pub fn generate_tags(loader: &Loader, scope: Option<&str>, paths: &[String]) -> }; if let Some(tags_config) = language_config.tags_config(language)? { - let ident = if paths.len() > 1 { - let path_str = format!("{:?}", path); - writeln!(&mut stdout, "{}", &path_str[1..path_str.len() - 1])?; - "\t" + let indent; + if paths.len() > 1 { + if !quiet { + writeln!(&mut stdout, "{}", path.to_string_lossy())?; + } + indent = "\t" } else { - "" + indent = ""; }; let source = fs::read(path)?; + let t0 = Instant::now(); for tag in context.generate_tags(tags_config, &source, Some(&cancellation_flag))? { let tag = tag?; - write!( - &mut stdout, - "{}{:<10}\t | {:<8}\t{} {} - {} `{}`", - ident, - str::from_utf8(&source[tag.name_range]).unwrap_or(""), - &tags_config.syntax_type_name(tag.syntax_type_id), - if tag.is_definition { "def" } else { "ref" }, - tag.span.start, - tag.span.end, - str::from_utf8(&source[tag.line_range]).unwrap_or(""), - )?; - if let Some(docs) = tag.docs { - if docs.len() > 120 { - write!(&mut stdout, "\t{:?}...", &docs[0..120])?; - } else { - write!(&mut stdout, "\t{:?}", &docs)?; + if !quiet { + write!( + &mut stdout, + "{}{:<10}\t | {:<8}\t{} {} - {} `{}`", + indent, + str::from_utf8(&source[tag.name_range]).unwrap_or(""), + &tags_config.syntax_type_name(tag.syntax_type_id), + if tag.is_definition { "def" } else { "ref" }, + tag.span.start, + tag.span.end, + str::from_utf8(&source[tag.line_range]).unwrap_or(""), + )?; + if let Some(docs) = tag.docs { + if docs.len() > 120 { + write!(&mut stdout, "\t{:?}...", &docs[0..120])?; + } else { + write!(&mut stdout, "\t{:?}", &docs)?; + } } + writeln!(&mut stdout, "")?; } - writeln!(&mut stdout, "")?; + } + + if time { + writeln!( + &mut stdout, + "{}time: {}ms", + indent, + t0.elapsed().as_millis(), + )?; } } else { eprintln!("No tags config found for path {:?}", path); From 1ecfc2548f1dfe0aa2ec34fb174555a27f37dde0 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 9 Jul 2020 11:30:30 -0700 Subject: [PATCH 31/71] tags: Move impls below type definitions --- tags/src/lib.rs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tags/src/lib.rs b/tags/src/lib.rs index 7d58d99b..790b866a 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -63,15 +63,6 @@ pub enum Error { InvalidCapture(String), } -impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - Error::InvalidCapture(name) => write!(f, "Invalid capture @{}. Expected one of: @definition.*, @reference.*, @doc, @name, @local.(scope|definition|reference).", name), - _ => write!(f, "{:?}", self) - } - } -} - #[derive(Debug, Default)] struct PatternInfo { docs_adjacent_capture: Option, @@ -475,6 +466,15 @@ where } } +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Error::InvalidCapture(name) => write!(f, "Invalid capture @{}. Expected one of: @definition.*, @reference.*, @doc, @name, @local.(scope|definition|reference).", name), + _ => write!(f, "{:?}", self) + } + } +} + impl From for Error { fn from(error: regex::Error) -> Self { Error::Regex(error) From 52360b103d0b293c54e83a188d7f2f1b9a7dc5d8 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 9 Jul 2020 12:07:57 -0700 Subject: [PATCH 32/71] tags: Fix comment position --- tags/src/lib.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tags/src/lib.rs b/tags/src/lib.rs index 790b866a..41b4557a 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -423,8 +423,6 @@ where } } - // Only create one tag per node. The tag queue is sorted by node position - // to allow for fast lookup. let range = tag_node.byte_range(); let span = name_node.start_position()..name_node.end_position(); let utf16_column_range = @@ -441,6 +439,9 @@ where is_definition, syntax_type_id, }; + + // Only create one tag per node. The tag queue is sorted by node position + // to allow for fast lookup. match self.tag_queue.binary_search_by_key( &(tag.name_range.end, tag.name_range.start), |(tag, _)| (tag.name_range.end, tag.name_range.start), From 0f805603104cab4d59c9f02154720fd000b22305 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 9 Jul 2020 12:13:12 -0700 Subject: [PATCH 33/71] tags: Reuse work when computing utf16 columns, line ranges --- tags/src/lib.rs | 64 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 49 insertions(+), 15 deletions(-) diff --git a/tags/src/lib.rs b/tags/src/lib.rs index 41b4557a..ca5699ca 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -91,6 +91,7 @@ where matches: I, _tree: Tree, source: &'a [u8], + prev_line_info: Option, config: &'a TagsConfiguration, cancellation_flag: Option<&'a AtomicUsize>, iter_count: usize, @@ -98,6 +99,13 @@ where scopes: Vec>, } +struct LineInfo { + utf8_position: Point, + utf8_byte: usize, + utf16_column: usize, + line_range: Range, +} + impl TagsConfiguration { pub fn new(language: Language, tags_query: &str, locals_query: &str) -> Result { let query = Query::new(language, &format!("{}{}", locals_query, tags_query))?; @@ -260,6 +268,7 @@ impl TagsContext { source, config, cancellation_flag, + prev_line_info: None, tag_queue: Vec::new(), iter_count: 0, scopes: vec![LocalScope { @@ -425,10 +434,46 @@ where let range = tag_node.byte_range(); let span = name_node.start_position()..name_node.end_position(); - let utf16_column_range = - get_utf16_column_range(self.source, &name_range, &span); - let line_range = - line_range(self.source, name_range.start, span.start, MAX_LINE_LEN); + + // Compute tag properties that depend on the text of the containing line. If the + // previous tag occurred on the same line, then reuse results from the previous tag. + let line_range; + let mut prev_utf16_column = 0; + let mut prev_utf8_byte = name_range.start - span.start.column; + let line_info = self.prev_line_info.as_ref().and_then(|info| { + if info.utf8_position.row == span.start.row { + Some(info) + } else { + None + } + }); + if let Some(line_info) = line_info { + line_range = line_info.line_range.clone(); + if line_info.utf8_position.column <= span.start.column { + prev_utf8_byte = line_info.utf8_byte; + prev_utf16_column = line_info.utf16_column; + } + } else { + line_range = self::line_range( + self.source, + name_range.start, + span.start, + MAX_LINE_LEN, + ); + } + + let utf16_start_column = prev_utf16_column + + utf16_len(&self.source[prev_utf8_byte..name_range.start]); + let utf16_end_column = + utf16_start_column + utf16_len(&self.source[name_range.clone()]); + let utf16_column_range = utf16_start_column..utf16_end_column; + + self.prev_line_info = Some(LineInfo { + utf8_position: span.end, + utf8_byte: name_range.end, + utf16_column: utf16_end_column, + line_range: line_range.clone(), + }); let tag = Tag { line_range, span, @@ -570,17 +615,6 @@ fn line_range( line_start_byte..line_end_byte } -fn get_utf16_column_range( - text: &[u8], - byte_range: &Range, - point_range: &Range, -) -> Range { - let line_start_byte = byte_range.start - point_range.start.column; - let preceding_text_on_line = &text[line_start_byte..byte_range.start]; - let start_col = utf16_len(preceding_text_on_line); - start_col..(start_col + utf16_len(&text[byte_range.clone()])) -} - fn utf16_len(bytes: &[u8]) -> usize { LossyUtf8::new(bytes) .flat_map(|chunk| chunk.chars().map(char::len_utf16)) From 9e38fd9f5c32b58919c1cb422f06c8021da98207 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 9 Jul 2020 12:32:40 -0700 Subject: [PATCH 34/71] Add todo comment for LossyUtf8 iterator --- tags/src/lib.rs | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tags/src/lib.rs b/tags/src/lib.rs index ca5699ca..dcbb9984 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -106,6 +106,11 @@ struct LineInfo { line_range: Range, } +struct LossyUtf8<'a> { + bytes: &'a [u8], + in_replacement: bool, +} + impl TagsConfiguration { pub fn new(language: Language, tags_query: &str, locals_query: &str) -> Result { let query = Query::new(language, &format!("{}{}", locals_query, tags_query))?; @@ -533,13 +538,11 @@ impl From for Error { } } -pub struct LossyUtf8<'a> { - bytes: &'a [u8], - in_replacement: bool, -} - +// TODO: Remove this struct at at some point. If `core::str::lossy::Utf8Lossy` +// is ever stabilized, we should use that. Otherwise, this struct could be moved +// into some module that's shared between `tree-sitter-tags` and `tree-sitter-highlight`. impl<'a> LossyUtf8<'a> { - pub fn new(bytes: &'a [u8]) -> Self { + fn new(bytes: &'a [u8]) -> Self { LossyUtf8 { bytes, in_replacement: false, From 6cee04350f909c6611258ccaee06446e08218f0c Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 9 Jul 2020 13:39:47 -0700 Subject: [PATCH 35/71] tags: Expose utf16 column range to C API --- tags/include/tree_sitter/tags.h | 2 ++ tags/src/c_lib.rs | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/tags/include/tree_sitter/tags.h b/tags/include/tree_sitter/tags.h index 58f5bbd9..f2b17075 100644 --- a/tags/include/tree_sitter/tags.h +++ b/tags/include/tree_sitter/tags.h @@ -28,6 +28,8 @@ typedef struct { uint32_t line_end_byte; TSPoint start_point; TSPoint end_point; + uint32_t utf16_start_column; + uint32_t utf16_end_column; uint32_t docs_start_byte; uint32_t docs_end_byte; uint32_t syntax_type_id; diff --git a/tags/src/c_lib.rs b/tags/src/c_lib.rs index 77f8aae5..07e1e19a 100644 --- a/tags/src/c_lib.rs +++ b/tags/src/c_lib.rs @@ -36,6 +36,8 @@ pub struct TSTag { pub line_end_byte: u32, pub start_point: TSPoint, pub end_point: TSPoint, + pub utf16_start_colum: u32, + pub utf16_end_colum: u32, pub docs_start_byte: u32, pub docs_end_byte: u32, pub syntax_type_id: u32, @@ -161,6 +163,8 @@ pub extern "C" fn ts_tagger_tag( row: tag.span.end.row as u32, column: tag.span.end.column as u32, }, + utf16_start_colum: tag.utf16_column_range.start as u32, + utf16_end_colum: tag.utf16_column_range.end as u32, docs_start_byte: prev_docs_len as u32, docs_end_byte: buffer.docs.len() as u32, syntax_type_id: tag.syntax_type_id, @@ -225,7 +229,7 @@ pub extern "C" fn ts_tagger_syntax_kinds_for_scope_name( *len = 0; if let Some(config) = tagger.languages.get(scope_name) { *len = config.c_syntax_type_names.len() as u32; - return config.c_syntax_type_names.as_ptr() as *const *const i8 + return config.c_syntax_type_names.as_ptr() as *const *const i8; } std::ptr::null() } From 0bfd47e2e5631af43ddf30abdac2043051bbe8af Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 10 Jul 2020 10:12:46 -0700 Subject: [PATCH 36/71] Improve error message when failing to run graphviz stuff Fixes #682 --- cli/src/util.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/cli/src/util.rs b/cli/src/util.rs index 8978ecc1..9f941f62 100644 --- a/cli/src/util.rs +++ b/cli/src/util.rs @@ -1,3 +1,4 @@ +use super::error::{Error, Result}; use std::io; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; @@ -31,12 +32,12 @@ pub struct LogSession(); pub struct LogSession(PathBuf, Option, Option); #[cfg(windows)] -pub fn log_graphs(_parser: &mut Parser, _path: &str) -> std::io::Result { +pub fn log_graphs(_parser: &mut Parser, _path: &str) -> Result { Ok(LogSession()) } #[cfg(unix)] -pub fn log_graphs(parser: &mut Parser, path: &str) -> std::io::Result { +pub fn log_graphs(parser: &mut Parser, path: &str) -> Result { use std::io::Write; let mut dot_file = std::fs::File::create(path)?; @@ -46,11 +47,13 @@ pub fn log_graphs(parser: &mut Parser, path: &str) -> std::io::Result Date: Fri, 10 Jul 2020 13:33:04 -0700 Subject: [PATCH 37/71] highlight: Avoid accidentally treating locals patterns as highlight patterns --- highlight/src/lib.rs | 163 ++++++++++++++++++++----------------------- 1 file changed, 75 insertions(+), 88 deletions(-) diff --git a/highlight/src/lib.rs b/highlight/src/lib.rs index d2e27b46..bb110219 100644 --- a/highlight/src/lib.rs +++ b/highlight/src/lib.rs @@ -620,7 +620,7 @@ where type Item = Result; fn next(&mut self) -> Option { - loop { + 'main: loop { // If we've already determined the next highlight boundary, just return it. if let Some(e) = self.next_event.take() { return Some(Ok(e)); @@ -640,29 +640,34 @@ where // If none of the layers have any more highlight boundaries, terminate. if self.layers.is_empty() { - if self.byte_offset < self.source.len() { + return if self.byte_offset < self.source.len() { let result = Some(Ok(HighlightEvent::Source { start: self.byte_offset, end: self.source.len(), })); self.byte_offset = self.source.len(); - return result; + result } else { - return None; - } + None + }; } // Get the next capture from whichever layer has the earliest highlight boundary. - let match_; - let mut captures; - let mut capture; - let mut pattern_index; + let range; let layer = &mut self.layers[0]; - if let Some((m, capture_index)) = layer.captures.peek() { - match_ = m; - captures = match_.captures; - pattern_index = match_.pattern_index; - capture = captures[*capture_index]; + if let Some((next_match, capture_index)) = layer.captures.peek() { + let next_capture = next_match.captures[*capture_index]; + range = next_capture.node.byte_range(); + + // If any previous highlight ends before this node starts, then before + // processing this capture, emit the source code up until the end of the + // previous highlight, and an end event for that highlight. + if let Some(end_byte) = layer.highlight_end_stack.last().cloned() { + if end_byte <= range.start { + layer.highlight_end_stack.pop(); + return self.emit_event(end_byte, Some(HighlightEvent::HighlightEnd)); + } + } } // If there are no more captures, then emit any remaining highlight end events. // And if there are none of those, then just advance to the end of the document. @@ -673,30 +678,17 @@ where return self.emit_event(self.source.len(), None); }; - // If any previous highlight ends before this node starts, then before - // processing this capture, emit the source code up until the end of the - // previous highlight, and an end event for that highlight. - let range = capture.node.byte_range(); - if let Some(end_byte) = layer.highlight_end_stack.last().cloned() { - if end_byte <= range.start { - layer.highlight_end_stack.pop(); - return self.emit_event(end_byte, Some(HighlightEvent::HighlightEnd)); - } - } - - // Remove from the local scope stack any local scopes that have already ended. - while range.start > layer.scope_stack.last().unwrap().range.end { - layer.scope_stack.pop(); - } + let (mut match_, capture_index) = layer.captures.next().unwrap(); + let mut capture = match_.captures[capture_index]; // If this capture represents an injection, then process the injection. - if pattern_index < layer.config.locals_pattern_index { + if match_.pattern_index < layer.config.locals_pattern_index { let (language_name, content_node, include_children) = - injection_for_match(&layer.config, &layer.config.query, match_, &self.source); + injection_for_match(&layer.config, &layer.config.query, &match_, &self.source); // Explicitly remove this match so that none of its other captures will remain - // in the stream of captures. The `unwrap` is ok because - layer.captures.next().unwrap().0.remove(); + // in the stream of captures. + match_.remove(); // If a language is found with the given name, then add a new language layer // to the highlighted document. @@ -729,16 +721,19 @@ where } self.sort_layers(); - continue; + continue 'main; } - layer.captures.next(); + // Remove from the local scope stack any local scopes that have already ended. + while range.start > layer.scope_stack.last().unwrap().range.end { + layer.scope_stack.pop(); + } // If this capture is for tracking local variables, then process the // local variable info. let mut reference_highlight = None; let mut definition_highlight = None; - while pattern_index < layer.config.highlights_pattern_index { + while match_.pattern_index < layer.config.highlights_pattern_index { // If the node represents a local scope, push a new local scope onto // the scope stack. if Some(capture.index) == layer.config.local_scope_capture_index { @@ -748,7 +743,7 @@ where range: range.clone(), local_defs: Vec::new(), }; - for prop in layer.config.query.property_settings(pattern_index) { + for prop in layer.config.query.property_settings(match_.pattern_index) { match prop.key.as_ref() { "local.scope-inherits" => { scope.inherits = @@ -767,7 +762,7 @@ where let scope = layer.scope_stack.last_mut().unwrap(); let mut value_range = 0..0; - for capture in captures { + for capture in match_.captures { if Some(capture.index) == layer.config.local_def_value_capture_index { value_range = capture.node.byte_range(); } @@ -810,84 +805,76 @@ where } } - // Continue processing any additional local-variable-tracking patterns - // for the same node. + // Continue processing any additional matches for the same node. if let Some((next_match, next_capture_index)) = layer.captures.peek() { let next_capture = next_match.captures[*next_capture_index]; if next_capture.node == capture.node { - pattern_index = next_match.pattern_index; - captures = next_match.captures; capture = next_capture; - layer.captures.next(); + match_ = layer.captures.next().unwrap().0; continue; - } else { - break; } } - break; + self.sort_layers(); + continue 'main; } // Otherwise, this capture must represent a highlight. - let mut has_highlight = true; - // If this exact range has already been highlighted by an earlier pattern, or by // a different layer, then skip over this one. if let Some((last_start, last_end, last_depth)) = self.last_highlight_range { if range.start == last_start && range.end == last_end && layer.depth < last_depth { - has_highlight = false; + self.sort_layers(); + continue 'main; } } // If the current node was found to be a local variable, then skip over any // highlighting patterns that are disabled for local variables. - while has_highlight - && (definition_highlight.is_some() || reference_highlight.is_some()) - && layer.config.non_local_variable_patterns[pattern_index] - { - has_highlight = false; - if let Some((next_match, next_capture_index)) = layer.captures.peek() { - let next_capture = next_match.captures[*next_capture_index]; - if next_capture.node == capture.node { - capture = next_capture; - has_highlight = true; - pattern_index = next_match.pattern_index; - layer.captures.next(); - continue; + if definition_highlight.is_some() || reference_highlight.is_some() { + while layer.config.non_local_variable_patterns[match_.pattern_index] { + if let Some((next_match, next_capture_index)) = layer.captures.peek() { + let next_capture = next_match.captures[*next_capture_index]; + if next_capture.node == capture.node { + capture = next_capture; + match_ = layer.captures.next().unwrap().0; + continue; + } } + + self.sort_layers(); + continue 'main; } - break; } - if has_highlight { - // Once a highlighting pattern is found for the current node, skip over - // any later highlighting patterns that also match this node. Captures - // for a given node are ordered by pattern index, so these subsequent - // captures are guaranteed to be for highlighting, not injections or - // local variables. - while let Some((next_match, next_capture_index)) = layer.captures.peek() { - if next_match.captures[*next_capture_index].node == capture.node { - layer.captures.next(); - } else { - break; - } + // Once a highlighting pattern is found for the current node, skip over + // any later highlighting patterns that also match this node. Captures + // for a given node are ordered by pattern index, so these subsequent + // captures are guaranteed to be for highlighting, not injections or + // local variables. + while let Some((next_match, next_capture_index)) = layer.captures.peek() { + let next_capture = next_match.captures[*next_capture_index]; + if next_capture.node == capture.node { + layer.captures.next(); + } else { + break; } + } - let current_highlight = layer.config.highlight_indices[capture.index as usize]; + let current_highlight = layer.config.highlight_indices[capture.index as usize]; - // If this node represents a local definition, then store the current - // highlight value on the local scope entry representing this node. - if let Some(definition_highlight) = definition_highlight { - *definition_highlight = current_highlight; - } + // If this node represents a local definition, then store the current + // highlight value on the local scope entry representing this node. + if let Some(definition_highlight) = definition_highlight { + *definition_highlight = current_highlight; + } - // Emit a scope start event and push the node's end position to the stack. - if let Some(highlight) = reference_highlight.or(current_highlight) { - self.last_highlight_range = Some((range.start, range.end, layer.depth)); - layer.highlight_end_stack.push(range.end); - return self - .emit_event(range.start, Some(HighlightEvent::HighlightStart(highlight))); - } + // Emit a scope start event and push the node's end position to the stack. + if let Some(highlight) = reference_highlight.or(current_highlight) { + self.last_highlight_range = Some((range.start, range.end, layer.depth)); + layer.highlight_end_stack.push(range.end); + return self + .emit_event(range.start, Some(HighlightEvent::HighlightStart(highlight))); } self.sort_layers(); From e4e785b567eb975c5fa6900b08728aac856bdaad Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 10 Jul 2020 13:47:56 -0700 Subject: [PATCH 38/71] Remove unused flags from tags CLI command --- cli/src/main.rs | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index d7a5e7b1..713bf28f 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -90,13 +90,6 @@ fn run() -> error::Result<()> { ) .subcommand( SubCommand::with_name("tags") - .arg( - Arg::with_name("format") - .short("f") - .long("format") - .value_name("json|protobuf") - .help("Determine output format (default: json)"), - ) .arg(Arg::with_name("quiet").long("quiet").short("q")) .arg(Arg::with_name("time").long("quiet").short("t")) .arg(Arg::with_name("scope").long("scope").takes_value(true)) @@ -106,12 +99,6 @@ fn run() -> error::Result<()> { .index(1) .required(true) .multiple(true), - ) - .arg( - Arg::with_name("v") - .short("v") - .multiple(true) - .help("Sets the level of verbosity"), ), ) .subcommand( From c2fb0f5229b1bb72005da5177457fafb1560954a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tu=E1=BA=A5n-Anh=20Nguy=E1=BB=85n?= Date: Sun, 12 Jul 2020 20:45:17 +0700 Subject: [PATCH 39/71] cli: Add --byte-range flag to query command --- cli/src/main.rs | 12 +++++++++++- cli/src/query.rs | 4 ++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index 713bf28f..a543202d 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -85,6 +85,12 @@ fn run() -> error::Result<()> { .multiple(true) .required(true), ) + .arg( + Arg::with_name("beg>: error::Result<()> { matches.value_of("scope"), )?; let query_path = Path::new(matches.value_of("query-path").unwrap()); - query::query_files_at_paths(language, paths, query_path, ordered_captures)?; + let range = matches.value_of("beg>: = br.split(":").collect(); + (r[0].parse().unwrap(), r[1].parse().unwrap()) + }); + query::query_files_at_paths(language, paths, query_path, ordered_captures, range)?; } else if let Some(matches) = matches.subcommand_matches("tags") { loader.find_all_languages(&config.parser_directories)?; let paths = collect_paths(matches.values_of("inputs").unwrap())?; diff --git a/cli/src/query.rs b/cli/src/query.rs index 47242273..8d097911 100644 --- a/cli/src/query.rs +++ b/cli/src/query.rs @@ -9,6 +9,7 @@ pub fn query_files_at_paths( paths: Vec<&Path>, query_path: &Path, ordered_captures: bool, + range: Option<(usize, usize)>, ) -> Result<()> { let stdout = io::stdout(); let mut stdout = stdout.lock(); @@ -20,6 +21,9 @@ pub fn query_files_at_paths( .map_err(|e| Error::new(format!("Query compilation failed: {:?}", e)))?; let mut query_cursor = QueryCursor::new(); + if let Some((beg, end)) = range { + query_cursor.set_byte_range(beg, end); + } let mut parser = Parser::new(); parser.set_language(language).map_err(|e| e.to_string())?; From 91a715799e1b468c8303c7c612416c04f5a9c5fb Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 14 Jul 2020 15:04:39 -0700 Subject: [PATCH 40/71] Accept a paths file to most CLI subcommands --- cli/src/main.rs | 111 +++++++++++++++++++++++++++-------------------- cli/src/query.rs | 6 +-- 2 files changed, 68 insertions(+), 49 deletions(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index a543202d..0668d08d 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -53,11 +53,12 @@ fn run() -> error::Result<()> { .subcommand( SubCommand::with_name("parse") .about("Parse files") + .arg(Arg::with_name("paths-file").long("paths").takes_value(true)) .arg( - Arg::with_name("path") + Arg::with_name("paths") .index(1) .multiple(true) - .required(true), + .required(false), ) .arg(Arg::with_name("scope").long("scope").takes_value(true)) .arg(Arg::with_name("debug").long("debug").short("d")) @@ -79,17 +80,18 @@ fn run() -> error::Result<()> { SubCommand::with_name("query") .about("Search files using a syntax tree query") .arg(Arg::with_name("query-path").index(1).required(true)) + .arg(Arg::with_name("paths-file").long("paths").takes_value(true)) .arg( - Arg::with_name("path") + Arg::with_name("paths") .index(2) .multiple(true) - .required(true), + .required(false), ) .arg( - Arg::with_name("beg>: error::Result<()> { .arg(Arg::with_name("quiet").long("quiet").short("q")) .arg(Arg::with_name("time").long("quiet").short("t")) .arg(Arg::with_name("scope").long("scope").takes_value(true)) + .arg(Arg::with_name("paths-file").long("paths").takes_value(true)) .arg( - Arg::with_name("inputs") + Arg::with_name("paths") .help("The source file to use") .index(1) - .required(true) .multiple(true), ), ) @@ -122,11 +124,12 @@ fn run() -> error::Result<()> { .subcommand( SubCommand::with_name("highlight") .about("Highlight a file") + .arg(Arg::with_name("paths-file").long("paths").takes_value(true)) .arg( - Arg::with_name("path") + Arg::with_name("paths") .index(1) .multiple(true) - .required(true), + .required(false), ) .arg(Arg::with_name("scope").long("scope").takes_value(true)) .arg(Arg::with_name("html").long("html").short("h")) @@ -225,7 +228,9 @@ fn run() -> error::Result<()> { let timeout = matches .value_of("timeout") .map_or(0, |t| u64::from_str_radix(t, 10).unwrap()); - let paths = collect_paths(matches.values_of("path").unwrap())?; + + let paths = collect_paths(matches.value_of("paths-file"), matches.values_of("paths"))?; + let max_path_length = paths.iter().map(|p| p.chars().count()).max().unwrap(); let mut has_error = false; loader.find_all_languages(&config.parser_directories)?; @@ -251,28 +256,23 @@ fn run() -> error::Result<()> { } } else if let Some(matches) = matches.subcommand_matches("query") { let ordered_captures = matches.values_of("captures").is_some(); - let paths = matches - .values_of("path") - .unwrap() - .into_iter() - .map(Path::new) - .collect::>(); + let paths = collect_paths(matches.value_of("paths-file"), matches.values_of("paths"))?; loader.find_all_languages(&config.parser_directories)?; let language = select_language( &mut loader, - paths[0], + Path::new(&paths[0]), ¤t_dir, matches.value_of("scope"), )?; let query_path = Path::new(matches.value_of("query-path").unwrap()); - let range = matches.value_of("beg>: = br.split(":").collect(); (r[0].parse().unwrap(), r[1].parse().unwrap()) }); query::query_files_at_paths(language, paths, query_path, ordered_captures, range)?; } else if let Some(matches) = matches.subcommand_matches("tags") { loader.find_all_languages(&config.parser_directories)?; - let paths = collect_paths(matches.values_of("inputs").unwrap())?; + let paths = collect_paths(matches.value_of("paths-file"), matches.values_of("paths"))?; tags::generate_tags( &loader, matches.value_of("scope"), @@ -285,7 +285,7 @@ fn run() -> error::Result<()> { loader.find_all_languages(&config.parser_directories)?; let time = matches.is_present("time"); - let paths = collect_paths(matches.values_of("path").unwrap())?; + let paths = collect_paths(matches.value_of("paths-file"), matches.values_of("paths"))?; let html_mode = matches.is_present("html"); if html_mode { println!("{}", highlight::HTML_HEADER); @@ -358,39 +358,58 @@ fn run() -> error::Result<()> { Ok(()) } -fn collect_paths<'a>(paths: impl Iterator) -> error::Result> { - let mut result = Vec::new(); +fn collect_paths<'a>( + paths_file: Option<&str>, + paths: Option>, +) -> error::Result> { + if let Some(paths_file) = paths_file { + return Ok(fs::read_to_string(paths_file) + .map_err(Error::wrap(|| { + format!("Failed to read paths file {}", paths_file) + }))? + .trim() + .split_ascii_whitespace() + .map(String::from) + .collect::>()); + } - let mut incorporate_path = |path: &str, positive| { - if positive { - result.push(path.to_string()); - } else { - if let Some(index) = result.iter().position(|p| p == path) { - result.remove(index); + if let Some(paths) = paths { + let mut result = Vec::new(); + + let mut incorporate_path = |path: &str, positive| { + if positive { + result.push(path.to_string()); + } else { + if let Some(index) = result.iter().position(|p| p == path) { + result.remove(index); + } } - } - }; + }; - for mut path in paths { - let mut positive = true; - if path.starts_with("!") { - positive = false; - path = path.trim_start_matches("!"); - } + for mut path in paths { + let mut positive = true; + if path.starts_with("!") { + positive = false; + path = path.trim_start_matches("!"); + } - if Path::new(path).exists() { - incorporate_path(path, positive); - } else { - let paths = - glob(path).map_err(Error::wrap(|| format!("Invalid glob pattern {:?}", path)))?; - for path in paths { - if let Some(path) = path?.to_str() { - incorporate_path(path, positive); + if Path::new(path).exists() { + incorporate_path(path, positive); + } else { + let paths = glob(path) + .map_err(Error::wrap(|| format!("Invalid glob pattern {:?}", path)))?; + for path in paths { + if let Some(path) = path?.to_str() { + incorporate_path(path, positive); + } } } } + + return Ok(result); } - Ok(result) + + Err(Error::new("Must provide one or more paths".to_string())) } fn select_language( diff --git a/cli/src/query.rs b/cli/src/query.rs index 8d097911..e71e6254 100644 --- a/cli/src/query.rs +++ b/cli/src/query.rs @@ -6,7 +6,7 @@ use tree_sitter::{Language, Node, Parser, Query, QueryCursor}; pub fn query_files_at_paths( language: Language, - paths: Vec<&Path>, + paths: Vec, query_path: &Path, ordered_captures: bool, range: Option<(usize, usize)>, @@ -29,9 +29,9 @@ pub fn query_files_at_paths( parser.set_language(language).map_err(|e| e.to_string())?; for path in paths { - writeln!(&mut stdout, "{}", path.to_str().unwrap())?; + writeln!(&mut stdout, "{}", path)?; - let source_code = fs::read(path).map_err(Error::wrap(|| { + let source_code = fs::read(&path).map_err(Error::wrap(|| { format!("Error reading source file {:?}", path) }))?; let text_callback = |n: Node| &source_code[n.byte_range()]; From 4535efce69016d28360618f9fc13e4ad4401b545 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 17 Jul 2020 09:39:06 -0700 Subject: [PATCH 41/71] query: Prevent dropping of matches when exceeding range maximum Fixes #685 --- cli/src/tests/query_test.rs | 39 +++++++++++++++++++++++++++++++++ lib/src/query.c | 43 ++++++++++++++++++++++++++----------- 2 files changed, 69 insertions(+), 13 deletions(-) diff --git a/cli/src/tests/query_test.rs b/cli/src/tests/query_test.rs index d4f18c7d..06ecc42e 100644 --- a/cli/src/tests/query_test.rs +++ b/cli/src/tests/query_test.rs @@ -1189,6 +1189,45 @@ fn test_query_matches_within_byte_range() { }); } +#[test] +fn test_query_captures_within_byte_range() { + allocations::record(|| { + let language = get_language("c"); + let query = Query::new( + language, + " + (call_expression + function: (identifier) @function + arguments: (argument_list (string_literal) @string.arg)) + + (string_literal) @string + ", + ) + .unwrap(); + + let source = r#"DEFUN ("safe-length", Fsafe_length, Ssafe_length, 1, 1, 0)"#; + + let mut parser = Parser::new(); + parser.set_language(language).unwrap(); + let tree = parser.parse(&source, None).unwrap(); + + let mut cursor = QueryCursor::new(); + let captures = + cursor + .set_byte_range(3, 27) + .captures(&query, tree.root_node(), to_callback(source)); + + assert_eq!( + collect_captures(captures, &query, source), + &[ + ("function", "DEFUN"), + ("string.arg", "\"safe-length\""), + ("string", "\"safe-length\""), + ] + ); + }); +} + #[test] fn test_query_matches_different_queries_same_cursor() { allocations::record(|| { diff --git a/lib/src/query.c b/lib/src/query.c index ff243494..b95ba057 100644 --- a/lib/src/query.c +++ b/lib/src/query.c @@ -172,6 +172,7 @@ struct TSQueryCursor { TSPoint start_point; TSPoint end_point; bool ascending; + bool halted; }; static const TSQueryError PARENT_DONE = -1; @@ -1286,6 +1287,7 @@ TSQueryCursor *ts_query_cursor_new(void) { TSQueryCursor *self = ts_malloc(sizeof(TSQueryCursor)); *self = (TSQueryCursor) { .ascending = false, + .halted = false, .states = array_new(), .finished_states = array_new(), .capture_list_pool = capture_list_pool_new(), @@ -1319,6 +1321,7 @@ void ts_query_cursor_exec( self->next_state_id = 0; self->depth = 0; self->ascending = false; + self->halted = false; self->query = query; } @@ -1522,18 +1525,30 @@ static QueryState *ts_query__cursor_copy_state( // `finished_states` array. Multiple patterns can finish on the same node. If // there are no more matches, return `false`. static inline bool ts_query_cursor__advance(TSQueryCursor *self) { - do { + bool did_match = false; + for (;;) { + if (self->halted) { + while (self->states.size > 0) { + QueryState state = array_pop(&self->states); + capture_list_pool_release( + &self->capture_list_pool, + state.capture_list_id + ); + } + } + + if (did_match || self->halted) return did_match; + if (self->ascending) { LOG("leave node. type:%s\n", ts_node_type(ts_tree_cursor_current_node(&self->cursor))); // Leave this node by stepping to its next sibling or to its parent. - bool did_move = true; if (ts_tree_cursor_goto_next_sibling(&self->cursor)) { self->ascending = false; } else if (ts_tree_cursor_goto_parent(&self->cursor)) { self->depth--; } else { - did_move = false; + self->halted = true; } // After leaving a node, remove any states that cannot make further progress. @@ -1545,10 +1560,11 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { // If a state completed its pattern inside of this node, but was deferred from finishing // in order to search for longer matches, mark it as finished. if (step->depth == PATTERN_DONE_MARKER) { - if (state->start_depth > self->depth || !did_move) { + if (state->start_depth > self->depth || self->halted) { LOG(" finish pattern %u\n", state->pattern_index); state->id = self->next_state_id++; array_push(&self->finished_states, *state); + did_match = true; deleted_count++; continue; } @@ -1575,10 +1591,6 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { } } self->states.size -= deleted_count; - - if (!did_move) { - return self->finished_states.size > 0; - } } else { // If this node is before the selected range, then avoid descending into it. TSNode node = ts_tree_cursor_current_node(&self->cursor); @@ -1596,7 +1608,10 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { if ( self->end_byte <= ts_node_start_byte(node) || point_lte(self->end_point, ts_node_start_point(node)) - ) return false; + ) { + self->halted = true; + continue; + } // Get the properties of the current node. TSSymbol symbol = ts_node_symbol(node); @@ -1888,6 +1903,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { state->id = self->next_state_id++; array_push(&self->finished_states, *state); array_erase(&self->states, state - self->states.contents); + did_match = true; i--; } } @@ -1901,9 +1917,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { self->ascending = true; } } - } while (self->finished_states.size == 0); - - return true; + } } bool ts_query_cursor_next_match( @@ -2043,7 +2057,10 @@ bool ts_query_cursor_next_capture( // If there are no finished matches that are ready to be returned, then // continue finding more matches. - if (!ts_query_cursor__advance(self)) return false; + if ( + !ts_query_cursor__advance(self) && + self->finished_states.size == 0 + ) return false; } } From f4adf0269af810e410c40a663c561511fb8c0467 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 17 Jul 2020 09:53:01 -0700 Subject: [PATCH 42/71] Propagate dynamic precedence correctly for inlined rules Fixes #683 --- cli/src/generate/prepare_grammar/process_inlines.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cli/src/generate/prepare_grammar/process_inlines.rs b/cli/src/generate/prepare_grammar/process_inlines.rs index 9ef89d75..f83658b2 100644 --- a/cli/src/generate/prepare_grammar/process_inlines.rs +++ b/cli/src/generate/prepare_grammar/process_inlines.rs @@ -127,6 +127,9 @@ impl InlinedProductionMapBuilder { last_inserted_step.associativity = removed_step.associativity; } } + if p.dynamic_precedence.abs() > production.dynamic_precedence.abs() { + production.dynamic_precedence = p.dynamic_precedence; + } production }), ); @@ -226,7 +229,7 @@ mod tests { ], }, Production { - dynamic_precedence: 0, + dynamic_precedence: -2, steps: vec![ProductionStep::new(Symbol::terminal(14))], }, ], @@ -258,7 +261,7 @@ mod tests { ], }, Production { - dynamic_precedence: 0, + dynamic_precedence: -2, steps: vec![ ProductionStep::new(Symbol::terminal(10)), ProductionStep::new(Symbol::terminal(14)), From c4fca5f73e194988dbb2790aa37f93fffaa284f5 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 17 Jul 2020 14:19:59 -0700 Subject: [PATCH 43/71] node types: Fix handling of repetitions inside of fields Fixes #676 --- cli/src/generate/node_types.rs | 239 ++++++++++++++++++++------------- 1 file changed, 149 insertions(+), 90 deletions(-) diff --git a/cli/src/generate/node_types.rs b/cli/src/generate/node_types.rs index 9c3bea64..6df40807 100644 --- a/cli/src/generate/node_types.rs +++ b/cli/src/generate/node_types.rs @@ -19,7 +19,7 @@ pub(crate) struct FieldInfo { #[derive(Clone, Debug, Default, PartialEq, Eq)] pub(crate) struct VariableInfo { pub fields: HashMap, - pub child_types: Vec, + pub children: FieldInfo, pub children_without_fields: FieldInfo, pub has_multi_step_production: bool, } @@ -70,7 +70,7 @@ impl Default for FieldInfoJSON { impl Default for ChildQuantity { fn default() -> Self { - Self::zero() + Self::one() } } @@ -158,7 +158,7 @@ pub(crate) fn get_variable_info( // Each variable's summary can depend on the summaries of other hidden variables, // and variables can have mutually recursive structure. So we compute the summaries - // iteratively, in a loop that terminates only when more changes are possible. + // iteratively, in a loop that terminates only when no more changes are possible. let mut did_change = true; let mut all_initialized = false; let mut result = vec![VariableInfo::default(); syntax_grammar.variables.len()]; @@ -168,13 +168,14 @@ pub(crate) fn get_variable_info( for (i, variable) in syntax_grammar.variables.iter().enumerate() { let mut variable_info = result[i].clone(); - // Within a variable, consider each production separately. For each - // production, determine which children and fields can occur, and how many - // times they can occur. - for (production_index, production) in variable.productions.iter().enumerate() { - let mut field_quantities = HashMap::new(); - let mut children_without_fields_quantity = ChildQuantity::zero(); - let mut has_uninitialized_invisible_children = false; + // Examine each of the variable's productions. The variable's child types can be + // immediately combined across all productions, but the child quantities must be + // recorded separately for each production. + for production in &variable.productions { + let mut production_field_quantities = HashMap::new(); + let mut production_children_quantity = ChildQuantity::zero(); + let mut production_children_without_fields_quantity = ChildQuantity::zero(); + let mut production_has_uninitialized_invisible_children = false; if production.steps.len() > 1 { variable_info.has_multi_step_production = true; @@ -190,106 +191,92 @@ pub(crate) fn get_variable_info( ChildType::Normal(child_symbol) }; - // Record all of the types of direct children. - did_change |= sorted_vec_insert(&mut variable_info.child_types, &child_type); + let child_is_hidden = !child_type_is_visible(&child_type) + && !syntax_grammar.supertype_symbols.contains(&child_symbol); - // Record all of the field names that occur. + // Maintain the set of all child types for this variable, and the quantity of + // visible children in this production. + did_change |= sorted_vec_insert(&mut variable_info.children.types, &child_type); + if !child_is_hidden { + production_children_quantity.append(ChildQuantity::one()); + } + + // Maintain the set of child types associated with each field, and the quantity + // of children associated with each field in this production. if let Some(field_name) = &step.field_name { - // Record how many times each field occurs in this production. - field_quantities - .entry(field_name) - .or_insert(ChildQuantity::zero()) - .append(ChildQuantity::one()); - - // Record the types of children for this field. - let field_info = - variable_info.fields.entry(field_name.clone()).or_insert({ - let mut info = FieldInfo { - types: Vec::new(), - quantity: ChildQuantity::one(), - }; - - // If this field did *not* occur in an earlier production, - // then it is not required. - if production_index > 0 { - info.quantity.required = false; - } - info - }); + let field_info = variable_info + .fields + .entry(field_name.clone()) + .or_insert(FieldInfo::default()); did_change |= sorted_vec_insert(&mut field_info.types, &child_type); - } - // Record named children without fields. - else if child_type_is_named(&child_type) { - // Record how many named children without fields occur in this production. - children_without_fields_quantity.append(ChildQuantity::one()); - // Record the types of all of the named children without fields. - let children_info = &mut variable_info.children_without_fields; - if children_info.types.is_empty() { - children_info.quantity = ChildQuantity::one(); + let production_field_quantity = production_field_quantities + .entry(field_name) + .or_insert(ChildQuantity::zero()); + + // Inherit the types and quantities of hidden children associated with fields. + if child_is_hidden { + let child_variable_info = &result[child_symbol.index]; + for child_type in &child_variable_info.children.types { + did_change |= sorted_vec_insert(&mut field_info.types, &child_type); + } + production_field_quantity.append(child_variable_info.children.quantity); + } else { + production_field_quantity.append(ChildQuantity::one()); } - did_change |= sorted_vec_insert(&mut children_info.types, &child_type); + } + // Maintain the set of named children without fields within this variable. + else if child_type_is_named(&child_type) { + production_children_without_fields_quantity.append(ChildQuantity::one()); + did_change |= sorted_vec_insert( + &mut variable_info.children_without_fields.types, + &child_type, + ); } - // Inherit information from any hidden children. - if child_symbol.is_non_terminal() - && !syntax_grammar.supertype_symbols.contains(&child_symbol) - && step.alias.is_none() - && !child_type_is_visible(&child_type) - { + // Inherit all child information from hidden children. + if child_is_hidden && child_symbol.is_non_terminal() { let child_variable_info = &result[child_symbol.index]; - // If a hidden child can have multiple children, then this - // node can appear to have multiple children. + // If a hidden child can have multiple children, then its parent node can + // appear to have multiple children. if child_variable_info.has_multi_step_production { variable_info.has_multi_step_production = true; } - // Inherit fields from this hidden child + // If a hidden child has fields, then the parent node can appear to have + // those same fields. for (field_name, child_field_info) in &child_variable_info.fields { - field_quantities + production_field_quantities .entry(field_name) .or_insert(ChildQuantity::zero()) .append(child_field_info.quantity); let field_info = variable_info .fields .entry(field_name.clone()) - .or_insert(FieldInfo { - types: Vec::new(), - quantity: ChildQuantity::one(), - }); + .or_insert(FieldInfo::default()); for child_type in &child_field_info.types { - sorted_vec_insert(&mut field_info.types, &child_type); - } - } - - // Inherit child types from this hidden child - for child_type in &child_variable_info.child_types { - did_change |= - sorted_vec_insert(&mut variable_info.child_types, child_type); - } - - // If any field points to this hidden child, inherit child types - // for the field. - if let Some(field_name) = &step.field_name { - let field_info = variable_info.fields.get_mut(field_name).unwrap(); - for child_type in &child_variable_info.child_types { did_change |= sorted_vec_insert(&mut field_info.types, &child_type); } } - // Inherit info about children without fields from this hidden child. - else { + + // If a hidden child has children, then the parent node can appear to have + // those same children. + production_children_quantity.append(child_variable_info.children.quantity); + for child_type in &child_variable_info.children.types { + did_change |= + sorted_vec_insert(&mut variable_info.children.types, child_type); + } + + // If a hidden child can have named children without fields, then the parent + // node can appear to have those same children. + if step.field_name.is_none() { let grandchildren_info = &child_variable_info.children_without_fields; if !grandchildren_info.types.is_empty() { - children_without_fields_quantity - .append(grandchildren_info.quantity); - - if variable_info.children_without_fields.types.is_empty() { - variable_info.children_without_fields.quantity = - ChildQuantity::one(); - } - - for child_type in &grandchildren_info.types { + production_children_without_fields_quantity + .append(child_variable_info.children_without_fields.quantity); + for child_type in &child_variable_info.children_without_fields.types + { did_change |= sorted_vec_insert( &mut variable_info.children_without_fields.types, &child_type, @@ -302,22 +289,27 @@ pub(crate) fn get_variable_info( // Note whether or not this production contains children whose summaries // have not yet been computed. if child_symbol.index >= i && !all_initialized { - has_uninitialized_invisible_children = true; + production_has_uninitialized_invisible_children = true; } } // If this production's children all have had their summaries initialized, // then expand the quantity information with all of the possibilities introduced // by this production. - if !has_uninitialized_invisible_children { + if !production_has_uninitialized_invisible_children { + did_change |= variable_info + .children + .quantity + .union(production_children_quantity); + did_change |= variable_info .children_without_fields .quantity - .union(children_without_fields_quantity); + .union(production_children_without_fields_quantity); for (field_name, info) in variable_info.fields.iter_mut() { did_change |= info.quantity.union( - field_quantities + production_field_quantities .get(field_name) .cloned() .unwrap_or(ChildQuantity::zero()), @@ -352,7 +344,8 @@ pub(crate) fn get_variable_info( // Update all of the node type lists to eliminate hidden nodes. for supertype_symbol in &syntax_grammar.supertype_symbols { result[supertype_symbol.index] - .child_types + .children + .types .retain(child_type_is_visible); } for variable_info in result.iter_mut() { @@ -467,7 +460,8 @@ pub(crate) fn generate_node_types_json( subtypes: None, }); let mut subtypes = info - .child_types + .children + .types .iter() .map(child_type_to_node_type) .collect::>(); @@ -1461,6 +1455,71 @@ mod tests { ); } + #[test] + fn test_get_variable_info_with_repetitions_inside_fields() { + let variable_info = get_variable_info( + &build_syntax_grammar( + vec![ + // Field associated with a repetition. + SyntaxVariable { + name: "rule0".to_string(), + kind: VariableType::Named, + productions: vec![ + Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::non_terminal(1)) + .with_field_name("field1")], + }, + Production { + dynamic_precedence: 0, + steps: vec![], + }, + ], + }, + // Repetition node + SyntaxVariable { + name: "_rule0_repeat".to_string(), + kind: VariableType::Hidden, + productions: vec![ + Production { + dynamic_precedence: 0, + steps: vec![ProductionStep::new(Symbol::terminal(1))], + }, + Production { + dynamic_precedence: 0, + steps: vec![ + ProductionStep::new(Symbol::non_terminal(1)), + ProductionStep::new(Symbol::non_terminal(1)), + ], + }, + ], + }, + ], + vec![], + ), + &build_lexical_grammar(), + &AliasMap::new(), + ) + .unwrap(); + + assert_eq!( + variable_info[0].fields, + vec![( + "field1".to_string(), + FieldInfo { + quantity: ChildQuantity { + exists: true, + required: false, + multiple: true, + }, + types: vec![ChildType::Normal(Symbol::terminal(1))], + } + )] + .into_iter() + .collect::>() + ); + } + #[test] fn test_get_variable_info_with_inherited_fields() { let variable_info = get_variable_info( From 12341dbbc03075e0b3bdcbf05191efbac78731fe Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 17 Jul 2020 14:23:54 -0700 Subject: [PATCH 44/71] 0.16.9 --- Cargo.lock | 2 +- cli/Cargo.toml | 2 +- cli/npm/package.json | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cdad3b61..117ac49e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -740,7 +740,7 @@ dependencies = [ [[package]] name = "tree-sitter-cli" -version = "0.16.8" +version = "0.16.9" dependencies = [ "ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)", "cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 0d85952f..52a2ed6b 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "tree-sitter-cli" description = "CLI tool for developing, testing, and using Tree-sitter parsers" -version = "0.16.8" +version = "0.16.9" authors = ["Max Brunsfeld "] edition = "2018" license = "MIT" diff --git a/cli/npm/package.json b/cli/npm/package.json index 738c5622..01afe107 100644 --- a/cli/npm/package.json +++ b/cli/npm/package.json @@ -1,6 +1,6 @@ { "name": "tree-sitter-cli", - "version": "0.16.8", + "version": "0.16.9", "author": "Max Brunsfeld", "license": "MIT", "repository": { From 82aa1462fd9f4b0d3a27dc2241318d6dbd0f6830 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 17 Jul 2020 15:12:13 -0700 Subject: [PATCH 45/71] Clean up get_variable_info function --- cli/src/generate/node_types.rs | 67 ++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 32 deletions(-) diff --git a/cli/src/generate/node_types.rs b/cli/src/generate/node_types.rs index 6df40807..039d7190 100644 --- a/cli/src/generate/node_types.rs +++ b/cli/src/generate/node_types.rs @@ -196,7 +196,8 @@ pub(crate) fn get_variable_info( // Maintain the set of all child types for this variable, and the quantity of // visible children in this production. - did_change |= sorted_vec_insert(&mut variable_info.children.types, &child_type); + did_change |= + extend_sorted(&mut variable_info.children.types, Some(&child_type)); if !child_is_hidden { production_children_quantity.append(ChildQuantity::one()); } @@ -208,7 +209,7 @@ pub(crate) fn get_variable_info( .fields .entry(field_name.clone()) .or_insert(FieldInfo::default()); - did_change |= sorted_vec_insert(&mut field_info.types, &child_type); + did_change |= extend_sorted(&mut field_info.types, Some(&child_type)); let production_field_quantity = production_field_quantities .entry(field_name) @@ -217,9 +218,10 @@ pub(crate) fn get_variable_info( // Inherit the types and quantities of hidden children associated with fields. if child_is_hidden { let child_variable_info = &result[child_symbol.index]; - for child_type in &child_variable_info.children.types { - did_change |= sorted_vec_insert(&mut field_info.types, &child_type); - } + did_change |= extend_sorted( + &mut field_info.types, + &child_variable_info.children.types, + ); production_field_quantity.append(child_variable_info.children.quantity); } else { production_field_quantity.append(ChildQuantity::one()); @@ -228,9 +230,9 @@ pub(crate) fn get_variable_info( // Maintain the set of named children without fields within this variable. else if child_type_is_named(&child_type) { production_children_without_fields_quantity.append(ChildQuantity::one()); - did_change |= sorted_vec_insert( + did_change |= extend_sorted( &mut variable_info.children_without_fields.types, - &child_type, + Some(&child_type), ); } @@ -251,22 +253,23 @@ pub(crate) fn get_variable_info( .entry(field_name) .or_insert(ChildQuantity::zero()) .append(child_field_info.quantity); - let field_info = variable_info - .fields - .entry(field_name.clone()) - .or_insert(FieldInfo::default()); - for child_type in &child_field_info.types { - did_change |= sorted_vec_insert(&mut field_info.types, &child_type); - } + did_change |= extend_sorted( + &mut variable_info + .fields + .entry(field_name.clone()) + .or_insert(FieldInfo::default()) + .types, + &child_field_info.types, + ); } // If a hidden child has children, then the parent node can appear to have // those same children. production_children_quantity.append(child_variable_info.children.quantity); - for child_type in &child_variable_info.children.types { - did_change |= - sorted_vec_insert(&mut variable_info.children.types, child_type); - } + did_change |= extend_sorted( + &mut variable_info.children.types, + &child_variable_info.children.types, + ); // If a hidden child can have named children without fields, then the parent // node can appear to have those same children. @@ -275,13 +278,10 @@ pub(crate) fn get_variable_info( if !grandchildren_info.types.is_empty() { production_children_without_fields_quantity .append(child_variable_info.children_without_fields.quantity); - for child_type in &child_variable_info.children_without_fields.types - { - did_change |= sorted_vec_insert( - &mut variable_info.children_without_fields.types, - &child_type, - ); - } + did_change |= extend_sorted( + &mut variable_info.children_without_fields.types, + &child_variable_info.children_without_fields.types, + ); } } } @@ -680,16 +680,19 @@ fn variable_type_for_child_type( } } -fn sorted_vec_insert(vec: &mut Vec, value: &T) -> bool +fn extend_sorted<'a, T>(vec: &mut Vec, values: impl IntoIterator) -> bool where T: Clone + Eq + Ord, + T: 'a, { - if let Err(i) = vec.binary_search(&value) { - vec.insert(i, value.clone()); - true - } else { - false - } + values.into_iter().any(|value| { + if let Err(i) = vec.binary_search(&value) { + vec.insert(i, value.clone()); + true + } else { + false + } + }) } #[cfg(test)] From 740d864e678ab0c5518780afd906e2123d8a9d79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tu=E1=BA=A5n-Anh=20Nguy=E1=BB=85n?= Date: Sun, 19 Jul 2020 12:40:17 +0700 Subject: [PATCH 46/71] Add '.' as a valid start of a predicate, in addition to '#' See https://github.com/ubolonton/emacs-tree-sitter/issues/38 --- cli/src/tests/query_test.rs | 29 +++++++++++++++++++++++++++++ lib/src/query.c | 4 ++-- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/cli/src/tests/query_test.rs b/cli/src/tests/query_test.rs index 06ecc42e..493bea8a 100644 --- a/cli/src/tests/query_test.rs +++ b/cli/src/tests/query_test.rs @@ -2087,6 +2087,35 @@ fn test_query_disable_pattern() { }); } +#[test] +fn test_query_alternative_predicate_prefix() { + allocations::record(|| { + let language = get_language("c"); + let query = Query::new(language, r#" + ((call_expression + function: (identifier) @keyword + arguments: (argument_list + (string_literal) @function)) + (.eq? @keyword "DEFUN")) + "#).unwrap(); + let source = r#" + DEFUN ("identity", Fidentity, Sidentity, 1, 1, 0, + doc: /* Return the argument unchanged. */ + attributes: const) + (Lisp_Object arg) + { + return arg; + } + "#; + assert_query_matches( + language, + &query, + source, + &[(0, vec![("keyword", "DEFUN"), ("function", "\"identity\"")])], + ); + }); +} + fn assert_query_matches( language: Language, query: &Query, diff --git a/lib/src/query.c b/lib/src/query.c index b95ba057..acce2c72 100644 --- a/lib/src/query.c +++ b/lib/src/query.c @@ -805,8 +805,8 @@ static TSQueryError ts_query__parse_pattern( } } - // A pound character indicates the start of a predicate. - else if (stream->next == '#') { + // A dot/pound character indicates the start of a predicate. + else if (stream->next == '.' || stream->next == '#') { stream_advance(stream); return ts_query__parse_predicate(self, stream); } From ba70927f573b0d098046da77888d3219ee31cc9d Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 20 Jul 2020 16:46:45 -0700 Subject: [PATCH 47/71] tags: Skip tags with a parse error inside the name --- tags/src/lib.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tags/src/lib.rs b/tags/src/lib.rs index dcbb9984..7733f3e3 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -372,6 +372,10 @@ where } if let (Some(tag_node), Some(name_node)) = (tag_node, name_node) { + if name_node.has_error() { + continue; + } + let name_range = name_node.byte_range(); if pattern_info.name_must_be_non_local { From a3b440b0c89763bb0b2e49f2a94144accc13462b Mon Sep 17 00:00:00 2001 From: Riccardo Schirone Date: Thu, 23 Jul 2020 09:48:18 +0200 Subject: [PATCH 48/71] size_t variables need %zu, not %lu --- lib/src/alloc.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/src/alloc.h b/lib/src/alloc.h index 9bbf7513..0e0927a9 100644 --- a/lib/src/alloc.h +++ b/lib/src/alloc.h @@ -45,7 +45,7 @@ static inline bool ts_toggle_allocation_recording(bool value) { static inline void *ts_malloc(size_t size) { void *result = malloc(size); if (size > 0 && !result) { - fprintf(stderr, "tree-sitter failed to allocate %lu bytes", size); + fprintf(stderr, "tree-sitter failed to allocate %zu bytes", size); exit(1); } return result; @@ -54,7 +54,7 @@ static inline void *ts_malloc(size_t size) { static inline void *ts_calloc(size_t count, size_t size) { void *result = calloc(count, size); if (count > 0 && !result) { - fprintf(stderr, "tree-sitter failed to allocate %lu bytes", count * size); + fprintf(stderr, "tree-sitter failed to allocate %zu bytes", count * size); exit(1); } return result; @@ -63,7 +63,7 @@ static inline void *ts_calloc(size_t count, size_t size) { static inline void *ts_realloc(void *buffer, size_t size) { void *result = realloc(buffer, size); if (size > 0 && !result) { - fprintf(stderr, "tree-sitter failed to reallocate %lu bytes", size); + fprintf(stderr, "tree-sitter failed to reallocate %zu bytes", size); exit(1); } return result; From de2b71d465919cc361d45a4abecb867b12fdd6d4 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 23 Jul 2020 16:05:50 -0700 Subject: [PATCH 49/71] Fix query bug when max permutations are exceeded --- cli/src/loader.rs | 4 +- cli/src/tests/query_test.rs | 45 +++++++++++- lib/src/query.c | 138 +++++++++++++++++++++--------------- 3 files changed, 125 insertions(+), 62 deletions(-) diff --git a/cli/src/loader.rs b/cli/src/loader.rs index cf2eb143..62cc9b62 100644 --- a/cli/src/loader.rs +++ b/cli/src/loader.rs @@ -160,7 +160,9 @@ impl Loader { // If multiple language configurations match, then determine which // one to use by applying the configurations' content regexes. else { - let file_contents = fs::read_to_string(path)?; + let file_contents = fs::read(path) + .map_err(Error::wrap(|| format!("Failed to read path {:?}", path)))?; + let file_contents = String::from_utf8_lossy(&file_contents); let mut best_score = -2isize; let mut best_configuration_id = None; for configuration_id in configuration_ids { diff --git a/cli/src/tests/query_test.rs b/cli/src/tests/query_test.rs index 493bea8a..c304f3b4 100644 --- a/cli/src/tests/query_test.rs +++ b/cli/src/tests/query_test.rs @@ -1135,6 +1135,43 @@ fn test_query_matches_with_too_many_permutations_to_track() { }); } +#[test] +fn test_query_matches_with_alternatives_and_too_many_permutations_to_track() { + allocations::record(|| { + let language = get_language("javascript"); + let query = Query::new( + language, + " + ( + (comment) @doc + ; not immediate + (class_declaration) @class + ) + + (call_expression + function: [ + (identifier) @function + (member_expression property: (property_identifier) @method) + ]) + ", + ) + .unwrap(); + + let source = "/* hi */ a.b(); ".repeat(50); + + let mut parser = Parser::new(); + parser.set_language(language).unwrap(); + let tree = parser.parse(&source, None).unwrap(); + let mut cursor = QueryCursor::new(); + let matches = cursor.matches(&query, tree.root_node(), to_callback(&source)); + + assert_eq!( + collect_matches(matches, &query, source.as_str()), + vec![(1, vec![("method", "b")]); 50], + ); + }); +} + #[test] fn test_query_matches_with_anonymous_tokens() { allocations::record(|| { @@ -2091,13 +2128,17 @@ fn test_query_disable_pattern() { fn test_query_alternative_predicate_prefix() { allocations::record(|| { let language = get_language("c"); - let query = Query::new(language, r#" + let query = Query::new( + language, + r#" ((call_expression function: (identifier) @keyword arguments: (argument_list (string_literal) @function)) (.eq? @keyword "DEFUN")) - "#).unwrap(); + "#, + ) + .unwrap(); let source = r#" DEFUN ("identity", Fidentity, Sidentity, 1, 1, 0, doc: /* Return the argument unchanged. */ diff --git a/lib/src/query.c b/lib/src/query.c index acce2c72..05c767e1 100644 --- a/lib/src/query.c +++ b/lib/src/query.c @@ -122,6 +122,7 @@ typedef struct { uint16_t consumed_capture_count: 14; bool seeking_immediate_match: 1; bool has_in_progress_alternatives: 1; + bool dead: 1; } QueryState; typedef Array(TSQueryCapture) CaptureList; @@ -1365,6 +1366,7 @@ static bool ts_query_cursor__first_in_progress_capture( *pattern_index = UINT32_MAX; for (unsigned i = 0; i < self->states.size; i++) { const QueryState *state = &self->states.contents[i]; + if (state->dead) continue; const CaptureList *captures = capture_list_pool_get( &self->capture_list_pool, state->capture_list_id @@ -1480,44 +1482,88 @@ static bool ts_query_cursor__add_state( .start_depth = self->depth - step->depth, .consumed_capture_count = 0, .seeking_immediate_match = false, + .has_in_progress_alternatives = false, + .dead = false, })); return true; } +// Acquire a capture list for this state. If there are no capture lists left in the +// pool, this will steal the capture list from another existing state, and mark that +// other state as 'dead'. +static CaptureList *ts_query_cursor__prepare_to_capture( + TSQueryCursor *self, + QueryState *state, + unsigned state_index_to_preserve +) { + if (state->capture_list_id == NONE) { + state->capture_list_id = capture_list_pool_acquire(&self->capture_list_pool); + + // If there are no capture lists left in the pool, then terminate whichever + // state has captured the earliest node in the document, and steal its + // capture list. + if (state->capture_list_id == NONE) { + uint32_t state_index, byte_offset, pattern_index; + if ( + ts_query_cursor__first_in_progress_capture( + self, + &state_index, + &byte_offset, + &pattern_index + ) && + state_index != state_index_to_preserve + ) { + LOG( + " abandon state. index:%u, pattern:%u, offset:%u.\n", + state_index, pattern_index, byte_offset + ); + QueryState *other_state = &self->states.contents[state_index]; + state->capture_list_id = other_state->capture_list_id; + other_state->capture_list_id = NONE; + other_state->dead = true; + CaptureList *list = capture_list_pool_get_mut( + &self->capture_list_pool, + state->capture_list_id + ); + array_clear(list); + return list; + } else { + LOG(" ran out of capture lists"); + return NULL; + } + } + } + return capture_list_pool_get_mut(&self->capture_list_pool, state->capture_list_id); +} + // Duplicate the given state and insert the newly-created state immediately after // the given state in the `states` array. -static QueryState *ts_query__cursor_copy_state( +static QueryState *ts_query_cursor__copy_state( TSQueryCursor *self, - const QueryState *state + unsigned state_index ) { if (self->states.size >= MAX_STATE_COUNT) { LOG(" too many states"); return NULL; } - // If the state has captures, copy its capture list. + const QueryState *state = &self->states.contents[state_index]; QueryState copy = *state; - copy.capture_list_id = state->capture_list_id; + copy.capture_list_id = NONE; + + // If the state has captures, copy its capture list. if (state->capture_list_id != NONE) { - copy.capture_list_id = capture_list_pool_acquire(&self->capture_list_pool); - if (copy.capture_list_id == NONE) { - LOG(" too many capture lists"); - return NULL; - } + CaptureList *new_captures = ts_query_cursor__prepare_to_capture(self, ©, state_index); + if (!new_captures) return NULL; const CaptureList *old_captures = capture_list_pool_get( &self->capture_list_pool, state->capture_list_id ); - CaptureList *new_captures = capture_list_pool_get_mut( - &self->capture_list_pool, - copy.capture_list_id - ); array_push_all(new_captures, old_captures); } - uint32_t index = (state - self->states.contents) + 1; - array_insert(&self->states, index, copy); - return &self->states.contents[index]; + array_insert(&self->states, state_index + 1, copy); + return &self->states.contents[state_index + 1]; } // Walk the tree, processing patterns until at least one pattern finishes, @@ -1728,7 +1774,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { !step->is_pattern_start && step->contains_captures ) { - if (ts_query__cursor_copy_state(self, state)) { + if (ts_query_cursor__copy_state(self, i)) { LOG( " split state for capture. pattern:%u, step:%u\n", state->pattern_index, @@ -1739,45 +1785,14 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { } // If the current node is captured in this pattern, add it to the capture list. - // For the first capture in a pattern, lazily acquire a capture list. if (step->capture_ids[0] != NONE) { - if (state->capture_list_id == NONE) { - state->capture_list_id = capture_list_pool_acquire(&self->capture_list_pool); - - // If there are no capture lists left in the pool, then terminate whichever - // state has captured the earliest node in the document, and steal its - // capture list. - if (state->capture_list_id == NONE) { - uint32_t state_index, byte_offset, pattern_index; - if (ts_query_cursor__first_in_progress_capture( - self, - &state_index, - &byte_offset, - &pattern_index - )) { - LOG( - " abandon state. index:%u, pattern:%u, offset:%u.\n", - state_index, pattern_index, byte_offset - ); - state->capture_list_id = self->states.contents[state_index].capture_list_id; - array_erase(&self->states, state_index); - if (state_index < i) { - i--; - state--; - } - } else { - LOG(" too many finished states.\n"); - array_erase(&self->states, i); - i--; - continue; - } - } + CaptureList *capture_list = ts_query_cursor__prepare_to_capture(self, state, UINT32_MAX); + if (!capture_list) { + array_erase(&self->states, i); + i--; + continue; } - CaptureList *capture_list = capture_list_pool_get_mut( - &self->capture_list_pool, - state->capture_list_id - ); for (unsigned j = 0; j < MAX_STEP_CAPTURE_COUNT; j++) { uint16_t capture_id = step->capture_ids[j]; if (step->capture_ids[j] == NONE) break; @@ -1800,10 +1815,9 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { state->step_index ); - // If this state's next step has an 'alternative' step (the step is either optional, - // or is the end of a repetition), then copy the state in order to pursue both - // alternatives. The alternative step itself may have an alternative, so this is - // an interative process. + // If this state's next step has an alternative step, then copy the state in order + // to pursue both alternatives. The alternative step itself may have an alternative, + // so this is an interative process. unsigned end_index = i + 1; for (unsigned j = i; j < end_index; j++) { QueryState *state = &self->states.contents[j]; @@ -1815,7 +1829,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { continue; } - QueryState *copy = ts_query__cursor_copy_state(self, state); + QueryState *copy = ts_query_cursor__copy_state(self, j); if (next_step->is_pass_through) { state->step_index++; j--; @@ -1841,14 +1855,20 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { for (unsigned i = 0; i < self->states.size; i++) { QueryState *state = &self->states.contents[i]; - bool did_remove = false; + if (state->dead) { + array_erase(&self->states, i); + i--; + continue; + } // Enfore the longest-match criteria. When a query pattern contains optional or // repeated nodes, this is necesssary to avoid multiple redundant states, where // one state has a strict subset of another state's captures. + bool did_remove = false; for (unsigned j = i + 1; j < self->states.size; j++) { QueryState *other_state = &self->states.contents[j]; if ( + !other_state->dead && state->pattern_index == other_state->pattern_index && state->start_depth == other_state->start_depth ) { From 32099050d6d41ff9538c4f7c4991b66254cad024 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 24 Jul 2020 09:26:54 -0700 Subject: [PATCH 50/71] node_types: Fix panic when field is associated with a hidden token Fixes #695 --- cli/src/generate/node_types.rs | 35 +++++++++++++++++++++++++++++++++- cli/src/main.rs | 2 +- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/cli/src/generate/node_types.rs b/cli/src/generate/node_types.rs index 039d7190..7a5768a5 100644 --- a/cli/src/generate/node_types.rs +++ b/cli/src/generate/node_types.rs @@ -216,7 +216,7 @@ pub(crate) fn get_variable_info( .or_insert(ChildQuantity::zero()); // Inherit the types and quantities of hidden children associated with fields. - if child_is_hidden { + if child_is_hidden && child_symbol.is_non_terminal() { let child_variable_info = &result[child_symbol.index]; did_change |= extend_sorted( &mut field_info.types, @@ -352,6 +352,7 @@ pub(crate) fn get_variable_info( for (_, field_info) in variable_info.fields.iter_mut() { field_info.types.retain(child_type_is_visible); } + variable_info.fields.retain(|_, v| !v.types.is_empty()); variable_info .children_without_fields .types @@ -1174,6 +1175,38 @@ mod tests { ); } + #[test] + fn test_node_types_with_fields_on_hidden_tokens() { + let node_types = get_node_types(InputGrammar { + name: String::new(), + extra_symbols: Vec::new(), + external_tokens: Vec::new(), + expected_conflicts: Vec::new(), + variables_to_inline: Vec::new(), + word_token: None, + supertype_symbols: vec![], + variables: vec![Variable { + name: "script".to_string(), + kind: VariableType::Named, + rule: Rule::seq(vec![ + Rule::field("a".to_string(), Rule::pattern("hi")), + Rule::field("b".to_string(), Rule::pattern("bye")), + ]), + }], + }); + + assert_eq!( + node_types, + [NodeInfoJSON { + kind: "script".to_string(), + named: true, + fields: Some(BTreeMap::new()), + children: None, + subtypes: None + }] + ); + } + #[test] fn test_node_types_with_multiple_rules_same_alias_name() { let node_types = get_node_types(InputGrammar { diff --git a/cli/src/main.rs b/cli/src/main.rs index 0668d08d..2f8c6dd5 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -99,7 +99,7 @@ fn run() -> error::Result<()> { .subcommand( SubCommand::with_name("tags") .arg(Arg::with_name("quiet").long("quiet").short("q")) - .arg(Arg::with_name("time").long("quiet").short("t")) + .arg(Arg::with_name("time").long("time").short("t")) .arg(Arg::with_name("scope").long("scope").takes_value(true)) .arg(Arg::with_name("paths-file").long("paths").takes_value(true)) .arg( From 1ae5cbc851ca55214a59e675240cd2dfd1efb276 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 24 Jul 2020 10:49:20 -0700 Subject: [PATCH 51/71] query: Handle #not-match? in rust, wasm bindings --- cli/src/tests/query_test.rs | 9 ++++++++- lib/binding_rust/lib.rs | 10 ++++++---- lib/binding_web/binding.js | 4 +++- lib/binding_web/test/query-test.js | 9 ++++++++- 4 files changed, 25 insertions(+), 7 deletions(-) diff --git a/cli/src/tests/query_test.rs b/cli/src/tests/query_test.rs index c304f3b4..914d41cd 100644 --- a/cli/src/tests/query_test.rs +++ b/cli/src/tests/query_test.rs @@ -1470,12 +1470,17 @@ fn test_query_captures_with_text_conditions() { ((identifier) @function.builtin (#eq? @function.builtin "require")) - (identifier) @variable + ((identifier) @variable + (#not-match? @variable "^(lambda|load)$")) "#, ) .unwrap(); let source = " + toad + load + panda + lambda const ab = require('./ab'); new Cd(EF); "; @@ -1489,6 +1494,8 @@ fn test_query_captures_with_text_conditions() { assert_eq!( collect_captures(captures, &query, source), &[ + ("variable", "toad"), + ("variable", "panda"), ("variable", "ab"), ("function.builtin", "require"), ("variable", "require"), diff --git a/lib/binding_rust/lib.rs b/lib/binding_rust/lib.rs index c0aba32f..ec7cd791 100644 --- a/lib/binding_rust/lib.rs +++ b/lib/binding_rust/lib.rs @@ -169,7 +169,7 @@ pub enum QueryError { enum TextPredicate { CaptureEqString(u32, String, bool), CaptureEqCapture(u32, u32, bool), - CaptureMatchString(u32, regex::bytes::Regex), + CaptureMatchString(u32, regex::bytes::Regex, bool), } impl Language { @@ -1298,7 +1298,7 @@ impl Query { }); } - "match?" => { + "match?" | "not-match?" => { if p.len() != 3 { return Err(QueryError::Predicate(format!( "Wrong number of arguments to #match? predicate. Expected 2, got {}.", @@ -1318,12 +1318,14 @@ impl Query { ))); } + let is_positive = operator_name == "match?"; let regex = &string_values[p[2].value_id as usize]; text_predicates.push(TextPredicate::CaptureMatchString( p[1].value_id, regex::bytes::Regex::new(regex).map_err(|_| { QueryError::Predicate(format!("Invalid regex '{}'", regex)) })?, + is_positive, )); } @@ -1607,9 +1609,9 @@ impl<'a> QueryMatch<'a> { let node = self.capture_for_index(*i).unwrap(); (text_callback(node).as_ref() == s.as_bytes()) == *is_positive } - TextPredicate::CaptureMatchString(i, r) => { + TextPredicate::CaptureMatchString(i, r, is_positive) => { let node = self.capture_for_index(*i).unwrap(); - r.is_match(text_callback(node).as_ref()) + r.is_match(text_callback(node).as_ref()) == *is_positive } }) } diff --git a/lib/binding_web/binding.js b/lib/binding_web/binding.js index 567b7eb3..3a193ef9 100644 --- a/lib/binding_web/binding.js +++ b/lib/binding_web/binding.js @@ -784,6 +784,8 @@ class Language { } break; + case 'not-match?': + isPositive = false; case 'match?': if (steps.length !== 3) throw new Error( `Wrong number of arguments to \`#match?\` predicate. Expected 2, got ${steps.length - 1}.` @@ -798,7 +800,7 @@ class Language { const regex = new RegExp(steps[2].value); textPredicates[i].push(function(captures) { for (const c of captures) { - if (c.name === captureName) return regex.test(c.node.text); + if (c.name === captureName) return regex.test(c.node.text) === isPositive; } return false; }); diff --git a/lib/binding_web/test/query-test.js b/lib/binding_web/test/query-test.js index 9dda9834..9d1e24e1 100644 --- a/lib/binding_web/test/query-test.js +++ b/lib/binding_web/test/query-test.js @@ -126,12 +126,17 @@ describe("Query", () => { it("handles conditions that compare the text of capture to literal strings", () => { tree = parser.parse(` + lambda + panda + load + toad const ab = require('./ab'); new Cd(EF); `); query = JavaScript.query(` - (identifier) @variable + ((identifier) @variable + (#not-match? @variable "^(lambda|load)$")) ((identifier) @function.builtin (#eq? @function.builtin "require")) @@ -145,6 +150,8 @@ describe("Query", () => { const captures = query.captures(tree.rootNode); assert.deepEqual(formatCaptures(captures), [ + { name: "variable", text: "panda" }, + { name: "variable", text: "toad" }, { name: "variable", text: "ab" }, { name: "variable", text: "require" }, { name: "function.builtin", text: "require" }, From d22240591c2accdc94de466f7352ee56c399a796 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Mon, 27 Jul 2020 17:38:32 -0500 Subject: [PATCH 52/71] Docs: document the `set!` predicate I was looking for something like this, I searched the documentation, but I found it in https://github.com/tree-sitter/tree-sitter-javascript/blob/master/queries/injections.scm#L15 --- docs/section-4-syntax-highlighting.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/section-4-syntax-highlighting.md b/docs/section-4-syntax-highlighting.md index 0182704b..cbf97b66 100644 --- a/docs/section-4-syntax-highlighting.md +++ b/docs/section-4-syntax-highlighting.md @@ -385,6 +385,14 @@ The following query would specify that the contents of the heredoc should be par (heredoc_end) @injection.language) @injection.content ``` +You can also force the language using the `#set!` predicate. +For example, this will force the language to be always `ruby`. + +``` +((heredoc_body) @injection.content + (#set! injection.language "ruby")) +``` + ## Unit Testing Tree-sitter has a built-in way to verify the results of syntax highlighting. The interface is based on [Sublime Text's system](https://www.sublimetext.com/docs/3/syntax.html#testing) for testing highlighting. From 253f23c3d432d75cbb2b4c53f5ca090c1e46ae72 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 28 Jul 2020 13:30:34 -0700 Subject: [PATCH 53/71] Fix error when parse error occurs after non-terminal extra --- lib/src/parser.c | 53 +++++++++++++--------- test/fixtures/error_corpus/ruby_errors.txt | 19 ++++++++ 2 files changed, 50 insertions(+), 22 deletions(-) create mode 100644 test/fixtures/error_corpus/ruby_errors.txt diff --git a/lib/src/parser.c b/lib/src/parser.c index 4d7dc1e5..035672b8 100644 --- a/lib/src/parser.c +++ b/lib/src/parser.c @@ -355,10 +355,11 @@ static Subtree ts_parser__lex( StackVersion version, TSStateId parse_state ) { - Length start_position = ts_stack_position(self->stack, version); - Subtree external_token = ts_stack_last_external_token(self->stack, version); TSLexMode lex_mode = self->language->lex_modes[parse_state]; if (lex_mode.lex_state == (uint16_t)-1) return NULL_SUBTREE; + + Length start_position = ts_stack_position(self->stack, version); + Subtree external_token = ts_stack_last_external_token(self->stack, version); const bool *valid_external_tokens = ts_language_enabled_external_tokens( self->language, lex_mode.external_lex_state @@ -1345,24 +1346,26 @@ static bool ts_parser__advance( ); } -lex: - // Otherwise, re-run the lexer. - if (!lookahead.ptr) { - lookahead = ts_parser__lex(self, version, state); - if (lookahead.ptr) { - ts_parser__set_cached_token(self, position, last_external_token, lookahead); - ts_language_table_entry(self->language, state, ts_subtree_symbol(lookahead), &table_entry); - } - - // When parsing a non-terminal extra, a null lookahead indicates the - // end of the rule. The reduction is stored in the EOF table entry. - // After the reduction, the lexer needs to be run again. - else { - ts_language_table_entry(self->language, state, ts_builtin_sym_end, &table_entry); - } - } - + bool needs_lex = !lookahead.ptr; for (;;) { + // Otherwise, re-run the lexer. + if (needs_lex) { + needs_lex = false; + lookahead = ts_parser__lex(self, version, state); + + if (lookahead.ptr) { + ts_parser__set_cached_token(self, position, last_external_token, lookahead); + ts_language_table_entry(self->language, state, ts_subtree_symbol(lookahead), &table_entry); + } + + // When parsing a non-terminal extra, a null lookahead indicates the + // end of the rule. The reduction is stored in the EOF table entry. + // After the reduction, the lexer needs to be run again. + else { + ts_language_table_entry(self->language, state, ts_builtin_sym_end, &table_entry); + } + } + // If a cancellation flag or a timeout was provided, then check every // time a fixed number of parse actions has been processed. if (++self->operation_count == OP_COUNT_PER_TIMEOUT_CHECK) { @@ -1459,8 +1462,10 @@ lex: // (and completing the non-terminal extra rule) run the lexer again based // on the current parse state. if (!lookahead.ptr) { - lookahead = ts_parser__lex(self, version, state); + needs_lex = true; + continue; } + ts_language_table_entry( self->language, state, @@ -1470,6 +1475,11 @@ lex: continue; } + if (!lookahead.ptr) { + ts_stack_pause(self->stack, version, ts_builtin_sym_end); + return true; + } + // If there were no parse actions for the current lookahead token, then // it is not valid in this state. If the current lookahead token is a // keyword, then switch to treating it as the normal word token if that @@ -1509,8 +1519,7 @@ lex: if (ts_parser__breakdown_top_of_stack(self, version)) { state = ts_stack_state(self->stack, version); ts_subtree_release(&self->tree_pool, lookahead); - lookahead = NULL_SUBTREE; - goto lex; + needs_lex = true; continue; } diff --git a/test/fixtures/error_corpus/ruby_errors.txt b/test/fixtures/error_corpus/ruby_errors.txt new file mode 100644 index 00000000..9c35781c --- /dev/null +++ b/test/fixtures/error_corpus/ruby_errors.txt @@ -0,0 +1,19 @@ +========================== +Heredocs with errors +========================== + +joins(<<~SQL( + b +SQL +c + +--- + +(program + (method_call + method: (identifier) + (ERROR (heredoc_beginning)) + arguments: (argument_list + (heredoc_body (heredoc_end)) + (identifier) + (MISSING ")")))) From 81bbdf19f4dc42f5f30c589b3ed449b6150de3de Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 29 Jul 2020 09:50:13 -0700 Subject: [PATCH 54/71] Fix handling of non-terminal extras that share non-extra rules Fixes #701 --- .../generate/build_tables/minimize_parse_table.rs | 3 +++ lib/src/parser.c | 15 ++++++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/cli/src/generate/build_tables/minimize_parse_table.rs b/cli/src/generate/build_tables/minimize_parse_table.rs index 5d8f7f0f..aa4801c8 100644 --- a/cli/src/generate/build_tables/minimize_parse_table.rs +++ b/cli/src/generate/build_tables/minimize_parse_table.rs @@ -199,6 +199,9 @@ impl<'a> Minimizer<'a> { right_state: &ParseState, group_ids_by_state_id: &Vec, ) -> bool { + if left_state.is_non_terminal_extra != right_state.is_non_terminal_extra { + return true; + } for (token, left_entry) in &left_state.terminal_entries { if let Some(right_entry) = right_state.terminal_entries.get(token) { if self.entries_conflict( diff --git a/lib/src/parser.c b/lib/src/parser.c index 035672b8..37d1a1c2 100644 --- a/lib/src/parser.c +++ b/lib/src/parser.c @@ -356,7 +356,10 @@ static Subtree ts_parser__lex( TSStateId parse_state ) { TSLexMode lex_mode = self->language->lex_modes[parse_state]; - if (lex_mode.lex_state == (uint16_t)-1) return NULL_SUBTREE; + if (lex_mode.lex_state == (uint16_t)-1) { + LOG("no_lookahead_after_non_terminal_extra"); + return NULL_SUBTREE; + } Length start_position = ts_stack_position(self->stack, version); Subtree external_token = ts_stack_last_external_token(self->stack, version); @@ -762,7 +765,7 @@ static StackVersion ts_parser__reduce( int dynamic_precedence, uint16_t production_id, bool is_fragile, - bool is_extra + bool end_of_non_terminal_extra ) { uint32_t initial_version_count = ts_stack_version_count(self->stack); @@ -833,7 +836,9 @@ static StackVersion ts_parser__reduce( TSStateId state = ts_stack_state(self->stack, slice_version); TSStateId next_state = ts_language_next_state(self->language, state, symbol); - if (is_extra) parent.ptr->extra = true; + if (end_of_non_terminal_extra && next_state == state) { + parent.ptr->extra = true; + } if (is_fragile || pop.size > 1 || initial_version_count > 1) { parent.ptr->fragile_left = true; parent.ptr->fragile_right = true; @@ -1417,12 +1422,12 @@ static bool ts_parser__advance( case TSParseActionTypeReduce: { bool is_fragile = table_entry.action_count > 1; - bool is_extra = lookahead.ptr == NULL; + bool end_of_non_terminal_extra = lookahead.ptr == NULL; LOG("reduce sym:%s, child_count:%u", SYM_NAME(action.params.reduce.symbol), action.params.reduce.child_count); StackVersion reduction_version = ts_parser__reduce( self, version, action.params.reduce.symbol, action.params.reduce.child_count, action.params.reduce.dynamic_precedence, action.params.reduce.production_id, - is_fragile, is_extra + is_fragile, end_of_non_terminal_extra ); if (reduction_version != STACK_VERSION_NONE) { last_reduction_version = reduction_version; From 9a7fdd29c263a1fa7778c7ec1cbc812397d88571 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 29 Jul 2020 09:53:07 -0700 Subject: [PATCH 55/71] Add test for non-terminal extras that share non-extra rules --- .../corpus.txt | 23 +++++++ .../grammar.json | 68 +++++++++++++++++++ 2 files changed, 91 insertions(+) create mode 100644 test/fixtures/test_grammars/extra_non_terminals_with_shared_rules/corpus.txt create mode 100644 test/fixtures/test_grammars/extra_non_terminals_with_shared_rules/grammar.json diff --git a/test/fixtures/test_grammars/extra_non_terminals_with_shared_rules/corpus.txt b/test/fixtures/test_grammars/extra_non_terminals_with_shared_rules/corpus.txt new file mode 100644 index 00000000..a22d8b8d --- /dev/null +++ b/test/fixtures/test_grammars/extra_non_terminals_with_shared_rules/corpus.txt @@ -0,0 +1,23 @@ +===== +Extras +===== + +; +%; +%foo:; +; +bar: baz:; +; + +--- + +(program + (statement) + (macro_statement (statement)) + (macro_statement (statement + (label_declaration (identifier)))) + (statement) + (statement + (label_declaration (identifier)) + (label_declaration (identifier))) + (statement)) diff --git a/test/fixtures/test_grammars/extra_non_terminals_with_shared_rules/grammar.json b/test/fixtures/test_grammars/extra_non_terminals_with_shared_rules/grammar.json new file mode 100644 index 00000000..a7f51b8e --- /dev/null +++ b/test/fixtures/test_grammars/extra_non_terminals_with_shared_rules/grammar.json @@ -0,0 +1,68 @@ +{ + "name": "extra_non_terminals_with_shared_rules", + + "extras": [ + { "type": "PATTERN", "value": "\\s+" }, + { "type": "SYMBOL", "name": "macro_statement" } + ], + + "rules": { + "program": { + "type": "REPEAT", + "content": { + "type": "SYMBOL", + "name": "statement" + } + }, + "statement": { + "type": "SEQ", + "members": [ + { + "type": "REPEAT", + "content": { + "type": "SYMBOL", + "name": "label_declaration" + } + }, + { + "type": "STRING", + "value": ";" + } + ] + }, + "macro_statement": { + "type": "SEQ", + "members": [ + { + "type": "STRING", + "value": "%" + }, + { + "type": "SYMBOL", + "name": "statement" + } + ] + }, + "label_declaration": { + "type": "SEQ", + "members": [ + { + "type": "SYMBOL", + "name": "identifier" + }, + { + "type": "STRING", + "value": ":" + } + ] + }, + "identifier": { + "type": "PATTERN", + "value": "[a-zA-Z]+" + } + }, + "conflicts": [], + "externals": [], + "inline": [], + "supertypes": [] +} From 4ec7d8096853b1b478da3588206eb2a29559efa9 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 29 Jul 2020 10:04:05 -0700 Subject: [PATCH 56/71] Mention rule order as the fallback criteria in docs Fixes #702 --- docs/section-3-creating-parsers.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/section-3-creating-parsers.md b/docs/section-3-creating-parsers.md index b075e488..694f8dae 100644 --- a/docs/section-3-creating-parsers.md +++ b/docs/section-3-creating-parsers.md @@ -505,6 +505,8 @@ Grammars often contain multiple tokens that can match the same characters. For e 4. **Match Specificity** - If there are two valid tokens with the same precedence and which both match the same number of characters, Tree-sitter will prefer a token that is specified in the grammar as a `String` over a token specified as a `RegExp`. +5. **Rule Order** - If none of the above criteria can be used to select one token over another, Tree-sitter will prefer the token that appears earlier in the grammar. + ### Keywords Many languages have a set of *keyword* tokens (e.g. `if`, `for`, `return`), as well as a more general token (e.g. `identifier`) that matches any word, including many of the keyword strings. For example, JavaScript has a keyword `instanceof`, which is used as a binary operator, like this: From e89a19a1588382c24ca807c7e43520efe60e311a Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 29 Jul 2020 15:30:13 -0700 Subject: [PATCH 57/71] tags: Add @ignore capture --- cli/src/tests/tags_test.rs | 8 +- tags/src/lib.rs | 254 +++++++++++++++++++++---------------- 2 files changed, 153 insertions(+), 109 deletions(-) diff --git a/cli/src/tests/tags_test.rs b/cli/src/tests/tags_test.rs index f00e83ac..3ff1c92b 100644 --- a/cli/src/tests/tags_test.rs +++ b/cli/src/tests/tags_test.rs @@ -68,11 +68,13 @@ const JS_TAG_QUERY: &'static str = r#" const RUBY_TAG_QUERY: &'static str = r#" (method - name: (identifier) @name) @definition.method + name: (_) @name) @definition.method (method_call method: (identifier) @name) @reference.call +(setter (identifier) @ignore) + ((identifier) @name @reference.call (#is-not? local)) "#; @@ -207,7 +209,7 @@ fn test_tags_ruby() { " b = 1 - def foo() + def foo=() c = 1 # a is a method because it is not in scope @@ -239,7 +241,7 @@ fn test_tags_ruby() { )) .collect::>(), &[ - ("foo", "method", (2, 4)), + ("foo=", "method", (2, 4)), ("bar", "call", (7, 4)), ("a", "call", (7, 8)), ("b", "call", (7, 11)), diff --git a/tags/src/lib.rs b/tags/src/lib.rs index 7733f3e3..07209e4d 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -25,6 +25,7 @@ pub struct TagsConfiguration { capture_map: HashMap, doc_capture_index: Option, name_capture_index: Option, + ignore_capture_index: Option, local_scope_capture_index: Option, local_definition_capture_index: Option, tags_pattern_index: usize, @@ -128,12 +129,14 @@ impl TagsConfiguration { let mut syntax_type_names = Vec::new(); let mut doc_capture_index = None; let mut name_capture_index = None; + let mut ignore_capture_index = None; let mut local_scope_capture_index = None; let mut local_definition_capture_index = None; for (i, name) in query.capture_names().iter().enumerate() { match name.as_str() { "" => continue, "name" => name_capture_index = Some(i as u32), + "ignore" => ignore_capture_index = Some(i as u32), "doc" => doc_capture_index = Some(i as u32), "local.scope" => local_scope_capture_index = Some(i as u32), "local.definition" => local_definition_capture_index = Some(i as u32), @@ -222,6 +225,7 @@ impl TagsConfiguration { capture_map, doc_capture_index, name_capture_index, + ignore_capture_index, tags_pattern_index, local_scope_capture_index, local_definition_capture_index, @@ -311,7 +315,12 @@ where if self.tag_queue.len() > 1 && self.tag_queue[0].0.name_range.end < last_entry.0.name_range.start { - return Some(Ok(self.tag_queue.remove(0).0)); + let tag = self.tag_queue.remove(0).0; + if tag.is_ignored() { + continue; + } else { + return Some(Ok(tag)); + } } } @@ -350,10 +359,16 @@ where let mut syntax_type_id = 0; let mut is_definition = false; let mut docs_adjacent_node = None; + let mut is_ignored = false; for capture in mat.captures { let index = Some(capture.index); + if index == self.config.ignore_capture_index { + is_ignored = true; + name_node = Some(capture.node); + } + if index == self.config.pattern_info[mat.pattern_index].docs_adjacent_capture { docs_adjacent_node = Some(capture.node); } @@ -371,129 +386,137 @@ where } } - if let (Some(tag_node), Some(name_node)) = (tag_node, name_node) { - if name_node.has_error() { - continue; - } - + if let Some(name_node) = name_node { let name_range = name_node.byte_range(); - if pattern_info.name_must_be_non_local { - let mut is_local = false; - for scope in self.scopes.iter().rev() { - if scope.range.start <= name_range.start - && scope.range.end >= name_range.end - { - if scope - .local_defs - .iter() - .any(|d| d.name == &self.source[name_range.clone()]) - { - is_local = true; - break; - } - if !scope.inherits { - break; - } - } - } - if is_local { + let tag; + if let Some(tag_node) = tag_node { + if name_node.has_error() { continue; } - } - // If needed, filter the doc nodes based on their ranges, selecting - // only the slice that are adjacent to some specified node. - let mut docs_start_index = 0; - if let (Some(docs_adjacent_node), false) = - (docs_adjacent_node, doc_nodes.is_empty()) - { - docs_start_index = doc_nodes.len(); - let mut start_row = docs_adjacent_node.start_position().row; - while docs_start_index > 0 { - let doc_node = &doc_nodes[docs_start_index - 1]; - let prev_doc_end_row = doc_node.end_position().row; - if prev_doc_end_row + 1 >= start_row { - docs_start_index -= 1; - start_row = doc_node.start_position().row; - } else { - break; + if pattern_info.name_must_be_non_local { + let mut is_local = false; + for scope in self.scopes.iter().rev() { + if scope.range.start <= name_range.start + && scope.range.end >= name_range.end + { + if scope + .local_defs + .iter() + .any(|d| d.name == &self.source[name_range.clone()]) + { + is_local = true; + break; + } + if !scope.inherits { + break; + } + } + } + if is_local { + continue; } } - } - // Generate a doc string from all of the doc nodes, applying any strip regexes. - let mut docs = None; - for doc_node in &doc_nodes[docs_start_index..] { - if let Ok(content) = str::from_utf8(&self.source[doc_node.byte_range()]) { - let content = if let Some(regex) = &pattern_info.doc_strip_regex { - regex.replace_all(content, "").to_string() - } else { - content.to_string() - }; - match &mut docs { - None => docs = Some(content), - Some(d) => { - d.push('\n'); - d.push_str(&content); + // If needed, filter the doc nodes based on their ranges, selecting + // only the slice that are adjacent to some specified node. + let mut docs_start_index = 0; + if let (Some(docs_adjacent_node), false) = + (docs_adjacent_node, doc_nodes.is_empty()) + { + docs_start_index = doc_nodes.len(); + let mut start_row = docs_adjacent_node.start_position().row; + while docs_start_index > 0 { + let doc_node = &doc_nodes[docs_start_index - 1]; + let prev_doc_end_row = doc_node.end_position().row; + if prev_doc_end_row + 1 >= start_row { + docs_start_index -= 1; + start_row = doc_node.start_position().row; + } else { + break; } } } - } - let range = tag_node.byte_range(); - let span = name_node.start_position()..name_node.end_position(); + // Generate a doc string from all of the doc nodes, applying any strip regexes. + let mut docs = None; + for doc_node in &doc_nodes[docs_start_index..] { + if let Ok(content) = str::from_utf8(&self.source[doc_node.byte_range()]) + { + let content = if let Some(regex) = &pattern_info.doc_strip_regex { + regex.replace_all(content, "").to_string() + } else { + content.to_string() + }; + match &mut docs { + None => docs = Some(content), + Some(d) => { + d.push('\n'); + d.push_str(&content); + } + } + } + } - // Compute tag properties that depend on the text of the containing line. If the - // previous tag occurred on the same line, then reuse results from the previous tag. - let line_range; - let mut prev_utf16_column = 0; - let mut prev_utf8_byte = name_range.start - span.start.column; - let line_info = self.prev_line_info.as_ref().and_then(|info| { - if info.utf8_position.row == span.start.row { - Some(info) + let range = tag_node.byte_range(); + let span = name_node.start_position()..name_node.end_position(); + + // Compute tag properties that depend on the text of the containing line. If the + // previous tag occurred on the same line, then reuse results from the previous tag. + let line_range; + let mut prev_utf16_column = 0; + let mut prev_utf8_byte = name_range.start - span.start.column; + let line_info = self.prev_line_info.as_ref().and_then(|info| { + if info.utf8_position.row == span.start.row { + Some(info) + } else { + None + } + }); + if let Some(line_info) = line_info { + line_range = line_info.line_range.clone(); + if line_info.utf8_position.column <= span.start.column { + prev_utf8_byte = line_info.utf8_byte; + prev_utf16_column = line_info.utf16_column; + } } else { - None - } - }); - if let Some(line_info) = line_info { - line_range = line_info.line_range.clone(); - if line_info.utf8_position.column <= span.start.column { - prev_utf8_byte = line_info.utf8_byte; - prev_utf16_column = line_info.utf16_column; + line_range = self::line_range( + self.source, + name_range.start, + span.start, + MAX_LINE_LEN, + ); } + + let utf16_start_column = prev_utf16_column + + utf16_len(&self.source[prev_utf8_byte..name_range.start]); + let utf16_end_column = + utf16_start_column + utf16_len(&self.source[name_range.clone()]); + let utf16_column_range = utf16_start_column..utf16_end_column; + + self.prev_line_info = Some(LineInfo { + utf8_position: span.end, + utf8_byte: name_range.end, + utf16_column: utf16_end_column, + line_range: line_range.clone(), + }); + tag = Tag { + line_range, + span, + utf16_column_range, + range, + name_range, + docs, + is_definition, + syntax_type_id, + }; + } else if is_ignored { + tag = Tag::ignored(name_range); } else { - line_range = self::line_range( - self.source, - name_range.start, - span.start, - MAX_LINE_LEN, - ); + continue; } - let utf16_start_column = prev_utf16_column - + utf16_len(&self.source[prev_utf8_byte..name_range.start]); - let utf16_end_column = - utf16_start_column + utf16_len(&self.source[name_range.clone()]); - let utf16_column_range = utf16_start_column..utf16_end_column; - - self.prev_line_info = Some(LineInfo { - utf8_position: span.end, - utf8_byte: name_range.end, - utf16_column: utf16_end_column, - line_range: line_range.clone(), - }); - let tag = Tag { - line_range, - span, - utf16_column_range, - range, - name_range, - docs, - is_definition, - syntax_type_id, - }; - // Only create one tag per node. The tag queue is sorted by node position // to allow for fast lookup. match self.tag_queue.binary_search_by_key( @@ -521,6 +544,25 @@ where } } +impl Tag { + fn ignored(name_range: Range) -> Self { + Tag { + name_range, + line_range: 0..0, + span: Point::new(0, 0)..Point::new(0, 0), + utf16_column_range: 0..0, + range: usize::MAX..usize::MAX, + docs: None, + is_definition: false, + syntax_type_id: 0, + } + } + + fn is_ignored(&self) -> bool { + self.range.start == usize::MAX + } +} + impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { From df5510acfc0561e64fc2a89fc21ec286eda4feb4 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 30 Jul 2020 12:59:34 -0700 Subject: [PATCH 58/71] query: Remove limit on number of in-progress states --- lib/src/query.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/lib/src/query.c b/lib/src/query.c index 05c767e1..15827cd7 100644 --- a/lib/src/query.c +++ b/lib/src/query.c @@ -11,7 +11,6 @@ // #define LOG(...) fprintf(stderr, __VA_ARGS__) #define LOG(...) -#define MAX_STATE_COUNT 256 #define MAX_CAPTURE_LIST_COUNT 32 #define MAX_STEP_CAPTURE_COUNT 3 @@ -1297,8 +1296,8 @@ TSQueryCursor *ts_query_cursor_new(void) { .start_point = {0, 0}, .end_point = POINT_MAX, }; - array_reserve(&self->states, MAX_STATE_COUNT); - array_reserve(&self->finished_states, MAX_CAPTURE_LIST_COUNT); + array_reserve(&self->states, 8); + array_reserve(&self->finished_states, 8); return self; } @@ -1465,10 +1464,6 @@ static bool ts_query_cursor__add_state( TSQueryCursor *self, const PatternEntry *pattern ) { - if (self->states.size >= MAX_STATE_COUNT) { - LOG(" too many states"); - return false; - } LOG( " start state. pattern:%u, step:%u\n", pattern->pattern_index, @@ -1537,17 +1532,14 @@ static CaptureList *ts_query_cursor__prepare_to_capture( } // Duplicate the given state and insert the newly-created state immediately after -// the given state in the `states` array. +// the given state in the `states` array. Ensures that the given state reference is +// still valid, even if the states array is reallocated. static QueryState *ts_query_cursor__copy_state( TSQueryCursor *self, - unsigned state_index + QueryState **state_ref ) { - if (self->states.size >= MAX_STATE_COUNT) { - LOG(" too many states"); - return NULL; - } - - const QueryState *state = &self->states.contents[state_index]; + const QueryState *state = *state_ref; + uint32_t state_index = state - self->states.contents; QueryState copy = *state; copy.capture_list_id = NONE; @@ -1563,6 +1555,7 @@ static QueryState *ts_query_cursor__copy_state( } array_insert(&self->states, state_index + 1, copy); + *state_ref = &self->states.contents[state_index]; return &self->states.contents[state_index + 1]; } @@ -1774,7 +1767,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { !step->is_pattern_start && step->contains_captures ) { - if (ts_query_cursor__copy_state(self, i)) { + if (ts_query_cursor__copy_state(self, &state)) { LOG( " split state for capture. pattern:%u, step:%u\n", state->pattern_index, @@ -1829,7 +1822,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { continue; } - QueryState *copy = ts_query_cursor__copy_state(self, j); + QueryState *copy = ts_query_cursor__copy_state(self, &state); if (next_step->is_pass_through) { state->step_index++; j--; @@ -1862,7 +1855,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { } // Enfore the longest-match criteria. When a query pattern contains optional or - // repeated nodes, this is necesssary to avoid multiple redundant states, where + // repeated nodes, this is necessary to avoid multiple redundant states, where // one state has a strict subset of another state's captures. bool did_remove = false; for (unsigned j = i + 1; j < self->states.size; j++) { From 411f69d13be8954baff074f4180ae4fdb5537453 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 30 Jul 2020 13:34:34 -0700 Subject: [PATCH 59/71] query: Optimize 'longest-match' filtering --- lib/src/query.c | 85 ++++++++++++++++++++++++++----------------------- 1 file changed, 46 insertions(+), 39 deletions(-) diff --git a/lib/src/query.c b/lib/src/query.c index 15827cd7..c839c299 100644 --- a/lib/src/query.c +++ b/lib/src/query.c @@ -118,7 +118,7 @@ typedef struct { uint16_t step_index; uint16_t pattern_index; uint16_t capture_list_id; - uint16_t consumed_capture_count: 14; + uint16_t consumed_capture_count: 12; bool seeking_immediate_match: 1; bool has_in_progress_alternatives: 1; bool dead: 1; @@ -1860,47 +1860,54 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { bool did_remove = false; for (unsigned j = i + 1; j < self->states.size; j++) { QueryState *other_state = &self->states.contents[j]; + if (other_state->dead) { + array_erase(&self->states, j); + j--; + continue; + } + + // When query states are copied in order if ( - !other_state->dead && - state->pattern_index == other_state->pattern_index && - state->start_depth == other_state->start_depth - ) { - bool left_contains_right, right_contains_left; - ts_query_cursor__compare_captures( - self, - state, - other_state, - &left_contains_right, - &right_contains_left - ); - if (left_contains_right) { - if (state->step_index == other_state->step_index) { - LOG( - " drop shorter state. pattern: %u, step_index: %u\n", - state->pattern_index, - state->step_index - ); - capture_list_pool_release(&self->capture_list_pool, other_state->capture_list_id); - array_erase(&self->states, j); - j--; - continue; - } - other_state->has_in_progress_alternatives = true; + other_state->start_depth != state->start_depth || + other_state->pattern_index != state->pattern_index + ) break; + + bool left_contains_right, right_contains_left; + ts_query_cursor__compare_captures( + self, + state, + other_state, + &left_contains_right, + &right_contains_left + ); + if (left_contains_right) { + if (state->step_index == other_state->step_index) { + LOG( + " drop shorter state. pattern: %u, step_index: %u\n", + state->pattern_index, + state->step_index + ); + capture_list_pool_release(&self->capture_list_pool, other_state->capture_list_id); + array_erase(&self->states, j); + j--; + continue; } - if (right_contains_left) { - if (state->step_index == other_state->step_index) { - LOG( - " drop shorter state. pattern: %u, step_index: %u\n", - state->pattern_index, - state->step_index - ); - capture_list_pool_release(&self->capture_list_pool, state->capture_list_id); - array_erase(&self->states, i); - did_remove = true; - break; - } - state->has_in_progress_alternatives = true; + other_state->has_in_progress_alternatives = true; + } + if (right_contains_left) { + if (state->step_index == other_state->step_index) { + LOG( + " drop shorter state. pattern: %u, step_index: %u\n", + state->pattern_index, + state->step_index + ); + capture_list_pool_release(&self->capture_list_pool, state->capture_list_id); + array_erase(&self->states, i); + i--; + did_remove = true; + break; } + state->has_in_progress_alternatives = true; } } From f265e63d488d14e06d905b2ddabe879afdb62945 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 30 Jul 2020 13:35:04 -0700 Subject: [PATCH 60/71] tags: Allow def or ref node to be a sibling of the name node --- tags/src/lib.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tags/src/lib.rs b/tags/src/lib.rs index 07209e4d..c247c13e 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -459,7 +459,8 @@ where } } - let range = tag_node.byte_range(); + let rng = tag_node.byte_range(); + let range = rng.start.min(name_range.start)..rng.end.max(name_range.end); let span = name_node.start_position()..name_node.end_position(); // Compute tag properties that depend on the text of the containing line. If the From af655547e5817efbdf350935555b4aaf2642c618 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 31 Jul 2020 12:47:06 -0700 Subject: [PATCH 61/71] Fix handling of queries with many patterns with leading repetitions --- cli/src/tests/query_test.rs | 53 ++++++++++++++++- lib/src/query.c | 115 ++++++++++++++++++++++++++---------- 2 files changed, 135 insertions(+), 33 deletions(-) diff --git a/cli/src/tests/query_test.rs b/cli/src/tests/query_test.rs index 914d41cd..a377ca51 100644 --- a/cli/src/tests/query_test.rs +++ b/cli/src/tests/query_test.rs @@ -382,7 +382,7 @@ fn test_query_matches_with_many_overlapping_results() { ) .unwrap(); - let count = 80; + let count = 1024; // Deeply nested chained function calls: // a @@ -547,8 +547,8 @@ fn test_query_matches_with_immediate_siblings() { &[ (0, vec![("parent", "a"), ("child", "b")]), (0, vec![("parent", "b"), ("child", "c")]), - (1, vec![("last-child", "d")]), (0, vec![("parent", "c"), ("child", "d")]), + (1, vec![("last-child", "d")]), (2, vec![("first-element", "w")]), (2, vec![("first-element", "1")]), ], @@ -732,6 +732,55 @@ fn test_query_matches_with_nested_repetitions() { }); } +#[test] +fn test_query_matches_with_multiple_repetition_patterns_that_intersect_other_pattern() { + allocations::record(|| { + let language = get_language("javascript"); + + // When this query sees a comment, it must keep track of several potential + // matches: up to two for each pattern that begins with a comment. + let query = Query::new( + language, + r#" + (call_expression + function: (member_expression + property: (property_identifier) @name)) @ref.method + + ((comment)* @doc (function_declaration)) + ((comment)* @doc (generator_function_declaration)) + ((comment)* @doc (class_declaration)) + ((comment)* @doc (lexical_declaration)) + ((comment)* @doc (variable_declaration)) + ((comment)* @doc (method_definition)) + + (comment) @comment + "#, + ) + .unwrap(); + + // Here, a series of comments occurs in the middle of a match of the first + // pattern. To avoid exceeding the storage limits and discarding that outer + // match, the comment-related matches need to be managed efficiently. + let source = format!( + "theObject\n{}\n.theMethod()", + " // the comment\n".repeat(64) + ); + + assert_query_matches( + language, + &query, + &source, + &vec![(7, vec![("comment", "// the comment")]); 64] + .into_iter() + .chain(vec![( + 0, + vec![("ref.method", source.as_str()), ("name", "theMethod")], + )]) + .collect::>(), + ); + }); +} + #[test] fn test_query_matches_with_leading_zero_or_more_repeated_leaf_nodes() { allocations::record(|| { diff --git a/lib/src/query.c b/lib/src/query.c index c839c299..8c8bd4c3 100644 --- a/lib/src/query.c +++ b/lib/src/query.c @@ -48,7 +48,6 @@ typedef struct { uint16_t alternative_index; uint16_t depth; bool contains_captures: 1; - bool is_pattern_start: 1; bool is_immediate: 1; bool is_last_child: 1; bool is_pass_through: 1; @@ -449,7 +448,6 @@ static QueryStep query_step__new( .alternative_index = NONE, .contains_captures = false, .is_last_child = false, - .is_pattern_start = false, .is_pass_through = false, .is_dead_end = false, .is_immediate = is_immediate, @@ -547,6 +545,23 @@ static inline void ts_query__pattern_map_insert( ) { uint32_t index; ts_query__pattern_map_search(self, symbol, &index); + + // Ensure that the entries are sorted not only by symbol, but also + // by pattern_index. This way, states for earlier patterns will be + // initiated first, which allows the ordering of the states array + // to be maintained more efficiently. + while (index < self->pattern_map.size) { + PatternEntry *entry = &self->pattern_map.contents[index]; + if ( + self->steps.contents[entry->step_index].symbol == symbol && + entry->pattern_index < pattern_index + ) { + index++; + } else { + break; + } + } + array_insert(&self->pattern_map, index, ((PatternEntry) { .step_index = start_step_index, .pattern_index = pattern_index, @@ -1168,7 +1183,6 @@ TSQuery *ts_query_new( // Maintain a map that can look up patterns for a given root symbol. for (;;) { QueryStep *step = &self->steps.contents[start_step_index]; - step->is_pattern_start = true; ts_query__pattern_map_insert(self, step->symbol, start_step_index, pattern_index); if (step->symbol == WILDCARD_SYMBOL) { self->wildcard_root_pattern_count++; @@ -1178,6 +1192,7 @@ TSQuery *ts_query_new( // then add multiple entries to the pattern map. if (step->alternative_index != NONE) { start_step_index = step->alternative_index; + step->alternative_index = NONE; } else { break; } @@ -1460,27 +1475,62 @@ void ts_query_cursor__compare_captures( } } -static bool ts_query_cursor__add_state( +static void ts_query_cursor__add_state( TSQueryCursor *self, const PatternEntry *pattern ) { + QueryStep *step = &self->query->steps.contents[pattern->step_index]; + uint32_t start_depth = self->depth - step->depth; + + // Keep the states array in ascending order of start_depth and pattern_index, + // so that it can be processed more efficiently elsewhere. Usually, there is + // no work to do here because of two facts: + // * States with lower start_depth are naturally added first due to the + // order in which nodes are visited. + // * Earlier patterns are naturally added first because of the ordering of the + // pattern_map data structure that's used to initiate matches. + // + // This loop is only needed in cases where two conditions hold: + // * A pattern consists of more than one sibling node, so that its states + // remain in progress after exiting the node that started the match. + // * The first node in the pattern matches against multiple nodes at the + // same depth. + // + // An example of this is the pattern '((comment)* (function))'. If multiple + // `comment` nodes appear in a row, then we may initiate a new state for this + // pattern while another state for the same pattern is already in progress. + // If there are multiple patterns like this in a query, then this loop will + // need to execute in order to keep the states ordered by pattern_index. + uint32_t index = self->states.size; + while (index > 0) { + QueryState *prev_state = &self->states.contents[index - 1]; + if (prev_state->start_depth < start_depth) break; + if (prev_state->start_depth == start_depth) { + if (prev_state->pattern_index < pattern->pattern_index) break; + if (prev_state->pattern_index == pattern->pattern_index) { + // Avoid unnecessarily inserting an unnecessary duplicate state, + // which would be immediately pruned by the longest-match criteria. + if (prev_state->step_index == pattern->step_index) return; + } + } + index--; + } + LOG( " start state. pattern:%u, step:%u\n", pattern->pattern_index, pattern->step_index ); - QueryStep *step = &self->query->steps.contents[pattern->step_index]; - array_push(&self->states, ((QueryState) { + array_insert(&self->states, index, ((QueryState) { .capture_list_id = NONE, .step_index = pattern->step_index, .pattern_index = pattern->pattern_index, - .start_depth = self->depth - step->depth, + .start_depth = start_depth, .consumed_capture_count = 0, - .seeking_immediate_match = false, + .seeking_immediate_match = true, .has_in_progress_alternatives = false, .dead = false, })); - return true; } // Acquire a capture list for this state. If there are no capture lists left in the @@ -1682,7 +1732,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { // If this node matches the first step of the pattern, then add a new // state at the start of this pattern. if (step->field && field_id != step->field) continue; - if (!ts_query_cursor__add_state(self, pattern)) break; + ts_query_cursor__add_state(self, pattern); } // Add new states for any patterns whose root node matches this node. @@ -1694,7 +1744,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { // If this node matches the first step of the pattern, then add a new // state at the start of this pattern. if (step->field && field_id != step->field) continue; - if (!ts_query_cursor__add_state(self, pattern)) break; + ts_query_cursor__add_state(self, pattern); // Advance to the next pattern whose root node matches this node. i++; @@ -1762,11 +1812,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { // parent, then this query state cannot simply be updated in place. It must be // split into two states: one that matches this node, and one which skips over // this node, to preserve the possibility of matching later siblings. - if ( - later_sibling_can_match && - !step->is_pattern_start && - step->contains_captures - ) { + if (later_sibling_can_match && step->contains_captures) { if (ts_query_cursor__copy_state(self, &state)) { LOG( " split state for capture. pattern:%u, step:%u\n", @@ -1822,25 +1868,27 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { continue; } - QueryState *copy = ts_query_cursor__copy_state(self, &state); if (next_step->is_pass_through) { state->step_index++; j--; } + + QueryState *copy = ts_query_cursor__copy_state(self, &state); if (copy) { - copy_count++; + LOG( + " split state for branch. pattern:%u, from_step:%u, to_step:%u, immediate:%d, capture_count: %u\n", + copy->pattern_index, + copy->step_index, + next_step->alternative_index, + next_step->alternative_is_immediate, + capture_list_pool_get(&self->capture_list_pool, copy->capture_list_id)->size + ); end_index++; + copy_count++; copy->step_index = next_step->alternative_index; if (next_step->alternative_is_immediate) { copy->seeking_immediate_match = true; } - LOG( - " split state for branch. pattern:%u, step:%u, step:%u, immediate:%d\n", - copy->pattern_index, - state->step_index, - copy->step_index, - copy->seeking_immediate_match - ); } } } @@ -1860,13 +1908,11 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { bool did_remove = false; for (unsigned j = i + 1; j < self->states.size; j++) { QueryState *other_state = &self->states.contents[j]; - if (other_state->dead) { - array_erase(&self->states, j); - j--; - continue; - } - // When query states are copied in order + // Query states are kept in ascending order of start_depth and pattern_index. + // Since the longest-match criteria is only used for deduping matches of the same + // pattern and root node, we only need to perform pairwise comparisons within a + // small slice of the states array. if ( other_state->start_depth != state->start_depth || other_state->pattern_index != state->pattern_index @@ -1914,6 +1960,13 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { // If there the state is at the end of its pattern, remove it from the list // of in-progress states and add it to the list of finished states. if (!did_remove) { + LOG( + " keep state. pattern: %u, start_depth: %u, step_index: %u, capture_count: %u\n", + state->pattern_index, + state->start_depth, + state->step_index, + capture_list_pool_get(&self->capture_list_pool, state->capture_list_id)->size + ); QueryStep *next_step = &self->query->steps.contents[state->step_index]; if (next_step->depth == PATTERN_DONE_MARKER) { if (state->has_in_progress_alternatives) { From 1a571ae20877c7bfac1fa59f0cc38027fe669685 Mon Sep 17 00:00:00 2001 From: Patrick Thomson Date: Tue, 4 Aug 2020 17:53:47 -0400 Subject: [PATCH 62/71] Add errors_present field to tagging context. --- tags/src/c_lib.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tags/src/c_lib.rs b/tags/src/c_lib.rs index 07e1e19a..b93c69a2 100644 --- a/tags/src/c_lib.rs +++ b/tags/src/c_lib.rs @@ -52,6 +52,7 @@ pub struct TSTagsBuffer { context: TagsContext, tags: Vec, docs: Vec, + errors_present: bool, } #[no_mangle] @@ -184,6 +185,7 @@ pub extern "C" fn ts_tags_buffer_new() -> *mut TSTagsBuffer { context: TagsContext::new(), tags: Vec::with_capacity(64), docs: Vec::with_capacity(64), + errors_present: false, })) } @@ -216,6 +218,12 @@ pub extern "C" fn ts_tags_buffer_docs_len(this: *const TSTagsBuffer) -> u32 { buffer.docs.len() as u32 } +#[no_mangle] +pub extern "C" fn ts_tagger_errors_present(this: *const TSTagsBuffer) -> bool { + let buffer = unwrap_ptr(this); + buffer.errors_present +} + #[no_mangle] pub extern "C" fn ts_tagger_syntax_kinds_for_scope_name( this: *mut TSTagger, From 5a52dc2cd700170196753481db1e8aa261e50d50 Mon Sep 17 00:00:00 2001 From: Patrick Thomson Date: Wed, 5 Aug 2020 11:18:59 -0400 Subject: [PATCH 63/71] Return an iterator-bool tuple instead of just an iterator. --- cli/src/tags.rs | 3 ++- tags/src/c_lib.rs | 5 ++++- tags/src/lib.rs | 6 +++--- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/cli/src/tags.rs b/cli/src/tags.rs index 5ea00f39..5e999693 100644 --- a/cli/src/tags.rs +++ b/cli/src/tags.rs @@ -53,7 +53,8 @@ pub fn generate_tags( let source = fs::read(path)?; let t0 = Instant::now(); - for tag in context.generate_tags(tags_config, &source, Some(&cancellation_flag))? { + let (tagged, _) = context.generate_tags(tags_config, &source, Some(&cancellation_flag))?; + for tag in tagged { let tag = tag?; if !quiet { write!( diff --git a/tags/src/c_lib.rs b/tags/src/c_lib.rs index b93c69a2..84f8c97b 100644 --- a/tags/src/c_lib.rs +++ b/tags/src/c_lib.rs @@ -126,7 +126,10 @@ pub extern "C" fn ts_tagger_tag( .context .generate_tags(config, source_code, cancellation_flag) { - Ok(tags) => tags, + Ok((tags, found_error)) => { + buffer.errors_present = found_error; + tags + } Err(e) => { return match e { Error::InvalidLanguage => TSTagsError::InvalidLanguage, diff --git a/tags/src/lib.rs b/tags/src/lib.rs index c247c13e..dd55d4be 100644 --- a/tags/src/lib.rs +++ b/tags/src/lib.rs @@ -255,7 +255,7 @@ impl TagsContext { config: &'a TagsConfiguration, source: &'a [u8], cancellation_flag: Option<&'a AtomicUsize>, - ) -> Result> + 'a, Error> { + ) -> Result<(impl Iterator> + 'a, bool), Error> { self.parser .set_language(config.language) .map_err(|_| Error::InvalidLanguage)?; @@ -271,7 +271,7 @@ impl TagsContext { .matches(&config.query, tree_ref.root_node(), move |node| { &source[node.byte_range()] }); - Ok(TagsIter { + Ok((TagsIter { _tree: tree, matches, source, @@ -285,7 +285,7 @@ impl TagsContext { inherits: false, local_defs: Vec::new(), }], - }) + }, tree_ref.root_node().has_error())) } } From f4108056b0b5be57441493a279cb22fc3fd95829 Mon Sep 17 00:00:00 2001 From: Patrick Thomson Date: Wed, 5 Aug 2020 11:33:04 -0400 Subject: [PATCH 64/71] Remove otiose pattern match. --- cli/src/tags.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cli/src/tags.rs b/cli/src/tags.rs index 5e999693..122b58d2 100644 --- a/cli/src/tags.rs +++ b/cli/src/tags.rs @@ -53,8 +53,7 @@ pub fn generate_tags( let source = fs::read(path)?; let t0 = Instant::now(); - let (tagged, _) = context.generate_tags(tags_config, &source, Some(&cancellation_flag))?; - for tag in tagged { + for tag in context.generate_tags(tags_config, &source, Some(&cancellation_flag))?.0 { let tag = tag?; if !quiet { write!( From 5c86a9c654b7f2be39f55039ad114f277aa64a64 Mon Sep 17 00:00:00 2001 From: Patrick Thomson Date: Wed, 5 Aug 2020 11:52:07 -0400 Subject: [PATCH 65/71] Fix the tests --- cli/src/tests/tags_test.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cli/src/tests/tags_test.rs b/cli/src/tests/tags_test.rs index 3ff1c92b..88e57ec1 100644 --- a/cli/src/tests/tags_test.rs +++ b/cli/src/tests/tags_test.rs @@ -102,6 +102,7 @@ fn test_tags_python() { let tags = tag_context .generate_tags(&tags_config, source, None) .unwrap() + .0 .collect::, _>>() .unwrap(); @@ -153,6 +154,7 @@ fn test_tags_javascript() { let tags = tag_context .generate_tags(&tags_config, source, None) .unwrap() + .0 .collect::, _>>() .unwrap(); @@ -189,6 +191,7 @@ fn test_tags_columns_measured_in_utf16_code_units() { let tag = tag_context .generate_tags(&tags_config, source, None) .unwrap() + .0 .next() .unwrap() .unwrap(); @@ -229,6 +232,7 @@ fn test_tags_ruby() { let tags = tag_context .generate_tags(&tags_config, source.as_bytes(), None) .unwrap() + .0 .collect::, _>>() .unwrap(); @@ -271,7 +275,7 @@ fn test_tags_cancellation() { .generate_tags(&tags_config, source.as_bytes(), Some(&cancellation_flag)) .unwrap(); - for (i, tag) in tags.enumerate() { + for (i, tag) in tags.0.enumerate() { if i == 150 { cancellation_flag.store(1, Ordering::SeqCst); } From 32f69dbe156030de5ae589d968efc2825bd0485f Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 5 Aug 2020 09:06:00 -0700 Subject: [PATCH 66/71] tags, highlight: Limit the size of buffers that are retained in memory --- highlight/src/lib.rs | 16 ++++++++++++---- tags/src/c_lib.rs | 12 ++++++++++-- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/highlight/src/lib.rs b/highlight/src/lib.rs index bb110219..1cffefa2 100644 --- a/highlight/src/lib.rs +++ b/highlight/src/lib.rs @@ -10,6 +10,8 @@ use tree_sitter::{ }; const CANCELLATION_CHECK_INTERVAL: usize = 100; +const BUFFER_HTML_RESERVE_CAPACITY: usize = 10 * 1024; +const BUFFER_LINES_RESERVE_CAPACITY: usize = 1000; /// Indicates which highlight should be applied to a region of source code. #[derive(Copy, Clone, Debug, PartialEq, Eq)] @@ -884,11 +886,13 @@ where impl HtmlRenderer { pub fn new() -> Self { - HtmlRenderer { - html: Vec::new(), - line_offsets: vec![0], + let mut result = HtmlRenderer { + html: Vec::with_capacity(BUFFER_HTML_RESERVE_CAPACITY), + line_offsets: Vec::with_capacity(BUFFER_LINES_RESERVE_CAPACITY), carriage_return_highlight: None, - } + }; + result.line_offsets.push(0); + result } pub fn set_carriage_return_highlight(&mut self, highlight: Option) { @@ -896,6 +900,10 @@ impl HtmlRenderer { } pub fn reset(&mut self) { + self.html.truncate(BUFFER_HTML_RESERVE_CAPACITY); + self.line_offsets.truncate(BUFFER_LINES_RESERVE_CAPACITY); + self.html.shrink_to_fit(); + self.line_offsets.shrink_to_fit(); self.html.clear(); self.line_offsets.clear(); self.line_offsets.push(0); diff --git a/tags/src/c_lib.rs b/tags/src/c_lib.rs index 07e1e19a..c2bec6ca 100644 --- a/tags/src/c_lib.rs +++ b/tags/src/c_lib.rs @@ -6,6 +6,9 @@ use std::sync::atomic::AtomicUsize; use std::{fmt, slice, str}; use tree_sitter::Language; +const BUFFER_TAGS_RESERVE_CAPACITY: usize = 100; +const BUFFER_DOCS_RESERVE_CAPACITY: usize = 1024; + #[repr(C)] #[derive(Debug, PartialEq, Eq)] pub enum TSTagsError { @@ -116,8 +119,13 @@ pub extern "C" fn ts_tagger_tag( let scope_name = unsafe { unwrap(CStr::from_ptr(scope_name).to_str()) }; if let Some(config) = tagger.languages.get(scope_name) { + buffer.tags.truncate(BUFFER_TAGS_RESERVE_CAPACITY); + buffer.docs.truncate(BUFFER_DOCS_RESERVE_CAPACITY); + buffer.tags.shrink_to_fit(); + buffer.docs.shrink_to_fit(); buffer.tags.clear(); buffer.docs.clear(); + let source_code = unsafe { slice::from_raw_parts(source_code, source_code_len as usize) }; let cancellation_flag = unsafe { cancellation_flag.as_ref() }; @@ -182,8 +190,8 @@ pub extern "C" fn ts_tagger_tag( pub extern "C" fn ts_tags_buffer_new() -> *mut TSTagsBuffer { Box::into_raw(Box::new(TSTagsBuffer { context: TagsContext::new(), - tags: Vec::with_capacity(64), - docs: Vec::with_capacity(64), + tags: Vec::with_capacity(BUFFER_TAGS_RESERVE_CAPACITY), + docs: Vec::with_capacity(BUFFER_DOCS_RESERVE_CAPACITY), })) } From 94ab884ee4d0b965c8c16212979e15927976f068 Mon Sep 17 00:00:00 2001 From: Patrick Thomson Date: Wed, 5 Aug 2020 12:16:09 -0400 Subject: [PATCH 67/71] Add a test. --- cli/src/tests/tags_test.rs | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/cli/src/tests/tags_test.rs b/cli/src/tests/tags_test.rs index 88e57ec1..2b058c0b 100644 --- a/cli/src/tests/tags_test.rs +++ b/cli/src/tests/tags_test.rs @@ -297,6 +297,39 @@ fn test_invalid_capture() { assert_eq!(e, Error::InvalidCapture("method".to_string())); } +#[test] +fn test_tags_with_parse_error() { + let language = get_language("python"); + let tags_config = TagsConfiguration::new(language, PYTHON_TAG_QUERY, "").unwrap(); + let mut tag_context = TagsContext::new(); + + let source = br#" + class Fine: pass + class Bad + "#; + + let (tags, failed) = tag_context + .generate_tags(&tags_config, source, None) + .unwrap(); + + let newtags = tags.collect::, _>>().unwrap(); + + assert!(failed, "syntax error should have been detected"); + + assert_eq!( + newtags.iter() + .map(|t| ( + substr(source, &t.name_range), + tags_config.syntax_type_name(t.syntax_type_id) + )) + .collect::>(), + &[ + ("Fine", "class"), + ] + ); +} + + #[test] fn test_tags_via_c_api() { allocations::record(|| { From 7576b0b4485343902f54ab1dbe0464dd7ef4f920 Mon Sep 17 00:00:00 2001 From: Patrick Thomson Date: Wed, 5 Aug 2020 12:21:42 -0400 Subject: [PATCH 68/71] Add accessor to the C header. --- tags/include/tree_sitter/tags.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tags/include/tree_sitter/tags.h b/tags/include/tree_sitter/tags.h index f2b17075..42109bee 100644 --- a/tags/include/tree_sitter/tags.h +++ b/tags/include/tree_sitter/tags.h @@ -88,6 +88,9 @@ uint32_t ts_tags_buffer_docs_len(const TSTagsBuffer *); // Get the syntax kinds for a scope. const char **ts_tagger_syntax_kinds_for_scope_name(const TSTagger *, const char *scope_name, uint32_t *len); +// Determine whether a parse error was encountered while tagging. +bool ts_tagger_errors_present(); + #ifdef __cplusplus } #endif From ec6af791af5761130238134e935ad6236aeb151c Mon Sep 17 00:00:00 2001 From: Patrick Thomson Date: Wed, 5 Aug 2020 12:24:39 -0400 Subject: [PATCH 69/71] Bikeshed this name a little bit. --- tags/include/tree_sitter/tags.h | 2 +- tags/src/c_lib.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tags/include/tree_sitter/tags.h b/tags/include/tree_sitter/tags.h index 42109bee..773113d7 100644 --- a/tags/include/tree_sitter/tags.h +++ b/tags/include/tree_sitter/tags.h @@ -89,7 +89,7 @@ uint32_t ts_tags_buffer_docs_len(const TSTagsBuffer *); const char **ts_tagger_syntax_kinds_for_scope_name(const TSTagger *, const char *scope_name, uint32_t *len); // Determine whether a parse error was encountered while tagging. -bool ts_tagger_errors_present(); +bool ts_tags_buffer_found_parse_error(); #ifdef __cplusplus } diff --git a/tags/src/c_lib.rs b/tags/src/c_lib.rs index 84f8c97b..8cb5abb4 100644 --- a/tags/src/c_lib.rs +++ b/tags/src/c_lib.rs @@ -222,7 +222,7 @@ pub extern "C" fn ts_tags_buffer_docs_len(this: *const TSTagsBuffer) -> u32 { } #[no_mangle] -pub extern "C" fn ts_tagger_errors_present(this: *const TSTagsBuffer) -> bool { +pub extern "C" fn ts_tags_buffer_found_parse_error(this: *const TSTagsBuffer) -> bool { let buffer = unwrap_ptr(this); buffer.errors_present } From f91b19c08947aad20e095a4103cf144794baf16d Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 5 Aug 2020 09:57:45 -0700 Subject: [PATCH 70/71] tags, highlight: Avoid completely deallocating buffers when shrinking --- highlight/src/lib.rs | 16 ++++++++++------ tags/src/c_lib.rs | 16 ++++++++++------ 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/highlight/src/lib.rs b/highlight/src/lib.rs index 1cffefa2..e4aebbfb 100644 --- a/highlight/src/lib.rs +++ b/highlight/src/lib.rs @@ -900,12 +900,8 @@ impl HtmlRenderer { } pub fn reset(&mut self) { - self.html.truncate(BUFFER_HTML_RESERVE_CAPACITY); - self.line_offsets.truncate(BUFFER_LINES_RESERVE_CAPACITY); - self.html.shrink_to_fit(); - self.line_offsets.shrink_to_fit(); - self.html.clear(); - self.line_offsets.clear(); + shrink_and_clear(&mut self.html, BUFFER_HTML_RESERVE_CAPACITY); + shrink_and_clear(&mut self.line_offsets, BUFFER_LINES_RESERVE_CAPACITY); self.line_offsets.push(0); } @@ -1069,3 +1065,11 @@ fn injection_for_match<'a>( (language_name, content_node, include_children) } + +fn shrink_and_clear(vec: &mut Vec, capacity: usize) { + if vec.len() > capacity { + vec.truncate(capacity); + vec.shrink_to_fit(); + } + vec.clear(); +} diff --git a/tags/src/c_lib.rs b/tags/src/c_lib.rs index c2bec6ca..b0786580 100644 --- a/tags/src/c_lib.rs +++ b/tags/src/c_lib.rs @@ -119,12 +119,8 @@ pub extern "C" fn ts_tagger_tag( let scope_name = unsafe { unwrap(CStr::from_ptr(scope_name).to_str()) }; if let Some(config) = tagger.languages.get(scope_name) { - buffer.tags.truncate(BUFFER_TAGS_RESERVE_CAPACITY); - buffer.docs.truncate(BUFFER_DOCS_RESERVE_CAPACITY); - buffer.tags.shrink_to_fit(); - buffer.docs.shrink_to_fit(); - buffer.tags.clear(); - buffer.docs.clear(); + shrink_and_clear(&mut buffer.tags, BUFFER_TAGS_RESERVE_CAPACITY); + shrink_and_clear(&mut buffer.docs, BUFFER_DOCS_RESERVE_CAPACITY); let source_code = unsafe { slice::from_raw_parts(source_code, source_code_len as usize) }; let cancellation_flag = unsafe { cancellation_flag.as_ref() }; @@ -262,3 +258,11 @@ fn unwrap(result: Result) -> T { abort(); }) } + +fn shrink_and_clear(vec: &mut Vec, capacity: usize) { + if vec.len() > capacity { + vec.truncate(capacity); + vec.shrink_to_fit(); + } + vec.clear(); +} From 8d58a0d33a070af73dd6548d8000e0e7ddd04331 Mon Sep 17 00:00:00 2001 From: Patrick Thomson Date: Wed, 5 Aug 2020 13:10:02 -0400 Subject: [PATCH 71/71] Add parameter in the header. --- tags/include/tree_sitter/tags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tags/include/tree_sitter/tags.h b/tags/include/tree_sitter/tags.h index 773113d7..4784abbb 100644 --- a/tags/include/tree_sitter/tags.h +++ b/tags/include/tree_sitter/tags.h @@ -89,7 +89,7 @@ uint32_t ts_tags_buffer_docs_len(const TSTagsBuffer *); const char **ts_tagger_syntax_kinds_for_scope_name(const TSTagger *, const char *scope_name, uint32_t *len); // Determine whether a parse error was encountered while tagging. -bool ts_tags_buffer_found_parse_error(); +bool ts_tags_buffer_found_parse_error(const TSTagsBuffer*); #ifdef __cplusplus }