diff --git a/cli/src/loader.rs b/cli/src/loader.rs index 34e0e164..d29ba1fe 100644 --- a/cli/src/loader.rs +++ b/cli/src/loader.rs @@ -20,27 +20,23 @@ const DYLIB_EXTENSION: &'static str = "dll"; const BUILD_TARGET: &'static str = env!("BUILD_TARGET"); -struct LanguageRepo { - path: PathBuf, - language: OnceCell, - configurations: Vec, -} - #[derive(Default)] pub struct LanguageConfiguration { - scope: Option, - _content_regex: Option, - _first_line_regex: Option, - injection_regex: Option, - file_types: Vec, - highlight_property_sheet_path: Option, + pub scope: Option, + pub content_regex: Option, + pub _first_line_regex: Option, + pub injection_regex: Option, + pub file_types: Vec, + pub highlight_property_sheet_path: Option, + language_id: usize, highlight_property_sheet: OnceCell>>, } pub struct Loader { parser_lib_path: PathBuf, - language_repos: Vec, - language_configuration_ids_by_file_type: HashMap>, + languages_by_id: Vec<(PathBuf, OnceCell)>, + language_configurations: Vec, + language_configuration_ids_by_file_type: HashMap>, } unsafe impl Send for Loader {} @@ -50,7 +46,8 @@ impl Loader { pub fn new(parser_lib_path: PathBuf) -> Self { Loader { parser_lib_path, - language_repos: Vec::new(), + languages_by_id: Vec::new(), + language_configurations: Vec::new(), language_configuration_ids_by_file_type: HashMap::new(), } } @@ -62,8 +59,10 @@ impl Loader { let entry = entry?; if let Some(parser_dir_name) = entry.file_name().to_str() { if parser_dir_name.starts_with("tree-sitter-") { - self.find_language_at_path(&parser_container_dir.join(parser_dir_name)) - .ok(); + self.find_language_configurations_at_path( + &parser_container_dir.join(parser_dir_name), + ) + .ok(); } } } @@ -72,24 +71,38 @@ impl Loader { Ok(()) } - pub fn language_at_path(&mut self, path: &Path) -> Result> { - if let Ok(id) = self.find_language_at_path(path) { - Ok(Some(self.language_for_id(id)?.0)) + pub fn languages_at_path(&mut self, path: &Path) -> Result> { + if let Ok(configurations) = self.find_language_configurations_at_path(path) { + let mut language_ids = configurations + .iter() + .map(|c| c.language_id) + .collect::>(); + language_ids.sort(); + language_ids.dedup(); + language_ids + .into_iter() + .map(|id| self.language_for_id(id)) + .collect::>>() } else { - Ok(None) + Ok(Vec::new()) } } + pub fn get_all_language_configurations(&self) -> Vec<(&LanguageConfiguration, &Path)> { + self.language_configurations + .iter() + .map(|c| (c, self.languages_by_id[c.language_id].0.as_ref())) + .collect() + } + pub fn language_configuration_for_scope( &self, scope: &str, ) -> Result> { - for (i, repo) in self.language_repos.iter().enumerate() { - for configuration in &repo.configurations { - if configuration.scope.as_ref().map_or(false, |s| s == scope) { - let (language, _) = self.language_for_id(i)?; - return Ok(Some((language, &configuration))); - } + for configuration in &self.language_configurations { + if configuration.scope.as_ref().map_or(false, |s| s == scope) { + let language = self.language_for_id(configuration.language_id)?; + return Ok(Some((language, configuration))); } } Ok(None) @@ -99,7 +112,9 @@ impl Loader { &self, path: &Path, ) -> Result> { - let ids = path + // Find all the language configurations that match this file name + // or a suffix of the file name. + let configuration_ids = path .file_name() .and_then(|n| n.to_str()) .and_then(|file_name| self.language_configuration_ids_by_file_type.get(file_name)) @@ -110,13 +125,57 @@ impl Loader { self.language_configuration_ids_by_file_type.get(extension) }) }); - if let Some(ids) = ids { - // TODO use `content-regex` to pick one - for (repo_id, configuration_id) in ids.iter().cloned() { - let (language, configurations) = self.language_for_id(repo_id)?; - return Ok(Some((language, &configurations[configuration_id]))); + + if let Some(configuration_ids) = configuration_ids { + if !configuration_ids.is_empty() { + let configuration; + + // If there is only one language configuration, then use it. + if configuration_ids.len() == 1 { + configuration = &self.language_configurations[configuration_ids[0]]; + } + + // If multiple language configurations match, then determine which + // one to use by applying the configurations' content regexes. + else { + let file_contents = fs::read_to_string(path)?; + let mut best_score = -2isize; + let mut best_configuration_id = None; + for configuration_id in configuration_ids { + let config = &self.language_configurations[*configuration_id]; + + // If the language configuration has a content regex, assign + // a score based on the length of the first match. + let score; + if let Some(content_regex) = &config.content_regex { + if let Some(mat) = content_regex.find(&file_contents) { + score = (mat.end() - mat.start()) as isize; + } + + // If the content regex does not match, then *penalize* this + // language configuration, so that language configurations + // without content regexes are preferred over those with + // non-matching content regexes. + else { + score = -1; + } + } else { + score = 0; + } + if score > best_score { + best_configuration_id = Some(*configuration_id); + best_score = score; + } + } + + configuration = &self.language_configurations[best_configuration_id.unwrap()]; + } + + let language = self.language_for_id(configuration.language_id)?; + return Ok(Some((language, configuration))); } } + Ok(None) } @@ -126,34 +185,35 @@ impl Loader { ) -> Result> { let mut best_match_length = 0; let mut best_match_position = None; - for (i, repo) in self.language_repos.iter().enumerate() { - for (j, configuration) in repo.configurations.iter().enumerate() { - if let Some(injection_regex) = &configuration.injection_regex { - if let Some(mat) = injection_regex.find(string) { - let length = mat.end() - mat.start(); - if length > best_match_length { - best_match_position = Some((i, j)); - best_match_length = length; - } + for (i, configuration) in self.language_configurations.iter().enumerate() { + if let Some(injection_regex) = &configuration.injection_regex { + if let Some(mat) = injection_regex.find(string) { + let length = mat.end() - mat.start(); + if length > best_match_length { + best_match_position = Some(i); + best_match_length = length; } } } } - if let Some((i, j)) = best_match_position { - let (language, configurations) = self.language_for_id(i)?; - Ok(Some((language, &configurations[j]))) + + if let Some(i) = best_match_position { + let configuration = &self.language_configurations[i]; + let language = self.language_for_id(configuration.language_id)?; + Ok(Some((language, configuration))) } else { Ok(None) } } - fn language_for_id(&self, id: usize) -> Result<(Language, &Vec)> { - let repo = &self.language_repos[id]; - let language = repo.language.get_or_try_init(|| { - let src_path = repo.path.join("src"); - self.load_language_at_path(&src_path, &src_path) - })?; - Ok((*language, &self.language_repos[id].configurations)) + fn language_for_id(&self, id: usize) -> Result { + let (path, language) = &self.languages_by_id[id]; + language + .get_or_try_init(|| { + let src_path = path.join("src"); + self.load_language_at_path(&src_path, &src_path) + }) + .map(|l| *l) } pub fn load_language_at_path(&self, src_path: &Path, header_path: &Path) -> Result { @@ -278,9 +338,14 @@ impl Loader { Ok(language) } - fn find_language_at_path<'a>(&'a mut self, parser_path: &Path) -> Result { + fn find_language_configurations_at_path<'a>( + &'a mut self, + parser_path: &Path, + ) -> Result<&[LanguageConfiguration]> { #[derive(Deserialize)] struct LanguageConfigurationJSON { + #[serde(default)] + path: PathBuf, scope: Option, #[serde(rename = "file-types")] file_types: Option>, @@ -295,57 +360,75 @@ impl Loader { #[derive(Deserialize)] struct PackageJSON { + #[serde(default)] #[serde(rename = "tree-sitter")] - tree_sitter: Option>, + tree_sitter: Vec, } - let mut configurations = vec![LanguageConfiguration::default()]; + let initial_language_configuration_count = self.language_configurations.len(); + if let Ok(package_json_contents) = fs::read_to_string(&parser_path.join("package.json")) { let package_json = serde_json::from_str::(&package_json_contents); if let Ok(package_json) = package_json { - configurations = package_json - .tree_sitter - .map_or(Vec::new(), |configurations| { - configurations - .into_iter() - .map(|conf| LanguageConfiguration { - scope: conf.scope, - file_types: conf.file_types.unwrap_or(Vec::new()), - _content_regex: conf.content_regex.and_then(|r| { - RegexBuilder::new(&r).multi_line(true).build().ok() - }), - _first_line_regex: conf.first_line_regex.and_then(|r| { - RegexBuilder::new(&r).multi_line(true).build().ok() - }), - injection_regex: conf.injection_regex.and_then(|r| { - RegexBuilder::new(&r).multi_line(true).build().ok() - }), - highlight_property_sheet_path: conf - .highlights - .map(|h| parser_path.join(h)), - highlight_property_sheet: OnceCell::new(), - }) - .collect() + if package_json.tree_sitter.is_empty() { + return Ok(&[]); + } + + let language_count = self.languages_by_id.len(); + for config_json in package_json.tree_sitter { + // Determine the path to the parser directory. This can be specified in + // the package.json, but defaults to the directory containing the package.json. + let language_path = parser_path.join(config_json.path); + + // Determine if a previous language configuration in this package.json file + // already uses the same language. + let mut language_id = None; + for (id, (path, _)) in + self.languages_by_id.iter().enumerate().skip(language_count) + { + if language_path == *path { + language_id = Some(id); + } + } + + // If not, add a new language path to the list. + let language_id = language_id.unwrap_or_else(|| { + self.languages_by_id.push((language_path, OnceCell::new())); + self.languages_by_id.len() - 1 }); - for (i, configuration) in configurations.iter().enumerate() { + let configuration = LanguageConfiguration { + scope: config_json.scope, + language_id, + file_types: config_json.file_types.unwrap_or(Vec::new()), + content_regex: config_json + .content_regex + .and_then(|r| RegexBuilder::new(&r).multi_line(true).build().ok()), + _first_line_regex: config_json + .first_line_regex + .and_then(|r| RegexBuilder::new(&r).multi_line(true).build().ok()), + injection_regex: config_json + .injection_regex + .and_then(|r| RegexBuilder::new(&r).multi_line(true).build().ok()), + highlight_property_sheet_path: config_json + .highlights + .map(|h| parser_path.join(h)), + highlight_property_sheet: OnceCell::new(), + }; + for file_type in &configuration.file_types { self.language_configuration_ids_by_file_type .entry(file_type.to_string()) .or_insert(Vec::new()) - .push((self.language_repos.len(), i)); + .push(self.language_configurations.len()); } + + self.language_configurations.push(configuration); } } } - self.language_repos.push(LanguageRepo { - path: parser_path.to_owned(), - language: OnceCell::new(), - configurations, - }); - - Ok(self.language_repos.len() - 1) + Ok(&self.language_configurations[initial_language_configuration_count..]) } } diff --git a/cli/src/main.rs b/cli/src/main.rs index 23e7fc1a..6b187fc7 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -1,8 +1,8 @@ use clap::{App, AppSettings, Arg, SubCommand}; use error::Error; -use std::{env, fs, u64}; use std::path::Path; use std::process::exit; +use std::{env, fs, u64}; use tree_sitter_cli::{ config, error, generate, highlight, loader, logger, parse, test, wasm, web_ui, }; @@ -102,6 +102,10 @@ fn run() -> error::Result<()> { .subcommand( SubCommand::with_name("web-ui").about("Test a parser interactively in the browser"), ) + .subcommand( + SubCommand::with_name("dump-languages") + .about("Print info about all known language parsers"), + ) .get_matches(); let home_dir = dirs::home_dir().expect("Failed to read home directory"); @@ -124,8 +128,8 @@ fn run() -> error::Result<()> { let debug_graph = matches.is_present("debug-graph"); let filter = matches.value_of("filter"); let corpus_path = current_dir.join("corpus"); - if let Some(language) = loader.language_at_path(¤t_dir)? { - test::run_tests_at_path(language, &corpus_path, debug, debug_graph, filter)?; + if let Some(language) = loader.languages_at_path(¤t_dir)?.first() { + test::run_tests_at_path(*language, &corpus_path, debug, debug_graph, filter)?; } else { eprintln!("No language found"); } @@ -173,12 +177,13 @@ fn run() -> error::Result<()> { }))? { lang - } else if let Some(lang) = - loader - .language_at_path(¤t_dir) - .map_err(Error::wrap(|| { - "Failed to load language in current directory" - }))? + } else if let Some(lang) = loader + .languages_at_path(¤t_dir) + .map_err(Error::wrap(|| { + "Failed to load language in current directory" + }))? + .first() + .cloned() { lang } else { @@ -251,6 +256,19 @@ fn run() -> error::Result<()> { wasm::compile_language_to_wasm(&grammar_path, matches.is_present("docker"))?; } else if matches.subcommand_matches("web-ui").is_some() { web_ui::serve(¤t_dir); + } else if matches.subcommand_matches("dump-languages").is_some() { + loader.find_all_languages(&config.parser_directories)?; + for (configuration, language_path) in loader.get_all_language_configurations() { + println!( + "scope: {}\nparser: {:?}\nproperties: {:?}\nfile_types: {:?}\ncontent_regex: {:?}\ninjection_regex: {:?}\n", + configuration.scope.as_ref().unwrap_or(&String::new()), + language_path, + configuration.highlight_property_sheet_path, + configuration.file_types, + configuration.content_regex, + configuration.injection_regex, + ); + } } Ok(()) diff --git a/lib/binding_rust/lib.rs b/lib/binding_rust/lib.rs index 9db71d8e..65c38645 100644 --- a/lib/binding_rust/lib.rs +++ b/lib/binding_rust/lib.rs @@ -21,7 +21,7 @@ use std::{fmt, ptr, slice, str, u16}; pub const LANGUAGE_VERSION: usize = ffi::TREE_SITTER_LANGUAGE_VERSION; pub const PARSER_HEADER: &'static str = include_str!("../include/tree_sitter/parser.h"); -#[derive(Clone, Copy)] +#[derive(Clone, Copy, PartialEq, Eq)] #[repr(transparent)] pub struct Language(*const ffi::TSLanguage);