cli: Handle multi-parser repos, content-regex property

Prompted by tree-sitter/tree-sitter-typescript#68
This commit is contained in:
Max Brunsfeld 2019-08-07 17:41:45 -07:00
parent 7005d8b9d9
commit 93f7de03e2
3 changed files with 198 additions and 97 deletions

View file

@ -20,27 +20,23 @@ const DYLIB_EXTENSION: &'static str = "dll";
const BUILD_TARGET: &'static str = env!("BUILD_TARGET");
struct LanguageRepo {
path: PathBuf,
language: OnceCell<Language>,
configurations: Vec<LanguageConfiguration>,
}
#[derive(Default)]
pub struct LanguageConfiguration {
scope: Option<String>,
_content_regex: Option<Regex>,
_first_line_regex: Option<Regex>,
injection_regex: Option<Regex>,
file_types: Vec<String>,
highlight_property_sheet_path: Option<PathBuf>,
pub scope: Option<String>,
pub content_regex: Option<Regex>,
pub _first_line_regex: Option<Regex>,
pub injection_regex: Option<Regex>,
pub file_types: Vec<String>,
pub highlight_property_sheet_path: Option<PathBuf>,
language_id: usize,
highlight_property_sheet: OnceCell<Option<PropertySheet<Properties>>>,
}
pub struct Loader {
parser_lib_path: PathBuf,
language_repos: Vec<LanguageRepo>,
language_configuration_ids_by_file_type: HashMap<String, Vec<(usize, usize)>>,
languages_by_id: Vec<(PathBuf, OnceCell<Language>)>,
language_configurations: Vec<LanguageConfiguration>,
language_configuration_ids_by_file_type: HashMap<String, Vec<usize>>,
}
unsafe impl Send for Loader {}
@ -50,7 +46,8 @@ impl Loader {
pub fn new(parser_lib_path: PathBuf) -> Self {
Loader {
parser_lib_path,
language_repos: Vec::new(),
languages_by_id: Vec::new(),
language_configurations: Vec::new(),
language_configuration_ids_by_file_type: HashMap::new(),
}
}
@ -62,8 +59,10 @@ impl Loader {
let entry = entry?;
if let Some(parser_dir_name) = entry.file_name().to_str() {
if parser_dir_name.starts_with("tree-sitter-") {
self.find_language_at_path(&parser_container_dir.join(parser_dir_name))
.ok();
self.find_language_configurations_at_path(
&parser_container_dir.join(parser_dir_name),
)
.ok();
}
}
}
@ -72,24 +71,38 @@ impl Loader {
Ok(())
}
pub fn language_at_path(&mut self, path: &Path) -> Result<Option<Language>> {
if let Ok(id) = self.find_language_at_path(path) {
Ok(Some(self.language_for_id(id)?.0))
pub fn languages_at_path(&mut self, path: &Path) -> Result<Vec<Language>> {
if let Ok(configurations) = self.find_language_configurations_at_path(path) {
let mut language_ids = configurations
.iter()
.map(|c| c.language_id)
.collect::<Vec<_>>();
language_ids.sort();
language_ids.dedup();
language_ids
.into_iter()
.map(|id| self.language_for_id(id))
.collect::<Result<Vec<_>>>()
} else {
Ok(None)
Ok(Vec::new())
}
}
pub fn get_all_language_configurations(&self) -> Vec<(&LanguageConfiguration, &Path)> {
self.language_configurations
.iter()
.map(|c| (c, self.languages_by_id[c.language_id].0.as_ref()))
.collect()
}
pub fn language_configuration_for_scope(
&self,
scope: &str,
) -> Result<Option<(Language, &LanguageConfiguration)>> {
for (i, repo) in self.language_repos.iter().enumerate() {
for configuration in &repo.configurations {
if configuration.scope.as_ref().map_or(false, |s| s == scope) {
let (language, _) = self.language_for_id(i)?;
return Ok(Some((language, &configuration)));
}
for configuration in &self.language_configurations {
if configuration.scope.as_ref().map_or(false, |s| s == scope) {
let language = self.language_for_id(configuration.language_id)?;
return Ok(Some((language, configuration)));
}
}
Ok(None)
@ -99,7 +112,9 @@ impl Loader {
&self,
path: &Path,
) -> Result<Option<(Language, &LanguageConfiguration)>> {
let ids = path
// Find all the language configurations that match this file name
// or a suffix of the file name.
let configuration_ids = path
.file_name()
.and_then(|n| n.to_str())
.and_then(|file_name| self.language_configuration_ids_by_file_type.get(file_name))
@ -110,13 +125,57 @@ impl Loader {
self.language_configuration_ids_by_file_type.get(extension)
})
});
if let Some(ids) = ids {
// TODO use `content-regex` to pick one
for (repo_id, configuration_id) in ids.iter().cloned() {
let (language, configurations) = self.language_for_id(repo_id)?;
return Ok(Some((language, &configurations[configuration_id])));
if let Some(configuration_ids) = configuration_ids {
if !configuration_ids.is_empty() {
let configuration;
// If there is only one language configuration, then use it.
if configuration_ids.len() == 1 {
configuration = &self.language_configurations[configuration_ids[0]];
}
// If multiple language configurations match, then determine which
// one to use by applying the configurations' content regexes.
else {
let file_contents = fs::read_to_string(path)?;
let mut best_score = -2isize;
let mut best_configuration_id = None;
for configuration_id in configuration_ids {
let config = &self.language_configurations[*configuration_id];
// If the language configuration has a content regex, assign
// a score based on the length of the first match.
let score;
if let Some(content_regex) = &config.content_regex {
if let Some(mat) = content_regex.find(&file_contents) {
score = (mat.end() - mat.start()) as isize;
}
// If the content regex does not match, then *penalize* this
// language configuration, so that language configurations
// without content regexes are preferred over those with
// non-matching content regexes.
else {
score = -1;
}
} else {
score = 0;
}
if score > best_score {
best_configuration_id = Some(*configuration_id);
best_score = score;
}
}
configuration = &self.language_configurations[best_configuration_id.unwrap()];
}
let language = self.language_for_id(configuration.language_id)?;
return Ok(Some((language, configuration)));
}
}
Ok(None)
}
@ -126,34 +185,35 @@ impl Loader {
) -> Result<Option<(Language, &LanguageConfiguration)>> {
let mut best_match_length = 0;
let mut best_match_position = None;
for (i, repo) in self.language_repos.iter().enumerate() {
for (j, configuration) in repo.configurations.iter().enumerate() {
if let Some(injection_regex) = &configuration.injection_regex {
if let Some(mat) = injection_regex.find(string) {
let length = mat.end() - mat.start();
if length > best_match_length {
best_match_position = Some((i, j));
best_match_length = length;
}
for (i, configuration) in self.language_configurations.iter().enumerate() {
if let Some(injection_regex) = &configuration.injection_regex {
if let Some(mat) = injection_regex.find(string) {
let length = mat.end() - mat.start();
if length > best_match_length {
best_match_position = Some(i);
best_match_length = length;
}
}
}
}
if let Some((i, j)) = best_match_position {
let (language, configurations) = self.language_for_id(i)?;
Ok(Some((language, &configurations[j])))
if let Some(i) = best_match_position {
let configuration = &self.language_configurations[i];
let language = self.language_for_id(configuration.language_id)?;
Ok(Some((language, configuration)))
} else {
Ok(None)
}
}
fn language_for_id(&self, id: usize) -> Result<(Language, &Vec<LanguageConfiguration>)> {
let repo = &self.language_repos[id];
let language = repo.language.get_or_try_init(|| {
let src_path = repo.path.join("src");
self.load_language_at_path(&src_path, &src_path)
})?;
Ok((*language, &self.language_repos[id].configurations))
fn language_for_id(&self, id: usize) -> Result<Language> {
let (path, language) = &self.languages_by_id[id];
language
.get_or_try_init(|| {
let src_path = path.join("src");
self.load_language_at_path(&src_path, &src_path)
})
.map(|l| *l)
}
pub fn load_language_at_path(&self, src_path: &Path, header_path: &Path) -> Result<Language> {
@ -278,9 +338,14 @@ impl Loader {
Ok(language)
}
fn find_language_at_path<'a>(&'a mut self, parser_path: &Path) -> Result<usize> {
fn find_language_configurations_at_path<'a>(
&'a mut self,
parser_path: &Path,
) -> Result<&[LanguageConfiguration]> {
#[derive(Deserialize)]
struct LanguageConfigurationJSON {
#[serde(default)]
path: PathBuf,
scope: Option<String>,
#[serde(rename = "file-types")]
file_types: Option<Vec<String>>,
@ -295,57 +360,75 @@ impl Loader {
#[derive(Deserialize)]
struct PackageJSON {
#[serde(default)]
#[serde(rename = "tree-sitter")]
tree_sitter: Option<Vec<LanguageConfigurationJSON>>,
tree_sitter: Vec<LanguageConfigurationJSON>,
}
let mut configurations = vec![LanguageConfiguration::default()];
let initial_language_configuration_count = self.language_configurations.len();
if let Ok(package_json_contents) = fs::read_to_string(&parser_path.join("package.json")) {
let package_json = serde_json::from_str::<PackageJSON>(&package_json_contents);
if let Ok(package_json) = package_json {
configurations = package_json
.tree_sitter
.map_or(Vec::new(), |configurations| {
configurations
.into_iter()
.map(|conf| LanguageConfiguration {
scope: conf.scope,
file_types: conf.file_types.unwrap_or(Vec::new()),
_content_regex: conf.content_regex.and_then(|r| {
RegexBuilder::new(&r).multi_line(true).build().ok()
}),
_first_line_regex: conf.first_line_regex.and_then(|r| {
RegexBuilder::new(&r).multi_line(true).build().ok()
}),
injection_regex: conf.injection_regex.and_then(|r| {
RegexBuilder::new(&r).multi_line(true).build().ok()
}),
highlight_property_sheet_path: conf
.highlights
.map(|h| parser_path.join(h)),
highlight_property_sheet: OnceCell::new(),
})
.collect()
if package_json.tree_sitter.is_empty() {
return Ok(&[]);
}
let language_count = self.languages_by_id.len();
for config_json in package_json.tree_sitter {
// Determine the path to the parser directory. This can be specified in
// the package.json, but defaults to the directory containing the package.json.
let language_path = parser_path.join(config_json.path);
// Determine if a previous language configuration in this package.json file
// already uses the same language.
let mut language_id = None;
for (id, (path, _)) in
self.languages_by_id.iter().enumerate().skip(language_count)
{
if language_path == *path {
language_id = Some(id);
}
}
// If not, add a new language path to the list.
let language_id = language_id.unwrap_or_else(|| {
self.languages_by_id.push((language_path, OnceCell::new()));
self.languages_by_id.len() - 1
});
for (i, configuration) in configurations.iter().enumerate() {
let configuration = LanguageConfiguration {
scope: config_json.scope,
language_id,
file_types: config_json.file_types.unwrap_or(Vec::new()),
content_regex: config_json
.content_regex
.and_then(|r| RegexBuilder::new(&r).multi_line(true).build().ok()),
_first_line_regex: config_json
.first_line_regex
.and_then(|r| RegexBuilder::new(&r).multi_line(true).build().ok()),
injection_regex: config_json
.injection_regex
.and_then(|r| RegexBuilder::new(&r).multi_line(true).build().ok()),
highlight_property_sheet_path: config_json
.highlights
.map(|h| parser_path.join(h)),
highlight_property_sheet: OnceCell::new(),
};
for file_type in &configuration.file_types {
self.language_configuration_ids_by_file_type
.entry(file_type.to_string())
.or_insert(Vec::new())
.push((self.language_repos.len(), i));
.push(self.language_configurations.len());
}
self.language_configurations.push(configuration);
}
}
}
self.language_repos.push(LanguageRepo {
path: parser_path.to_owned(),
language: OnceCell::new(),
configurations,
});
Ok(self.language_repos.len() - 1)
Ok(&self.language_configurations[initial_language_configuration_count..])
}
}

View file

@ -1,8 +1,8 @@
use clap::{App, AppSettings, Arg, SubCommand};
use error::Error;
use std::{env, fs, u64};
use std::path::Path;
use std::process::exit;
use std::{env, fs, u64};
use tree_sitter_cli::{
config, error, generate, highlight, loader, logger, parse, test, wasm, web_ui,
};
@ -102,6 +102,10 @@ fn run() -> error::Result<()> {
.subcommand(
SubCommand::with_name("web-ui").about("Test a parser interactively in the browser"),
)
.subcommand(
SubCommand::with_name("dump-languages")
.about("Print info about all known language parsers"),
)
.get_matches();
let home_dir = dirs::home_dir().expect("Failed to read home directory");
@ -124,8 +128,8 @@ fn run() -> error::Result<()> {
let debug_graph = matches.is_present("debug-graph");
let filter = matches.value_of("filter");
let corpus_path = current_dir.join("corpus");
if let Some(language) = loader.language_at_path(&current_dir)? {
test::run_tests_at_path(language, &corpus_path, debug, debug_graph, filter)?;
if let Some(language) = loader.languages_at_path(&current_dir)?.first() {
test::run_tests_at_path(*language, &corpus_path, debug, debug_graph, filter)?;
} else {
eprintln!("No language found");
}
@ -173,12 +177,13 @@ fn run() -> error::Result<()> {
}))?
{
lang
} else if let Some(lang) =
loader
.language_at_path(&current_dir)
.map_err(Error::wrap(|| {
"Failed to load language in current directory"
}))?
} else if let Some(lang) = loader
.languages_at_path(&current_dir)
.map_err(Error::wrap(|| {
"Failed to load language in current directory"
}))?
.first()
.cloned()
{
lang
} else {
@ -251,6 +256,19 @@ fn run() -> error::Result<()> {
wasm::compile_language_to_wasm(&grammar_path, matches.is_present("docker"))?;
} else if matches.subcommand_matches("web-ui").is_some() {
web_ui::serve(&current_dir);
} else if matches.subcommand_matches("dump-languages").is_some() {
loader.find_all_languages(&config.parser_directories)?;
for (configuration, language_path) in loader.get_all_language_configurations() {
println!(
"scope: {}\nparser: {:?}\nproperties: {:?}\nfile_types: {:?}\ncontent_regex: {:?}\ninjection_regex: {:?}\n",
configuration.scope.as_ref().unwrap_or(&String::new()),
language_path,
configuration.highlight_property_sheet_path,
configuration.file_types,
configuration.content_regex,
configuration.injection_regex,
);
}
}
Ok(())

View file

@ -21,7 +21,7 @@ use std::{fmt, ptr, slice, str, u16};
pub const LANGUAGE_VERSION: usize = ffi::TREE_SITTER_LANGUAGE_VERSION;
pub const PARSER_HEADER: &'static str = include_str!("../include/tree_sitter/parser.h");
#[derive(Clone, Copy)]
#[derive(Clone, Copy, PartialEq, Eq)]
#[repr(transparent)]
pub struct Language(*const ffi::TSLanguage);