Add --check flag to tree-sitter highlight.

Recently I've been pulling a lot of grammars into GitHub's highlighting backend,
replacing legacy language support with tree-sitter highlighting queries.
Our backend systems have a standard set of highlight captures we expect, very
similar to the standard tagging captures we expect. Though end-user applications
are free to choose whatever tagging nomenclature they want, I think it's nice to
include a checking stage that will help us ensure that we know whether a capture
might be recognized or not. It will also help us figure out where we need to
expand our standard set of captures (see #1539).
This commit is contained in:
Patrick Thomson 2023-06-22 09:18:53 -04:00 committed by Amaan Qureshi
parent d30e9c9d71
commit cb58bc593f
No known key found for this signature in database
GPG key ID: E67890ADC4227273
4 changed files with 78 additions and 2 deletions

3
Cargo.lock generated
View file

@ -859,8 +859,9 @@ dependencies = [
[[package]]
name = "tree-sitter-highlight"
version = "0.20.1"
version = "0.20.2"
dependencies = [
"lazy_static",
"regex",
"thiserror",
"tree-sitter",

View file

@ -239,6 +239,11 @@ fn run() -> Result<()> {
.long("html")
.short("H"),
)
.arg(
Arg::with_name("check")
.help("Check that highlighting captures conform strictly to standards")
.long("check"),
)
.arg(&scope_arg)
.arg(&time_arg)
.arg(&quiet_arg)
@ -543,6 +548,7 @@ fn run() -> Result<()> {
let time = matches.is_present("time");
let quiet = matches.is_present("quiet");
let html_mode = quiet || matches.is_present("html");
let should_check = matches.is_present("check");
let paths = collect_paths(matches.value_of("paths-file"), matches.values_of("paths"))?;
if html_mode && !quiet {
@ -573,6 +579,25 @@ fn run() -> Result<()> {
};
if let Some(highlight_config) = language_config.highlight_config(language)? {
if should_check {
let names = highlight_config.nonconformant_capture_names();
if names.is_empty() {
eprintln!("All highlight captures conform to standards.");
} else {
eprintln!(
"Non-standard highlight {} detected:",
if names.len() > 1 {
"captures"
} else {
"capture"
}
);
for name in names {
eprintln!("* {}", name);
}
}
}
let source = fs::read(path)?;
if html_mode {
highlight::html(

View file

@ -1,7 +1,7 @@
[package]
name = "tree-sitter-highlight"
description = "Library for performing syntax highlighting with Tree-sitter"
version = "0.20.1"
version = "0.20.2"
authors = [
"Max Brunsfeld <maxbrunsfeld@gmail.com>",
"Tim Clem <timothy.clem@gmail.com>",
@ -18,6 +18,7 @@ rust-version.workspace = true
crate-type = ["lib", "staticlib"]
[dependencies]
lazy_static = "1.2.0"
regex = "1"
thiserror = "1.0"

View file

@ -2,6 +2,8 @@ pub mod c_lib;
pub mod util;
pub use c_lib as c;
use lazy_static::lazy_static;
use std::collections::HashSet;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::{iter, mem, ops, str, usize};
use thiserror::Error;
@ -14,6 +16,42 @@ const CANCELLATION_CHECK_INTERVAL: usize = 100;
const BUFFER_HTML_RESERVE_CAPACITY: usize = 10 * 1024;
const BUFFER_LINES_RESERVE_CAPACITY: usize = 1000;
lazy_static! {
static ref STANDARD_CAPTURE_NAMES: HashSet<&'static str> = vec![
"attribute",
"carriage-return",
"comment",
"constant",
"constant.builtin",
"constructor",
"constructor.builtin",
"embedded",
"escape",
"function",
"function.builtin",
"keyword",
"number",
"module",
"operator",
"property",
"property.builtin",
"punctuation",
"punctuation.bracket",
"punctuation.delimiter",
"punctuation.special",
"string",
"string.special",
"tag",
"type",
"type.builtin",
"variable",
"variable.builtin",
"variable.parameter",
]
.into_iter()
.collect();
}
/// Indicates which highlight should be applied to a region of source code.
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub struct Highlight(pub usize);
@ -321,6 +359,17 @@ impl HighlightConfiguration {
best_index.map(Highlight)
}));
}
// Return the list of this configuration's capture names that are neither present in the
// list of predefined 'canonical' names nor start with an underscore (denoting 'private' captures
// used as part of capture internals).
pub fn nonconformant_capture_names(&self) -> Vec<&String> {
return self
.names()
.iter()
.filter(|&n| !STANDARD_CAPTURE_NAMES.contains(n.as_str()))
.collect();
}
}
impl<'a> HighlightIterLayer<'a> {