2024-04-09 13:35:08 -04:00
|
|
|
use std::{
|
|
|
|
|
env, fs,
|
|
|
|
|
io::Write,
|
|
|
|
|
path::{Path, PathBuf},
|
|
|
|
|
process::{Command, Stdio},
|
|
|
|
|
};
|
2024-02-21 11:47:59 -05:00
|
|
|
|
|
|
|
|
use anyhow::{anyhow, Context, Result};
|
|
|
|
|
use build_tables::build_tables;
|
|
|
|
|
use grammar_files::path_in_ignore;
|
2024-04-12 10:03:46 -07:00
|
|
|
use grammars::InputGrammar;
|
2024-04-09 17:53:37 -07:00
|
|
|
use lazy_static::lazy_static;
|
2024-02-21 11:47:59 -05:00
|
|
|
use parse_grammar::parse_grammar;
|
|
|
|
|
use prepare_grammar::prepare_grammar;
|
2024-04-09 17:53:37 -07:00
|
|
|
use regex::{Regex, RegexBuilder};
|
2024-02-21 11:47:59 -05:00
|
|
|
use render::render_c_code;
|
2024-04-09 17:53:37 -07:00
|
|
|
use semver::Version;
|
2024-02-21 11:47:59 -05:00
|
|
|
|
2019-01-07 10:23:01 -08:00
|
|
|
mod build_tables;
|
2019-07-19 12:39:24 -07:00
|
|
|
mod dedup;
|
2024-02-21 11:47:59 -05:00
|
|
|
mod grammar_files;
|
2019-01-07 10:23:01 -08:00
|
|
|
mod grammars;
|
|
|
|
|
mod nfa;
|
2019-03-26 14:42:32 -07:00
|
|
|
mod node_types;
|
2019-04-23 14:29:46 -07:00
|
|
|
pub mod parse_grammar;
|
2019-01-07 10:23:01 -08:00
|
|
|
mod prepare_grammar;
|
|
|
|
|
mod render;
|
|
|
|
|
mod rules;
|
|
|
|
|
mod tables;
|
|
|
|
|
|
2024-05-05 14:03:24 -04:00
|
|
|
pub use grammar_files::lookup_package_json_for_path;
|
|
|
|
|
|
2019-01-11 13:30:45 -08:00
|
|
|
lazy_static! {
|
|
|
|
|
static ref JSON_COMMENT_REGEX: Regex = RegexBuilder::new("^\\s*//.*")
|
|
|
|
|
.multi_line(true)
|
|
|
|
|
.build()
|
|
|
|
|
.unwrap();
|
|
|
|
|
}
|
|
|
|
|
|
2019-02-12 11:06:18 -08:00
|
|
|
struct GeneratedParser {
|
|
|
|
|
c_code: String,
|
2019-03-26 13:43:10 -07:00
|
|
|
node_types_json: String,
|
2019-02-12 11:06:18 -08:00
|
|
|
}
|
|
|
|
|
|
2024-03-10 16:38:53 -04:00
|
|
|
pub const ALLOC_HEADER: &str = include_str!("./templates/alloc.h");
|
|
|
|
|
|
2019-01-11 13:30:45 -08:00
|
|
|
pub fn generate_parser_in_directory(
|
2024-02-04 01:30:33 -05:00
|
|
|
repo_path: &Path,
|
2019-01-14 14:07:42 -08:00
|
|
|
grammar_path: Option<&str>,
|
2022-01-17 14:45:07 -08:00
|
|
|
abi_version: usize,
|
2021-03-08 12:01:45 -08:00
|
|
|
generate_bindings: bool,
|
2019-05-20 13:25:01 -07:00
|
|
|
report_symbol_name: Option<&str>,
|
2023-07-18 13:24:52 +02:00
|
|
|
js_runtime: Option<&str>,
|
2019-01-09 14:43:49 -08:00
|
|
|
) -> Result<()> {
|
2024-02-21 11:47:59 -05:00
|
|
|
let mut repo_path = repo_path.to_owned();
|
|
|
|
|
let mut grammar_path = grammar_path;
|
|
|
|
|
|
|
|
|
|
// Populate a new empty grammar directory.
|
|
|
|
|
if let Some(path) = grammar_path {
|
|
|
|
|
let path = PathBuf::from(path);
|
|
|
|
|
if !path
|
|
|
|
|
.try_exists()
|
|
|
|
|
.with_context(|| "Some error with specified path")?
|
|
|
|
|
{
|
|
|
|
|
fs::create_dir_all(&path)?;
|
|
|
|
|
grammar_path = None;
|
|
|
|
|
repo_path = path;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if repo_path.is_dir() && !repo_path.join("grammar.js").exists() && !path_in_ignore(&repo_path) {
|
|
|
|
|
if let Some(dir_name) = repo_path
|
|
|
|
|
.file_name()
|
|
|
|
|
.map(|x| x.to_string_lossy().to_ascii_lowercase())
|
|
|
|
|
{
|
|
|
|
|
if let Some(language_name) = dir_name
|
|
|
|
|
.strip_prefix("tree-sitter-")
|
|
|
|
|
.or_else(|| Some(dir_name.as_ref()))
|
|
|
|
|
{
|
|
|
|
|
grammar_files::generate_grammar_files(&repo_path, language_name, false)?;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2019-02-14 10:57:33 -08:00
|
|
|
|
2019-07-19 13:11:08 -07:00
|
|
|
// Read the grammar.json.
|
2024-02-04 01:30:33 -05:00
|
|
|
let grammar_json = if let Some(path) = grammar_path {
|
|
|
|
|
load_grammar_file(path.as_ref(), js_runtime)?
|
|
|
|
|
} else {
|
|
|
|
|
let grammar_js_path =
|
|
|
|
|
grammar_path.map_or(repo_path.join("grammar.js"), std::convert::Into::into);
|
|
|
|
|
load_grammar_file(&grammar_js_path, js_runtime)?
|
2023-07-30 20:43:52 +03:00
|
|
|
};
|
|
|
|
|
|
2024-02-21 11:47:59 -05:00
|
|
|
let src_path = repo_path.join("src");
|
|
|
|
|
let header_path = src_path.join("tree_sitter");
|
|
|
|
|
|
2023-07-30 20:43:52 +03:00
|
|
|
// Ensure that the output directories exist.
|
|
|
|
|
fs::create_dir_all(&src_path)?;
|
|
|
|
|
fs::create_dir_all(&header_path)?;
|
|
|
|
|
|
|
|
|
|
if grammar_path.is_none() {
|
2024-02-04 01:30:33 -05:00
|
|
|
fs::write(src_path.join("grammar.json"), &grammar_json)
|
|
|
|
|
.with_context(|| format!("Failed to write grammar.json to {src_path:?}"))?;
|
2019-02-08 15:15:47 -08:00
|
|
|
}
|
|
|
|
|
|
2019-07-19 13:11:08 -07:00
|
|
|
// Parse and preprocess the grammar.
|
|
|
|
|
let input_grammar = parse_grammar(&grammar_json)?;
|
|
|
|
|
|
|
|
|
|
// Generate the parser and related files.
|
2019-10-17 11:00:31 -07:00
|
|
|
let GeneratedParser {
|
|
|
|
|
c_code,
|
|
|
|
|
node_types_json,
|
2024-04-09 17:53:37 -07:00
|
|
|
} = generate_parser_for_grammar_with_opts(&input_grammar, abi_version, report_symbol_name)?;
|
2019-08-28 17:14:04 -07:00
|
|
|
|
2019-10-17 11:00:31 -07:00
|
|
|
write_file(&src_path.join("parser.c"), c_code)?;
|
|
|
|
|
write_file(&src_path.join("node-types.json"), node_types_json)?;
|
2024-03-10 16:38:53 -04:00
|
|
|
write_file(&header_path.join("alloc.h"), ALLOC_HEADER)?;
|
2024-02-22 09:13:59 -05:00
|
|
|
write_file(&header_path.join("array.h"), tree_sitter::ARRAY_HEADER)?;
|
2022-01-23 10:29:52 -08:00
|
|
|
write_file(&header_path.join("parser.h"), tree_sitter::PARSER_HEADER)?;
|
2019-07-19 13:11:08 -07:00
|
|
|
|
2024-02-21 11:47:59 -05:00
|
|
|
if !path_in_ignore(&repo_path) {
|
2024-04-09 17:53:37 -07:00
|
|
|
grammar_files::generate_grammar_files(&repo_path, &input_grammar.name, generate_bindings)?;
|
2021-03-08 12:01:45 -08:00
|
|
|
}
|
2019-10-17 11:00:31 -07:00
|
|
|
|
2019-01-09 14:43:49 -08:00
|
|
|
Ok(())
|
2019-01-07 10:23:01 -08:00
|
|
|
}
|
|
|
|
|
|
2019-02-04 14:44:06 -08:00
|
|
|
pub fn generate_parser_for_grammar(grammar_json: &str) -> Result<(String, String)> {
|
2019-01-11 13:30:45 -08:00
|
|
|
let grammar_json = JSON_COMMENT_REGEX.replace_all(grammar_json, "\n");
|
2019-07-19 13:11:08 -07:00
|
|
|
let input_grammar = parse_grammar(&grammar_json)?;
|
2024-04-09 17:53:37 -07:00
|
|
|
let parser =
|
|
|
|
|
generate_parser_for_grammar_with_opts(&input_grammar, tree_sitter::LANGUAGE_VERSION, None)?;
|
|
|
|
|
Ok((input_grammar.name.clone(), parser.c_code))
|
2019-01-11 13:30:45 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn generate_parser_for_grammar_with_opts(
|
2024-04-09 17:53:37 -07:00
|
|
|
input_grammar: &InputGrammar,
|
2022-01-17 14:45:07 -08:00
|
|
|
abi_version: usize,
|
2019-05-20 13:25:01 -07:00
|
|
|
report_symbol_name: Option<&str>,
|
2019-02-12 11:06:18 -08:00
|
|
|
) -> Result<GeneratedParser> {
|
2024-04-09 17:53:37 -07:00
|
|
|
let (syntax_grammar, lexical_grammar, inlines, simple_aliases) =
|
|
|
|
|
prepare_grammar(input_grammar)?;
|
2019-11-13 10:38:47 -08:00
|
|
|
let variable_info =
|
|
|
|
|
node_types::get_variable_info(&syntax_grammar, &lexical_grammar, &simple_aliases)?;
|
2019-03-27 16:17:02 -07:00
|
|
|
let node_types_json = node_types::generate_node_types_json(
|
|
|
|
|
&syntax_grammar,
|
|
|
|
|
&lexical_grammar,
|
|
|
|
|
&simple_aliases,
|
|
|
|
|
&variable_info,
|
|
|
|
|
);
|
2024-04-09 17:53:37 -07:00
|
|
|
let tables = build_tables(
|
2019-01-11 13:30:45 -08:00
|
|
|
&syntax_grammar,
|
|
|
|
|
&lexical_grammar,
|
|
|
|
|
&simple_aliases,
|
2019-03-27 16:17:02 -07:00
|
|
|
&variable_info,
|
2024-04-09 17:53:37 -07:00
|
|
|
&inlines,
|
2019-05-20 13:25:01 -07:00
|
|
|
report_symbol_name,
|
2019-01-11 13:30:45 -08:00
|
|
|
)?;
|
2019-01-16 13:53:01 -08:00
|
|
|
let c_code = render_c_code(
|
2024-04-09 17:53:37 -07:00
|
|
|
&input_grammar.name,
|
|
|
|
|
tables,
|
2019-01-11 13:30:45 -08:00
|
|
|
syntax_grammar,
|
|
|
|
|
lexical_grammar,
|
|
|
|
|
simple_aliases,
|
2022-01-17 14:45:07 -08:00
|
|
|
abi_version,
|
2019-01-16 13:53:01 -08:00
|
|
|
);
|
2019-02-12 11:06:18 -08:00
|
|
|
Ok(GeneratedParser {
|
|
|
|
|
c_code,
|
2019-03-26 14:42:32 -07:00
|
|
|
node_types_json: serde_json::to_string_pretty(&node_types_json).unwrap(),
|
2019-02-12 11:06:18 -08:00
|
|
|
})
|
2019-01-11 13:30:45 -08:00
|
|
|
}
|
|
|
|
|
|
2023-07-18 13:24:52 +02:00
|
|
|
pub fn load_grammar_file(grammar_path: &Path, js_runtime: Option<&str>) -> Result<String> {
|
2023-07-30 20:43:52 +03:00
|
|
|
if grammar_path.is_dir() {
|
|
|
|
|
return Err(anyhow!(
|
|
|
|
|
"Path to a grammar file with `.js` or `.json` extension is required"
|
|
|
|
|
));
|
|
|
|
|
}
|
2019-01-14 14:07:42 -08:00
|
|
|
match grammar_path.extension().and_then(|e| e.to_str()) {
|
2023-07-18 13:24:52 +02:00
|
|
|
Some("js") => Ok(load_js_grammar_file(grammar_path, js_runtime)
|
|
|
|
|
.with_context(|| "Failed to load grammar.js")?),
|
2023-07-30 20:43:52 +03:00
|
|
|
Some("json") => {
|
|
|
|
|
Ok(fs::read_to_string(grammar_path).with_context(|| "Failed to load grammar.json")?)
|
|
|
|
|
}
|
2024-02-04 01:30:33 -05:00
|
|
|
_ => Err(anyhow!("Unknown grammar file extension: {grammar_path:?}",)),
|
2019-01-14 14:07:42 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-07-18 13:24:52 +02:00
|
|
|
fn load_js_grammar_file(grammar_path: &Path, js_runtime: Option<&str>) -> Result<String> {
|
2021-08-05 03:50:10 +03:00
|
|
|
let grammar_path = fs::canonicalize(grammar_path)?;
|
2023-07-18 13:24:52 +02:00
|
|
|
|
2024-05-24 23:53:33 +03:00
|
|
|
#[cfg(windows)]
|
|
|
|
|
let grammar_path = url::Url::from_file_path(grammar_path)
|
|
|
|
|
.expect("Failed to convert path to URL")
|
|
|
|
|
.to_string();
|
|
|
|
|
|
2023-07-18 13:24:52 +02:00
|
|
|
let js_runtime = js_runtime.unwrap_or("node");
|
|
|
|
|
|
2024-05-24 23:53:33 +03:00
|
|
|
let mut js_command = Command::new(js_runtime);
|
|
|
|
|
match js_runtime {
|
|
|
|
|
"node" => {
|
|
|
|
|
js_command.args(["--input-type=module", "-"]);
|
|
|
|
|
}
|
|
|
|
|
"bun" => {
|
|
|
|
|
js_command.arg("-");
|
|
|
|
|
}
|
|
|
|
|
"deno" => {
|
|
|
|
|
js_command.args(["run", "--allow-all", "-"]);
|
|
|
|
|
}
|
|
|
|
|
_ => {}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let mut js_process = js_command
|
2019-01-17 10:09:03 -08:00
|
|
|
.env("TREE_SITTER_GRAMMAR_PATH", grammar_path)
|
2019-01-07 10:23:01 -08:00
|
|
|
.stdin(Stdio::piped())
|
|
|
|
|
.stdout(Stdio::piped())
|
|
|
|
|
.spawn()
|
2023-07-18 13:24:52 +02:00
|
|
|
.with_context(|| format!("Failed to run `{js_runtime}`"))?;
|
2019-01-07 10:23:01 -08:00
|
|
|
|
2024-05-24 23:53:33 +03:00
|
|
|
let mut js_stdin = js_process
|
2019-01-07 10:23:01 -08:00
|
|
|
.stdin
|
|
|
|
|
.take()
|
2024-04-14 10:40:59 +03:00
|
|
|
.with_context(|| format!("Failed to open stdin for {js_runtime}"))?;
|
Expand regex support to include emojis and binary ops
The `Emoji` property alias is already present, but the actual property
is not available since it lives in a new file. This adds that file to
the `generate-unicode-categories-json`.
The `emoji-data` file follows the same format as the ones we already
consume in `generate-unicode-categories-json`, so adding emoji support
is fairly easy. his, grammars would need to hard-code a set of
unicode ranges in their own regex. The Javascript library `emoji-regex`
cannot be used because of #451.
For unclear reasons, the characters #, *, and 0-9 are marked as
`Emoji=Yes` by `emoji-data.txt`. Because of this, a grammar that wishes
to use emojis is likely to want to exclude those characters. For that
reason, this change also adds support for binary operations in regexes,
e.g. `[\p{Emoji}&&[^#*0-9]]`.
Lastly (and perhaps controversially), this change introduces new
variables available at grammar compile time, for the major, minor, and
patch versions of the tree-sitter CLI used to compile the grammar. This
will allow grammars to conditionally adopt these new regex features
while remaining backward compatible with older versions of the CLI.
Without this part of the change, grammar authors who do not precompile
and check-in their `grammar.json` would need to wait for downstream
systems to adopt a newer tree-sitter CLI version before they could begin
to use these features.
2022-02-14 21:46:12 -08:00
|
|
|
let cli_version = Version::parse(env!("CARGO_PKG_VERSION"))
|
2023-07-30 20:43:52 +03:00
|
|
|
.with_context(|| "Could not parse this package's version as semver.")?;
|
Expand regex support to include emojis and binary ops
The `Emoji` property alias is already present, but the actual property
is not available since it lives in a new file. This adds that file to
the `generate-unicode-categories-json`.
The `emoji-data` file follows the same format as the ones we already
consume in `generate-unicode-categories-json`, so adding emoji support
is fairly easy. his, grammars would need to hard-code a set of
unicode ranges in their own regex. The Javascript library `emoji-regex`
cannot be used because of #451.
For unclear reasons, the characters #, *, and 0-9 are marked as
`Emoji=Yes` by `emoji-data.txt`. Because of this, a grammar that wishes
to use emojis is likely to want to exclude those characters. For that
reason, this change also adds support for binary operations in regexes,
e.g. `[\p{Emoji}&&[^#*0-9]]`.
Lastly (and perhaps controversially), this change introduces new
variables available at grammar compile time, for the major, minor, and
patch versions of the tree-sitter CLI used to compile the grammar. This
will allow grammars to conditionally adopt these new regex features
while remaining backward compatible with older versions of the CLI.
Without this part of the change, grammar authors who do not precompile
and check-in their `grammar.json` would need to wait for downstream
systems to adopt a newer tree-sitter CLI version before they could begin
to use these features.
2022-02-14 21:46:12 -08:00
|
|
|
write!(
|
2024-05-24 23:53:33 +03:00
|
|
|
js_stdin,
|
|
|
|
|
"globalThis.TREE_SITTER_CLI_VERSION_MAJOR = {};
|
|
|
|
|
globalThis.TREE_SITTER_CLI_VERSION_MINOR = {};
|
|
|
|
|
globalThis.TREE_SITTER_CLI_VERSION_PATCH = {};",
|
Expand regex support to include emojis and binary ops
The `Emoji` property alias is already present, but the actual property
is not available since it lives in a new file. This adds that file to
the `generate-unicode-categories-json`.
The `emoji-data` file follows the same format as the ones we already
consume in `generate-unicode-categories-json`, so adding emoji support
is fairly easy. his, grammars would need to hard-code a set of
unicode ranges in their own regex. The Javascript library `emoji-regex`
cannot be used because of #451.
For unclear reasons, the characters #, *, and 0-9 are marked as
`Emoji=Yes` by `emoji-data.txt`. Because of this, a grammar that wishes
to use emojis is likely to want to exclude those characters. For that
reason, this change also adds support for binary operations in regexes,
e.g. `[\p{Emoji}&&[^#*0-9]]`.
Lastly (and perhaps controversially), this change introduces new
variables available at grammar compile time, for the major, minor, and
patch versions of the tree-sitter CLI used to compile the grammar. This
will allow grammars to conditionally adopt these new regex features
while remaining backward compatible with older versions of the CLI.
Without this part of the change, grammar authors who do not precompile
and check-in their `grammar.json` would need to wait for downstream
systems to adopt a newer tree-sitter CLI version before they could begin
to use these features.
2022-02-14 21:46:12 -08:00
|
|
|
cli_version.major, cli_version.minor, cli_version.patch,
|
|
|
|
|
)
|
2024-04-14 10:40:59 +03:00
|
|
|
.with_context(|| format!("Failed to write tree-sitter version to {js_runtime}'s stdin"))?;
|
2024-05-24 23:53:33 +03:00
|
|
|
js_stdin
|
|
|
|
|
.write(include_bytes!("./dsl.js"))
|
2024-04-14 10:40:59 +03:00
|
|
|
.with_context(|| format!("Failed to write grammar dsl to {js_runtime}'s stdin"))?;
|
2024-05-24 23:53:33 +03:00
|
|
|
drop(js_stdin);
|
|
|
|
|
|
|
|
|
|
let output = js_process
|
2019-01-07 10:23:01 -08:00
|
|
|
.wait_with_output()
|
2024-04-14 10:40:59 +03:00
|
|
|
.with_context(|| format!("Failed to read output from {js_runtime}"))?;
|
2019-01-07 10:23:01 -08:00
|
|
|
match output.status.code() {
|
2024-04-14 10:40:59 +03:00
|
|
|
None => panic!("{js_runtime} process was killed"),
|
2024-02-05 00:42:05 -05:00
|
|
|
Some(0) => {
|
2024-04-14 10:40:59 +03:00
|
|
|
let stdout = String::from_utf8(output.stdout)
|
|
|
|
|
.with_context(|| format!("Got invalid UTF8 from {js_runtime}"))?;
|
2024-02-05 00:42:05 -05:00
|
|
|
|
|
|
|
|
let mut grammar_json = &stdout[..];
|
|
|
|
|
|
|
|
|
|
if let Some(pos) = stdout.rfind('\n') {
|
|
|
|
|
// If there's a newline, split the last line from the rest of the output
|
2024-02-07 07:13:03 -05:00
|
|
|
let node_output = &stdout[..pos];
|
2024-02-05 00:42:05 -05:00
|
|
|
grammar_json = &stdout[pos + 1..];
|
|
|
|
|
|
|
|
|
|
let mut stdout = std::io::stdout().lock();
|
|
|
|
|
stdout.write_all(node_output.as_bytes())?;
|
|
|
|
|
stdout.write_all(b"\n")?;
|
|
|
|
|
stdout.flush()?;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Ok(serde_json::to_string_pretty(
|
|
|
|
|
&serde_json::from_str::<serde_json::Value>(grammar_json)
|
|
|
|
|
.with_context(|| "Failed to parse grammar JSON")?,
|
|
|
|
|
)
|
|
|
|
|
.with_context(|| "Failed to serialize grammar JSON")?
|
|
|
|
|
+ "\n")
|
|
|
|
|
}
|
2024-04-14 10:40:59 +03:00
|
|
|
Some(code) => Err(anyhow!("{js_runtime} process exited with status {code}")),
|
2019-01-07 10:23:01 -08:00
|
|
|
}
|
|
|
|
|
}
|
2019-01-17 12:50:30 -08:00
|
|
|
|
2019-05-30 16:52:30 -07:00
|
|
|
fn write_file(path: &Path, body: impl AsRef<[u8]>) -> Result<()> {
|
2021-06-09 12:32:22 -04:00
|
|
|
fs::write(path, body)
|
|
|
|
|
.with_context(|| format!("Failed to write {:?}", path.file_name().unwrap()))
|
2019-05-30 16:52:30 -07:00
|
|
|
}
|