2021-02-26 12:45:02 -08:00
|
|
|
mod binding_files;
|
2019-01-07 10:23:01 -08:00
|
|
|
mod build_tables;
|
2021-02-17 13:08:56 -08:00
|
|
|
mod char_tree;
|
2019-07-19 12:39:24 -07:00
|
|
|
mod dedup;
|
2019-01-07 10:23:01 -08:00
|
|
|
mod grammars;
|
|
|
|
|
mod nfa;
|
2019-03-26 14:42:32 -07:00
|
|
|
mod node_types;
|
2019-04-23 14:29:46 -07:00
|
|
|
pub mod parse_grammar;
|
2019-01-07 10:23:01 -08:00
|
|
|
mod prepare_grammar;
|
|
|
|
|
mod render;
|
|
|
|
|
mod rules;
|
|
|
|
|
mod tables;
|
|
|
|
|
|
2019-07-19 13:11:08 -07:00
|
|
|
use self::build_tables::build_tables;
|
2019-10-17 11:00:31 -07:00
|
|
|
use self::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar};
|
2019-07-19 13:11:08 -07:00
|
|
|
use self::parse_grammar::parse_grammar;
|
|
|
|
|
use self::prepare_grammar::prepare_grammar;
|
|
|
|
|
use self::render::render_c_code;
|
|
|
|
|
use self::rules::AliasMap;
|
2021-06-09 12:32:22 -04:00
|
|
|
use anyhow::{anyhow, Context, Result};
|
2019-07-19 13:11:08 -07:00
|
|
|
use lazy_static::lazy_static;
|
|
|
|
|
use regex::{Regex, RegexBuilder};
|
Expand regex support to include emojis and binary ops
The `Emoji` property alias is already present, but the actual property
is not available since it lives in a new file. This adds that file to
the `generate-unicode-categories-json`.
The `emoji-data` file follows the same format as the ones we already
consume in `generate-unicode-categories-json`, so adding emoji support
is fairly easy. his, grammars would need to hard-code a set of
unicode ranges in their own regex. The Javascript library `emoji-regex`
cannot be used because of #451.
For unclear reasons, the characters #, *, and 0-9 are marked as
`Emoji=Yes` by `emoji-data.txt`. Because of this, a grammar that wishes
to use emojis is likely to want to exclude those characters. For that
reason, this change also adds support for binary operations in regexes,
e.g. `[\p{Emoji}&&[^#*0-9]]`.
Lastly (and perhaps controversially), this change introduces new
variables available at grammar compile time, for the major, minor, and
patch versions of the tree-sitter CLI used to compile the grammar. This
will allow grammars to conditionally adopt these new regex features
while remaining backward compatible with older versions of the CLI.
Without this part of the change, grammar authors who do not precompile
and check-in their `grammar.json` would need to wait for downstream
systems to adopt a newer tree-sitter CLI version before they could begin
to use these features.
2022-02-14 21:46:12 -08:00
|
|
|
use semver::Version;
|
2019-10-17 11:00:31 -07:00
|
|
|
use std::fs;
|
|
|
|
|
use std::io::Write;
|
2019-07-19 13:11:08 -07:00
|
|
|
use std::path::{Path, PathBuf};
|
|
|
|
|
use std::process::{Command, Stdio};
|
|
|
|
|
|
2019-01-11 13:30:45 -08:00
|
|
|
lazy_static! {
|
|
|
|
|
static ref JSON_COMMENT_REGEX: Regex = RegexBuilder::new("^\\s*//.*")
|
|
|
|
|
.multi_line(true)
|
|
|
|
|
.build()
|
|
|
|
|
.unwrap();
|
|
|
|
|
}
|
|
|
|
|
|
2019-02-12 11:06:18 -08:00
|
|
|
struct GeneratedParser {
|
|
|
|
|
c_code: String,
|
2019-03-26 13:43:10 -07:00
|
|
|
node_types_json: String,
|
2019-02-12 11:06:18 -08:00
|
|
|
}
|
|
|
|
|
|
2019-01-11 13:30:45 -08:00
|
|
|
pub fn generate_parser_in_directory(
|
2019-01-09 14:43:49 -08:00
|
|
|
repo_path: &PathBuf,
|
2019-01-14 14:07:42 -08:00
|
|
|
grammar_path: Option<&str>,
|
2022-01-17 14:45:07 -08:00
|
|
|
abi_version: usize,
|
2021-03-08 12:01:45 -08:00
|
|
|
generate_bindings: bool,
|
2019-05-20 13:25:01 -07:00
|
|
|
report_symbol_name: Option<&str>,
|
2019-01-09 14:43:49 -08:00
|
|
|
) -> Result<()> {
|
2019-07-19 13:11:08 -07:00
|
|
|
let src_path = repo_path.join("src");
|
|
|
|
|
let header_path = src_path.join("tree_sitter");
|
2019-02-14 10:57:33 -08:00
|
|
|
|
2019-07-19 13:11:08 -07:00
|
|
|
// Ensure that the output directories exist.
|
|
|
|
|
fs::create_dir_all(&src_path)?;
|
|
|
|
|
fs::create_dir_all(&header_path)?;
|
2019-02-08 15:15:47 -08:00
|
|
|
|
2019-07-19 13:11:08 -07:00
|
|
|
// Read the grammar.json.
|
2019-02-08 15:15:47 -08:00
|
|
|
let grammar_json;
|
|
|
|
|
match grammar_path {
|
|
|
|
|
Some(path) => {
|
|
|
|
|
grammar_json = load_grammar_file(path.as_ref())?;
|
|
|
|
|
}
|
|
|
|
|
None => {
|
|
|
|
|
let grammar_js_path = grammar_path.map_or(repo_path.join("grammar.js"), |s| s.into());
|
|
|
|
|
grammar_json = load_grammar_file(&grammar_js_path)?;
|
2023-07-19 22:19:22 -04:00
|
|
|
fs::write(&src_path.join("grammar.json"), &grammar_json)
|
|
|
|
|
.with_context(|| format!("Failed to write grammar.json to {:?}", src_path))?;
|
2019-02-08 15:15:47 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-07-19 13:11:08 -07:00
|
|
|
// Parse and preprocess the grammar.
|
|
|
|
|
let input_grammar = parse_grammar(&grammar_json)?;
|
|
|
|
|
let (syntax_grammar, lexical_grammar, inlines, simple_aliases) =
|
|
|
|
|
prepare_grammar(&input_grammar)?;
|
|
|
|
|
let language_name = input_grammar.name;
|
|
|
|
|
|
|
|
|
|
// Generate the parser and related files.
|
2019-10-17 11:00:31 -07:00
|
|
|
let GeneratedParser {
|
|
|
|
|
c_code,
|
|
|
|
|
node_types_json,
|
|
|
|
|
} = generate_parser_for_grammar_with_opts(
|
|
|
|
|
&language_name,
|
|
|
|
|
syntax_grammar,
|
|
|
|
|
lexical_grammar,
|
|
|
|
|
inlines,
|
|
|
|
|
simple_aliases,
|
2022-01-17 14:45:07 -08:00
|
|
|
abi_version,
|
2019-10-17 11:00:31 -07:00
|
|
|
report_symbol_name,
|
|
|
|
|
)?;
|
2019-08-28 17:14:04 -07:00
|
|
|
|
2019-10-17 11:00:31 -07:00
|
|
|
write_file(&src_path.join("parser.c"), c_code)?;
|
|
|
|
|
write_file(&src_path.join("node-types.json"), node_types_json)?;
|
2022-01-23 10:29:52 -08:00
|
|
|
write_file(&header_path.join("parser.h"), tree_sitter::PARSER_HEADER)?;
|
2019-07-19 13:11:08 -07:00
|
|
|
|
2021-03-08 12:01:45 -08:00
|
|
|
if generate_bindings {
|
|
|
|
|
binding_files::generate_binding_files(&repo_path, &language_name)?;
|
|
|
|
|
}
|
2019-10-17 11:00:31 -07:00
|
|
|
|
2019-01-09 14:43:49 -08:00
|
|
|
Ok(())
|
2019-01-07 10:23:01 -08:00
|
|
|
}
|
|
|
|
|
|
2019-02-04 14:44:06 -08:00
|
|
|
pub fn generate_parser_for_grammar(grammar_json: &str) -> Result<(String, String)> {
|
2019-01-11 13:30:45 -08:00
|
|
|
let grammar_json = JSON_COMMENT_REGEX.replace_all(grammar_json, "\n");
|
2019-07-19 13:11:08 -07:00
|
|
|
let input_grammar = parse_grammar(&grammar_json)?;
|
|
|
|
|
let (syntax_grammar, lexical_grammar, inlines, simple_aliases) =
|
|
|
|
|
prepare_grammar(&input_grammar)?;
|
|
|
|
|
let parser = generate_parser_for_grammar_with_opts(
|
|
|
|
|
&input_grammar.name,
|
|
|
|
|
syntax_grammar,
|
|
|
|
|
lexical_grammar,
|
|
|
|
|
inlines,
|
|
|
|
|
simple_aliases,
|
2022-01-17 14:45:07 -08:00
|
|
|
tree_sitter::LANGUAGE_VERSION,
|
2019-05-20 13:25:01 -07:00
|
|
|
None,
|
2019-07-19 13:11:08 -07:00
|
|
|
)?;
|
|
|
|
|
Ok((input_grammar.name, parser.c_code))
|
2019-01-11 13:30:45 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn generate_parser_for_grammar_with_opts(
|
2019-07-19 13:11:08 -07:00
|
|
|
name: &String,
|
|
|
|
|
syntax_grammar: SyntaxGrammar,
|
|
|
|
|
lexical_grammar: LexicalGrammar,
|
|
|
|
|
inlines: InlinedProductionMap,
|
|
|
|
|
simple_aliases: AliasMap,
|
2022-01-17 14:45:07 -08:00
|
|
|
abi_version: usize,
|
2019-05-20 13:25:01 -07:00
|
|
|
report_symbol_name: Option<&str>,
|
2019-02-12 11:06:18 -08:00
|
|
|
) -> Result<GeneratedParser> {
|
2019-11-13 10:38:47 -08:00
|
|
|
let variable_info =
|
|
|
|
|
node_types::get_variable_info(&syntax_grammar, &lexical_grammar, &simple_aliases)?;
|
2019-03-27 16:17:02 -07:00
|
|
|
let node_types_json = node_types::generate_node_types_json(
|
|
|
|
|
&syntax_grammar,
|
|
|
|
|
&lexical_grammar,
|
|
|
|
|
&simple_aliases,
|
|
|
|
|
&variable_info,
|
|
|
|
|
);
|
2019-01-11 13:30:45 -08:00
|
|
|
let (parse_table, main_lex_table, keyword_lex_table, keyword_capture_token) = build_tables(
|
|
|
|
|
&syntax_grammar,
|
|
|
|
|
&lexical_grammar,
|
|
|
|
|
&simple_aliases,
|
2019-03-27 16:17:02 -07:00
|
|
|
&variable_info,
|
2019-01-11 13:30:45 -08:00
|
|
|
&inlines,
|
2019-05-20 13:25:01 -07:00
|
|
|
report_symbol_name,
|
2019-01-11 13:30:45 -08:00
|
|
|
)?;
|
2019-01-16 13:53:01 -08:00
|
|
|
let c_code = render_c_code(
|
2019-07-19 13:11:08 -07:00
|
|
|
name,
|
2019-01-11 13:30:45 -08:00
|
|
|
parse_table,
|
|
|
|
|
main_lex_table,
|
|
|
|
|
keyword_lex_table,
|
|
|
|
|
keyword_capture_token,
|
|
|
|
|
syntax_grammar,
|
|
|
|
|
lexical_grammar,
|
|
|
|
|
simple_aliases,
|
2022-01-17 14:45:07 -08:00
|
|
|
abi_version,
|
2019-01-16 13:53:01 -08:00
|
|
|
);
|
2019-02-12 11:06:18 -08:00
|
|
|
Ok(GeneratedParser {
|
|
|
|
|
c_code,
|
2019-03-26 14:42:32 -07:00
|
|
|
node_types_json: serde_json::to_string_pretty(&node_types_json).unwrap(),
|
2019-02-12 11:06:18 -08:00
|
|
|
})
|
2019-01-11 13:30:45 -08:00
|
|
|
}
|
|
|
|
|
|
2021-10-22 18:47:13 -06:00
|
|
|
pub fn load_grammar_file(grammar_path: &Path) -> Result<String> {
|
2019-01-14 14:07:42 -08:00
|
|
|
match grammar_path.extension().and_then(|e| e.to_str()) {
|
2019-01-18 09:40:09 -08:00
|
|
|
Some("js") => Ok(load_js_grammar_file(grammar_path)?),
|
|
|
|
|
Some("json") => Ok(fs::read_to_string(grammar_path)?),
|
2021-06-09 12:32:22 -04:00
|
|
|
_ => Err(anyhow!(
|
2019-01-28 14:23:41 -08:00
|
|
|
"Unknown grammar file extension: {:?}",
|
|
|
|
|
grammar_path
|
2021-06-09 12:32:22 -04:00
|
|
|
)),
|
2019-01-14 14:07:42 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-02-08 15:15:47 -08:00
|
|
|
fn load_js_grammar_file(grammar_path: &Path) -> Result<String> {
|
2021-08-05 03:50:10 +03:00
|
|
|
let grammar_path = fs::canonicalize(grammar_path)?;
|
2019-01-07 10:23:01 -08:00
|
|
|
let mut node_process = Command::new("node")
|
2019-01-17 10:09:03 -08:00
|
|
|
.env("TREE_SITTER_GRAMMAR_PATH", grammar_path)
|
2019-01-07 10:23:01 -08:00
|
|
|
.stdin(Stdio::piped())
|
|
|
|
|
.stdout(Stdio::piped())
|
|
|
|
|
.spawn()
|
|
|
|
|
.expect("Failed to run `node`");
|
|
|
|
|
|
|
|
|
|
let mut node_stdin = node_process
|
|
|
|
|
.stdin
|
|
|
|
|
.take()
|
|
|
|
|
.expect("Failed to open stdin for node");
|
Expand regex support to include emojis and binary ops
The `Emoji` property alias is already present, but the actual property
is not available since it lives in a new file. This adds that file to
the `generate-unicode-categories-json`.
The `emoji-data` file follows the same format as the ones we already
consume in `generate-unicode-categories-json`, so adding emoji support
is fairly easy. his, grammars would need to hard-code a set of
unicode ranges in their own regex. The Javascript library `emoji-regex`
cannot be used because of #451.
For unclear reasons, the characters #, *, and 0-9 are marked as
`Emoji=Yes` by `emoji-data.txt`. Because of this, a grammar that wishes
to use emojis is likely to want to exclude those characters. For that
reason, this change also adds support for binary operations in regexes,
e.g. `[\p{Emoji}&&[^#*0-9]]`.
Lastly (and perhaps controversially), this change introduces new
variables available at grammar compile time, for the major, minor, and
patch versions of the tree-sitter CLI used to compile the grammar. This
will allow grammars to conditionally adopt these new regex features
while remaining backward compatible with older versions of the CLI.
Without this part of the change, grammar authors who do not precompile
and check-in their `grammar.json` would need to wait for downstream
systems to adopt a newer tree-sitter CLI version before they could begin
to use these features.
2022-02-14 21:46:12 -08:00
|
|
|
let cli_version = Version::parse(env!("CARGO_PKG_VERSION"))
|
|
|
|
|
.expect("Could not parse this package's version as semver.");
|
|
|
|
|
write!(
|
|
|
|
|
node_stdin,
|
|
|
|
|
"global.TREE_SITTER_CLI_VERSION_MAJOR = {};
|
|
|
|
|
global.TREE_SITTER_CLI_VERSION_MINOR = {};
|
|
|
|
|
global.TREE_SITTER_CLI_VERSION_PATCH = {};",
|
|
|
|
|
cli_version.major, cli_version.minor, cli_version.patch,
|
|
|
|
|
)
|
|
|
|
|
.expect("Failed to write tree-sitter version to node's stdin");
|
2019-01-17 10:09:03 -08:00
|
|
|
let javascript_code = include_bytes!("./dsl.js");
|
2019-01-17 12:50:30 -08:00
|
|
|
node_stdin
|
|
|
|
|
.write(javascript_code)
|
Expand regex support to include emojis and binary ops
The `Emoji` property alias is already present, but the actual property
is not available since it lives in a new file. This adds that file to
the `generate-unicode-categories-json`.
The `emoji-data` file follows the same format as the ones we already
consume in `generate-unicode-categories-json`, so adding emoji support
is fairly easy. his, grammars would need to hard-code a set of
unicode ranges in their own regex. The Javascript library `emoji-regex`
cannot be used because of #451.
For unclear reasons, the characters #, *, and 0-9 are marked as
`Emoji=Yes` by `emoji-data.txt`. Because of this, a grammar that wishes
to use emojis is likely to want to exclude those characters. For that
reason, this change also adds support for binary operations in regexes,
e.g. `[\p{Emoji}&&[^#*0-9]]`.
Lastly (and perhaps controversially), this change introduces new
variables available at grammar compile time, for the major, minor, and
patch versions of the tree-sitter CLI used to compile the grammar. This
will allow grammars to conditionally adopt these new regex features
while remaining backward compatible with older versions of the CLI.
Without this part of the change, grammar authors who do not precompile
and check-in their `grammar.json` would need to wait for downstream
systems to adopt a newer tree-sitter CLI version before they could begin
to use these features.
2022-02-14 21:46:12 -08:00
|
|
|
.expect("Failed to write grammar dsl to node's stdin");
|
2019-01-07 10:23:01 -08:00
|
|
|
drop(node_stdin);
|
|
|
|
|
let output = node_process
|
|
|
|
|
.wait_with_output()
|
|
|
|
|
.expect("Failed to read output from node");
|
|
|
|
|
match output.status.code() {
|
|
|
|
|
None => panic!("Node process was killed"),
|
|
|
|
|
Some(0) => {}
|
2021-06-09 12:32:22 -04:00
|
|
|
Some(code) => return Err(anyhow!("Node process exited with status {}", code)),
|
2019-01-07 10:23:01 -08:00
|
|
|
}
|
|
|
|
|
|
2019-03-12 11:54:31 -07:00
|
|
|
let mut result = String::from_utf8(output.stdout).expect("Got invalid UTF8 from node");
|
|
|
|
|
result.push('\n');
|
|
|
|
|
Ok(result)
|
2019-01-07 10:23:01 -08:00
|
|
|
}
|
2019-01-17 12:50:30 -08:00
|
|
|
|
2019-05-30 16:52:30 -07:00
|
|
|
fn write_file(path: &Path, body: impl AsRef<[u8]>) -> Result<()> {
|
2021-06-09 12:32:22 -04:00
|
|
|
fs::write(path, body)
|
|
|
|
|
.with_context(|| format!("Failed to write {:?}", path.file_name().unwrap()))
|
2019-05-30 16:52:30 -07:00
|
|
|
}
|