From 29213864442e58f167a402b231c4f206bf0e0498 Mon Sep 17 00:00:00 2001 From: Andrew Hlynskyi Date: Sun, 20 Jun 2021 13:23:59 +0300 Subject: [PATCH 01/53] feat(cli/loader): Add TREE_SITTER_INTERNAL_BUILD C/C++ compiler definition --- cli/loader/src/lib.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cli/loader/src/lib.rs b/cli/loader/src/lib.rs index 7d26ab02..89018677 100644 --- a/cli/loader/src/lib.rs +++ b/cli/loader/src/lib.rs @@ -391,6 +391,11 @@ impl Loader { .arg("-o") .arg(&library_path) .arg("-O2"); + + // For conditional compilation of external scanner code when + // used internally by `tree-siteer parse` and other sub commands. + command.arg("-DTREE_SITTER_INTERNAL_BUILD"); + if let Some(scanner_path) = scanner_path.as_ref() { if scanner_path.extension() == Some("c".as_ref()) { command.arg("-xc").arg("-std=c99").arg(scanner_path); From bc94c0cc2f4c1c22a9d7fe310037e038941c3159 Mon Sep 17 00:00:00 2001 From: Andrew Hlynskyi Date: Sat, 3 Jul 2021 03:36:01 +0300 Subject: [PATCH 02/53] fix(cli): fix theme key loading from config.json, closes #1232 --- cli/config/src/lib.rs | 1 + cli/src/main.rs | 5 ++--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cli/config/src/lib.rs b/cli/config/src/lib.rs index 7979b969..289de595 100644 --- a/cli/config/src/lib.rs +++ b/cli/config/src/lib.rs @@ -14,6 +14,7 @@ use std::{env, fs}; /// This type holds the generic JSON content of the configuration file. Individual tree-sitter /// components will use the [`get`][] method to parse that JSON to extract configuration fields /// that are specific to that component. +#[derive(Debug)] pub struct Config { pub location: PathBuf, pub config: Value, diff --git a/cli/src/main.rs b/cli/src/main.rs index 8d701852..5d3e63fe 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -418,11 +418,10 @@ fn run() -> Result<()> { if let Some(highlight_config) = language_config.highlight_config(language)? { let source = fs::read(path)?; - let theme_config = config.get()?; if html_mode { highlight::html( &loader, - &theme_config, + &theme_config.theme, &source, highlight_config, quiet, @@ -431,7 +430,7 @@ fn run() -> Result<()> { } else { highlight::ansi( &loader, - &theme_config, + &theme_config.theme, &source, highlight_config, time, From b4208ee3072e7c4085184765e8bc0c9e24dbc614 Mon Sep 17 00:00:00 2001 From: Andrew Hlynskyi Date: Sat, 3 Jul 2021 05:26:13 +0300 Subject: [PATCH 03/53] feat(cli): Add a lot of help messages for CLI options --- cli/src/main.rs | 120 +++++++++++++++++++++++++++++++----------------- 1 file changed, 78 insertions(+), 42 deletions(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index 8d701852..e5130503 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -35,6 +35,40 @@ fn run() -> Result<()> { BUILD_VERSION.to_string() }; + let debug_arg = Arg::with_name("debug") + .help("Show parsing debug log") + .long("debug") + .short("d"); + + let debug_graph_arg = Arg::with_name("debug-graph") + .help("Produce the log.html file with debug graphs") + .long("debug-graph") + .short("D"); + + let paths_file_arg = Arg::with_name("paths-file") + .help("The path to a file with paths to source file(s)") + .long("paths") + .takes_value(true); + + let paths_arg = Arg::with_name("paths") + .help("The source file(s) to use") + .multiple(true); + + let scope_arg = Arg::with_name("scope") + .help("Select a language by the scope instead of a file extension") + .long("scope") + .takes_value(true); + + let time_arg = Arg::with_name("time") + .help("Measure execution time") + .long("time") + .short("t"); + + let quiet_arg = Arg::with_name("quiet") + .help("Suppress main output") + .long("quiet") + .short("q"); + let matches = App::new("tree-sitter") .author("Max Brunsfeld ") .about("Generates and tests parsers") @@ -65,23 +99,29 @@ fn run() -> Result<()> { SubCommand::with_name("parse") .alias("p") .about("Parse files") - .arg(Arg::with_name("paths-file").long("paths").takes_value(true)) - .arg( - Arg::with_name("paths") - .index(1) - .multiple(true) - .required(false), - ) - .arg(Arg::with_name("scope").long("scope").takes_value(true)) - .arg(Arg::with_name("debug").long("debug").short("d")) - .arg(Arg::with_name("debug-graph").long("debug-graph").short("D")) + .arg(&paths_file_arg) + .arg(&paths_arg) + .arg(&scope_arg) + .arg(&debug_arg) + .arg(&debug_graph_arg) .arg(Arg::with_name("debug-xml").long("xml").short("x")) - .arg(Arg::with_name("quiet").long("quiet").short("q")) - .arg(Arg::with_name("stat").long("stat").short("s")) - .arg(Arg::with_name("time").long("time").short("t")) - .arg(Arg::with_name("timeout").long("timeout").takes_value(true)) + .arg( + Arg::with_name("stat") + .help("Show parsing statistic") + .long("stat") + .short("s"), + ) + .arg( + Arg::with_name("timeout") + .help("Interrupt the parsing process by timeout (µs)") + .long("timeout") + .takes_value(true), + ) + .arg(&time_arg) + .arg(&quiet_arg) .arg( Arg::with_name("edits") + .help("Apply edits in the format: \"row,col del_count insert_text\"") .long("edit") .short("edit") .takes_value(true) @@ -93,36 +133,32 @@ fn run() -> Result<()> { SubCommand::with_name("query") .alias("q") .about("Search files using a syntax tree query") - .arg(Arg::with_name("query-path").index(1).required(true)) - .arg(Arg::with_name("paths-file").long("paths").takes_value(true)) .arg( - Arg::with_name("paths") - .index(2) - .multiple(true) - .required(false), + Arg::with_name("query-path") + .help("Path to a file with queries") + .index(1) + .required(true), ) + .arg(&paths_file_arg) + .arg(&paths_arg.clone().index(2)) .arg( Arg::with_name("byte-range") .help("The range of byte offsets in which the query will be executed") .long("byte-range") .takes_value(true), ) - .arg(Arg::with_name("scope").long("scope").takes_value(true)) + .arg(&scope_arg) .arg(Arg::with_name("captures").long("captures").short("c")) .arg(Arg::with_name("test").long("test")), ) .subcommand( SubCommand::with_name("tags") - .arg(Arg::with_name("quiet").long("quiet").short("q")) - .arg(Arg::with_name("time").long("time").short("t")) - .arg(Arg::with_name("scope").long("scope").takes_value(true)) - .arg(Arg::with_name("paths-file").long("paths").takes_value(true)) - .arg( - Arg::with_name("paths") - .help("The source file to use") - .index(1) - .multiple(true), - ), + .about("Generate a list of tags") + .arg(&scope_arg) + .arg(&time_arg) + .arg(&quiet_arg) + .arg(&paths_file_arg) + .arg(&paths_arg), ) .subcommand( SubCommand::with_name("test") @@ -141,23 +177,23 @@ fn run() -> Result<()> { .short("u") .help("Update all syntax trees in corpus files with current parser output"), ) - .arg(Arg::with_name("debug").long("debug").short("d")) - .arg(Arg::with_name("debug-graph").long("debug-graph").short("D")), + .arg(&debug_arg) + .arg(&debug_graph_arg), ) .subcommand( SubCommand::with_name("highlight") .about("Highlight a file") - .arg(Arg::with_name("paths-file").long("paths").takes_value(true)) .arg( - Arg::with_name("paths") - .index(1) - .multiple(true) - .required(false), + Arg::with_name("html") + .help("Generate highlighting as an HTML document") + .long("html") + .short("H"), ) - .arg(Arg::with_name("scope").long("scope").takes_value(true)) - .arg(Arg::with_name("html").long("html").short("H")) - .arg(Arg::with_name("time").long("time").short("t")) - .arg(Arg::with_name("quiet").long("quiet").short("q")), + .arg(&scope_arg) + .arg(&time_arg) + .arg(&quiet_arg) + .arg(&paths_file_arg) + .arg(&paths_arg), ) .subcommand( SubCommand::with_name("build-wasm") From 4adc2f5c882e97b3ef7206960972b4646fff2c3c Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Sun, 11 Jul 2021 10:25:42 -0500 Subject: [PATCH 04/53] Docs: document `_` wildcard node Closes https://github.com/tree-sitter/tree-sitter/issues/1253 --- docs/section-2-using-parsers.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/section-2-using-parsers.md b/docs/section-2-using-parsers.md index 06aa0c00..86b5d750 100644 --- a/docs/section-2-using-parsers.md +++ b/docs/section-2-using-parsers.md @@ -586,8 +586,10 @@ This pattern would match a set of possible keyword tokens, capturing them as `@k #### Wildcard Node -A wildcard node is represented with an underscore (`(_)`), it matches any node. +A wildcard node is represented with an underscore (`_`), it matches any node. This is similar to `.` in regular expressions. +There are two types, `(_)` will match any named node, +and `_` will match any named or anonymous node. For example, this pattern would match any node inside a call: From 9cb732859ff894c246a1b7cb724b9567eb4ba161 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 12 Jul 2021 12:21:10 -0700 Subject: [PATCH 05/53] Update unit test to reflect HTML grammar change --- cli/src/tests/parser_test.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cli/src/tests/parser_test.rs b/cli/src/tests/parser_test.rs index 58cd1880..417513d9 100644 --- a/cli/src/tests/parser_test.rs +++ b/cli/src/tests/parser_test.rs @@ -849,7 +849,10 @@ fn test_parsing_with_multiple_included_ranges() { hello_text_node.start_byte(), source_code.find("Hello").unwrap() ); - assert_eq!(hello_text_node.end_byte(), source_code.find("").unwrap()); + assert_eq!( + hello_text_node.end_byte(), + source_code.find(" ").unwrap() + ); assert_eq!(b_start_tag_node.kind(), "start_tag"); assert_eq!( From 7f538170bfb45ed6f647236f4bc87306a2eb21b6 Mon Sep 17 00:00:00 2001 From: Andrew Hlynskyi Date: Thu, 15 Jul 2021 08:26:14 +0300 Subject: [PATCH 06/53] fix(parser): count rows in the debug log from 0 --- cli/src/tests/parser_test.rs | 7 ++++++- lib/src/parser.c | 6 +++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/cli/src/tests/parser_test.rs b/cli/src/tests/parser_test.rs index 417513d9..a24ed4bb 100644 --- a/cli/src/tests/parser_test.rs +++ b/cli/src/tests/parser_test.rs @@ -63,9 +63,14 @@ fn test_parsing_with_logging() { ))); assert!(messages.contains(&(LogType::Lex, "skip character:' '".to_string()))); + let mut row_starts_from_0 = false; for (_, m) in &messages { - assert!(!m.contains("row:0")); + if m.contains("row:0") { + row_starts_from_0 = true; + break; + } } + assert!(row_starts_from_0); } #[test] diff --git a/lib/src/parser.c b/lib/src/parser.c index 0f0b4ac4..bf9b7f3b 100644 --- a/lib/src/parser.c +++ b/lib/src/parser.c @@ -417,7 +417,7 @@ static Subtree ts_parser__lex( LOG( "lex_external state:%d, row:%u, column:%u", lex_mode.external_lex_state, - current_position.extent.row + 1, + current_position.extent.row, current_position.extent.column ); ts_lexer_start(&self->lexer); @@ -456,7 +456,7 @@ static Subtree ts_parser__lex( LOG( "lex_internal state:%d, row:%u, column:%u", lex_mode.lex_state, - current_position.extent.row + 1, + current_position.extent.row, current_position.extent.column ); ts_lexer_start(&self->lexer); @@ -1884,7 +1884,7 @@ TSTree *ts_parser_parse( LOG("process version:%d, version_count:%u, state:%d, row:%u, col:%u", version, ts_stack_version_count(self->stack), ts_stack_state(self->stack, version), - ts_stack_position(self->stack, version).extent.row + 1, + ts_stack_position(self->stack, version).extent.row, ts_stack_position(self->stack, version).extent.column); if (!ts_parser__advance(self, version, allow_node_reuse)) return NULL; From 13108baef7e0b8d7ced54619550e5a11d9cb465e Mon Sep 17 00:00:00 2001 From: Andrew Hlynskyi Date: Sat, 3 Jul 2021 04:07:47 +0300 Subject: [PATCH 07/53] fix(cli): Improve error messages on config.json loading, closes #1227 --- cli/config/src/lib.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cli/config/src/lib.rs b/cli/config/src/lib.rs index 7979b969..cffe7aa7 100644 --- a/cli/config/src/lib.rs +++ b/cli/config/src/lib.rs @@ -1,6 +1,6 @@ //! Manages tree-sitter's configuration file. -use anyhow::{anyhow, Result}; +use anyhow::{anyhow, Context, Result}; use serde::{Deserialize, Serialize}; use serde_json::Value; use std::path::PathBuf; @@ -64,8 +64,10 @@ impl Config { Some(location) => location, None => return Config::initial(), }; - let content = fs::read_to_string(&location)?; - let config = serde_json::from_str(&content)?; + let content = fs::read_to_string(&location) + .with_context(|| format!("Failed to read {}", &location.to_string_lossy()))?; + let config = serde_json::from_str(&content) + .with_context(|| format!("Bad JSON config {}", &location.to_string_lossy()))?; Ok(Config { location, config }) } From 505bc5e1af321dabb97cb6fe538dfbf888ecdd4c Mon Sep 17 00:00:00 2001 From: Vladimir Panteleev Date: Mon, 19 Jul 2021 10:44:19 +0000 Subject: [PATCH 08/53] feat(cli): Make "test" output more readable --- cli/src/test.rs | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/cli/src/test.rs b/cli/src/test.rs index 9c6987d7..acda8ae9 100644 --- a/cli/src/test.rs +++ b/cli/src/test.rs @@ -114,7 +114,9 @@ pub fn run_tests_at_path( print_diff_key(); for (i, (name, actual, expected)) in failures.iter().enumerate() { println!("\n {}. {}:", i + 1, name); - print_diff(actual, expected); + let actual = format_sexp_indented(&actual, 2); + let expected = format_sexp_indented(&expected, 2); + print_diff(&actual, &expected); } Err(anyhow!("")) } @@ -153,8 +155,7 @@ pub fn print_diff_key() { } pub fn print_diff(actual: &String, expected: &String) { - let changeset = Changeset::new(actual, expected, " "); - print!(" "); + let changeset = Changeset::new(actual, expected, "\n"); for diff in &changeset.diffs { match diff { Difference::Same(part) => { @@ -263,9 +264,13 @@ fn run_tests( } fn format_sexp(sexp: &String) -> String { + format_sexp_indented(sexp, 0) +} + +fn format_sexp_indented(sexp: &String, initial_indent_level: u32) -> String { let mut formatted = String::new(); - let mut indent_level = 0; + let mut indent_level = initial_indent_level; let mut has_field = false; let mut s_iter = sexp.split(|c| c == ' ' || c == ')'); while let Some(s) = s_iter.next() { From 66cfc05d76651f5e7ab961db57ecb8e1eff2795a Mon Sep 17 00:00:00 2001 From: Niklas Mohrin <47574893+niklasmohrin@users.noreply.github.com> Date: Sun, 25 Jul 2021 14:08:46 +0200 Subject: [PATCH 09/53] Fix highlighting typo on "creating parsers" site --- docs/section-3-creating-parsers.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/section-3-creating-parsers.md b/docs/section-3-creating-parsers.md index 83e5a1c9..4433ea00 100644 --- a/docs/section-3-creating-parsers.md +++ b/docs/section-3-creating-parsers.md @@ -95,7 +95,7 @@ Let's go over all of the functionality of the `tree-sitter` command line tool. ### Command: `generate` -The most important command you'll use is `tree-sitter generate`. This command reads the `grammar.js` file in your current working directory and creates a file called `src/parser.c`, which implements the parser. After making changes to your grammar, just run `tree-sitter` generate again. +The most important command you'll use is `tree-sitter generate`. This command reads the `grammar.js` file in your current working directory and creates a file called `src/parser.c`, which implements the parser. After making changes to your grammar, just run `tree-sitter generate` again. The first time you run `tree-sitter generate`, it will also generate a few other files: From d8ac8f5d166be2578cfbb8b1bfbaa7d126e6533a Mon Sep 17 00:00:00 2001 From: Andrew Hlynskyi Date: Wed, 4 Aug 2021 03:38:58 +0300 Subject: [PATCH 10/53] fix(cli): allow dead code in Logger --- cli/src/logger.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/cli/src/logger.rs b/cli/src/logger.rs index 6abe6470..ce4f74a3 100644 --- a/cli/src/logger.rs +++ b/cli/src/logger.rs @@ -1,5 +1,6 @@ use log::{LevelFilter, Log, Metadata, Record}; +#[allow(dead_code)] struct Logger { pub filter: Option, } From 99243ddaf2620c8a8f29744c17b0e7bfca97debb Mon Sep 17 00:00:00 2001 From: Kolja Lampe Date: Thu, 29 Jul 2021 22:39:37 +0200 Subject: [PATCH 11/53] Correct quiet description for playground --- cli/src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index cbe1f0ca..f64e4973 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -216,7 +216,7 @@ fn run() -> Result<()> { Arg::with_name("quiet") .long("quiet") .short("q") - .help("open in default browser"), + .help("Don't open in default browser"), ), ) .subcommand( From c21bec371673f5e5994544521a958c7dd8a8e3d9 Mon Sep 17 00:00:00 2001 From: Kolja Lampe Date: Thu, 29 Jul 2021 22:40:13 +0200 Subject: [PATCH 12/53] Always print where the playground is running --- cli/src/web_ui.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cli/src/web_ui.rs b/cli/src/web_ui.rs index bbdbd381..d3b51ade 100644 --- a/cli/src/web_ui.rs +++ b/cli/src/web_ui.rs @@ -73,8 +73,9 @@ pub fn serve(grammar_path: &Path, open_in_browser: bool) { ) }) .unwrap(); + println!("Started playground on '{}'", url); if open_in_browser { - if let Err(_) = webbrowser::open(&format!("http://127.0.0.1:{}", port)) { + if let Err(_) = webbrowser::open(&url) { eprintln!("Failed to open '{}' in a web browser", url); } } From 533073cdb5ffcd73a985ee01e7b29ac5c17d35d0 Mon Sep 17 00:00:00 2001 From: Andrew Hlynskyi Date: Thu, 5 Aug 2021 03:50:10 +0300 Subject: [PATCH 13/53] fix(cli): Remove tree-sitter grammar ./... call limitation --- cli/src/generate/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/cli/src/generate/mod.rs b/cli/src/generate/mod.rs index 123e6ffa..141fdff0 100644 --- a/cli/src/generate/mod.rs +++ b/cli/src/generate/mod.rs @@ -169,6 +169,7 @@ fn load_grammar_file(grammar_path: &Path) -> Result { } fn load_js_grammar_file(grammar_path: &Path) -> Result { + let grammar_path = fs::canonicalize(grammar_path)?; let mut node_process = Command::new("node") .env("TREE_SITTER_GRAMMAR_PATH", grammar_path) .stdin(Stdio::piped()) From cf69a2c94cef61b8d07e80832e6e92cc78217801 Mon Sep 17 00:00:00 2001 From: Paul Gey Date: Wed, 4 Aug 2021 21:15:55 +0200 Subject: [PATCH 14/53] Use `IndexMap` and `FxHash` for some hot hash maps --- Cargo.lock | 8 +++++++ cli/Cargo.toml | 2 ++ .../build_tables/build_parse_table.rs | 22 +++++++++---------- cli/src/generate/tables.rs | 11 +++++++--- 4 files changed, 28 insertions(+), 15 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a6be4cdc..d71fa1bc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -495,6 +495,12 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "ryu" version = "1.0.5" @@ -689,11 +695,13 @@ dependencies = [ "dirs", "glob", "html-escape", + "indexmap", "lazy_static", "log", "rand", "regex", "regex-syntax", + "rustc-hash", "serde", "serde_derive", "serde_json", diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 6ec541dd..8d280e5d 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -27,9 +27,11 @@ difference = "2.0" dirs = "3.0" glob = "0.3.0" html-escape = "0.2.6" +indexmap = "1" lazy_static = "1.2.0" regex = "1" regex-syntax = "0.6.4" +rustc-hash = "1" serde = "1.0" serde_derive = "1.0" smallbitvec = "2.3.0" diff --git a/cli/src/generate/build_tables/build_parse_table.rs b/cli/src/generate/build_tables/build_parse_table.rs index bcce614a..59ee631d 100644 --- a/cli/src/generate/build_tables/build_parse_table.rs +++ b/cli/src/generate/build_tables/build_parse_table.rs @@ -11,10 +11,14 @@ use crate::generate::tables::{ ProductionInfo, ProductionInfoId, }; use anyhow::{anyhow, Result}; +use std::cmp::Ordering; use std::collections::{BTreeMap, HashMap, HashSet, VecDeque}; use std::fmt::Write; +use std::hash::BuildHasherDefault; use std::u32; -use std::{cmp::Ordering, collections::hash_map::Entry}; + +use indexmap::{map::Entry, IndexMap}; +use rustc_hash::FxHasher; // For conflict reporting, each parse state is associated with an example // sequence of symbols that could lead to that parse state. @@ -49,7 +53,7 @@ struct ParseTableBuilder<'a> { lexical_grammar: &'a LexicalGrammar, variable_info: &'a Vec, core_ids_by_core: HashMap, usize>, - state_ids_by_item_set: HashMap, ParseStateId>, + state_ids_by_item_set: IndexMap, ParseStateId, BuildHasherDefault>, parse_state_info_by_id: Vec>, parse_state_queue: VecDeque, non_terminal_extra_states: Vec<(Symbol, usize)>, @@ -147,13 +151,7 @@ impl<'a> ParseTableBuilder<'a> { Entry::Vacant(v) => { let core = v.key().core(); let core_count = self.core_ids_by_core.len(); - let core_id = match self.core_ids_by_core.entry(core) { - Entry::Occupied(e) => *e.get(), - Entry::Vacant(e) => { - e.insert(core_count); - core_count - } - }; + let core_id = *self.core_ids_by_core.entry(core).or_insert(core_count); let state_id = self.parse_table.states.len(); self.parse_state_info_by_id @@ -163,8 +161,8 @@ impl<'a> ParseTableBuilder<'a> { id: state_id, lex_state_id: 0, external_lex_state_id: 0, - terminal_entries: HashMap::new(), - nonterminal_entries: HashMap::new(), + terminal_entries: IndexMap::default(), + nonterminal_entries: IndexMap::default(), core_id, }); self.parse_state_queue.push_back(ParseStateQueueEntry { @@ -981,7 +979,7 @@ pub(crate) fn build_parse_table<'a>( item_set_builder, variable_info, non_terminal_extra_states: Vec::new(), - state_ids_by_item_set: HashMap::new(), + state_ids_by_item_set: IndexMap::default(), core_ids_by_core: HashMap::new(), parse_state_info_by_id: Vec::new(), parse_state_queue: VecDeque::new(), diff --git a/cli/src/generate/tables.rs b/cli/src/generate/tables.rs index ccbf8895..799fe02d 100644 --- a/cli/src/generate/tables.rs +++ b/cli/src/generate/tables.rs @@ -1,10 +1,15 @@ use super::nfa::CharacterSet; use super::rules::{Alias, Symbol, TokenSet}; -use std::collections::{BTreeMap, HashMap}; +use std::collections::BTreeMap; pub(crate) type ProductionInfoId = usize; pub(crate) type ParseStateId = usize; pub(crate) type LexStateId = usize; +use std::hash::BuildHasherDefault; + +use indexmap::IndexMap; +use rustc_hash::FxHasher; + #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] pub(crate) enum ParseAction { Accept, @@ -37,8 +42,8 @@ pub(crate) struct ParseTableEntry { #[derive(Clone, Debug, Default, PartialEq, Eq)] pub(crate) struct ParseState { pub id: ParseStateId, - pub terminal_entries: HashMap, - pub nonterminal_entries: HashMap, + pub terminal_entries: IndexMap>, + pub nonterminal_entries: IndexMap>, pub lex_state_id: usize, pub external_lex_state_id: usize, pub core_id: usize, From 965e3c9e5e968cd54d119cfbbef6604c5f4f7e3c Mon Sep 17 00:00:00 2001 From: Paul Gey Date: Wed, 4 Aug 2021 21:31:25 +0200 Subject: [PATCH 15/53] `Generator::add_parse_table`: Store entries in hash map This avoids a quadratic behaviour due to repeatedly using `find` on a growing `Vec`. --- cli/src/generate/render.rs | 23 ++++++++++++++--------- cli/src/generate/tables.rs | 4 ++-- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/cli/src/generate/render.rs b/cli/src/generate/render.rs index 78a07a22..613776bf 100644 --- a/cli/src/generate/render.rs +++ b/cli/src/generate/render.rs @@ -1057,7 +1057,7 @@ impl Generator { } fn add_parse_table(&mut self) { - let mut parse_table_entries = Vec::new(); + let mut parse_table_entries = HashMap::new(); let mut next_parse_action_list_index = 0; self.get_parse_action_list_id( @@ -1224,6 +1224,11 @@ impl Generator { add_line!(self, ""); } + let mut parse_table_entries: Vec<_> = parse_table_entries + .into_iter() + .map(|(entry, i)| (i, entry)) + .collect(); + parse_table_entries.sort_by_key(|(index, _)| *index); self.add_parse_action_list(parse_table_entries); } @@ -1404,17 +1409,17 @@ impl Generator { fn get_parse_action_list_id( &self, entry: &ParseTableEntry, - parse_table_entries: &mut Vec<(usize, ParseTableEntry)>, + parse_table_entries: &mut HashMap, next_parse_action_list_index: &mut usize, ) -> usize { - if let Some((index, _)) = parse_table_entries.iter().find(|(_, e)| *e == *entry) { - return *index; + if let Some(&index) = parse_table_entries.get(entry) { + index + } else { + let result = *next_parse_action_list_index; + parse_table_entries.insert(entry.clone(), result); + *next_parse_action_list_index += 1 + entry.actions.len(); + result } - - let result = *next_parse_action_list_index; - parse_table_entries.push((result, entry.clone())); - *next_parse_action_list_index += 1 + entry.actions.len(); - result } fn get_field_map_id( diff --git a/cli/src/generate/tables.rs b/cli/src/generate/tables.rs index 799fe02d..16bf1851 100644 --- a/cli/src/generate/tables.rs +++ b/cli/src/generate/tables.rs @@ -10,7 +10,7 @@ use std::hash::BuildHasherDefault; use indexmap::IndexMap; use rustc_hash::FxHasher; -#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] pub(crate) enum ParseAction { Accept, Shift { @@ -33,7 +33,7 @@ pub(crate) enum GotoAction { ShiftExtra, } -#[derive(Clone, Debug, PartialEq, Eq)] +#[derive(Clone, Debug, PartialEq, Eq, Hash)] pub(crate) struct ParseTableEntry { pub actions: Vec, pub reusable: bool, From f34c13d2a7bd0d5fca47387def3a5bcf3951d339 Mon Sep 17 00:00:00 2001 From: Paul Gey Date: Sun, 8 Aug 2021 16:31:10 +0200 Subject: [PATCH 16/53] Update `smallbitvec` dependency --- Cargo.lock | 4 ++-- cli/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d71fa1bc..61262a3a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -547,9 +547,9 @@ dependencies = [ [[package]] name = "smallbitvec" -version = "2.5.0" +version = "2.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "797a4eaffb90d896f29698d45676f9f940a71936d7574996a7df54593ba209fa" +checksum = "75ce4f9dc4a41b4c3476cc925f1efb11b66df373a8fde5d4b8915fa91b5d995e" [[package]] name = "spin" diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 8d280e5d..75f7a95b 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -34,7 +34,7 @@ regex-syntax = "0.6.4" rustc-hash = "1" serde = "1.0" serde_derive = "1.0" -smallbitvec = "2.3.0" +smallbitvec = "2.5.1" tiny_http = "0.8" walkdir = "2.3" webbrowser = "0.5.1" From a533e4d7bb77291650dde293d6abbc4fea886d67 Mon Sep 17 00:00:00 2001 From: Paul Gey Date: Sat, 14 Aug 2021 15:01:16 +0200 Subject: [PATCH 17/53] Remove unnecessary borrows This produces an `unused_must_use` warning on nightly: https://github.com/rust-lang/rust/pull/86426 --- cli/src/generate/build_tables/build_lex_table.rs | 2 +- cli/src/generate/build_tables/minimize_parse_table.rs | 2 +- highlight/src/lib.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cli/src/generate/build_tables/build_lex_table.rs b/cli/src/generate/build_tables/build_lex_table.rs index b365feb1..d3ebb241 100644 --- a/cli/src/generate/build_tables/build_lex_table.rs +++ b/cli/src/generate/build_tables/build_lex_table.rs @@ -347,7 +347,7 @@ fn lex_states_differ( fn sort_states(table: &mut LexTable, parse_table: &mut ParseTable) { // Get a mapping of old state index -> new_state_index let mut old_ids_by_new_id = (0..table.states.len()).collect::>(); - &old_ids_by_new_id[1..].sort_by_key(|id| &table.states[*id]); + old_ids_by_new_id[1..].sort_by_key(|id| &table.states[*id]); // Get the inverse mapping let mut new_ids_by_old_id = vec![0; old_ids_by_new_id.len()]; diff --git a/cli/src/generate/build_tables/minimize_parse_table.rs b/cli/src/generate/build_tables/minimize_parse_table.rs index 4c2224c4..d10bea56 100644 --- a/cli/src/generate/build_tables/minimize_parse_table.rs +++ b/cli/src/generate/build_tables/minimize_parse_table.rs @@ -479,7 +479,7 @@ impl<'a> Minimizer<'a> { fn reorder_states_by_descending_size(&mut self) { // Get a mapping of old state index -> new_state_index let mut old_ids_by_new_id = (0..self.parse_table.states.len()).collect::>(); - &old_ids_by_new_id.sort_unstable_by_key(|i| { + old_ids_by_new_id.sort_unstable_by_key(|i| { // Don't changes states 0 (the error state) or 1 (the start state). if *i <= 1 { return *i as i64 - 1_000_000; diff --git a/highlight/src/lib.rs b/highlight/src/lib.rs index b7bfeba8..58d7e88c 100644 --- a/highlight/src/lib.rs +++ b/highlight/src/lib.rs @@ -586,7 +586,7 @@ where break; } if i > 0 { - &self.layers[0..(i + 1)].rotate_left(1); + self.layers[0..(i + 1)].rotate_left(1); } break; } else { From e030434ca75a8f4e5b1bbd38361a7cbe64777aab Mon Sep 17 00:00:00 2001 From: FnControlOption <70830482+FnControlOption@users.noreply.github.com> Date: Wed, 18 Aug 2021 22:22:46 -0700 Subject: [PATCH 18/53] Handle aliases in unicode property escapes in regexes --- .../generate/prepare_grammar/expand_tokens.rs | 18 ++- .../unicode-category-aliases.json | 1 + .../unicode-property-aliases.json | 1 + script/generate-unicode-categories-json | 119 +++++++++++++++++- .../test_grammars/unicode_classes/corpus.txt | 11 ++ .../unicode_classes/grammar.json | 8 +- 6 files changed, 150 insertions(+), 8 deletions(-) create mode 100644 cli/src/generate/prepare_grammar/unicode-category-aliases.json create mode 100644 cli/src/generate/prepare_grammar/unicode-property-aliases.json diff --git a/cli/src/generate/prepare_grammar/expand_tokens.rs b/cli/src/generate/prepare_grammar/expand_tokens.rs index a5fe318b..4950348f 100644 --- a/cli/src/generate/prepare_grammar/expand_tokens.rs +++ b/cli/src/generate/prepare_grammar/expand_tokens.rs @@ -19,10 +19,16 @@ lazy_static! { serde_json::from_str(UNICODE_CATEGORIES_JSON).unwrap(); static ref UNICODE_PROPERTIES: HashMap<&'static str, Vec> = serde_json::from_str(UNICODE_PROPERTIES_JSON).unwrap(); + static ref UNICODE_CATEGORY_ALIASES: HashMap<&'static str, String> = + serde_json::from_str(UNICODE_CATEGORY_ALIASES_JSON).unwrap(); + static ref UNICODE_PROPERTY_ALIASES: HashMap<&'static str, String> = + serde_json::from_str(UNICODE_PROPERTY_ALIASES_JSON).unwrap(); } const UNICODE_CATEGORIES_JSON: &'static str = include_str!("./unicode-categories.json"); const UNICODE_PROPERTIES_JSON: &'static str = include_str!("./unicode-properties.json"); +const UNICODE_CATEGORY_ALIASES_JSON: &'static str = include_str!("./unicode-category-aliases.json"); +const UNICODE_PROPERTY_ALIASES_JSON: &'static str = include_str!("./unicode-property-aliases.json"); const ALLOWED_REDUNDANT_ESCAPED_CHARS: [char; 4] = ['!', '\'', '"', '/']; struct NfaBuilder { @@ -394,12 +400,16 @@ impl NfaBuilder { category_letter = le.to_string(); } ClassUnicodeKind::Named(class_name) => { - if class_name.len() == 1 { - category_letter = class_name.clone(); + let actual_class_name = UNICODE_CATEGORY_ALIASES + .get(class_name.as_str()) + .or_else(|| UNICODE_PROPERTY_ALIASES.get(class_name.as_str())) + .unwrap_or(class_name); + if actual_class_name.len() == 1 { + category_letter = actual_class_name.clone(); } else { let code_points = UNICODE_CATEGORIES - .get(class_name.as_str()) - .or_else(|| UNICODE_PROPERTIES.get(class_name.as_str())) + .get(actual_class_name.as_str()) + .or_else(|| UNICODE_PROPERTIES.get(actual_class_name.as_str())) .ok_or_else(|| { anyhow!( "Regex error: Unsupported unicode character class {}", diff --git a/cli/src/generate/prepare_grammar/unicode-category-aliases.json b/cli/src/generate/prepare_grammar/unicode-category-aliases.json new file mode 100644 index 00000000..c7091c05 --- /dev/null +++ b/cli/src/generate/prepare_grammar/unicode-category-aliases.json @@ -0,0 +1 @@ +{"Other":"C","Control":"Cc","cntrl":"Cc","Format":"Cf","Unassigned":"Cn","Private_Use":"Co","Surrogate":"Cs","Letter":"L","Cased_Letter":"LC","Lowercase_Letter":"Ll","Modifier_Letter":"Lm","Other_Letter":"Lo","Titlecase_Letter":"Lt","Uppercase_Letter":"Lu","Mark":"M","Combining_Mark":"M","Spacing_Mark":"Mc","Enclosing_Mark":"Me","Nonspacing_Mark":"Mn","Number":"N","Decimal_Number":"Nd","digit":"Nd","Letter_Number":"Nl","Other_Number":"No","Punctuation":"P","punct":"P","Connector_Punctuation":"Pc","Dash_Punctuation":"Pd","Close_Punctuation":"Pe","Final_Punctuation":"Pf","Initial_Punctuation":"Pi","Other_Punctuation":"Po","Open_Punctuation":"Ps","Symbol":"S","Currency_Symbol":"Sc","Modifier_Symbol":"Sk","Math_Symbol":"Sm","Other_Symbol":"So","Separator":"Z","Line_Separator":"Zl","Paragraph_Separator":"Zp","Space_Separator":"Zs"} \ No newline at end of file diff --git a/cli/src/generate/prepare_grammar/unicode-property-aliases.json b/cli/src/generate/prepare_grammar/unicode-property-aliases.json new file mode 100644 index 00000000..2dd2e28c --- /dev/null +++ b/cli/src/generate/prepare_grammar/unicode-property-aliases.json @@ -0,0 +1 @@ +{"cjkAccountingNumeric":"kAccountingNumeric","cjkOtherNumeric":"kOtherNumeric","cjkPrimaryNumeric":"kPrimaryNumeric","nv":"Numeric_Value","cf":"Case_Folding","cjkCompatibilityVariant":"kCompatibilityVariant","dm":"Decomposition_Mapping","FC_NFKC":"FC_NFKC_Closure","lc":"Lowercase_Mapping","NFKC_CF":"NFKC_Casefold","scf":"Simple_Case_Folding","sfc":"Simple_Case_Folding","slc":"Simple_Lowercase_Mapping","stc":"Simple_Titlecase_Mapping","suc":"Simple_Uppercase_Mapping","tc":"Titlecase_Mapping","uc":"Uppercase_Mapping","bmg":"Bidi_Mirroring_Glyph","bpb":"Bidi_Paired_Bracket","cjkIICore":"kIICore","cjkIRG_GSource":"kIRG_GSource","cjkIRG_HSource":"kIRG_HSource","cjkIRG_JSource":"kIRG_JSource","cjkIRG_KPSource":"kIRG_KPSource","cjkIRG_KSource":"kIRG_KSource","cjkIRG_MSource":"kIRG_MSource","cjkIRG_SSource":"kIRG_SSource","cjkIRG_TSource":"kIRG_TSource","cjkIRG_UKSource":"kIRG_UKSource","cjkIRG_USource":"kIRG_USource","cjkIRG_VSource":"kIRG_VSource","cjkRSUnicode":"kRSUnicode","Unicode_Radical_Stroke":"kRSUnicode","URS":"kRSUnicode","EqUIdeo":"Equivalent_Unified_Ideograph","isc":"ISO_Comment","JSN":"Jamo_Short_Name","na":"Name","na1":"Unicode_1_Name","Name_Alias":"Name_Alias","scx":"Script_Extensions","age":"Age","blk":"Block","sc":"Script","bc":"Bidi_Class","bpt":"Bidi_Paired_Bracket_Type","ccc":"Canonical_Combining_Class","dt":"Decomposition_Type","ea":"East_Asian_Width","gc":"General_Category","GCB":"Grapheme_Cluster_Break","hst":"Hangul_Syllable_Type","InPC":"Indic_Positional_Category","InSC":"Indic_Syllabic_Category","jg":"Joining_Group","jt":"Joining_Type","lb":"Line_Break","NFC_QC":"NFC_Quick_Check","NFD_QC":"NFD_Quick_Check","NFKC_QC":"NFKC_Quick_Check","NFKD_QC":"NFKD_Quick_Check","nt":"Numeric_Type","SB":"Sentence_Break","vo":"Vertical_Orientation","WB":"Word_Break","AHex":"ASCII_Hex_Digit","Alpha":"Alphabetic","Bidi_C":"Bidi_Control","Bidi_M":"Bidi_Mirrored","Cased":"Cased","CE":"Composition_Exclusion","CI":"Case_Ignorable","Comp_Ex":"Full_Composition_Exclusion","CWCF":"Changes_When_Casefolded","CWCM":"Changes_When_Casemapped","CWKCF":"Changes_When_NFKC_Casefolded","CWL":"Changes_When_Lowercased","CWT":"Changes_When_Titlecased","CWU":"Changes_When_Uppercased","Dash":"Dash","Dep":"Deprecated","DI":"Default_Ignorable_Code_Point","Dia":"Diacritic","EBase":"Emoji_Modifier_Base","EComp":"Emoji_Component","EMod":"Emoji_Modifier","Emoji":"Emoji","EPres":"Emoji_Presentation","Ext":"Extender","ExtPict":"Extended_Pictographic","Gr_Base":"Grapheme_Base","Gr_Ext":"Grapheme_Extend","Gr_Link":"Grapheme_Link","Hex":"Hex_Digit","Hyphen":"Hyphen","IDC":"ID_Continue","Ideo":"Ideographic","IDS":"ID_Start","IDSB":"IDS_Binary_Operator","IDST":"IDS_Trinary_Operator","Join_C":"Join_Control","LOE":"Logical_Order_Exception","Lower":"Lowercase","Math":"Math","NChar":"Noncharacter_Code_Point","OAlpha":"Other_Alphabetic","ODI":"Other_Default_Ignorable_Code_Point","OGr_Ext":"Other_Grapheme_Extend","OIDC":"Other_ID_Continue","OIDS":"Other_ID_Start","OLower":"Other_Lowercase","OMath":"Other_Math","OUpper":"Other_Uppercase","Pat_Syn":"Pattern_Syntax","Pat_WS":"Pattern_White_Space","PCM":"Prepended_Concatenation_Mark","QMark":"Quotation_Mark","Radical":"Radical","RI":"Regional_Indicator","SD":"Soft_Dotted","STerm":"Sentence_Terminal","Term":"Terminal_Punctuation","UIdeo":"Unified_Ideograph","Upper":"Uppercase","VS":"Variation_Selector","WSpace":"White_Space","space":"White_Space","XIDC":"XID_Continue","XIDS":"XID_Start","XO_NFC":"Expands_On_NFC","XO_NFD":"Expands_On_NFD","XO_NFKC":"Expands_On_NFKC","XO_NFKD":"Expands_On_NFKD"} \ No newline at end of file diff --git a/script/generate-unicode-categories-json b/script/generate-unicode-categories-json index 2dd36c3a..a106862e 100755 --- a/script/generate-unicode-categories-json +++ b/script/generate-unicode-categories-json @@ -4,10 +4,14 @@ const CATEGORY_OUTPUT_PATH = './cli/src/generate/prepare_grammar/unicode-categories.json' const PROPERTY_OUTPUT_PATH = './cli/src/generate/prepare_grammar/unicode-properties.json' +const CATEGORY_ALIAS_OUTPUT_PATH = './cli/src/generate/prepare_grammar/unicode-category-aliases.json' +const PROPERTY_ALIAS_OUTPUT_PATH = './cli/src/generate/prepare_grammar/unicode-property-aliases.json' const CATEGORY_URL = 'https://unicode.org/Public/13.0.0/ucd/UnicodeData.txt' const PROPERTY_URL = 'https://unicode.org/Public/13.0.0/ucd/PropList.txt' const DERIVED_PROPERTY_URL = 'https://unicode.org/Public/13.0.0/ucd/DerivedCoreProperties.txt' +const CATEGORY_ALIAS_URL = 'https://unicode.org/Public/13.0.0/ucd/PropertyValueAliases.txt' +const PROPERTY_ALIAS_URL = 'https://unicode.org/Public/13.0.0/ucd/PropertyAliases.txt' const fs = require('fs'); const path = require('path'); @@ -16,7 +20,9 @@ const {spawnSync} = require('child_process'); // Download the unicode data files, caching them inside the 'target' directory. const categoryData = cachedDownload(CATEGORY_URL); const propertyData = cachedDownload(PROPERTY_URL); -const derivedPopertyData = cachedDownload(DERIVED_PROPERTY_URL); +const derivedPropertyData = cachedDownload(DERIVED_PROPERTY_URL); +const categoryAliasData = cachedDownload(CATEGORY_ALIAS_URL); +const propertyAliasData = cachedDownload(PROPERTY_ALIAS_URL); function cachedDownload(url) { let downloadPath = path.join('.', 'target', path.basename(url)) if (fs.existsSync(downloadPath)) { @@ -30,10 +36,12 @@ function cachedDownload(url) { const categories = {}; const properties = {}; +const categoryAliases = {}; +const propertyAliases = {} let data, row, lineStart, lineEnd; // Parse the properties -data = propertyData + derivedPopertyData; +data = propertyData + derivedPropertyData; row = 0; lineStart = 0; lineEnd = -1; @@ -106,7 +114,7 @@ while (lineStart < data.length) { if ( nameStart === 0 || categoryStart == 0 || - categoryEnd === 0 + categoryEnd === -1 ) { throw new Error(`Unexpected format on line ${row}`); } @@ -124,5 +132,110 @@ while (lineStart < data.length) { categories[category].push(codePoint); } +// Parse the category aliases +data = categoryAliasData; +row = 0; +lineStart = 0; +lineEnd = -1; +const IGNORE = /[#\s]/ +while (lineStart < data.length) { + row++; + lineStart = lineEnd + 1; + lineEnd = data.indexOf('\n', lineStart); + if (lineEnd === -1) break; + + // Skip over blank and comment lines + if (IGNORE.test(data[lineStart])) continue; + + // Parse the first three semicolon-separated fields: + // * property value type + // * short name + // * long name + // Other aliases may be listed in additional fields + const propertyValueTypeEnd = data.indexOf(';', lineStart); + const shortNameStart = propertyValueTypeEnd + 1; + const shortNameEnd = data.indexOf(';', shortNameStart); + const longNameStart = shortNameEnd + 1; + if ( + shortNameStart === 0 || + longNameStart === 0 + ) { + throw new Error(`Unexpected format on line ${row}`); + } + + const propertyValueType = data.slice(lineStart, propertyValueTypeEnd).trim(); + const shortName = data.slice(shortNameStart, shortNameEnd).trim(); + + // Filter for General_Category lines + if (propertyValueType !== 'gc') continue; + + let aliasStart = longNameStart; + let lineDone = false; + do { + let aliasEnd = data.indexOf(';', aliasStart); + if (aliasEnd === -1 || aliasEnd > lineEnd) { + aliasEnd = data.indexOf('#', aliasStart); + if (aliasEnd === -1 || aliasEnd > lineEnd) { + aliasEnd = lineEnd; + } + lineDone = true; + } + const alias = data.slice(aliasStart, aliasEnd).trim(); + console.log(alias, shortName); + categoryAliases[alias] = shortName; + aliasStart = aliasEnd + 1; + } while (!lineDone); +} + +// Parse the property aliases +data = propertyAliasData; +row = 0; +lineStart = 0; +lineEnd = -1; +while (lineStart < data.length) { + row++; + lineStart = lineEnd + 1; + lineEnd = data.indexOf('\n', lineStart); + if (lineEnd === -1) break; + + // Skip over blank and comment lines + if (IGNORE.test(data[lineStart])) continue; + + // Parse the first two semicolon fields: + // * short name + // * long name + const shortNameEnd = data.indexOf(';', lineStart); + const longNameStart = shortNameEnd + 1; + + if (longNameStart == 0) { + throw new Error(`Unexpected format on line ${row}`); + } + + let alias = data.slice(lineStart, shortNameEnd).trim(); + let longName = null; + let nameStart = longNameStart; + let lineDone = false; + do { + let nameEnd = data.indexOf(';', nameStart); + if (nameEnd === -1 || nameEnd > lineEnd) { + nameEnd = data.indexOf('#', nameStart); + if (nameEnd === -1 || nameEnd > lineEnd) { + nameEnd = lineEnd; + } + lineDone = true; + } + if (longName == null) { + longName = data.slice(nameStart, nameEnd).trim(); + } else { + alias = data.slice(nameStart, nameEnd).trim(); + } + console.log(alias, longName); + propertyAliases[alias] = longName; + nameStart = nameEnd + 1; + } while (!lineDone); +} + fs.writeFileSync(CATEGORY_OUTPUT_PATH, JSON.stringify(categories), 'utf8'); fs.writeFileSync(PROPERTY_OUTPUT_PATH, JSON.stringify(properties), 'utf8'); +fs.writeFileSync(CATEGORY_ALIAS_OUTPUT_PATH, JSON.stringify(categoryAliases), 'utf8'); +fs.writeFileSync(PROPERTY_ALIAS_OUTPUT_PATH, JSON.stringify(propertyAliases), 'utf8'); diff --git a/test/fixtures/test_grammars/unicode_classes/corpus.txt b/test/fixtures/test_grammars/unicode_classes/corpus.txt index d28d1acb..9c35be27 100644 --- a/test/fixtures/test_grammars/unicode_classes/corpus.txt +++ b/test/fixtures/test_grammars/unicode_classes/corpus.txt @@ -30,3 +30,14 @@ Math symbols (program (math_sym) (math_sym) (math_sym) (math_sym) (math_sym)) + +================================ +Letterlike numeric characters +================================ + +ᛯ Ⅵ 〩 + +--- + +(program + (letter_number) (letter_number) (letter_number)) diff --git a/test/fixtures/test_grammars/unicode_classes/grammar.json b/test/fixtures/test_grammars/unicode_classes/grammar.json index 9b040867..7a36d0c1 100644 --- a/test/fixtures/test_grammars/unicode_classes/grammar.json +++ b/test/fixtures/test_grammars/unicode_classes/grammar.json @@ -13,7 +13,8 @@ "members": [ {"type": "SYMBOL", "name": "lower"}, {"type": "SYMBOL", "name": "upper"}, - {"type": "SYMBOL", "name": "math_sym"} + {"type": "SYMBOL", "name": "math_sym"}, + {"type": "SYMBOL", "name": "letter_number"} ] } }, @@ -31,6 +32,11 @@ "math_sym": { "type": "PATTERN", "value": "\\p{Sm}+" + }, + + "letter_number": { + "type": "PATTERN", + "value": "\\p{Letter_Number}" } } } From 03b6a00bb8a02d54ffa2c80242f2af673e46329c Mon Sep 17 00:00:00 2001 From: Andrew Helwer Date: Sat, 10 Apr 2021 14:38:37 -0400 Subject: [PATCH 19/53] Support for suffixes in test file separators Some languages use the non-suffixed separators in their syntax Fixes #982 --- cli/src/test.rs | 156 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 143 insertions(+), 13 deletions(-) diff --git a/cli/src/test.rs b/cli/src/test.rs index acda8ae9..ef1080ef 100644 --- a/cli/src/test.rs +++ b/cli/src/test.rs @@ -5,7 +5,6 @@ use difference::{Changeset, Difference}; use lazy_static::lazy_static; use regex::bytes::{Regex as ByteRegex, RegexBuilder as ByteRegexBuilder}; use regex::Regex; -use std::char; use std::ffi::OsStr; use std::fmt::Write as FmtWrite; use std::fs; @@ -16,10 +15,16 @@ use tree_sitter::{Language, LogType, Parser, Query}; use walkdir::WalkDir; lazy_static! { - static ref HEADER_REGEX: ByteRegex = ByteRegexBuilder::new(r"^===+\r?\n([^=]*)\r?\n===+\r?\n") - .multi_line(true) - .build() - .unwrap(); + static ref FIRST_HEADER_REGEX: ByteRegex = + ByteRegexBuilder::new(r"^===+(?P[^=\r\n]*)\r?\n") + .multi_line(true) + .build() + .unwrap(); + static ref HEADER_REGEX: ByteRegex = + ByteRegexBuilder::new(r"^===+\r?\n(?P[^=\r\n]*)\r?\n===+\r?\n") + .multi_line(true) + .build() + .unwrap(); static ref DIVIDER_REGEX: ByteRegex = ByteRegexBuilder::new(r"^---+\r?\n") .multi_line(true) .build() @@ -380,16 +385,56 @@ fn parse_test_content(name: String, content: String, file_path: Option) let mut prev_name = String::new(); let mut prev_header_end = 0; + let suffix = FIRST_HEADER_REGEX + .captures(bytes) + .and_then(|c| c.name("suffix")) + .map(|m| &bytes[m.range()]) + .map(|b| String::from_utf8_lossy(b).to_string()) + .map(|s| regex::escape(&s[..])); + + let suffix_header_pattern: Option = suffix.as_ref().map(|s| { + String::from(r"^===+") + s + r"\r?\n(?P[^\r\n]*)\r?\n===+" + s + r"\r?\n" + }); + + let header_regex_from_suffix_header_pattern = suffix_header_pattern + .as_ref() + .and_then(|s| ByteRegexBuilder::new(&s[..]).multi_line(true).build().ok()); + + let header_regex = header_regex_from_suffix_header_pattern + .as_ref() + .unwrap_or(&HEADER_REGEX); + + let suffix_divider_pattern: Option = suffix + .as_ref() + .map(|s| String::from(r"^---+") + s + r"\r?\n"); + + let divider_regex_from_suffix_divider_pattern = suffix_divider_pattern + .as_ref() + .and_then(|s| ByteRegexBuilder::new(&s[..]).multi_line(true).build().ok()); + + let divider_regex = divider_regex_from_suffix_divider_pattern + .as_ref() + .unwrap_or(&DIVIDER_REGEX); + // Identify all of the test descriptions using the `======` headers. - for (header_start, header_end) in HEADER_REGEX - .find_iter(&bytes) - .map(|m| (m.start(), m.end())) - .chain(Some((bytes.len(), bytes.len()))) + // Must be followed by custom suffix if defined on first header. + // Capture index 0 corresponds to entire match and is guaranteed to exist. + for (header_start, header_end, test_name_capture) in header_regex + .captures_iter(&bytes) + .map(|c| { + ( + c.get(0).unwrap().start(), + c.get(0).unwrap().end(), + c.name("test_name"), + ) + }) + .chain(Some((bytes.len(), bytes.len(), None))) { // Find the longest line of dashes following each test description. // That is the divider between input and expected output. + // Must be followed by custom suffix if defined on first header. if prev_header_end > 0 { - let divider_match = DIVIDER_REGEX + let divider_match = divider_regex .find_iter(&bytes[prev_header_end..header_start]) .map(|m| (prev_header_end + m.start(), prev_header_end + m.end())) .max_by_key(|(start, end)| end - start); @@ -422,9 +467,10 @@ fn parse_test_content(name: String, content: String, file_path: Option) } } } - prev_name = String::from_utf8_lossy(&bytes[header_start..header_end]) - .trim_matches(|c| char::is_whitespace(c) || c == '=') - .to_string(); + prev_name = test_name_capture + .map(|m| &bytes[m.range()]) + .map(|b| String::from_utf8_lossy(b).to_string()) + .unwrap_or(String::new()); prev_header_end = header_end; } TestEntry::Group { @@ -669,4 +715,88 @@ code } ); } + + #[test] + fn test_parse_test_content_with_suffixes() { + let entry = parse_test_content( + "the-filename".to_string(), + r#" +==================asdf\()[]|{}*+?^$.- +First test +==================asdf\()[]|{}*+?^$.- + +========================= +NOT A TEST HEADER +========================= +------------------------- + +---asdf\()[]|{}*+?^$.- + +(a) + +==================asdf\()[]|{}*+?^$.- +Second test +==================asdf\()[]|{}*+?^$.- + +========================= +NOT A TEST HEADER +========================= +------------------------- + +---asdf\()[]|{}*+?^$.- + +(a) + +=========================asdf\()[]|{}*+?^$.- +Test name with = symbol +=========================asdf\()[]|{}*+?^$.- + +========================= +NOT A TEST HEADER +========================= +------------------------- + +---asdf\()[]|{}*+?^$.- + +(a) + "# + .trim() + .to_string(), + None, + ); + + let expected_input = "\n=========================\n\ + NOT A TEST HEADER\n\ + =========================\n\ + -------------------------\n" + .as_bytes() + .to_vec(); + assert_eq!( + entry, + TestEntry::Group { + name: "the-filename".to_string(), + children: vec![ + TestEntry::Example { + name: "First test".to_string(), + input: expected_input.clone(), + output: "(a)".to_string(), + has_fields: false, + }, + TestEntry::Example { + name: "Second test".to_string(), + input: expected_input.clone(), + output: "(a)".to_string(), + has_fields: false, + }, + TestEntry::Example { + name: "Test name with = symbol".to_string(), + input: expected_input.clone(), + output: "(a)".to_string(), + has_fields: false, + } + ], + file_path: None, + } + ); + } } From 88601000a29f2e33e109ef930ed9d9ad94fc894f Mon Sep 17 00:00:00 2001 From: Andrew Hlynskyi Date: Sun, 22 Aug 2021 03:09:50 +0300 Subject: [PATCH 20/53] chore(cli): Add the LICENSE file to the tree-sitter-cli npm package --- LICENSE | 2 +- cli/npm/.gitignore | 1 + cli/npm/package.json | 3 ++- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/LICENSE b/LICENSE index 971b81f9..4c220022 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (c) 2018 Max Brunsfeld +Copyright (c) 2018-2021 Max Brunsfeld Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/cli/npm/.gitignore b/cli/npm/.gitignore index 2d3aa23a..942b33a1 100644 --- a/cli/npm/.gitignore +++ b/cli/npm/.gitignore @@ -2,3 +2,4 @@ tree-sitter tree-sitter.exe *.gz *.tgz +LICENSE diff --git a/cli/npm/package.json b/cli/npm/package.json index 66c7ccb2..cb0f30f7 100644 --- a/cli/npm/package.json +++ b/cli/npm/package.json @@ -14,7 +14,8 @@ ], "main": "lib/api/index.js", "scripts": { - "install": "node install.js" + "install": "node install.js", + "prepack": "cp ../../LICENSE ." }, "bin": { "tree-sitter": "cli.js" From f15700c6bd245f986d5ec156a63e070c6081a937 Mon Sep 17 00:00:00 2001 From: Andrew Hlynskyi Date: Sun, 22 Aug 2021 03:12:28 +0300 Subject: [PATCH 21/53] chore(web): Add the LICENSE file to the web-tree-sitter npm package --- lib/binding_web/.gitignore | 1 + lib/binding_web/package.json | 1 + 2 files changed, 2 insertions(+) diff --git a/lib/binding_web/.gitignore b/lib/binding_web/.gitignore index 1a4530c9..eec0cfe6 100644 --- a/lib/binding_web/.gitignore +++ b/lib/binding_web/.gitignore @@ -3,3 +3,4 @@ package-lock.json node_modules *.tgz +LICENSE diff --git a/lib/binding_web/package.json b/lib/binding_web/package.json index d13552e0..f140d46a 100644 --- a/lib/binding_web/package.json +++ b/lib/binding_web/package.json @@ -9,6 +9,7 @@ }, "scripts": { "test": "mocha", + "prepack": "cp ../../LICENSE .", "prepublishOnly": "node check-artifacts-fresh.js" }, "repository": { From 3c59284df5d553b0e2e4663c4510c8c113e0ceb0 Mon Sep 17 00:00:00 2001 From: beyonddream <43626691+beyonddream@users.noreply.github.com> Date: Sun, 22 Aug 2021 04:07:49 -0700 Subject: [PATCH 22/53] Fix broken links of research papers. --- docs/index.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/index.md b/docs/index.md index 62a24a87..f86e1435 100644 --- a/docs/index.md +++ b/docs/index.md @@ -70,8 +70,8 @@ Parsers for these languages are in development: The design of Tree-sitter was greatly influenced by the following research papers: - [Practical Algorithms for Incremental Software Development Environments](https://www2.eecs.berkeley.edu/Pubs/TechRpts/1997/CSD-97-946.pdf) -- [Context Aware Scanning for Parsing Extensible Languages](http://www.umsec.umn.edu/publications/Context-Aware-Scanning-Parsing-Extensible) -- [Efficient and Flexible Incremental Parsing](http://ftp.cs.berkeley.edu/sggs/toplas-parsing.ps) -- [Incremental Analysis of Real Programming Languages](https://pdfs.semanticscholar.org/ca69/018c29cc415820ed207d7e1d391e2da1656f.pdf) +- [Context Aware Scanning for Parsing Extensible Languages](https://www-users.cse.umn.edu/~evw/pubs/vanwyk07gpce/vanwyk07gpce.pdf) +- [Efficient and Flexible Incremental Parsing](http://harmonia.cs.berkeley.edu/papers/twagner-parsing.pdf) +- [Incremental Analysis of Real Programming Languages](http://harmonia.cs.berkeley.edu/papers/twagner-glr.pdf) - [Error Detection and Recovery in LR Parsers](http://what-when-how.com/compiler-writing/bottom-up-parsing-compiler-writing-part-13) -- [Error Recovery for LR Parsers](http://www.dtic.mil/dtic/tr/fulltext/u2/a043470.pdf) +- [Error Recovery for LR Parsers](https://apps.dtic.mil/sti/pdfs/ADA043470.pdf) From c1849098f5f7d5fb3abdc48f075825b83f171518 Mon Sep 17 00:00:00 2001 From: Andrew Hlynskyi Date: Fri, 27 Aug 2021 11:06:28 +0300 Subject: [PATCH 23/53] chore(docs): Fix getting started example, closes #891 --- docs/section-3-creating-parsers.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/section-3-creating-parsers.md b/docs/section-3-creating-parsers.md index 4433ea00..777f7fcb 100644 --- a/docs/section-3-creating-parsers.md +++ b/docs/section-3-creating-parsers.md @@ -84,7 +84,7 @@ tree-sitter parse example-file This should print the following: ``` -(source_file [1, 0] - [1, 5]) +(source_file [0, 0] - [1, 0]) ``` You now have a working parser. From c7f118ce552b5df84aefd79ade2db53017f97b27 Mon Sep 17 00:00:00 2001 From: rydesun Date: Sun, 29 Aug 2021 20:20:56 +0800 Subject: [PATCH 24/53] Add Graphviz DOT parser --- docs/index.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/index.md b/docs/index.md index 36ca6777..72b16c96 100644 --- a/docs/index.md +++ b/docs/index.md @@ -31,6 +31,7 @@ Parsers for these languages are fairly complete: * [C#](https://github.com/tree-sitter/tree-sitter-c-sharp) * [C++](https://github.com/tree-sitter/tree-sitter-cpp) * [CSS](https://github.com/tree-sitter/tree-sitter-css) +* [DOT](https://github.com/rydesun/tree-sitter-dot) * [Elm](https://github.com/elm-tooling/tree-sitter-elm) * [Eno](https://github.com/eno-lang/tree-sitter-eno) * [ERB / EJS](https://github.com/tree-sitter/tree-sitter-embedded-template) From 8fa875b1a430b9d256983d1aae8069137bd4f2e1 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Sun, 29 Aug 2021 15:03:53 -0700 Subject: [PATCH 25/53] Fix possible infinite loop when running syntax highlighting tests Fixes #1347 --- cli/src/query_testing.rs | 62 ++++++++++++++-------------- cli/src/tests/test_highlight_test.rs | 19 +++++---- 2 files changed, 40 insertions(+), 41 deletions(-) diff --git a/cli/src/query_testing.rs b/cli/src/query_testing.rs index 6dc35c8d..9950f12f 100644 --- a/cli/src/query_testing.rs +++ b/cli/src/query_testing.rs @@ -48,40 +48,38 @@ pub fn parse_position_comments( if node.kind().contains("comment") { if let Ok(text) = node.utf8_text(source) { let mut position = node.start_position(); - if position.row == 0 { - continue; - } - - // Find the arrow character ("^" or '<-") in the comment. A left arrow - // refers to the column where the comment node starts. An up arrow refers - // to its own column. - let mut has_left_caret = false; - let mut has_arrow = false; - let mut arrow_end = 0; - for (i, c) in text.char_indices() { - arrow_end = i + 1; - if c == '-' && has_left_caret { - has_arrow = true; - break; + if position.row > 0 { + // Find the arrow character ("^" or '<-") in the comment. A left arrow + // refers to the column where the comment node starts. An up arrow refers + // to its own column. + let mut has_left_caret = false; + let mut has_arrow = false; + let mut arrow_end = 0; + for (i, c) in text.char_indices() { + arrow_end = i + 1; + if c == '-' && has_left_caret { + has_arrow = true; + break; + } + if c == '^' { + has_arrow = true; + position.column += i; + break; + } + has_left_caret = c == '<'; } - if c == '^' { - has_arrow = true; - position.column += i; - break; - } - has_left_caret = c == '<'; - } - // If the comment node contains an arrow and a highlight name, record the - // highlight name and the position. - if let (true, Some(mat)) = - (has_arrow, CAPTURE_NAME_REGEX.find(&text[arrow_end..])) - { - assertion_ranges.push((node.start_position(), node.end_position())); - result.push(Assertion { - position: position, - expected_capture_name: mat.as_str().to_string(), - }); + // If the comment node contains an arrow and a highlight name, record the + // highlight name and the position. + if let (true, Some(mat)) = + (has_arrow, CAPTURE_NAME_REGEX.find(&text[arrow_end..])) + { + assertion_ranges.push((node.start_position(), node.end_position())); + result.push(Assertion { + position: position, + expected_capture_name: mat.as_str().to_string(), + }); + } } } } diff --git a/cli/src/tests/test_highlight_test.rs b/cli/src/tests/test_highlight_test.rs index 1a658281..af2c15c5 100644 --- a/cli/src/tests/test_highlight_test.rs +++ b/cli/src/tests/test_highlight_test.rs @@ -17,6 +17,7 @@ fn test_highlight_test_with_basic_test() { ], ); let source = [ + "// hi", "var abc = function(d) {", " // ^ function", " // ^ keyword", @@ -32,15 +33,15 @@ fn test_highlight_test_with_basic_test() { assertions, &[ Assertion { - position: Point::new(0, 5), + position: Point::new(1, 5), expected_capture_name: "function".to_string() }, Assertion { - position: Point::new(0, 11), + position: Point::new(1, 11), expected_capture_name: "keyword".to_string() }, Assertion { - position: Point::new(3, 9), + position: Point::new(4, 9), expected_capture_name: "variable.parameter".to_string() }, ] @@ -53,12 +54,12 @@ fn test_highlight_test_with_basic_test() { assert_eq!( highlight_positions, &[ - (Point::new(0, 0), Point::new(0, 3), Highlight(2)), // "var" - (Point::new(0, 4), Point::new(0, 7), Highlight(0)), // "abc" - (Point::new(0, 10), Point::new(0, 18), Highlight(2)), // "function" - (Point::new(0, 19), Point::new(0, 20), Highlight(1)), // "d" - (Point::new(3, 2), Point::new(3, 8), Highlight(2)), // "return" - (Point::new(3, 9), Point::new(3, 10), Highlight(1)), // "d" + (Point::new(1, 0), Point::new(1, 3), Highlight(2)), // "var" + (Point::new(1, 4), Point::new(1, 7), Highlight(0)), // "abc" + (Point::new(1, 10), Point::new(1, 18), Highlight(2)), // "function" + (Point::new(1, 19), Point::new(1, 20), Highlight(1)), // "d" + (Point::new(4, 2), Point::new(4, 8), Highlight(2)), // "return" + (Point::new(4, 9), Point::new(4, 10), Highlight(1)), // "d" ] ); } From 12d727fd49a2fd353e8a061e9c2cb5c29a3e21bf Mon Sep 17 00:00:00 2001 From: Johannes Rieken Date: Mon, 30 Aug 2021 14:16:41 +0200 Subject: [PATCH 26/53] mix init options in the Module-global --- lib/binding_web/binding.js | 22 +++++----------------- lib/binding_web/prefix.js | 24 +++++++++++++++--------- lib/binding_web/suffix.js | 25 +++++++++++++++++++++++-- lib/binding_web/tree-sitter-web.d.ts | 6 +++++- 4 files changed, 48 insertions(+), 29 deletions(-) diff --git a/lib/binding_web/binding.js b/lib/binding_web/binding.js index bf0a91ce..6296ed35 100644 --- a/lib/binding_web/binding.js +++ b/lib/binding_web/binding.js @@ -17,24 +17,15 @@ var MIN_COMPATIBLE_VERSION; var TRANSFER_BUFFER; var currentParseCallback; var currentLogCallback; -var initPromise = new Promise(resolve => { - Module.onRuntimeInitialized = resolve -}).then(() => { - TRANSFER_BUFFER = C._ts_init(); - VERSION = getValue(TRANSFER_BUFFER, 'i32'); - MIN_COMPATIBLE_VERSION = getValue(TRANSFER_BUFFER + SIZE_OF_INT, 'i32'); -}); -class Parser { +class ParserImpl { static init() { - return initPromise; + TRANSFER_BUFFER = C._ts_init(); + VERSION = getValue(TRANSFER_BUFFER, 'i32'); + MIN_COMPATIBLE_VERSION = getValue(TRANSFER_BUFFER + SIZE_OF_INT, 'i32'); } - constructor() { - if (TRANSFER_BUFFER == null) { - throw new Error('You must first call Parser.init() and wait for it to resolve.'); - } - + initialize() { C._ts_parser_new_wasm(); this[0] = getValue(TRANSFER_BUFFER, 'i32'); this[1] = getValue(TRANSFER_BUFFER + SIZE_OF_INT, 'i32'); @@ -1203,6 +1194,3 @@ function marshalEdit(edit) { setValue(address, edit.oldEndIndex, 'i32'); address += SIZE_OF_INT; setValue(address, edit.newEndIndex, 'i32'); address += SIZE_OF_INT; } - -Parser.Language = Language; -Parser.Parser = Parser; diff --git a/lib/binding_web/prefix.js b/lib/binding_web/prefix.js index 3653e99d..de01b78a 100644 --- a/lib/binding_web/prefix.js +++ b/lib/binding_web/prefix.js @@ -1,9 +1,15 @@ -(function (root, factory) { - if (typeof define === 'function' && define.amd) { - define([], factory); - } else if (typeof exports === 'object') { - module.exports = factory(); - } else { - window.TreeSitter = factory(); - } -}(this, function () { +var TreeSitter = function() { + var initPromise; + class Parser { + constructor() { + this.initialize(); + } + + initialize() { + throw new Error("cannot construct a Parser before calling `init()`"); + } + + static init(moduleOptions) { + if (initPromise) return initPromise; + Module = { ...Module, ...moduleOptions }; + return initPromise = new Promise((resolveInitPromise) => { diff --git a/lib/binding_web/suffix.js b/lib/binding_web/suffix.js index 0e9fe021..cd91f919 100644 --- a/lib/binding_web/suffix.js +++ b/lib/binding_web/suffix.js @@ -1,2 +1,23 @@ -return Parser; -})); + for (const name of Object.getOwnPropertyNames(ParserImpl.prototype)) { + Object.defineProperty(Parser.prototype, name, { + value: ParserImpl.prototype[name], + enumerable: false, + writable: false, + }) + } + + Parser.Language = Language; + Module.onRuntimeInitialized = () => { + ParserImpl.init(); + resolveInitPromise(); + }; + }); + } + } + + return Parser; +}(); + +if (typeof exports === 'object') { + module.exports = TreeSitter; +} \ No newline at end of file diff --git a/lib/binding_web/tree-sitter-web.d.ts b/lib/binding_web/tree-sitter-web.d.ts index 2127fa41..fd847c1c 100644 --- a/lib/binding_web/tree-sitter-web.d.ts +++ b/lib/binding_web/tree-sitter-web.d.ts @@ -1,6 +1,10 @@ declare module 'web-tree-sitter' { class Parser { - static init(): Promise; + /** + * + * @param moduleOptions Optional emscripten module-object, see https://emscripten.org/docs/api_reference/module.html + */ + static init(moduleOptions?: object): Promise; delete(): void; parse(input: string | Parser.Input, previousTree?: Parser.Tree, options?: Parser.Options): Parser.Tree; getLanguage(): any; From e7a8e73bbf9d2c5e10a7c240f547f654df294cca Mon Sep 17 00:00:00 2001 From: Johannes Rieken Date: Mon, 30 Aug 2021 14:47:47 +0200 Subject: [PATCH 27/53] fixes and additions to tree-sitter-web.d.ts --- lib/binding_web/tree-sitter-web.d.ts | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/lib/binding_web/tree-sitter-web.d.ts b/lib/binding_web/tree-sitter-web.d.ts index 2127fa41..e7338d56 100644 --- a/lib/binding_web/tree-sitter-web.d.ts +++ b/lib/binding_web/tree-sitter-web.d.ts @@ -3,10 +3,13 @@ declare module 'web-tree-sitter' { static init(): Promise; delete(): void; parse(input: string | Parser.Input, previousTree?: Parser.Tree, options?: Parser.Options): Parser.Tree; - getLanguage(): any; - setLanguage(language: any): void; + reset(): void; + getLanguage(): Parser.Language; + setLanguage(language?: Parser.Language | undefined | null): void; getLogger(): Parser.Logger; - setLogger(logFunc: Parser.Logger): void; + setLogger(logFunc?: Parser.Logger | undefined | null): void; + setTimeoutMicros(value: number): void; + getTimeoutMicros(): number; } namespace Parser { @@ -96,8 +99,11 @@ declare module 'web-tree-sitter' { export interface TreeCursor { nodeType: string; + nodeTypeId: number; nodeText: string; + nodeId: number; nodeIsNamed: boolean; + nodeIsMissing: boolean; startPosition: Point; endPosition: Point; startIndex: number; @@ -123,7 +129,7 @@ declare module 'web-tree-sitter' { walk(): TreeCursor; getChangedRanges(other: Tree): Range[]; getEditedRange(other: Tree): Range; - getLanguage(): any; + getLanguage(): Language; } class Language { From 3e9874df7f56070a9dd1a55ae5bac87a9eeea870 Mon Sep 17 00:00:00 2001 From: Johannes Rieken Date: Mon, 30 Aug 2021 14:56:28 +0200 Subject: [PATCH 28/53] Don't confuse terser with object spread --- lib/binding_web/prefix.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/binding_web/prefix.js b/lib/binding_web/prefix.js index de01b78a..382035e1 100644 --- a/lib/binding_web/prefix.js +++ b/lib/binding_web/prefix.js @@ -11,5 +11,5 @@ var TreeSitter = function() { static init(moduleOptions) { if (initPromise) return initPromise; - Module = { ...Module, ...moduleOptions }; + Module = Object.assign({ }, Module, moduleOptions); return initPromise = new Promise((resolveInitPromise) => { From 52e6c900c3fd044e7ec8cf6f25af7a588dfc5776 Mon Sep 17 00:00:00 2001 From: Andrew Hlynskyi Date: Fri, 3 Sep 2021 12:49:42 +0300 Subject: [PATCH 29/53] fix(lib): fix segfault on ts_query_new with incompatible grammar version, close #1318 --- cli/loader/src/lib.rs | 49 ++++++++++++++++++---------------- lib/binding_rust/bindings.rs | 1 + lib/binding_rust/lib.rs | 50 ++++++++++++++++++++++++----------- lib/include/tree_sitter/api.h | 1 + lib/src/query.c | 9 +++++++ 5 files changed, 72 insertions(+), 38 deletions(-) diff --git a/cli/loader/src/lib.rs b/cli/loader/src/lib.rs index 89018677..d1a37df0 100644 --- a/cli/loader/src/lib.rs +++ b/cli/loader/src/lib.rs @@ -12,7 +12,7 @@ use std::process::Command; use std::sync::Mutex; use std::time::SystemTime; use std::{fs, mem}; -use tree_sitter::{Language, QueryError}; +use tree_sitter::{Language, QueryError, QueryErrorKind}; use tree_sitter_highlight::HighlightConfiguration; use tree_sitter_tags::{Error as TagsError, TagsConfiguration}; @@ -667,28 +667,31 @@ impl<'a> LanguageConfiguration<'a> { &injections_query, &locals_query, ) - .map_err(|error| { - if error.offset < injections_query.len() { - Self::include_path_in_query_error( - error, - &injection_ranges, - &injections_query, - 0, - ) - } else if error.offset < injections_query.len() + locals_query.len() { - Self::include_path_in_query_error( - error, - &locals_ranges, - &locals_query, - injections_query.len(), - ) - } else { - Self::include_path_in_query_error( - error, - &highlight_ranges, - &highlights_query, - injections_query.len() + locals_query.len(), - ) + .map_err(|error| match error.kind { + QueryErrorKind::Language => Error::from(error), + _ => { + if error.offset < injections_query.len() { + Self::include_path_in_query_error( + error, + &injection_ranges, + &injections_query, + 0, + ) + } else if error.offset < injections_query.len() + locals_query.len() { + Self::include_path_in_query_error( + error, + &locals_ranges, + &locals_query, + injections_query.len(), + ) + } else { + Self::include_path_in_query_error( + error, + &highlight_ranges, + &highlights_query, + injections_query.len() + locals_query.len(), + ) + } } })?; let mut all_highlight_names = self.highlight_names.lock().unwrap(); diff --git a/lib/binding_rust/bindings.rs b/lib/binding_rust/bindings.rs index 5c032a36..881780e4 100644 --- a/lib/binding_rust/bindings.rs +++ b/lib/binding_rust/bindings.rs @@ -133,6 +133,7 @@ pub const TSQueryError_TSQueryErrorNodeType: TSQueryError = 2; pub const TSQueryError_TSQueryErrorField: TSQueryError = 3; pub const TSQueryError_TSQueryErrorCapture: TSQueryError = 4; pub const TSQueryError_TSQueryErrorStructure: TSQueryError = 5; +pub const TSQueryError_TSQueryErrorLanguage: TSQueryError = 6; pub type TSQueryError = u32; extern "C" { #[doc = " Create a new parser."] diff --git a/lib/binding_rust/lib.rs b/lib/binding_rust/lib.rs index 08dd7b11..57f678d8 100644 --- a/lib/binding_rust/lib.rs +++ b/lib/binding_rust/lib.rs @@ -202,6 +202,7 @@ pub enum QueryErrorKind { Capture, Predicate, Structure, + Language, } #[derive(Debug)] @@ -1231,6 +1232,19 @@ impl Query { // On failure, build an error based on the error code and offset. if ptr.is_null() { + if error_type == ffi::TSQueryError_TSQueryErrorLanguage { + return Err(QueryError { + row: 0, + column: 0, + offset: 0, + message: LanguageError { + version: language.version(), + } + .to_string(), + kind: QueryErrorKind::Language, + }); + } + let offset = error_offset as usize; let mut line_start = 0; let mut row = 0; @@ -2105,21 +2119,27 @@ impl fmt::Display for LanguageError { impl fmt::Display for QueryError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!( - f, - "Query error at {}:{}. {}{}", - self.row + 1, - self.column + 1, - match self.kind { - QueryErrorKind::Field => "Invalid field name ", - QueryErrorKind::NodeType => "Invalid node type ", - QueryErrorKind::Capture => "Invalid capture name ", - QueryErrorKind::Predicate => "Invalid predicate: ", - QueryErrorKind::Structure => "Impossible pattern:\n", - QueryErrorKind::Syntax => "Invalid syntax:\n", - }, - self.message - ) + let msg = match self.kind { + QueryErrorKind::Field => "Invalid field name ", + QueryErrorKind::NodeType => "Invalid node type ", + QueryErrorKind::Capture => "Invalid capture name ", + QueryErrorKind::Predicate => "Invalid predicate: ", + QueryErrorKind::Structure => "Impossible pattern:\n", + QueryErrorKind::Syntax => "Invalid syntax:\n", + QueryErrorKind::Language => "", + }; + if msg.len() > 0 { + write!( + f, + "Query error at {}:{}. {}{}", + self.row + 1, + self.column + 1, + msg, + self.message + ) + } else { + write!(f, "{}", self.message) + } } } diff --git a/lib/include/tree_sitter/api.h b/lib/include/tree_sitter/api.h index f02789ee..e4d49a58 100644 --- a/lib/include/tree_sitter/api.h +++ b/lib/include/tree_sitter/api.h @@ -131,6 +131,7 @@ typedef enum { TSQueryErrorField, TSQueryErrorCapture, TSQueryErrorStructure, + TSQueryErrorLanguage, } TSQueryError; /********************/ diff --git a/lib/src/query.c b/lib/src/query.c index 2e8e4b79..1e6ba848 100644 --- a/lib/src/query.c +++ b/lib/src/query.c @@ -2069,6 +2069,15 @@ TSQuery *ts_query_new( uint32_t *error_offset, TSQueryError *error_type ) { + if ( + !language || + language->version > TREE_SITTER_LANGUAGE_VERSION || + language->version < TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION + ) { + *error_type = TSQueryErrorLanguage; + return NULL; + } + TSQuery *self = ts_malloc(sizeof(TSQuery)); *self = (TSQuery) { .steps = array_new(), From e22b42c9e65d07f592a577beb0b3cd528b2d6f72 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 3 Sep 2021 13:28:58 -0700 Subject: [PATCH 30/53] Bump lib tree-sitter dependency versions in loader crate --- .gitignore | 1 + cli/loader/Cargo.toml | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 572c2ac5..d73c0e40 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ log*.html .idea *.xcodeproj +.vscode fuzz-results diff --git a/cli/loader/Cargo.toml b/cli/loader/Cargo.toml index 4d2c8a5f..da27cae4 100644 --- a/cli/loader/Cargo.toml +++ b/cli/loader/Cargo.toml @@ -25,13 +25,13 @@ version = "1.0" features = ["preserve_order"] [dependencies.tree-sitter] -version = ">= 0.19" +version = "0.20" path = "../../lib" [dependencies.tree-sitter-highlight] -version = ">= 0.19" +version = "0.20" path = "../../highlight" [dependencies.tree-sitter-tags] -version = ">= 0.19" +version = "0.20" path = "../../tags" From 1eb4d8efe6627c41cd2094f586ef69043abec1af Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 3 Sep 2021 13:31:26 -0700 Subject: [PATCH 31/53] Bump tree-sitter dep version in tags and highlight crate --- highlight/Cargo.toml | 4 ++-- tags/Cargo.toml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/highlight/Cargo.toml b/highlight/Cargo.toml index 78281df7..7acc10b9 100644 --- a/highlight/Cargo.toml +++ b/highlight/Cargo.toml @@ -4,7 +4,7 @@ description = "Library for performing syntax highlighting with Tree-sitter" version = "0.20.0" authors = [ "Max Brunsfeld ", - "Tim Clem " + "Tim Clem ", ] license = "MIT" readme = "README.md" @@ -21,5 +21,5 @@ regex = "1" thiserror = "1.0" [dependencies.tree-sitter] -version = ">= 0.3.7" +version = "0.20" path = "../lib" diff --git a/tags/Cargo.toml b/tags/Cargo.toml index e59c53e8..f458c00b 100644 --- a/tags/Cargo.toml +++ b/tags/Cargo.toml @@ -4,7 +4,7 @@ description = "Library for extracting tag information" version = "0.20.0" authors = [ "Max Brunsfeld ", - "Patrick Thomson " + "Patrick Thomson ", ] license = "MIT" readme = "README.md" @@ -22,5 +22,5 @@ memchr = "2.3" thiserror = "1.0" [dependencies.tree-sitter] -version = ">= 0.17.0" +version = "0.20" path = "../lib" From 23b28f6f36ae0b9056fbb23802c3300e50802069 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 3 Sep 2021 13:45:13 -0700 Subject: [PATCH 32/53] Fix 'include!' error when building the CLI outside of the repo --- cli/build.rs | 5 +++-- cli/src/web_ui.rs | 38 +++++++++++--------------------------- 2 files changed, 14 insertions(+), 29 deletions(-) diff --git a/cli/build.rs b/cli/build.rs index 83be39a5..f62f83aa 100644 --- a/cli/build.rs +++ b/cli/build.rs @@ -6,7 +6,7 @@ fn main() { println!("cargo:rustc-env={}={}", "BUILD_SHA", git_sha); } - if wasm_files_present() { + if web_playground_files_present() { println!("cargo:rustc-cfg={}", "TREE_SITTER_EMBED_WASM_BINDING"); } @@ -23,8 +23,9 @@ fn main() { ); } -fn wasm_files_present() -> bool { +fn web_playground_files_present() -> bool { let paths = [ + "../docs/assets/js/playground.js", "../lib/binding_web/tree-sitter.js", "../lib/binding_web/tree-sitter.wasm", ]; diff --git a/cli/src/web_ui.rs b/cli/src/web_ui.rs index d3b51ade..807f1516 100644 --- a/cli/src/web_ui.rs +++ b/cli/src/web_ui.rs @@ -9,28 +9,6 @@ use tiny_http::{Header, Response, Server}; use webbrowser; macro_rules! resource { - ($name: tt, $path: tt) => { - #[cfg(TREE_SITTER_EMBED_WASM_BINDING)] - fn $name(tree_sitter_dir: &Option) -> Vec { - if let Some(tree_sitter_dir) = tree_sitter_dir { - fs::read(tree_sitter_dir.join($path)).unwrap() - } else { - include_bytes!(concat!("../../", $path)).to_vec() - } - } - - #[cfg(not(TREE_SITTER_EMBED_WASM_BINDING))] - fn $name(tree_sitter_dir: &Option) -> Vec { - if let Some(tree_sitter_dir) = tree_sitter_dir { - fs::read(tree_sitter_dir.join($path)).unwrap() - } else { - include_bytes!(concat!("../../", $path)).to_vec() - } - } - }; -} - -macro_rules! optional_resource { ($name: tt, $path: tt) => { #[cfg(TREE_SITTER_EMBED_WASM_BINDING)] fn $name(tree_sitter_dir: &Option) -> Vec { @@ -54,8 +32,8 @@ macro_rules! optional_resource { resource!(get_main_html, "cli/src/web_ui.html"); resource!(get_playground_js, "docs/assets/js/playground.js"); -optional_resource!(get_lib_js, "lib/binding_web/tree-sitter.js"); -optional_resource!(get_lib_wasm, "lib/binding_web/tree-sitter.wasm"); +resource!(get_lib_js, "lib/binding_web/tree-sitter.js"); +resource!(get_lib_wasm, "lib/binding_web/tree-sitter.wasm"); pub fn serve(grammar_path: &Path, open_in_browser: bool) { let port = get_available_port().expect("Couldn't find an available port"); @@ -96,17 +74,23 @@ pub fn serve(grammar_path: &Path, open_in_browser: bool) { for request in server.incoming_requests() { let res = match request.url() { "/" => response(&main_html, &html_header), - "/playground.js" => response(&playground_js, &js_header), "/tree-sitter-parser.wasm" => response(&language_wasm, &wasm_header), + "/playground.js" => { + if playground_js.is_empty() { + redirect("https://tree-sitter.github.io/tree-sitter/assets/js/playground.js") + } else { + response(&playground_js, &js_header) + } + } "/tree-sitter.js" => { - if cfg!(windows) { + if lib_js.is_empty() { redirect("https://tree-sitter.github.io/tree-sitter.js") } else { response(&lib_js, &js_header) } } "/tree-sitter.wasm" => { - if cfg!(windows) { + if lib_wasm.is_empty() { redirect("https://tree-sitter.github.io/tree-sitter.wasm") } else { response(&lib_wasm, &wasm_header) From b938486ebe822cc4b7a344e8966eba9fb0a2dc73 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 3 Sep 2021 13:54:47 -0700 Subject: [PATCH 33/53] In CLI, get tree-sitter binding version from CLI's Cargo.toml --- cli/Cargo.toml | 12 ++++++------ cli/build.rs | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 75f7a95b..e559842f 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -41,28 +41,28 @@ webbrowser = "0.5.1" which = "4.1.0" [dependencies.tree-sitter] -version = ">= 0.17.0" +version = "0.20" path = "../lib" [dev-dependencies.tree-sitter] -version = ">= 0.17.0" +version = "0.20" path = "../lib" features = ["allocation-tracking"] [dependencies.tree-sitter-config] -version = ">= 0.19.0" +version = "0.19.0" path = "config" [dependencies.tree-sitter-highlight] -version = ">= 0.3.0" +version = "0.20" path = "../highlight" [dependencies.tree-sitter-loader] -version = ">= 0.19.0" +version = "0.19.0" path = "loader" [dependencies.tree-sitter-tags] -version = ">= 0.1.0" +version = "0.20" path = "../tags" [dependencies.serde_json] diff --git a/cli/build.rs b/cli/build.rs index f62f83aa..e8a1320b 100644 --- a/cli/build.rs +++ b/cli/build.rs @@ -82,10 +82,10 @@ fn read_git_sha() -> Option { } fn read_rust_binding_version() -> String { - let path = "../lib/Cargo.toml"; + let path = "Cargo.toml"; let text = fs::read_to_string(path).unwrap(); let cargo_toml = toml::from_str::(text.as_ref()).unwrap(); - cargo_toml["package"]["version"] + cargo_toml["dependencies"]["tree-sitter"]["version"] .as_str() .unwrap() .trim_matches('"') From 4d64c2b939d4bb1074b5ae5631cf2616368f78d8 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 3 Sep 2021 13:57:45 -0700 Subject: [PATCH 34/53] Put emscripten-version file in cli directory This lets the CLI crate build without relying on sibling directories. --- .github/workflows/ci.yml | 2 +- cli/build.rs | 2 +- emscripten-version => cli/emscripten-version | 0 script/build-wasm | 2 +- script/fetch-emscripten | 2 +- 5 files changed, 4 insertions(+), 4 deletions(-) rename emscripten-version => cli/emscripten-version (100%) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f5435945..bfcd9f8c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -36,7 +36,7 @@ jobs: - name: Read Emscripten version run: | - printf 'EMSCRIPTEN_VERSION=%s\n' "$(cat emscripten-version)" >> $GITHUB_ENV + printf 'EMSCRIPTEN_VERSION=%s\n' "$(cat cli/emscripten-version)" >> $GITHUB_ENV - name: Cache artifacts id: cache diff --git a/cli/build.rs b/cli/build.rs index e8a1320b..1986e023 100644 --- a/cli/build.rs +++ b/cli/build.rs @@ -16,7 +16,7 @@ fn main() { "RUST_BINDING_VERSION", rust_binding_version, ); - let emscripten_version = fs::read_to_string("../emscripten-version").unwrap(); + let emscripten_version = fs::read_to_string("emscripten-version").unwrap(); println!( "cargo:rustc-env={}={}", "EMSCRIPTEN_VERSION", emscripten_version, diff --git a/emscripten-version b/cli/emscripten-version similarity index 100% rename from emscripten-version rename to cli/emscripten-version diff --git a/script/build-wasm b/script/build-wasm index 201d0b50..19c7aa13 100755 --- a/script/build-wasm +++ b/script/build-wasm @@ -33,7 +33,7 @@ web_dir=lib/binding_web emscripten_flags="-O3" minify_js=1 force_docker=0 -emscripen_version=$(cat "$(dirname "$0")"/../emscripten-version) +emscripen_version=$(cat "$(dirname "$0")"/../cli/emscripten-version) while [[ $# > 0 ]]; do case "$1" in diff --git a/script/fetch-emscripten b/script/fetch-emscripten index 157d0cae..4b579df0 100755 --- a/script/fetch-emscripten +++ b/script/fetch-emscripten @@ -2,7 +2,7 @@ set -e -EMSCRIPTEN_VERSION=$(cat "$(dirname "$0")/../emscripten-version") +EMSCRIPTEN_VERSION=$(cat "$(dirname "$0")/../cli/emscripten-version") mkdir -p target EMSDK_DIR="./target/emsdk" From b239583510aaafdc2c37e97521bd5ae4a8175aff Mon Sep 17 00:00:00 2001 From: Alon Hershenhorn Date: Fri, 10 Sep 2021 12:28:23 -0700 Subject: [PATCH 35/53] Fix typo in API documentation Fix small typo in ts_tree_cursor_current_field_id documentation. --- lib/include/tree_sitter/api.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/include/tree_sitter/api.h b/lib/include/tree_sitter/api.h index e4d49a58..ede1bc99 100644 --- a/lib/include/tree_sitter/api.h +++ b/lib/include/tree_sitter/api.h @@ -619,7 +619,7 @@ TSNode ts_tree_cursor_current_node(const TSTreeCursor *); const char *ts_tree_cursor_current_field_name(const TSTreeCursor *); /** - * Get the field name of the tree cursor's current node. + * Get the field id of the tree cursor's current node. * * This returns zero if the current node doesn't have a field. * See also `ts_node_child_by_field_id`, `ts_language_field_id_for_name`. From 5c2ebf093b161b896b232bd8b22b16deb7872207 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 10 Sep 2021 15:21:55 -0700 Subject: [PATCH 36/53] Include memmove symbol in wasm build --- lib/binding_web/exports.json | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/binding_web/exports.json b/lib/binding_web/exports.json index 0313f799..e0b3f718 100644 --- a/lib/binding_web/exports.json +++ b/lib/binding_web/exports.json @@ -23,6 +23,7 @@ "_memchr", "_memcmp", "_memcpy", + "_memmove", "_strlen", "_towupper", From 62d79b80e1255323a64d024223a67536e7e0adea Mon Sep 17 00:00:00 2001 From: Andrew Hlynskyi Date: Wed, 8 Sep 2021 00:08:13 +0300 Subject: [PATCH 37/53] feat(cli): add a flag to compile a parser in debug mode with -O0 C/C++ compiler flag --- cli/loader/src/lib.rs | 33 +++++++++++++++++++++++++-------- cli/src/main.rs | 14 ++++++++++++++ 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/cli/loader/src/lib.rs b/cli/loader/src/lib.rs index d1a37df0..0002bf08 100644 --- a/cli/loader/src/lib.rs +++ b/cli/loader/src/lib.rs @@ -101,6 +101,7 @@ pub struct Loader { language_configuration_ids_by_file_type: HashMap>, highlight_names: Box>>, use_all_highlight_names: bool, + debug_build: bool, } unsafe impl Send for Loader {} @@ -122,6 +123,7 @@ impl Loader { language_configuration_ids_by_file_type: HashMap::new(), highlight_names: Box::new(Mutex::new(Vec::new())), use_all_highlight_names: true, + debug_build: false, } } @@ -347,7 +349,11 @@ impl Loader { parser_path: &Path, scanner_path: &Option, ) -> Result { - let mut library_path = self.parser_lib_path.join(name); + let mut lib_name = name.to_string(); + if self.debug_build { + lib_name.push_str(".debug._"); + } + let mut library_path = self.parser_lib_path.join(lib_name); library_path.set_extension(DYLIB_EXTENSION); let recompile = needs_recompile(&library_path, &parser_path, &scanner_path) @@ -369,11 +375,13 @@ impl Loader { } if cfg!(windows) { - command - .args(&["/nologo", "/LD", "/I"]) - .arg(header_path) - .arg("/Od") - .arg(parser_path); + command.args(&["/nologo", "/LD", "/I"]).arg(header_path); + if self.debug_build { + command.arg("/Od"); + } else { + command.arg("/O2"); + } + command.arg(parser_path); if let Some(scanner_path) = scanner_path.as_ref() { command.arg(scanner_path); } @@ -389,8 +397,13 @@ impl Loader { .arg("-I") .arg(header_path) .arg("-o") - .arg(&library_path) - .arg("-O2"); + .arg(&library_path); + + if self.debug_build { + command.arg("-O0"); + } else { + command.arg("-O2"); + } // For conditional compilation of external scanner code when // used internally by `tree-siteer parse` and other sub commands. @@ -644,6 +657,10 @@ impl Loader { Err(anyhow!("No language found")) } } + + pub fn use_debug_build(&mut self, flag: bool) { + self.debug_build = flag; + } } impl<'a> LanguageConfiguration<'a> { diff --git a/cli/src/main.rs b/cli/src/main.rs index f64e4973..6687373d 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -45,6 +45,11 @@ fn run() -> Result<()> { .long("debug-graph") .short("D"); + let debug_build_arg = Arg::with_name("debug-build") + .help("Compile a parser in debug mode") + .long("debug-build") + .short("0"); + let paths_file_arg = Arg::with_name("paths-file") .help("The path to a file with paths to source file(s)") .long("paths") @@ -103,6 +108,7 @@ fn run() -> Result<()> { .arg(&paths_arg) .arg(&scope_arg) .arg(&debug_arg) + .arg(&debug_build_arg) .arg(&debug_graph_arg) .arg(Arg::with_name("debug-xml").long("xml").short("x")) .arg( @@ -178,6 +184,7 @@ fn run() -> Result<()> { .help("Update all syntax trees in corpus files with current parser output"), ) .arg(&debug_arg) + .arg(&debug_build_arg) .arg(&debug_graph_arg), ) .subcommand( @@ -273,8 +280,12 @@ fn run() -> Result<()> { ("test", Some(matches)) => { let debug = matches.is_present("debug"); let debug_graph = matches.is_present("debug-graph"); + let debug_build = matches.is_present("debug-build"); let update = matches.is_present("update"); let filter = matches.value_of("filter"); + + loader.use_debug_build(debug_build); + let languages = loader.languages_at_path(¤t_dir)?; let language = languages .first() @@ -310,6 +321,7 @@ fn run() -> Result<()> { ("parse", Some(matches)) => { let debug = matches.is_present("debug"); let debug_graph = matches.is_present("debug-graph"); + let debug_build = matches.is_present("debug-build"); let debug_xml = matches.is_present("debug-xml"); let quiet = matches.is_present("quiet"); let time = matches.is_present("time"); @@ -323,6 +335,8 @@ fn run() -> Result<()> { env::set_var("TREE_SITTER_DEBUG", "1"); } + loader.use_debug_build(debug_build); + let timeout = matches .value_of("timeout") .map_or(0, |t| u64::from_str_radix(t, 10).unwrap()); From 22a5cfbe102056ff3d51b2b44a8666fd8f85fdc0 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 13 Sep 2021 12:39:48 -0700 Subject: [PATCH 38/53] Assign ids to query matches only when the matches are returned Refs #1372 --- lib/src/query.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/src/query.c b/lib/src/query.c index 1e6ba848..86a9dfea 100644 --- a/lib/src/query.c +++ b/lib/src/query.c @@ -2561,6 +2561,7 @@ static void ts_query_cursor__add_state( pattern->step_index ); array_insert(&self->states, index, ((QueryState) { + .id = UINT32_MAX, .capture_list_id = NONE, .step_index = pattern->step_index, .pattern_index = pattern->pattern_index, @@ -2725,7 +2726,6 @@ static inline bool ts_query_cursor__advance( if (step->depth == PATTERN_DONE_MARKER) { if (state->start_depth > self->depth || self->halted) { LOG(" finish pattern %u\n", state->pattern_index); - state->id = self->next_state_id++; array_push(&self->finished_states, *state); did_match = true; deleted_count++; @@ -3114,7 +3114,6 @@ static inline bool ts_query_cursor__advance( LOG(" defer finishing pattern %u\n", state->pattern_index); } else { LOG(" finish pattern %u\n", state->pattern_index); - state->id = self->next_state_id++; array_push(&self->finished_states, *state); array_erase(&self->states, state - self->states.contents); did_match = true; @@ -3169,6 +3168,7 @@ bool ts_query_cursor_next_match( } QueryState *state = &self->finished_states.contents[0]; + if (state->id == UINT32_MAX) state->id = self->next_state_id++; match->id = state->id; match->pattern_index = state->pattern_index; const CaptureList *captures = capture_list_pool_get( @@ -3278,6 +3278,7 @@ bool ts_query_cursor_next_capture( } if (state) { + if (state->id == UINT32_MAX) state->id = self->next_state_id++; match->id = state->id; match->pattern_index = state->pattern_index; const CaptureList *captures = capture_list_pool_get( From ae12ff81b5534869e414ecf4294059b97eccd9fc Mon Sep 17 00:00:00 2001 From: Andrew Hlynskyi Date: Tue, 14 Sep 2021 19:00:50 +0300 Subject: [PATCH 39/53] feat(rust): Add an id() method for QueryMatch Refs #1372 --- lib/binding_rust/lib.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/binding_rust/lib.rs b/lib/binding_rust/lib.rs index 57f678d8..04dd78bf 100644 --- a/lib/binding_rust/lib.rs +++ b/lib/binding_rust/lib.rs @@ -1753,6 +1753,10 @@ impl QueryCursor { } impl<'a, 'tree> QueryMatch<'a, 'tree> { + pub fn id(&self) -> u32 { + self.id + } + pub fn remove(self) { unsafe { ffi::ts_query_cursor_remove_match(self.cursor, self.id) } } From 7e5781637c392c06b0553eafe66a5801f8e71137 Mon Sep 17 00:00:00 2001 From: Jiyee Sheng Date: Thu, 16 Sep 2021 12:30:40 +0800 Subject: [PATCH 40/53] Add Objective-C language parser --- docs/index.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/index.md b/docs/index.md index 72b16c96..97c131e2 100644 --- a/docs/index.md +++ b/docs/index.md @@ -76,6 +76,7 @@ Parsers for these languages are in development: * [Scala](https://github.com/tree-sitter/tree-sitter-scala) * [Sourcepawn](https://github.com/nilshelmig/tree-sitter-sourcepawn) * [Swift](https://github.com/tree-sitter/tree-sitter-swift) +* [Objective-C](https://github.com/jiyee/tree-sitter-objc) ### Talks on Tree-sitter From b675587132ad730dac8c51d159421807f18082dd Mon Sep 17 00:00:00 2001 From: Martin Jambon Date: Thu, 16 Sep 2021 13:19:05 -0700 Subject: [PATCH 41/53] Add link to OCaml bindings to list and sort list alphabetically. --- docs/index.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/index.md b/docs/index.md index 72b16c96..95505121 100644 --- a/docs/index.md +++ b/docs/index.md @@ -15,12 +15,13 @@ Tree-sitter is a parser generator tool and an incremental parsing library. It ca There are currently bindings that allow Tree-sitter to be used from the following languages: -* [Rust](https://github.com/tree-sitter/tree-sitter/tree/master/lib/binding_rust) -* [JavaScript (Wasm)](https://github.com/tree-sitter/tree-sitter/tree/master/lib/binding_web) +* [Haskell](https://github.com/tree-sitter/haskell-tree-sitter) * [JavaScript (Node.js)](https://github.com/tree-sitter/node-tree-sitter) +* [JavaScript (Wasm)](https://github.com/tree-sitter/tree-sitter/tree/master/lib/binding_web) +* [OCaml](https://github.com/returntocorp/ocaml-tree-sitter-core) * [Python](https://github.com/tree-sitter/py-tree-sitter) * [Ruby](https://github.com/tree-sitter/ruby-tree-sitter) -* [Haskell](https://github.com/tree-sitter/haskell-tree-sitter) +* [Rust](https://github.com/tree-sitter/tree-sitter/tree/master/lib/binding_rust) ### Available Parsers From 8ec38a9b6434e3fa3bb6e78a75803c8facf79cbc Mon Sep 17 00:00:00 2001 From: Martin Jambon Date: Thu, 16 Sep 2021 13:34:59 -0700 Subject: [PATCH 42/53] Add HCL (Terraform) and Hack to the list of supported languages --- docs/index.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/index.md b/docs/index.md index 95505121..2f782220 100644 --- a/docs/index.md +++ b/docs/index.md @@ -38,6 +38,7 @@ Parsers for these languages are fairly complete: * [ERB / EJS](https://github.com/tree-sitter/tree-sitter-embedded-template) * [Fennel](https://github.com/travonted/tree-sitter-fennel) * [Go](https://github.com/tree-sitter/tree-sitter-go) +* [HCL](https://github.com/MichaHoffmann/tree-sitter-hcl) * [HTML](https://github.com/tree-sitter/tree-sitter-html) * [Java](https://github.com/tree-sitter/tree-sitter-java) * [JavaScript](https://github.com/tree-sitter/tree-sitter-javascript) @@ -69,6 +70,7 @@ Parsers for these languages are in development: * [Erlang](https://github.com/AbstractMachinesLab/tree-sitter-erlang/) * [Dockerfile](https://github.com/camdencheek/tree-sitter-dockerfile) * [Go mod](https://github.com/camdencheek/tree-sitter-go-mod) +* [Hack](https://github.com/slackhq/tree-sitter-hack) * [Haskell](https://github.com/tree-sitter/tree-sitter-haskell) * [Julia](https://github.com/tree-sitter/tree-sitter-julia) * [Kotlin](https://github.com/fwcd/tree-sitter-kotlin) From eca777f501a16ba3e2d16d723cdc830d0255261e Mon Sep 17 00:00:00 2001 From: Mehmet Oguz Derin Date: Fri, 17 Sep 2021 18:54:14 +0300 Subject: [PATCH 43/53] Add WGSL WebGPU Shading Language This PR adds WebGPU Shading Language to the list of available grammars, the linked repository's grammar is periodically automatically extracted from the WGSL specification itself, which is actually extracted every single time where the specification gets a modification in itself to check the validity of both the syntax and the examples. Thank you very much for developing and maintaining tree-sitter, fantastic project! --- docs/index.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/index.md b/docs/index.md index 1b6287ba..50a37dde 100644 --- a/docs/index.md +++ b/docs/index.md @@ -63,6 +63,7 @@ Parsers for these languages are fairly complete: * [Vue](https://github.com/ikatyang/tree-sitter-vue) * [YAML](https://github.com/ikatyang/tree-sitter-yaml) * [WASM](https://github.com/wasm-lsp/tree-sitter-wasm) +* [WGSL WebGPU Shading Language](https://github.com/mehmetoguzderin/tree-sitter-wgsl) Parsers for these languages are in development: From b324d0802ad977adc180ec10315be7675637d5cd Mon Sep 17 00:00:00 2001 From: Andrew Hlynskyi Date: Tue, 21 Sep 2021 19:23:24 +0300 Subject: [PATCH 44/53] fix(cli): Panic on queries containing alternation with predicates `QuearyMatch::satisfies_text_predicates()` was changed to pass captures that don't relate to a checked predicate. This allows predicates in inner alternations for queries. Refs #1392 --- lib/binding_rust/lib.rs | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/lib/binding_rust/lib.rs b/lib/binding_rust/lib.rs index 04dd78bf..f5a85311 100644 --- a/lib/binding_rust/lib.rs +++ b/lib/binding_rust/lib.rs @@ -1821,21 +1821,36 @@ impl<'a, 'tree> QueryMatch<'a, 'tree> { .iter() .all(|predicate| match predicate { TextPredicate::CaptureEqCapture(i, j, is_positive) => { - let node1 = self.nodes_for_capture_index(*i).next().unwrap(); - let node2 = self.nodes_for_capture_index(*j).next().unwrap(); - let text1 = get_text(buffer1, text_provider.text(node1)); - let text2 = get_text(buffer2, text_provider.text(node2)); - (text1 == text2) == *is_positive + let node1 = self.nodes_for_capture_index(*i).next(); + let node2 = self.nodes_for_capture_index(*j).next(); + match (node1, node2) { + (Some(node1), Some(node2)) => { + let text1 = get_text(buffer1, text_provider.text(node1)); + let text2 = get_text(buffer2, text_provider.text(node2)); + (text1 == text2) == *is_positive + } + _ => true, + } } TextPredicate::CaptureEqString(i, s, is_positive) => { - let node = self.nodes_for_capture_index(*i).next().unwrap(); - let text = get_text(buffer1, text_provider.text(node)); - (text == s.as_bytes()) == *is_positive + let node = self.nodes_for_capture_index(*i).next(); + match node { + Some(node) => { + let text = get_text(buffer1, text_provider.text(node)); + (text == s.as_bytes()) == *is_positive + } + None => true, + } } TextPredicate::CaptureMatchString(i, r, is_positive) => { - let node = self.nodes_for_capture_index(*i).next().unwrap(); - let text = get_text(buffer1, text_provider.text(node)); - r.is_match(text) == *is_positive + let node = self.nodes_for_capture_index(*i).next(); + match node { + Some(node) => { + let text = get_text(buffer1, text_provider.text(node)); + r.is_match(text) == *is_positive + } + None => true, + } } }) } From 79b2bf1c30c65e8450f3712fcaffa39c6c3301b1 Mon Sep 17 00:00:00 2001 From: Andrew Hlynskyi Date: Tue, 21 Sep 2021 21:13:25 +0300 Subject: [PATCH 45/53] fix(wasm): Fix predicates in alternations, resolves #1392 --- lib/binding_web/binding.js | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/binding_web/binding.js b/lib/binding_web/binding.js index 6296ed35..5352cb18 100644 --- a/lib/binding_web/binding.js +++ b/lib/binding_web/binding.js @@ -785,6 +785,7 @@ class Language { if (c.name === captureName1) node1 = c.node; if (c.name === captureName2) node2 = c.node; } + if(node1 === undefined || node2 === undefined) return true; return (node1.text === node2.text) === isPositive; }); } else { @@ -796,7 +797,7 @@ class Language { return (c.node.text === stringValue) === isPositive; }; } - return false; + return true; }); } break; @@ -819,7 +820,7 @@ class Language { for (const c of captures) { if (c.name === captureName) return regex.test(c.node.text) === isPositive; } - return false; + return true; }); break; From d973527964446b9b8187974a73d5cd418d09a880 Mon Sep 17 00:00:00 2001 From: Andrew Hlynskyi Date: Mon, 20 Sep 2021 17:58:43 +0300 Subject: [PATCH 46/53] binding(rust): Mark set_cancellation_flag self as mutable --- lib/binding_rust/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/binding_rust/lib.rs b/lib/binding_rust/lib.rs index 04dd78bf..068a6028 100644 --- a/lib/binding_rust/lib.rs +++ b/lib/binding_rust/lib.rs @@ -630,7 +630,7 @@ impl Parser { /// If a pointer is assigned, then the parser will periodically read from /// this pointer during parsing. If it reads a non-zero value, it will halt early, /// returning `None`. See [parse](Parser::parse) for more information. - pub unsafe fn set_cancellation_flag(&self, flag: Option<&AtomicUsize>) { + pub unsafe fn set_cancellation_flag(&mut self, flag: Option<&AtomicUsize>) { if let Some(flag) = flag { ffi::ts_parser_set_cancellation_flag( self.0.as_ptr(), From e2de738026c481d8e7fa3981dd8e8174e44b929f Mon Sep 17 00:00:00 2001 From: Andrew Hlynskyi Date: Thu, 19 Aug 2021 08:21:16 +0300 Subject: [PATCH 47/53] cli(query): Improve and unify query subcommand output --- cli/src/query.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/cli/src/query.rs b/cli/src/query.rs index 9039f751..73d6dd28 100644 --- a/cli/src/query.rs +++ b/cli/src/query.rs @@ -48,10 +48,12 @@ pub fn query_files_at_paths( let capture_name = &query.capture_names()[capture.index as usize]; writeln!( &mut stdout, - " pattern: {}, capture: {}, row: {}, text: {:?}", + " pattern: {:>2}, capture: {} - {}, start: {}, end: {}, text: `{}`", mat.pattern_index, + capture.index, capture_name, - capture.node.start_position().row, + capture.node.start_position(), + capture.node.end_position(), capture.node.utf8_text(&source_code).unwrap_or("") )?; results.push(query_testing::CaptureInfo { @@ -70,9 +72,11 @@ pub fn query_files_at_paths( if end.row == start.row { writeln!( &mut stdout, - " capture: {}, start: {}, text: {:?}", + " capture: {} - {}, start: {}, end: {}, text: `{}`", + capture.index, capture_name, start, + end, capture.node.utf8_text(&source_code).unwrap_or("") )?; } else { From 28a121b532d855276155ac164531a771379b36b2 Mon Sep 17 00:00:00 2001 From: Andrew Hlynskyi Date: Wed, 22 Sep 2021 00:19:03 +0300 Subject: [PATCH 48/53] chore(docs): Fix misprints --- cli/README.md | 2 +- docs/section-2-using-parsers.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cli/README.md b/cli/README.md index b6f526e9..fe45b17b 100644 --- a/cli/README.md +++ b/cli/README.md @@ -36,4 +36,4 @@ The `tree-sitter` binary itself has no dependencies, but specific commands have * `test` - The `tree-sitter test` command will run the unit tests for the Tree-sitter parser in the current working directory. See [the documentation](http://tree-sitter.github.io/tree-sitter/creating-parsers) for more information. -* `parse` - The `tree-sitter parse` command will parse a file (or list of file) using Tree-sitter parsers. +* `parse` - The `tree-sitter parse` command will parse a file (or list of files) using Tree-sitter parsers. diff --git a/docs/section-2-using-parsers.md b/docs/section-2-using-parsers.md index 86b5d750..d3734018 100644 --- a/docs/section-2-using-parsers.md +++ b/docs/section-2-using-parsers.md @@ -464,7 +464,7 @@ In general, it's a good idea to make patterns more specific by specifying [field #### Negated Fields -You can also constrain a pattern so that it only mathces nodes that *lack* a certain field. To do this, add a field name prefixed by a `!` within the parent pattern. For example, this pattern would match a class declaration with no type parameters: +You can also constrain a pattern so that it only matches nodes that *lack* a certain field. To do this, add a field name prefixed by a `!` within the parent pattern. For example, this pattern would match a class declaration with no type parameters: ``` (class_declaration From 2abd6bc318cc8d8d49963b3d27ca226ea864a895 Mon Sep 17 00:00:00 2001 From: Andrew Hlynskyi Date: Wed, 22 Sep 2021 01:35:43 +0300 Subject: [PATCH 49/53] fix(cli): Fix playground opening in a browser, regression in #1304 --- cli/src/web_ui.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cli/src/web_ui.rs b/cli/src/web_ui.rs index 807f1516..3fcf56f1 100644 --- a/cli/src/web_ui.rs +++ b/cli/src/web_ui.rs @@ -37,8 +37,8 @@ resource!(get_lib_wasm, "lib/binding_web/tree-sitter.wasm"); pub fn serve(grammar_path: &Path, open_in_browser: bool) { let port = get_available_port().expect("Couldn't find an available port"); - let url = format!("127.0.0.1:{}", port); - let server = Server::http(&url).expect("Failed to start web server"); + let addr = format!("127.0.0.1:{}", port); + let server = Server::http(&addr).expect("Failed to start web server"); let grammar_name = wasm::get_grammar_name(&grammar_path.join("src")) .with_context(|| "Failed to get wasm filename") .unwrap(); @@ -51,7 +51,8 @@ pub fn serve(grammar_path: &Path, open_in_browser: bool) { ) }) .unwrap(); - println!("Started playground on '{}'", url); + let url = format!("http://{}", addr); + println!("Started playground on: {}", url); if open_in_browser { if let Err(_) = webbrowser::open(&url) { eprintln!("Failed to open '{}' in a web browser", url); From 0801cd38708c2dcbbd2956360f364729c15b5ac6 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 12 Jul 2021 21:55:42 +0000 Subject: [PATCH 50/53] chore(deps): bump addressable from 2.5.2 to 2.8.0 in /docs Bumps [addressable](https://github.com/sporkmonger/addressable) from 2.5.2 to 2.8.0. - [Release notes](https://github.com/sporkmonger/addressable/releases) - [Changelog](https://github.com/sporkmonger/addressable/blob/main/CHANGELOG.md) - [Commits](https://github.com/sporkmonger/addressable/compare/addressable-2.5.2...addressable-2.8.0) --- updated-dependencies: - dependency-name: addressable dependency-type: indirect ... Signed-off-by: dependabot[bot] --- docs/Gemfile.lock | 53 +++++++++++++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock index 6a18f64c..d22264e7 100644 --- a/docs/Gemfile.lock +++ b/docs/Gemfile.lock @@ -6,8 +6,8 @@ GEM minitest (~> 5.1) thread_safe (~> 0.3, >= 0.3.4) tzinfo (~> 1.1) - addressable (2.5.2) - public_suffix (>= 2.0.2, < 4.0) + addressable (2.8.0) + public_suffix (>= 2.0.2, < 5.0) coffee-script (2.4.1) coffee-script-source execjs @@ -16,12 +16,27 @@ GEM commonmarker (0.17.8) ruby-enum (~> 0.5) concurrent-ruby (1.0.5) - ethon (0.11.0) - ffi (>= 1.3.0) + ethon (0.14.0) + ffi (>= 1.15.0) execjs (2.7.0) - faraday (0.14.0) + faraday (1.5.1) + faraday-em_http (~> 1.0) + faraday-em_synchrony (~> 1.0) + faraday-excon (~> 1.1) + faraday-httpclient (~> 1.0.1) + faraday-net_http (~> 1.0) + faraday-net_http_persistent (~> 1.1) + faraday-patron (~> 1.0) multipart-post (>= 1.2, < 3) - ffi (1.9.23) + ruby2_keywords (>= 0.0.4) + faraday-em_http (1.0.0) + faraday-em_synchrony (1.0.0) + faraday-excon (1.1.0) + faraday-httpclient (1.0.1) + faraday-net_http (1.0.1) + faraday-net_http_persistent (1.2.0) + faraday-patron (1.0.0) + ffi (1.15.3) forwardable-extended (2.6.0) gemoji (3.0.0) github-pages (177) @@ -195,33 +210,35 @@ GEM minima (2.1.1) jekyll (~> 3.3) minitest (5.11.3) - multipart-post (2.0.0) - net-dns (0.8.0) + multipart-post (2.1.1) + net-dns (0.9.0) nokogiri (1.11.4) mini_portile2 (~> 2.5.0) racc (~> 1.4) - octokit (4.8.0) + octokit (4.21.0) + faraday (>= 0.9) sawyer (~> 0.8.0, >= 0.5.3) - pathutil (0.16.1) + pathutil (0.16.2) forwardable-extended (~> 2.6) public_suffix (2.0.5) racc (1.5.2) - rb-fsevent (0.10.2) - rb-inotify (0.9.10) - ffi (>= 0.5.0, < 2) + rb-fsevent (0.11.0) + rb-inotify (0.10.1) + ffi (~> 1.0) rouge (2.2.1) ruby-enum (0.7.2) i18n + ruby2_keywords (0.0.4) rubyzip (2.0.0) - safe_yaml (1.0.4) - sass (3.5.5) + safe_yaml (1.0.5) + sass (3.7.4) sass-listen (~> 4.0.0) sass-listen (4.0.0) rb-fsevent (~> 0.9, >= 0.9.4) rb-inotify (~> 0.9, >= 0.9.7) - sawyer (0.8.1) - addressable (>= 2.3.5, < 2.6) - faraday (~> 0.8, < 1.0) + sawyer (0.8.2) + addressable (>= 2.3.5) + faraday (> 0.8, < 2.0) terminal-table (1.8.0) unicode-display_width (~> 1.1, >= 1.1.1) thread_safe (0.3.6) From 16470bc0b1a24037cb8a393bc43731a8c39c14cd Mon Sep 17 00:00:00 2001 From: Andrew Hlynskyi Date: Wed, 22 Sep 2021 01:43:32 +0300 Subject: [PATCH 51/53] chore(cli): Rename all internal web_ui stuff to playground --- cli/src/lib.rs | 2 +- cli/src/main.rs | 4 ++-- cli/src/{web_ui.html => playground.html} | 0 cli/src/{web_ui.rs => playground.rs} | 2 +- docs/section-6-contributing.md | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) rename cli/src/{web_ui.html => playground.html} (100%) rename cli/src/{web_ui.rs => playground.rs} (98%) diff --git a/cli/src/lib.rs b/cli/src/lib.rs index 734b3e6a..7de4afc5 100644 --- a/cli/src/lib.rs +++ b/cli/src/lib.rs @@ -2,6 +2,7 @@ pub mod generate; pub mod highlight; pub mod logger; pub mod parse; +pub mod playground; pub mod query; pub mod query_testing; pub mod tags; @@ -9,7 +10,6 @@ pub mod test; pub mod test_highlight; pub mod util; pub mod wasm; -pub mod web_ui; #[cfg(test)] mod tests; diff --git a/cli/src/main.rs b/cli/src/main.rs index 6687373d..2c18f03f 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -4,7 +4,7 @@ use glob::glob; use std::path::Path; use std::{env, fs, u64}; use tree_sitter_cli::{ - generate, highlight, logger, parse, query, tags, test, test_highlight, util, wasm, web_ui, + generate, highlight, logger, parse, playground, query, tags, test, test_highlight, util, wasm, }; use tree_sitter_config::Config; use tree_sitter_loader as loader; @@ -504,7 +504,7 @@ fn run() -> Result<()> { ("playground", Some(matches)) => { let open_in_browser = !matches.is_present("quiet"); - web_ui::serve(¤t_dir, open_in_browser); + playground::serve(¤t_dir, open_in_browser); } ("dump-languages", Some(_)) => { diff --git a/cli/src/web_ui.html b/cli/src/playground.html similarity index 100% rename from cli/src/web_ui.html rename to cli/src/playground.html diff --git a/cli/src/web_ui.rs b/cli/src/playground.rs similarity index 98% rename from cli/src/web_ui.rs rename to cli/src/playground.rs index 3fcf56f1..f674ce11 100644 --- a/cli/src/web_ui.rs +++ b/cli/src/playground.rs @@ -30,7 +30,7 @@ macro_rules! resource { }; } -resource!(get_main_html, "cli/src/web_ui.html"); +resource!(get_main_html, "cli/src/playground.html"); resource!(get_playground_js, "docs/assets/js/playground.js"); resource!(get_lib_js, "lib/binding_web/tree-sitter.js"); resource!(get_lib_wasm, "lib/binding_web/tree-sitter.wasm"); diff --git a/docs/section-6-contributing.md b/docs/section-6-contributing.md index 685fe5e7..36f5f499 100644 --- a/docs/section-6-contributing.md +++ b/docs/section-6-contributing.md @@ -29,7 +29,7 @@ git clone https://github.com/tree-sitter/tree-sitter cd tree-sitter ``` -Optionally, build the WASM library. If you skip this step, then the `tree-sitter web-ui` command will require an internet connection. If you have emscripten installed, this will use your `emcc` compiler. Otherwise, it will use Docker: +Optionally, build the WASM library. If you skip this step, then the `tree-sitter playground` command will require an internet connection. If you have emscripten installed, this will use your `emcc` compiler. Otherwise, it will use Docker: ```sh ./script/build-wasm From 0e3a2780cdeee1fdc8d2935e8d40a21dc2ad4958 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 21 Sep 2021 21:35:14 -0700 Subject: [PATCH 52/53] Avoid dynamic regex construction when parsing test files --- cli/src/test.rs | 117 ++++++++++++++++++++++-------------------------- 1 file changed, 53 insertions(+), 64 deletions(-) diff --git a/cli/src/test.rs b/cli/src/test.rs index ef1080ef..4374f527 100644 --- a/cli/src/test.rs +++ b/cli/src/test.rs @@ -15,17 +15,12 @@ use tree_sitter::{Language, LogType, Parser, Query}; use walkdir::WalkDir; lazy_static! { - static ref FIRST_HEADER_REGEX: ByteRegex = - ByteRegexBuilder::new(r"^===+(?P[^=\r\n]*)\r?\n") - .multi_line(true) - .build() - .unwrap(); static ref HEADER_REGEX: ByteRegex = - ByteRegexBuilder::new(r"^===+\r?\n(?P[^=\r\n]*)\r?\n===+\r?\n") + ByteRegexBuilder::new(r"^===+(?P[^=\r\n][^\r\n]*)?\r?\n(?P[^=\r\n][^\r\n]*)\r?\n===+(?P[^=\r\n][^\r\n]*)?\r?\n") .multi_line(true) .build() .unwrap(); - static ref DIVIDER_REGEX: ByteRegex = ByteRegexBuilder::new(r"^---+\r?\n") + static ref DIVIDER_REGEX: ByteRegex = ByteRegexBuilder::new(r"^---+(?P[^-\r\n][^\r\n]*)?\r?\n") .multi_line(true) .build() .unwrap(); @@ -385,62 +380,58 @@ fn parse_test_content(name: String, content: String, file_path: Option) let mut prev_name = String::new(); let mut prev_header_end = 0; - let suffix = FIRST_HEADER_REGEX + // Find the first test header in the file, and determine if it has a + // custom suffix. If so, then this suffix will be used to identify + // all subsequent headers and divider lines in the file. + let first_suffix = HEADER_REGEX .captures(bytes) - .and_then(|c| c.name("suffix")) - .map(|m| &bytes[m.range()]) - .map(|b| String::from_utf8_lossy(b).to_string()) - .map(|s| regex::escape(&s[..])); + .and_then(|c| c.name("suffix1")) + .map(|m| String::from_utf8_lossy(m.as_bytes())); - let suffix_header_pattern: Option = suffix.as_ref().map(|s| { - String::from(r"^===+") + s + r"\r?\n(?P[^\r\n]*)\r?\n===+" + s + r"\r?\n" + // Find all of the `===` test headers, which contain the test names. + // Ignore any matches whose suffix does not match the first header + // suffix in the file. + let header_matches = HEADER_REGEX.captures_iter(&bytes).filter_map(|c| { + let suffix1 = c + .name("suffix1") + .map(|m| String::from_utf8_lossy(m.as_bytes())); + let suffix2 = c + .name("suffix2") + .map(|m| String::from_utf8_lossy(m.as_bytes())); + if suffix1 == first_suffix && suffix2 == first_suffix { + let header_range = c.get(0).unwrap().range(); + let test_name = c + .name("test_name") + .map(|c| String::from_utf8_lossy(c.as_bytes()).to_string()); + Some((header_range, test_name)) + } else { + None + } }); - let header_regex_from_suffix_header_pattern = suffix_header_pattern - .as_ref() - .and_then(|s| ByteRegexBuilder::new(&s[..]).multi_line(true).build().ok()); - - let header_regex = header_regex_from_suffix_header_pattern - .as_ref() - .unwrap_or(&HEADER_REGEX); - - let suffix_divider_pattern: Option = suffix - .as_ref() - .map(|s| String::from(r"^---+") + s + r"\r?\n"); - - let divider_regex_from_suffix_divider_pattern = suffix_divider_pattern - .as_ref() - .and_then(|s| ByteRegexBuilder::new(&s[..]).multi_line(true).build().ok()); - - let divider_regex = divider_regex_from_suffix_divider_pattern - .as_ref() - .unwrap_or(&DIVIDER_REGEX); - - // Identify all of the test descriptions using the `======` headers. - // Must be followed by custom suffix if defined on first header. - // Capture index 0 corresponds to entire match and is guaranteed to exist. - for (header_start, header_end, test_name_capture) in header_regex - .captures_iter(&bytes) - .map(|c| { - ( - c.get(0).unwrap().start(), - c.get(0).unwrap().end(), - c.name("test_name"), - ) - }) - .chain(Some((bytes.len(), bytes.len(), None))) - { - // Find the longest line of dashes following each test description. - // That is the divider between input and expected output. - // Must be followed by custom suffix if defined on first header. + for (header_range, test_name) in header_matches.chain(Some((bytes.len()..bytes.len(), None))) { + // Find the longest line of dashes following each test description. That line + // separates the input from the expected output. Ignore any matches whose suffix + // does not match the first suffix in the file. if prev_header_end > 0 { - let divider_match = divider_regex - .find_iter(&bytes[prev_header_end..header_start]) - .map(|m| (prev_header_end + m.start(), prev_header_end + m.end())) - .max_by_key(|(start, end)| end - start); - if let Some((divider_start, divider_end)) = divider_match { - if let Ok(output) = str::from_utf8(&bytes[divider_end..header_start]) { - let mut input = bytes[prev_header_end..divider_start].to_vec(); + let divider_range = DIVIDER_REGEX + .captures_iter(&bytes[prev_header_end..header_range.start]) + .filter_map(|m| { + let suffix = m + .name("suffix") + .map(|m| String::from_utf8_lossy(m.as_bytes())); + if suffix == first_suffix { + let range = m.get(0).unwrap().range(); + Some((prev_header_end + range.start)..(prev_header_end + range.end)) + } else { + None + } + }) + .max_by_key(|range| range.len()); + + if let Some(divider_range) = divider_range { + if let Ok(output) = str::from_utf8(&bytes[divider_range.end..header_range.start]) { + let mut input = bytes[prev_header_end..divider_range.start].to_vec(); // Remove trailing newline from the input. input.pop(); @@ -450,6 +441,7 @@ fn parse_test_content(name: String, content: String, file_path: Option) // Remove all comments let output = COMMENT_REGEX.replace_all(output, "").to_string(); + // Normalize the whitespace in the expected output. let output = WHITESPACE_REGEX.replace_all(output.trim(), " "); let output = output.replace(" )", ")"); @@ -467,11 +459,8 @@ fn parse_test_content(name: String, content: String, file_path: Option) } } } - prev_name = test_name_capture - .map(|m| &bytes[m.range()]) - .map(|b| String::from_utf8_lossy(b).to_string()) - .unwrap_or(String::new()); - prev_header_end = header_end; + prev_name = test_name.unwrap_or(String::new()); + prev_header_end = header_range.end; } TestEntry::Group { name, @@ -485,7 +474,7 @@ mod tests { use super::*; #[test] - fn test_parse_test_content() { + fn test_parse_test_content_simple() { let entry = parse_test_content( "the-filename".to_string(), r#" From 2bee7c9b75e3e0163b321502f1f73e2e38943a7e Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 22 Sep 2021 13:53:42 -0700 Subject: [PATCH 53/53] Update get_column docs Fixes #1405 --- docs/section-3-creating-parsers.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/section-3-creating-parsers.md b/docs/section-3-creating-parsers.md index 777f7fcb..f5f7c933 100644 --- a/docs/section-3-creating-parsers.md +++ b/docs/section-3-creating-parsers.md @@ -674,7 +674,7 @@ This function is responsible for recognizing external tokens. It should return ` * **`TSSymbol result_symbol`** - The symbol that was recognized. Your scan function should *assign* to this field one of the values from the `TokenType` enum, described above. * **`void (*advance)(TSLexer *, bool skip)`** - A function for advancing to the next character. If you pass `true` for the second argument, the current character will be treated as whitespace. * **`void (*mark_end)(TSLexer *)`** - A function for marking the end of the recognized token. This allows matching tokens that require multiple characters of lookahead. By default (if you don't call `mark_end`), any character that you moved past using the `advance` function will be included in the size of the token. But once you call `mark_end`, then any later calls to `advance` will *not* increase the size of the returned token. You can call `mark_end` multiple times to increase the size of the token. -* **`uint32_t (*get_column)(TSLexer *)`** - **(Experimental)** A function for querying the current column position of the lexer. It returns the number of unicode code points (not bytes) since the start of the current line. +* **`uint32_t (*get_column)(TSLexer *)`** - A function for querying the current column position of the lexer. It returns the number of bytes (not characters) since the start of the current line. * **`bool (*is_at_included_range_start)(TSLexer *)`** - A function for checking if the parser has just skipped some characters in the document. When parsing an embedded document using the `ts_parser_set_included_ranges` function (described in the [multi-language document section][multi-language-section]), your scanner may want to apply some special behavior when moving to a disjoint part of the document. For example, in [EJS documents][ejs], the JavaScript parser uses this function to enable inserting automatic semicolon tokens in between the code directives, delimited by `<%` and `%>`. The third argument to the `scan` function is an array of booleans that indicates which of your external tokens are currently expected by the parser. You should only look for a given token if it is valid according to this array. At the same time, you cannot backtrack, so you may need to combine certain pieces of logic.