diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f5435945..bfcd9f8c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -36,7 +36,7 @@ jobs: - name: Read Emscripten version run: | - printf 'EMSCRIPTEN_VERSION=%s\n' "$(cat emscripten-version)" >> $GITHUB_ENV + printf 'EMSCRIPTEN_VERSION=%s\n' "$(cat cli/emscripten-version)" >> $GITHUB_ENV - name: Cache artifacts id: cache diff --git a/.gitignore b/.gitignore index 572c2ac5..d73c0e40 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ log*.html .idea *.xcodeproj +.vscode fuzz-results diff --git a/Cargo.lock b/Cargo.lock index a6be4cdc..61262a3a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -495,6 +495,12 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "ryu" version = "1.0.5" @@ -541,9 +547,9 @@ dependencies = [ [[package]] name = "smallbitvec" -version = "2.5.0" +version = "2.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "797a4eaffb90d896f29698d45676f9f940a71936d7574996a7df54593ba209fa" +checksum = "75ce4f9dc4a41b4c3476cc925f1efb11b66df373a8fde5d4b8915fa91b5d995e" [[package]] name = "spin" @@ -689,11 +695,13 @@ dependencies = [ "dirs", "glob", "html-escape", + "indexmap", "lazy_static", "log", "rand", "regex", "regex-syntax", + "rustc-hash", "serde", "serde_derive", "serde_json", diff --git a/LICENSE b/LICENSE index 971b81f9..4c220022 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (c) 2018 Max Brunsfeld +Copyright (c) 2018-2021 Max Brunsfeld Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 6ec541dd..e559842f 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -27,40 +27,42 @@ difference = "2.0" dirs = "3.0" glob = "0.3.0" html-escape = "0.2.6" +indexmap = "1" lazy_static = "1.2.0" regex = "1" regex-syntax = "0.6.4" +rustc-hash = "1" serde = "1.0" serde_derive = "1.0" -smallbitvec = "2.3.0" +smallbitvec = "2.5.1" tiny_http = "0.8" walkdir = "2.3" webbrowser = "0.5.1" which = "4.1.0" [dependencies.tree-sitter] -version = ">= 0.17.0" +version = "0.20" path = "../lib" [dev-dependencies.tree-sitter] -version = ">= 0.17.0" +version = "0.20" path = "../lib" features = ["allocation-tracking"] [dependencies.tree-sitter-config] -version = ">= 0.19.0" +version = "0.19.0" path = "config" [dependencies.tree-sitter-highlight] -version = ">= 0.3.0" +version = "0.20" path = "../highlight" [dependencies.tree-sitter-loader] -version = ">= 0.19.0" +version = "0.19.0" path = "loader" [dependencies.tree-sitter-tags] -version = ">= 0.1.0" +version = "0.20" path = "../tags" [dependencies.serde_json] diff --git a/cli/README.md b/cli/README.md index b6f526e9..fe45b17b 100644 --- a/cli/README.md +++ b/cli/README.md @@ -36,4 +36,4 @@ The `tree-sitter` binary itself has no dependencies, but specific commands have * `test` - The `tree-sitter test` command will run the unit tests for the Tree-sitter parser in the current working directory. See [the documentation](http://tree-sitter.github.io/tree-sitter/creating-parsers) for more information. -* `parse` - The `tree-sitter parse` command will parse a file (or list of file) using Tree-sitter parsers. +* `parse` - The `tree-sitter parse` command will parse a file (or list of files) using Tree-sitter parsers. diff --git a/cli/build.rs b/cli/build.rs index 83be39a5..1986e023 100644 --- a/cli/build.rs +++ b/cli/build.rs @@ -6,7 +6,7 @@ fn main() { println!("cargo:rustc-env={}={}", "BUILD_SHA", git_sha); } - if wasm_files_present() { + if web_playground_files_present() { println!("cargo:rustc-cfg={}", "TREE_SITTER_EMBED_WASM_BINDING"); } @@ -16,15 +16,16 @@ fn main() { "RUST_BINDING_VERSION", rust_binding_version, ); - let emscripten_version = fs::read_to_string("../emscripten-version").unwrap(); + let emscripten_version = fs::read_to_string("emscripten-version").unwrap(); println!( "cargo:rustc-env={}={}", "EMSCRIPTEN_VERSION", emscripten_version, ); } -fn wasm_files_present() -> bool { +fn web_playground_files_present() -> bool { let paths = [ + "../docs/assets/js/playground.js", "../lib/binding_web/tree-sitter.js", "../lib/binding_web/tree-sitter.wasm", ]; @@ -81,10 +82,10 @@ fn read_git_sha() -> Option { } fn read_rust_binding_version() -> String { - let path = "../lib/Cargo.toml"; + let path = "Cargo.toml"; let text = fs::read_to_string(path).unwrap(); let cargo_toml = toml::from_str::(text.as_ref()).unwrap(); - cargo_toml["package"]["version"] + cargo_toml["dependencies"]["tree-sitter"]["version"] .as_str() .unwrap() .trim_matches('"') diff --git a/cli/config/src/lib.rs b/cli/config/src/lib.rs index 7979b969..a09694e4 100644 --- a/cli/config/src/lib.rs +++ b/cli/config/src/lib.rs @@ -1,6 +1,6 @@ //! Manages tree-sitter's configuration file. -use anyhow::{anyhow, Result}; +use anyhow::{anyhow, Context, Result}; use serde::{Deserialize, Serialize}; use serde_json::Value; use std::path::PathBuf; @@ -14,6 +14,7 @@ use std::{env, fs}; /// This type holds the generic JSON content of the configuration file. Individual tree-sitter /// components will use the [`get`][] method to parse that JSON to extract configuration fields /// that are specific to that component. +#[derive(Debug)] pub struct Config { pub location: PathBuf, pub config: Value, @@ -64,8 +65,10 @@ impl Config { Some(location) => location, None => return Config::initial(), }; - let content = fs::read_to_string(&location)?; - let config = serde_json::from_str(&content)?; + let content = fs::read_to_string(&location) + .with_context(|| format!("Failed to read {}", &location.to_string_lossy()))?; + let config = serde_json::from_str(&content) + .with_context(|| format!("Bad JSON config {}", &location.to_string_lossy()))?; Ok(Config { location, config }) } diff --git a/emscripten-version b/cli/emscripten-version similarity index 100% rename from emscripten-version rename to cli/emscripten-version diff --git a/cli/loader/Cargo.toml b/cli/loader/Cargo.toml index 4d2c8a5f..da27cae4 100644 --- a/cli/loader/Cargo.toml +++ b/cli/loader/Cargo.toml @@ -25,13 +25,13 @@ version = "1.0" features = ["preserve_order"] [dependencies.tree-sitter] -version = ">= 0.19" +version = "0.20" path = "../../lib" [dependencies.tree-sitter-highlight] -version = ">= 0.19" +version = "0.20" path = "../../highlight" [dependencies.tree-sitter-tags] -version = ">= 0.19" +version = "0.20" path = "../../tags" diff --git a/cli/loader/src/lib.rs b/cli/loader/src/lib.rs index 7d26ab02..0002bf08 100644 --- a/cli/loader/src/lib.rs +++ b/cli/loader/src/lib.rs @@ -12,7 +12,7 @@ use std::process::Command; use std::sync::Mutex; use std::time::SystemTime; use std::{fs, mem}; -use tree_sitter::{Language, QueryError}; +use tree_sitter::{Language, QueryError, QueryErrorKind}; use tree_sitter_highlight::HighlightConfiguration; use tree_sitter_tags::{Error as TagsError, TagsConfiguration}; @@ -101,6 +101,7 @@ pub struct Loader { language_configuration_ids_by_file_type: HashMap>, highlight_names: Box>>, use_all_highlight_names: bool, + debug_build: bool, } unsafe impl Send for Loader {} @@ -122,6 +123,7 @@ impl Loader { language_configuration_ids_by_file_type: HashMap::new(), highlight_names: Box::new(Mutex::new(Vec::new())), use_all_highlight_names: true, + debug_build: false, } } @@ -347,7 +349,11 @@ impl Loader { parser_path: &Path, scanner_path: &Option, ) -> Result { - let mut library_path = self.parser_lib_path.join(name); + let mut lib_name = name.to_string(); + if self.debug_build { + lib_name.push_str(".debug._"); + } + let mut library_path = self.parser_lib_path.join(lib_name); library_path.set_extension(DYLIB_EXTENSION); let recompile = needs_recompile(&library_path, &parser_path, &scanner_path) @@ -369,11 +375,13 @@ impl Loader { } if cfg!(windows) { - command - .args(&["/nologo", "/LD", "/I"]) - .arg(header_path) - .arg("/Od") - .arg(parser_path); + command.args(&["/nologo", "/LD", "/I"]).arg(header_path); + if self.debug_build { + command.arg("/Od"); + } else { + command.arg("/O2"); + } + command.arg(parser_path); if let Some(scanner_path) = scanner_path.as_ref() { command.arg(scanner_path); } @@ -389,8 +397,18 @@ impl Loader { .arg("-I") .arg(header_path) .arg("-o") - .arg(&library_path) - .arg("-O2"); + .arg(&library_path); + + if self.debug_build { + command.arg("-O0"); + } else { + command.arg("-O2"); + } + + // For conditional compilation of external scanner code when + // used internally by `tree-siteer parse` and other sub commands. + command.arg("-DTREE_SITTER_INTERNAL_BUILD"); + if let Some(scanner_path) = scanner_path.as_ref() { if scanner_path.extension() == Some("c".as_ref()) { command.arg("-xc").arg("-std=c99").arg(scanner_path); @@ -639,6 +657,10 @@ impl Loader { Err(anyhow!("No language found")) } } + + pub fn use_debug_build(&mut self, flag: bool) { + self.debug_build = flag; + } } impl<'a> LanguageConfiguration<'a> { @@ -662,28 +684,31 @@ impl<'a> LanguageConfiguration<'a> { &injections_query, &locals_query, ) - .map_err(|error| { - if error.offset < injections_query.len() { - Self::include_path_in_query_error( - error, - &injection_ranges, - &injections_query, - 0, - ) - } else if error.offset < injections_query.len() + locals_query.len() { - Self::include_path_in_query_error( - error, - &locals_ranges, - &locals_query, - injections_query.len(), - ) - } else { - Self::include_path_in_query_error( - error, - &highlight_ranges, - &highlights_query, - injections_query.len() + locals_query.len(), - ) + .map_err(|error| match error.kind { + QueryErrorKind::Language => Error::from(error), + _ => { + if error.offset < injections_query.len() { + Self::include_path_in_query_error( + error, + &injection_ranges, + &injections_query, + 0, + ) + } else if error.offset < injections_query.len() + locals_query.len() { + Self::include_path_in_query_error( + error, + &locals_ranges, + &locals_query, + injections_query.len(), + ) + } else { + Self::include_path_in_query_error( + error, + &highlight_ranges, + &highlights_query, + injections_query.len() + locals_query.len(), + ) + } } })?; let mut all_highlight_names = self.highlight_names.lock().unwrap(); diff --git a/cli/npm/.gitignore b/cli/npm/.gitignore index 2d3aa23a..942b33a1 100644 --- a/cli/npm/.gitignore +++ b/cli/npm/.gitignore @@ -2,3 +2,4 @@ tree-sitter tree-sitter.exe *.gz *.tgz +LICENSE diff --git a/cli/npm/package.json b/cli/npm/package.json index 66c7ccb2..cb0f30f7 100644 --- a/cli/npm/package.json +++ b/cli/npm/package.json @@ -14,7 +14,8 @@ ], "main": "lib/api/index.js", "scripts": { - "install": "node install.js" + "install": "node install.js", + "prepack": "cp ../../LICENSE ." }, "bin": { "tree-sitter": "cli.js" diff --git a/cli/src/generate/build_tables/build_lex_table.rs b/cli/src/generate/build_tables/build_lex_table.rs index b365feb1..d3ebb241 100644 --- a/cli/src/generate/build_tables/build_lex_table.rs +++ b/cli/src/generate/build_tables/build_lex_table.rs @@ -347,7 +347,7 @@ fn lex_states_differ( fn sort_states(table: &mut LexTable, parse_table: &mut ParseTable) { // Get a mapping of old state index -> new_state_index let mut old_ids_by_new_id = (0..table.states.len()).collect::>(); - &old_ids_by_new_id[1..].sort_by_key(|id| &table.states[*id]); + old_ids_by_new_id[1..].sort_by_key(|id| &table.states[*id]); // Get the inverse mapping let mut new_ids_by_old_id = vec![0; old_ids_by_new_id.len()]; diff --git a/cli/src/generate/build_tables/build_parse_table.rs b/cli/src/generate/build_tables/build_parse_table.rs index bcce614a..59ee631d 100644 --- a/cli/src/generate/build_tables/build_parse_table.rs +++ b/cli/src/generate/build_tables/build_parse_table.rs @@ -11,10 +11,14 @@ use crate::generate::tables::{ ProductionInfo, ProductionInfoId, }; use anyhow::{anyhow, Result}; +use std::cmp::Ordering; use std::collections::{BTreeMap, HashMap, HashSet, VecDeque}; use std::fmt::Write; +use std::hash::BuildHasherDefault; use std::u32; -use std::{cmp::Ordering, collections::hash_map::Entry}; + +use indexmap::{map::Entry, IndexMap}; +use rustc_hash::FxHasher; // For conflict reporting, each parse state is associated with an example // sequence of symbols that could lead to that parse state. @@ -49,7 +53,7 @@ struct ParseTableBuilder<'a> { lexical_grammar: &'a LexicalGrammar, variable_info: &'a Vec, core_ids_by_core: HashMap, usize>, - state_ids_by_item_set: HashMap, ParseStateId>, + state_ids_by_item_set: IndexMap, ParseStateId, BuildHasherDefault>, parse_state_info_by_id: Vec>, parse_state_queue: VecDeque, non_terminal_extra_states: Vec<(Symbol, usize)>, @@ -147,13 +151,7 @@ impl<'a> ParseTableBuilder<'a> { Entry::Vacant(v) => { let core = v.key().core(); let core_count = self.core_ids_by_core.len(); - let core_id = match self.core_ids_by_core.entry(core) { - Entry::Occupied(e) => *e.get(), - Entry::Vacant(e) => { - e.insert(core_count); - core_count - } - }; + let core_id = *self.core_ids_by_core.entry(core).or_insert(core_count); let state_id = self.parse_table.states.len(); self.parse_state_info_by_id @@ -163,8 +161,8 @@ impl<'a> ParseTableBuilder<'a> { id: state_id, lex_state_id: 0, external_lex_state_id: 0, - terminal_entries: HashMap::new(), - nonterminal_entries: HashMap::new(), + terminal_entries: IndexMap::default(), + nonterminal_entries: IndexMap::default(), core_id, }); self.parse_state_queue.push_back(ParseStateQueueEntry { @@ -981,7 +979,7 @@ pub(crate) fn build_parse_table<'a>( item_set_builder, variable_info, non_terminal_extra_states: Vec::new(), - state_ids_by_item_set: HashMap::new(), + state_ids_by_item_set: IndexMap::default(), core_ids_by_core: HashMap::new(), parse_state_info_by_id: Vec::new(), parse_state_queue: VecDeque::new(), diff --git a/cli/src/generate/build_tables/minimize_parse_table.rs b/cli/src/generate/build_tables/minimize_parse_table.rs index 4c2224c4..d10bea56 100644 --- a/cli/src/generate/build_tables/minimize_parse_table.rs +++ b/cli/src/generate/build_tables/minimize_parse_table.rs @@ -479,7 +479,7 @@ impl<'a> Minimizer<'a> { fn reorder_states_by_descending_size(&mut self) { // Get a mapping of old state index -> new_state_index let mut old_ids_by_new_id = (0..self.parse_table.states.len()).collect::>(); - &old_ids_by_new_id.sort_unstable_by_key(|i| { + old_ids_by_new_id.sort_unstable_by_key(|i| { // Don't changes states 0 (the error state) or 1 (the start state). if *i <= 1 { return *i as i64 - 1_000_000; diff --git a/cli/src/generate/mod.rs b/cli/src/generate/mod.rs index 123e6ffa..141fdff0 100644 --- a/cli/src/generate/mod.rs +++ b/cli/src/generate/mod.rs @@ -169,6 +169,7 @@ fn load_grammar_file(grammar_path: &Path) -> Result { } fn load_js_grammar_file(grammar_path: &Path) -> Result { + let grammar_path = fs::canonicalize(grammar_path)?; let mut node_process = Command::new("node") .env("TREE_SITTER_GRAMMAR_PATH", grammar_path) .stdin(Stdio::piped()) diff --git a/cli/src/generate/prepare_grammar/expand_tokens.rs b/cli/src/generate/prepare_grammar/expand_tokens.rs index a5fe318b..4950348f 100644 --- a/cli/src/generate/prepare_grammar/expand_tokens.rs +++ b/cli/src/generate/prepare_grammar/expand_tokens.rs @@ -19,10 +19,16 @@ lazy_static! { serde_json::from_str(UNICODE_CATEGORIES_JSON).unwrap(); static ref UNICODE_PROPERTIES: HashMap<&'static str, Vec> = serde_json::from_str(UNICODE_PROPERTIES_JSON).unwrap(); + static ref UNICODE_CATEGORY_ALIASES: HashMap<&'static str, String> = + serde_json::from_str(UNICODE_CATEGORY_ALIASES_JSON).unwrap(); + static ref UNICODE_PROPERTY_ALIASES: HashMap<&'static str, String> = + serde_json::from_str(UNICODE_PROPERTY_ALIASES_JSON).unwrap(); } const UNICODE_CATEGORIES_JSON: &'static str = include_str!("./unicode-categories.json"); const UNICODE_PROPERTIES_JSON: &'static str = include_str!("./unicode-properties.json"); +const UNICODE_CATEGORY_ALIASES_JSON: &'static str = include_str!("./unicode-category-aliases.json"); +const UNICODE_PROPERTY_ALIASES_JSON: &'static str = include_str!("./unicode-property-aliases.json"); const ALLOWED_REDUNDANT_ESCAPED_CHARS: [char; 4] = ['!', '\'', '"', '/']; struct NfaBuilder { @@ -394,12 +400,16 @@ impl NfaBuilder { category_letter = le.to_string(); } ClassUnicodeKind::Named(class_name) => { - if class_name.len() == 1 { - category_letter = class_name.clone(); + let actual_class_name = UNICODE_CATEGORY_ALIASES + .get(class_name.as_str()) + .or_else(|| UNICODE_PROPERTY_ALIASES.get(class_name.as_str())) + .unwrap_or(class_name); + if actual_class_name.len() == 1 { + category_letter = actual_class_name.clone(); } else { let code_points = UNICODE_CATEGORIES - .get(class_name.as_str()) - .or_else(|| UNICODE_PROPERTIES.get(class_name.as_str())) + .get(actual_class_name.as_str()) + .or_else(|| UNICODE_PROPERTIES.get(actual_class_name.as_str())) .ok_or_else(|| { anyhow!( "Regex error: Unsupported unicode character class {}", diff --git a/cli/src/generate/prepare_grammar/unicode-category-aliases.json b/cli/src/generate/prepare_grammar/unicode-category-aliases.json new file mode 100644 index 00000000..c7091c05 --- /dev/null +++ b/cli/src/generate/prepare_grammar/unicode-category-aliases.json @@ -0,0 +1 @@ +{"Other":"C","Control":"Cc","cntrl":"Cc","Format":"Cf","Unassigned":"Cn","Private_Use":"Co","Surrogate":"Cs","Letter":"L","Cased_Letter":"LC","Lowercase_Letter":"Ll","Modifier_Letter":"Lm","Other_Letter":"Lo","Titlecase_Letter":"Lt","Uppercase_Letter":"Lu","Mark":"M","Combining_Mark":"M","Spacing_Mark":"Mc","Enclosing_Mark":"Me","Nonspacing_Mark":"Mn","Number":"N","Decimal_Number":"Nd","digit":"Nd","Letter_Number":"Nl","Other_Number":"No","Punctuation":"P","punct":"P","Connector_Punctuation":"Pc","Dash_Punctuation":"Pd","Close_Punctuation":"Pe","Final_Punctuation":"Pf","Initial_Punctuation":"Pi","Other_Punctuation":"Po","Open_Punctuation":"Ps","Symbol":"S","Currency_Symbol":"Sc","Modifier_Symbol":"Sk","Math_Symbol":"Sm","Other_Symbol":"So","Separator":"Z","Line_Separator":"Zl","Paragraph_Separator":"Zp","Space_Separator":"Zs"} \ No newline at end of file diff --git a/cli/src/generate/prepare_grammar/unicode-property-aliases.json b/cli/src/generate/prepare_grammar/unicode-property-aliases.json new file mode 100644 index 00000000..2dd2e28c --- /dev/null +++ b/cli/src/generate/prepare_grammar/unicode-property-aliases.json @@ -0,0 +1 @@ +{"cjkAccountingNumeric":"kAccountingNumeric","cjkOtherNumeric":"kOtherNumeric","cjkPrimaryNumeric":"kPrimaryNumeric","nv":"Numeric_Value","cf":"Case_Folding","cjkCompatibilityVariant":"kCompatibilityVariant","dm":"Decomposition_Mapping","FC_NFKC":"FC_NFKC_Closure","lc":"Lowercase_Mapping","NFKC_CF":"NFKC_Casefold","scf":"Simple_Case_Folding","sfc":"Simple_Case_Folding","slc":"Simple_Lowercase_Mapping","stc":"Simple_Titlecase_Mapping","suc":"Simple_Uppercase_Mapping","tc":"Titlecase_Mapping","uc":"Uppercase_Mapping","bmg":"Bidi_Mirroring_Glyph","bpb":"Bidi_Paired_Bracket","cjkIICore":"kIICore","cjkIRG_GSource":"kIRG_GSource","cjkIRG_HSource":"kIRG_HSource","cjkIRG_JSource":"kIRG_JSource","cjkIRG_KPSource":"kIRG_KPSource","cjkIRG_KSource":"kIRG_KSource","cjkIRG_MSource":"kIRG_MSource","cjkIRG_SSource":"kIRG_SSource","cjkIRG_TSource":"kIRG_TSource","cjkIRG_UKSource":"kIRG_UKSource","cjkIRG_USource":"kIRG_USource","cjkIRG_VSource":"kIRG_VSource","cjkRSUnicode":"kRSUnicode","Unicode_Radical_Stroke":"kRSUnicode","URS":"kRSUnicode","EqUIdeo":"Equivalent_Unified_Ideograph","isc":"ISO_Comment","JSN":"Jamo_Short_Name","na":"Name","na1":"Unicode_1_Name","Name_Alias":"Name_Alias","scx":"Script_Extensions","age":"Age","blk":"Block","sc":"Script","bc":"Bidi_Class","bpt":"Bidi_Paired_Bracket_Type","ccc":"Canonical_Combining_Class","dt":"Decomposition_Type","ea":"East_Asian_Width","gc":"General_Category","GCB":"Grapheme_Cluster_Break","hst":"Hangul_Syllable_Type","InPC":"Indic_Positional_Category","InSC":"Indic_Syllabic_Category","jg":"Joining_Group","jt":"Joining_Type","lb":"Line_Break","NFC_QC":"NFC_Quick_Check","NFD_QC":"NFD_Quick_Check","NFKC_QC":"NFKC_Quick_Check","NFKD_QC":"NFKD_Quick_Check","nt":"Numeric_Type","SB":"Sentence_Break","vo":"Vertical_Orientation","WB":"Word_Break","AHex":"ASCII_Hex_Digit","Alpha":"Alphabetic","Bidi_C":"Bidi_Control","Bidi_M":"Bidi_Mirrored","Cased":"Cased","CE":"Composition_Exclusion","CI":"Case_Ignorable","Comp_Ex":"Full_Composition_Exclusion","CWCF":"Changes_When_Casefolded","CWCM":"Changes_When_Casemapped","CWKCF":"Changes_When_NFKC_Casefolded","CWL":"Changes_When_Lowercased","CWT":"Changes_When_Titlecased","CWU":"Changes_When_Uppercased","Dash":"Dash","Dep":"Deprecated","DI":"Default_Ignorable_Code_Point","Dia":"Diacritic","EBase":"Emoji_Modifier_Base","EComp":"Emoji_Component","EMod":"Emoji_Modifier","Emoji":"Emoji","EPres":"Emoji_Presentation","Ext":"Extender","ExtPict":"Extended_Pictographic","Gr_Base":"Grapheme_Base","Gr_Ext":"Grapheme_Extend","Gr_Link":"Grapheme_Link","Hex":"Hex_Digit","Hyphen":"Hyphen","IDC":"ID_Continue","Ideo":"Ideographic","IDS":"ID_Start","IDSB":"IDS_Binary_Operator","IDST":"IDS_Trinary_Operator","Join_C":"Join_Control","LOE":"Logical_Order_Exception","Lower":"Lowercase","Math":"Math","NChar":"Noncharacter_Code_Point","OAlpha":"Other_Alphabetic","ODI":"Other_Default_Ignorable_Code_Point","OGr_Ext":"Other_Grapheme_Extend","OIDC":"Other_ID_Continue","OIDS":"Other_ID_Start","OLower":"Other_Lowercase","OMath":"Other_Math","OUpper":"Other_Uppercase","Pat_Syn":"Pattern_Syntax","Pat_WS":"Pattern_White_Space","PCM":"Prepended_Concatenation_Mark","QMark":"Quotation_Mark","Radical":"Radical","RI":"Regional_Indicator","SD":"Soft_Dotted","STerm":"Sentence_Terminal","Term":"Terminal_Punctuation","UIdeo":"Unified_Ideograph","Upper":"Uppercase","VS":"Variation_Selector","WSpace":"White_Space","space":"White_Space","XIDC":"XID_Continue","XIDS":"XID_Start","XO_NFC":"Expands_On_NFC","XO_NFD":"Expands_On_NFD","XO_NFKC":"Expands_On_NFKC","XO_NFKD":"Expands_On_NFKD"} \ No newline at end of file diff --git a/cli/src/generate/render.rs b/cli/src/generate/render.rs index 78a07a22..613776bf 100644 --- a/cli/src/generate/render.rs +++ b/cli/src/generate/render.rs @@ -1057,7 +1057,7 @@ impl Generator { } fn add_parse_table(&mut self) { - let mut parse_table_entries = Vec::new(); + let mut parse_table_entries = HashMap::new(); let mut next_parse_action_list_index = 0; self.get_parse_action_list_id( @@ -1224,6 +1224,11 @@ impl Generator { add_line!(self, ""); } + let mut parse_table_entries: Vec<_> = parse_table_entries + .into_iter() + .map(|(entry, i)| (i, entry)) + .collect(); + parse_table_entries.sort_by_key(|(index, _)| *index); self.add_parse_action_list(parse_table_entries); } @@ -1404,17 +1409,17 @@ impl Generator { fn get_parse_action_list_id( &self, entry: &ParseTableEntry, - parse_table_entries: &mut Vec<(usize, ParseTableEntry)>, + parse_table_entries: &mut HashMap, next_parse_action_list_index: &mut usize, ) -> usize { - if let Some((index, _)) = parse_table_entries.iter().find(|(_, e)| *e == *entry) { - return *index; + if let Some(&index) = parse_table_entries.get(entry) { + index + } else { + let result = *next_parse_action_list_index; + parse_table_entries.insert(entry.clone(), result); + *next_parse_action_list_index += 1 + entry.actions.len(); + result } - - let result = *next_parse_action_list_index; - parse_table_entries.push((result, entry.clone())); - *next_parse_action_list_index += 1 + entry.actions.len(); - result } fn get_field_map_id( diff --git a/cli/src/generate/tables.rs b/cli/src/generate/tables.rs index ccbf8895..16bf1851 100644 --- a/cli/src/generate/tables.rs +++ b/cli/src/generate/tables.rs @@ -1,11 +1,16 @@ use super::nfa::CharacterSet; use super::rules::{Alias, Symbol, TokenSet}; -use std::collections::{BTreeMap, HashMap}; +use std::collections::BTreeMap; pub(crate) type ProductionInfoId = usize; pub(crate) type ParseStateId = usize; pub(crate) type LexStateId = usize; -#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +use std::hash::BuildHasherDefault; + +use indexmap::IndexMap; +use rustc_hash::FxHasher; + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] pub(crate) enum ParseAction { Accept, Shift { @@ -28,7 +33,7 @@ pub(crate) enum GotoAction { ShiftExtra, } -#[derive(Clone, Debug, PartialEq, Eq)] +#[derive(Clone, Debug, PartialEq, Eq, Hash)] pub(crate) struct ParseTableEntry { pub actions: Vec, pub reusable: bool, @@ -37,8 +42,8 @@ pub(crate) struct ParseTableEntry { #[derive(Clone, Debug, Default, PartialEq, Eq)] pub(crate) struct ParseState { pub id: ParseStateId, - pub terminal_entries: HashMap, - pub nonterminal_entries: HashMap, + pub terminal_entries: IndexMap>, + pub nonterminal_entries: IndexMap>, pub lex_state_id: usize, pub external_lex_state_id: usize, pub core_id: usize, diff --git a/cli/src/lib.rs b/cli/src/lib.rs index 734b3e6a..7de4afc5 100644 --- a/cli/src/lib.rs +++ b/cli/src/lib.rs @@ -2,6 +2,7 @@ pub mod generate; pub mod highlight; pub mod logger; pub mod parse; +pub mod playground; pub mod query; pub mod query_testing; pub mod tags; @@ -9,7 +10,6 @@ pub mod test; pub mod test_highlight; pub mod util; pub mod wasm; -pub mod web_ui; #[cfg(test)] mod tests; diff --git a/cli/src/logger.rs b/cli/src/logger.rs index 6abe6470..ce4f74a3 100644 --- a/cli/src/logger.rs +++ b/cli/src/logger.rs @@ -1,5 +1,6 @@ use log::{LevelFilter, Log, Metadata, Record}; +#[allow(dead_code)] struct Logger { pub filter: Option, } diff --git a/cli/src/main.rs b/cli/src/main.rs index 8d701852..2c18f03f 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -4,7 +4,7 @@ use glob::glob; use std::path::Path; use std::{env, fs, u64}; use tree_sitter_cli::{ - generate, highlight, logger, parse, query, tags, test, test_highlight, util, wasm, web_ui, + generate, highlight, logger, parse, playground, query, tags, test, test_highlight, util, wasm, }; use tree_sitter_config::Config; use tree_sitter_loader as loader; @@ -35,6 +35,45 @@ fn run() -> Result<()> { BUILD_VERSION.to_string() }; + let debug_arg = Arg::with_name("debug") + .help("Show parsing debug log") + .long("debug") + .short("d"); + + let debug_graph_arg = Arg::with_name("debug-graph") + .help("Produce the log.html file with debug graphs") + .long("debug-graph") + .short("D"); + + let debug_build_arg = Arg::with_name("debug-build") + .help("Compile a parser in debug mode") + .long("debug-build") + .short("0"); + + let paths_file_arg = Arg::with_name("paths-file") + .help("The path to a file with paths to source file(s)") + .long("paths") + .takes_value(true); + + let paths_arg = Arg::with_name("paths") + .help("The source file(s) to use") + .multiple(true); + + let scope_arg = Arg::with_name("scope") + .help("Select a language by the scope instead of a file extension") + .long("scope") + .takes_value(true); + + let time_arg = Arg::with_name("time") + .help("Measure execution time") + .long("time") + .short("t"); + + let quiet_arg = Arg::with_name("quiet") + .help("Suppress main output") + .long("quiet") + .short("q"); + let matches = App::new("tree-sitter") .author("Max Brunsfeld ") .about("Generates and tests parsers") @@ -65,23 +104,30 @@ fn run() -> Result<()> { SubCommand::with_name("parse") .alias("p") .about("Parse files") - .arg(Arg::with_name("paths-file").long("paths").takes_value(true)) - .arg( - Arg::with_name("paths") - .index(1) - .multiple(true) - .required(false), - ) - .arg(Arg::with_name("scope").long("scope").takes_value(true)) - .arg(Arg::with_name("debug").long("debug").short("d")) - .arg(Arg::with_name("debug-graph").long("debug-graph").short("D")) + .arg(&paths_file_arg) + .arg(&paths_arg) + .arg(&scope_arg) + .arg(&debug_arg) + .arg(&debug_build_arg) + .arg(&debug_graph_arg) .arg(Arg::with_name("debug-xml").long("xml").short("x")) - .arg(Arg::with_name("quiet").long("quiet").short("q")) - .arg(Arg::with_name("stat").long("stat").short("s")) - .arg(Arg::with_name("time").long("time").short("t")) - .arg(Arg::with_name("timeout").long("timeout").takes_value(true)) + .arg( + Arg::with_name("stat") + .help("Show parsing statistic") + .long("stat") + .short("s"), + ) + .arg( + Arg::with_name("timeout") + .help("Interrupt the parsing process by timeout (µs)") + .long("timeout") + .takes_value(true), + ) + .arg(&time_arg) + .arg(&quiet_arg) .arg( Arg::with_name("edits") + .help("Apply edits in the format: \"row,col del_count insert_text\"") .long("edit") .short("edit") .takes_value(true) @@ -93,36 +139,32 @@ fn run() -> Result<()> { SubCommand::with_name("query") .alias("q") .about("Search files using a syntax tree query") - .arg(Arg::with_name("query-path").index(1).required(true)) - .arg(Arg::with_name("paths-file").long("paths").takes_value(true)) .arg( - Arg::with_name("paths") - .index(2) - .multiple(true) - .required(false), + Arg::with_name("query-path") + .help("Path to a file with queries") + .index(1) + .required(true), ) + .arg(&paths_file_arg) + .arg(&paths_arg.clone().index(2)) .arg( Arg::with_name("byte-range") .help("The range of byte offsets in which the query will be executed") .long("byte-range") .takes_value(true), ) - .arg(Arg::with_name("scope").long("scope").takes_value(true)) + .arg(&scope_arg) .arg(Arg::with_name("captures").long("captures").short("c")) .arg(Arg::with_name("test").long("test")), ) .subcommand( SubCommand::with_name("tags") - .arg(Arg::with_name("quiet").long("quiet").short("q")) - .arg(Arg::with_name("time").long("time").short("t")) - .arg(Arg::with_name("scope").long("scope").takes_value(true)) - .arg(Arg::with_name("paths-file").long("paths").takes_value(true)) - .arg( - Arg::with_name("paths") - .help("The source file to use") - .index(1) - .multiple(true), - ), + .about("Generate a list of tags") + .arg(&scope_arg) + .arg(&time_arg) + .arg(&quiet_arg) + .arg(&paths_file_arg) + .arg(&paths_arg), ) .subcommand( SubCommand::with_name("test") @@ -141,23 +183,24 @@ fn run() -> Result<()> { .short("u") .help("Update all syntax trees in corpus files with current parser output"), ) - .arg(Arg::with_name("debug").long("debug").short("d")) - .arg(Arg::with_name("debug-graph").long("debug-graph").short("D")), + .arg(&debug_arg) + .arg(&debug_build_arg) + .arg(&debug_graph_arg), ) .subcommand( SubCommand::with_name("highlight") .about("Highlight a file") - .arg(Arg::with_name("paths-file").long("paths").takes_value(true)) .arg( - Arg::with_name("paths") - .index(1) - .multiple(true) - .required(false), + Arg::with_name("html") + .help("Generate highlighting as an HTML document") + .long("html") + .short("H"), ) - .arg(Arg::with_name("scope").long("scope").takes_value(true)) - .arg(Arg::with_name("html").long("html").short("H")) - .arg(Arg::with_name("time").long("time").short("t")) - .arg(Arg::with_name("quiet").long("quiet").short("q")), + .arg(&scope_arg) + .arg(&time_arg) + .arg(&quiet_arg) + .arg(&paths_file_arg) + .arg(&paths_arg), ) .subcommand( SubCommand::with_name("build-wasm") @@ -180,7 +223,7 @@ fn run() -> Result<()> { Arg::with_name("quiet") .long("quiet") .short("q") - .help("open in default browser"), + .help("Don't open in default browser"), ), ) .subcommand( @@ -237,8 +280,12 @@ fn run() -> Result<()> { ("test", Some(matches)) => { let debug = matches.is_present("debug"); let debug_graph = matches.is_present("debug-graph"); + let debug_build = matches.is_present("debug-build"); let update = matches.is_present("update"); let filter = matches.value_of("filter"); + + loader.use_debug_build(debug_build); + let languages = loader.languages_at_path(¤t_dir)?; let language = languages .first() @@ -274,6 +321,7 @@ fn run() -> Result<()> { ("parse", Some(matches)) => { let debug = matches.is_present("debug"); let debug_graph = matches.is_present("debug-graph"); + let debug_build = matches.is_present("debug-build"); let debug_xml = matches.is_present("debug-xml"); let quiet = matches.is_present("quiet"); let time = matches.is_present("time"); @@ -287,6 +335,8 @@ fn run() -> Result<()> { env::set_var("TREE_SITTER_DEBUG", "1"); } + loader.use_debug_build(debug_build); + let timeout = matches .value_of("timeout") .map_or(0, |t| u64::from_str_radix(t, 10).unwrap()); @@ -418,11 +468,10 @@ fn run() -> Result<()> { if let Some(highlight_config) = language_config.highlight_config(language)? { let source = fs::read(path)?; - let theme_config = config.get()?; if html_mode { highlight::html( &loader, - &theme_config, + &theme_config.theme, &source, highlight_config, quiet, @@ -431,7 +480,7 @@ fn run() -> Result<()> { } else { highlight::ansi( &loader, - &theme_config, + &theme_config.theme, &source, highlight_config, time, @@ -455,7 +504,7 @@ fn run() -> Result<()> { ("playground", Some(matches)) => { let open_in_browser = !matches.is_present("quiet"); - web_ui::serve(¤t_dir, open_in_browser); + playground::serve(¤t_dir, open_in_browser); } ("dump-languages", Some(_)) => { diff --git a/cli/src/web_ui.html b/cli/src/playground.html similarity index 100% rename from cli/src/web_ui.html rename to cli/src/playground.html diff --git a/cli/src/web_ui.rs b/cli/src/playground.rs similarity index 74% rename from cli/src/web_ui.rs rename to cli/src/playground.rs index bbdbd381..f674ce11 100644 --- a/cli/src/web_ui.rs +++ b/cli/src/playground.rs @@ -9,28 +9,6 @@ use tiny_http::{Header, Response, Server}; use webbrowser; macro_rules! resource { - ($name: tt, $path: tt) => { - #[cfg(TREE_SITTER_EMBED_WASM_BINDING)] - fn $name(tree_sitter_dir: &Option) -> Vec { - if let Some(tree_sitter_dir) = tree_sitter_dir { - fs::read(tree_sitter_dir.join($path)).unwrap() - } else { - include_bytes!(concat!("../../", $path)).to_vec() - } - } - - #[cfg(not(TREE_SITTER_EMBED_WASM_BINDING))] - fn $name(tree_sitter_dir: &Option) -> Vec { - if let Some(tree_sitter_dir) = tree_sitter_dir { - fs::read(tree_sitter_dir.join($path)).unwrap() - } else { - include_bytes!(concat!("../../", $path)).to_vec() - } - } - }; -} - -macro_rules! optional_resource { ($name: tt, $path: tt) => { #[cfg(TREE_SITTER_EMBED_WASM_BINDING)] fn $name(tree_sitter_dir: &Option) -> Vec { @@ -52,15 +30,15 @@ macro_rules! optional_resource { }; } -resource!(get_main_html, "cli/src/web_ui.html"); +resource!(get_main_html, "cli/src/playground.html"); resource!(get_playground_js, "docs/assets/js/playground.js"); -optional_resource!(get_lib_js, "lib/binding_web/tree-sitter.js"); -optional_resource!(get_lib_wasm, "lib/binding_web/tree-sitter.wasm"); +resource!(get_lib_js, "lib/binding_web/tree-sitter.js"); +resource!(get_lib_wasm, "lib/binding_web/tree-sitter.wasm"); pub fn serve(grammar_path: &Path, open_in_browser: bool) { let port = get_available_port().expect("Couldn't find an available port"); - let url = format!("127.0.0.1:{}", port); - let server = Server::http(&url).expect("Failed to start web server"); + let addr = format!("127.0.0.1:{}", port); + let server = Server::http(&addr).expect("Failed to start web server"); let grammar_name = wasm::get_grammar_name(&grammar_path.join("src")) .with_context(|| "Failed to get wasm filename") .unwrap(); @@ -73,8 +51,10 @@ pub fn serve(grammar_path: &Path, open_in_browser: bool) { ) }) .unwrap(); + let url = format!("http://{}", addr); + println!("Started playground on: {}", url); if open_in_browser { - if let Err(_) = webbrowser::open(&format!("http://127.0.0.1:{}", port)) { + if let Err(_) = webbrowser::open(&url) { eprintln!("Failed to open '{}' in a web browser", url); } } @@ -95,17 +75,23 @@ pub fn serve(grammar_path: &Path, open_in_browser: bool) { for request in server.incoming_requests() { let res = match request.url() { "/" => response(&main_html, &html_header), - "/playground.js" => response(&playground_js, &js_header), "/tree-sitter-parser.wasm" => response(&language_wasm, &wasm_header), + "/playground.js" => { + if playground_js.is_empty() { + redirect("https://tree-sitter.github.io/tree-sitter/assets/js/playground.js") + } else { + response(&playground_js, &js_header) + } + } "/tree-sitter.js" => { - if cfg!(windows) { + if lib_js.is_empty() { redirect("https://tree-sitter.github.io/tree-sitter.js") } else { response(&lib_js, &js_header) } } "/tree-sitter.wasm" => { - if cfg!(windows) { + if lib_wasm.is_empty() { redirect("https://tree-sitter.github.io/tree-sitter.wasm") } else { response(&lib_wasm, &wasm_header) diff --git a/cli/src/query.rs b/cli/src/query.rs index 9039f751..73d6dd28 100644 --- a/cli/src/query.rs +++ b/cli/src/query.rs @@ -48,10 +48,12 @@ pub fn query_files_at_paths( let capture_name = &query.capture_names()[capture.index as usize]; writeln!( &mut stdout, - " pattern: {}, capture: {}, row: {}, text: {:?}", + " pattern: {:>2}, capture: {} - {}, start: {}, end: {}, text: `{}`", mat.pattern_index, + capture.index, capture_name, - capture.node.start_position().row, + capture.node.start_position(), + capture.node.end_position(), capture.node.utf8_text(&source_code).unwrap_or("") )?; results.push(query_testing::CaptureInfo { @@ -70,9 +72,11 @@ pub fn query_files_at_paths( if end.row == start.row { writeln!( &mut stdout, - " capture: {}, start: {}, text: {:?}", + " capture: {} - {}, start: {}, end: {}, text: `{}`", + capture.index, capture_name, start, + end, capture.node.utf8_text(&source_code).unwrap_or("") )?; } else { diff --git a/cli/src/query_testing.rs b/cli/src/query_testing.rs index 6dc35c8d..9950f12f 100644 --- a/cli/src/query_testing.rs +++ b/cli/src/query_testing.rs @@ -48,40 +48,38 @@ pub fn parse_position_comments( if node.kind().contains("comment") { if let Ok(text) = node.utf8_text(source) { let mut position = node.start_position(); - if position.row == 0 { - continue; - } - - // Find the arrow character ("^" or '<-") in the comment. A left arrow - // refers to the column where the comment node starts. An up arrow refers - // to its own column. - let mut has_left_caret = false; - let mut has_arrow = false; - let mut arrow_end = 0; - for (i, c) in text.char_indices() { - arrow_end = i + 1; - if c == '-' && has_left_caret { - has_arrow = true; - break; + if position.row > 0 { + // Find the arrow character ("^" or '<-") in the comment. A left arrow + // refers to the column where the comment node starts. An up arrow refers + // to its own column. + let mut has_left_caret = false; + let mut has_arrow = false; + let mut arrow_end = 0; + for (i, c) in text.char_indices() { + arrow_end = i + 1; + if c == '-' && has_left_caret { + has_arrow = true; + break; + } + if c == '^' { + has_arrow = true; + position.column += i; + break; + } + has_left_caret = c == '<'; } - if c == '^' { - has_arrow = true; - position.column += i; - break; - } - has_left_caret = c == '<'; - } - // If the comment node contains an arrow and a highlight name, record the - // highlight name and the position. - if let (true, Some(mat)) = - (has_arrow, CAPTURE_NAME_REGEX.find(&text[arrow_end..])) - { - assertion_ranges.push((node.start_position(), node.end_position())); - result.push(Assertion { - position: position, - expected_capture_name: mat.as_str().to_string(), - }); + // If the comment node contains an arrow and a highlight name, record the + // highlight name and the position. + if let (true, Some(mat)) = + (has_arrow, CAPTURE_NAME_REGEX.find(&text[arrow_end..])) + { + assertion_ranges.push((node.start_position(), node.end_position())); + result.push(Assertion { + position: position, + expected_capture_name: mat.as_str().to_string(), + }); + } } } } diff --git a/cli/src/test.rs b/cli/src/test.rs index 9c6987d7..4374f527 100644 --- a/cli/src/test.rs +++ b/cli/src/test.rs @@ -5,7 +5,6 @@ use difference::{Changeset, Difference}; use lazy_static::lazy_static; use regex::bytes::{Regex as ByteRegex, RegexBuilder as ByteRegexBuilder}; use regex::Regex; -use std::char; use std::ffi::OsStr; use std::fmt::Write as FmtWrite; use std::fs; @@ -16,11 +15,12 @@ use tree_sitter::{Language, LogType, Parser, Query}; use walkdir::WalkDir; lazy_static! { - static ref HEADER_REGEX: ByteRegex = ByteRegexBuilder::new(r"^===+\r?\n([^=]*)\r?\n===+\r?\n") - .multi_line(true) - .build() - .unwrap(); - static ref DIVIDER_REGEX: ByteRegex = ByteRegexBuilder::new(r"^---+\r?\n") + static ref HEADER_REGEX: ByteRegex = + ByteRegexBuilder::new(r"^===+(?P[^=\r\n][^\r\n]*)?\r?\n(?P[^=\r\n][^\r\n]*)\r?\n===+(?P[^=\r\n][^\r\n]*)?\r?\n") + .multi_line(true) + .build() + .unwrap(); + static ref DIVIDER_REGEX: ByteRegex = ByteRegexBuilder::new(r"^---+(?P[^-\r\n][^\r\n]*)?\r?\n") .multi_line(true) .build() .unwrap(); @@ -114,7 +114,9 @@ pub fn run_tests_at_path( print_diff_key(); for (i, (name, actual, expected)) in failures.iter().enumerate() { println!("\n {}. {}:", i + 1, name); - print_diff(actual, expected); + let actual = format_sexp_indented(&actual, 2); + let expected = format_sexp_indented(&expected, 2); + print_diff(&actual, &expected); } Err(anyhow!("")) } @@ -153,8 +155,7 @@ pub fn print_diff_key() { } pub fn print_diff(actual: &String, expected: &String) { - let changeset = Changeset::new(actual, expected, " "); - print!(" "); + let changeset = Changeset::new(actual, expected, "\n"); for diff in &changeset.diffs { match diff { Difference::Same(part) => { @@ -263,9 +264,13 @@ fn run_tests( } fn format_sexp(sexp: &String) -> String { + format_sexp_indented(sexp, 0) +} + +fn format_sexp_indented(sexp: &String, initial_indent_level: u32) -> String { let mut formatted = String::new(); - let mut indent_level = 0; + let mut indent_level = initial_indent_level; let mut has_field = false; let mut s_iter = sexp.split(|c| c == ' ' || c == ')'); while let Some(s) = s_iter.next() { @@ -375,22 +380,58 @@ fn parse_test_content(name: String, content: String, file_path: Option) let mut prev_name = String::new(); let mut prev_header_end = 0; - // Identify all of the test descriptions using the `======` headers. - for (header_start, header_end) in HEADER_REGEX - .find_iter(&bytes) - .map(|m| (m.start(), m.end())) - .chain(Some((bytes.len(), bytes.len()))) - { - // Find the longest line of dashes following each test description. - // That is the divider between input and expected output. + // Find the first test header in the file, and determine if it has a + // custom suffix. If so, then this suffix will be used to identify + // all subsequent headers and divider lines in the file. + let first_suffix = HEADER_REGEX + .captures(bytes) + .and_then(|c| c.name("suffix1")) + .map(|m| String::from_utf8_lossy(m.as_bytes())); + + // Find all of the `===` test headers, which contain the test names. + // Ignore any matches whose suffix does not match the first header + // suffix in the file. + let header_matches = HEADER_REGEX.captures_iter(&bytes).filter_map(|c| { + let suffix1 = c + .name("suffix1") + .map(|m| String::from_utf8_lossy(m.as_bytes())); + let suffix2 = c + .name("suffix2") + .map(|m| String::from_utf8_lossy(m.as_bytes())); + if suffix1 == first_suffix && suffix2 == first_suffix { + let header_range = c.get(0).unwrap().range(); + let test_name = c + .name("test_name") + .map(|c| String::from_utf8_lossy(c.as_bytes()).to_string()); + Some((header_range, test_name)) + } else { + None + } + }); + + for (header_range, test_name) in header_matches.chain(Some((bytes.len()..bytes.len(), None))) { + // Find the longest line of dashes following each test description. That line + // separates the input from the expected output. Ignore any matches whose suffix + // does not match the first suffix in the file. if prev_header_end > 0 { - let divider_match = DIVIDER_REGEX - .find_iter(&bytes[prev_header_end..header_start]) - .map(|m| (prev_header_end + m.start(), prev_header_end + m.end())) - .max_by_key(|(start, end)| end - start); - if let Some((divider_start, divider_end)) = divider_match { - if let Ok(output) = str::from_utf8(&bytes[divider_end..header_start]) { - let mut input = bytes[prev_header_end..divider_start].to_vec(); + let divider_range = DIVIDER_REGEX + .captures_iter(&bytes[prev_header_end..header_range.start]) + .filter_map(|m| { + let suffix = m + .name("suffix") + .map(|m| String::from_utf8_lossy(m.as_bytes())); + if suffix == first_suffix { + let range = m.get(0).unwrap().range(); + Some((prev_header_end + range.start)..(prev_header_end + range.end)) + } else { + None + } + }) + .max_by_key(|range| range.len()); + + if let Some(divider_range) = divider_range { + if let Ok(output) = str::from_utf8(&bytes[divider_range.end..header_range.start]) { + let mut input = bytes[prev_header_end..divider_range.start].to_vec(); // Remove trailing newline from the input. input.pop(); @@ -400,6 +441,7 @@ fn parse_test_content(name: String, content: String, file_path: Option) // Remove all comments let output = COMMENT_REGEX.replace_all(output, "").to_string(); + // Normalize the whitespace in the expected output. let output = WHITESPACE_REGEX.replace_all(output.trim(), " "); let output = output.replace(" )", ")"); @@ -417,10 +459,8 @@ fn parse_test_content(name: String, content: String, file_path: Option) } } } - prev_name = String::from_utf8_lossy(&bytes[header_start..header_end]) - .trim_matches(|c| char::is_whitespace(c) || c == '=') - .to_string(); - prev_header_end = header_end; + prev_name = test_name.unwrap_or(String::new()); + prev_header_end = header_range.end; } TestEntry::Group { name, @@ -434,7 +474,7 @@ mod tests { use super::*; #[test] - fn test_parse_test_content() { + fn test_parse_test_content_simple() { let entry = parse_test_content( "the-filename".to_string(), r#" @@ -664,4 +704,88 @@ code } ); } + + #[test] + fn test_parse_test_content_with_suffixes() { + let entry = parse_test_content( + "the-filename".to_string(), + r#" +==================asdf\()[]|{}*+?^$.- +First test +==================asdf\()[]|{}*+?^$.- + +========================= +NOT A TEST HEADER +========================= +------------------------- + +---asdf\()[]|{}*+?^$.- + +(a) + +==================asdf\()[]|{}*+?^$.- +Second test +==================asdf\()[]|{}*+?^$.- + +========================= +NOT A TEST HEADER +========================= +------------------------- + +---asdf\()[]|{}*+?^$.- + +(a) + +=========================asdf\()[]|{}*+?^$.- +Test name with = symbol +=========================asdf\()[]|{}*+?^$.- + +========================= +NOT A TEST HEADER +========================= +------------------------- + +---asdf\()[]|{}*+?^$.- + +(a) + "# + .trim() + .to_string(), + None, + ); + + let expected_input = "\n=========================\n\ + NOT A TEST HEADER\n\ + =========================\n\ + -------------------------\n" + .as_bytes() + .to_vec(); + assert_eq!( + entry, + TestEntry::Group { + name: "the-filename".to_string(), + children: vec![ + TestEntry::Example { + name: "First test".to_string(), + input: expected_input.clone(), + output: "(a)".to_string(), + has_fields: false, + }, + TestEntry::Example { + name: "Second test".to_string(), + input: expected_input.clone(), + output: "(a)".to_string(), + has_fields: false, + }, + TestEntry::Example { + name: "Test name with = symbol".to_string(), + input: expected_input.clone(), + output: "(a)".to_string(), + has_fields: false, + } + ], + file_path: None, + } + ); + } } diff --git a/cli/src/tests/parser_test.rs b/cli/src/tests/parser_test.rs index 58cd1880..a24ed4bb 100644 --- a/cli/src/tests/parser_test.rs +++ b/cli/src/tests/parser_test.rs @@ -63,9 +63,14 @@ fn test_parsing_with_logging() { ))); assert!(messages.contains(&(LogType::Lex, "skip character:' '".to_string()))); + let mut row_starts_from_0 = false; for (_, m) in &messages { - assert!(!m.contains("row:0")); + if m.contains("row:0") { + row_starts_from_0 = true; + break; + } } + assert!(row_starts_from_0); } #[test] @@ -849,7 +854,10 @@ fn test_parsing_with_multiple_included_ranges() { hello_text_node.start_byte(), source_code.find("Hello").unwrap() ); - assert_eq!(hello_text_node.end_byte(), source_code.find("").unwrap()); + assert_eq!( + hello_text_node.end_byte(), + source_code.find(" ").unwrap() + ); assert_eq!(b_start_tag_node.kind(), "start_tag"); assert_eq!( diff --git a/cli/src/tests/test_highlight_test.rs b/cli/src/tests/test_highlight_test.rs index 1a658281..af2c15c5 100644 --- a/cli/src/tests/test_highlight_test.rs +++ b/cli/src/tests/test_highlight_test.rs @@ -17,6 +17,7 @@ fn test_highlight_test_with_basic_test() { ], ); let source = [ + "// hi", "var abc = function(d) {", " // ^ function", " // ^ keyword", @@ -32,15 +33,15 @@ fn test_highlight_test_with_basic_test() { assertions, &[ Assertion { - position: Point::new(0, 5), + position: Point::new(1, 5), expected_capture_name: "function".to_string() }, Assertion { - position: Point::new(0, 11), + position: Point::new(1, 11), expected_capture_name: "keyword".to_string() }, Assertion { - position: Point::new(3, 9), + position: Point::new(4, 9), expected_capture_name: "variable.parameter".to_string() }, ] @@ -53,12 +54,12 @@ fn test_highlight_test_with_basic_test() { assert_eq!( highlight_positions, &[ - (Point::new(0, 0), Point::new(0, 3), Highlight(2)), // "var" - (Point::new(0, 4), Point::new(0, 7), Highlight(0)), // "abc" - (Point::new(0, 10), Point::new(0, 18), Highlight(2)), // "function" - (Point::new(0, 19), Point::new(0, 20), Highlight(1)), // "d" - (Point::new(3, 2), Point::new(3, 8), Highlight(2)), // "return" - (Point::new(3, 9), Point::new(3, 10), Highlight(1)), // "d" + (Point::new(1, 0), Point::new(1, 3), Highlight(2)), // "var" + (Point::new(1, 4), Point::new(1, 7), Highlight(0)), // "abc" + (Point::new(1, 10), Point::new(1, 18), Highlight(2)), // "function" + (Point::new(1, 19), Point::new(1, 20), Highlight(1)), // "d" + (Point::new(4, 2), Point::new(4, 8), Highlight(2)), // "return" + (Point::new(4, 9), Point::new(4, 10), Highlight(1)), // "d" ] ); } diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock index 6a18f64c..d22264e7 100644 --- a/docs/Gemfile.lock +++ b/docs/Gemfile.lock @@ -6,8 +6,8 @@ GEM minitest (~> 5.1) thread_safe (~> 0.3, >= 0.3.4) tzinfo (~> 1.1) - addressable (2.5.2) - public_suffix (>= 2.0.2, < 4.0) + addressable (2.8.0) + public_suffix (>= 2.0.2, < 5.0) coffee-script (2.4.1) coffee-script-source execjs @@ -16,12 +16,27 @@ GEM commonmarker (0.17.8) ruby-enum (~> 0.5) concurrent-ruby (1.0.5) - ethon (0.11.0) - ffi (>= 1.3.0) + ethon (0.14.0) + ffi (>= 1.15.0) execjs (2.7.0) - faraday (0.14.0) + faraday (1.5.1) + faraday-em_http (~> 1.0) + faraday-em_synchrony (~> 1.0) + faraday-excon (~> 1.1) + faraday-httpclient (~> 1.0.1) + faraday-net_http (~> 1.0) + faraday-net_http_persistent (~> 1.1) + faraday-patron (~> 1.0) multipart-post (>= 1.2, < 3) - ffi (1.9.23) + ruby2_keywords (>= 0.0.4) + faraday-em_http (1.0.0) + faraday-em_synchrony (1.0.0) + faraday-excon (1.1.0) + faraday-httpclient (1.0.1) + faraday-net_http (1.0.1) + faraday-net_http_persistent (1.2.0) + faraday-patron (1.0.0) + ffi (1.15.3) forwardable-extended (2.6.0) gemoji (3.0.0) github-pages (177) @@ -195,33 +210,35 @@ GEM minima (2.1.1) jekyll (~> 3.3) minitest (5.11.3) - multipart-post (2.0.0) - net-dns (0.8.0) + multipart-post (2.1.1) + net-dns (0.9.0) nokogiri (1.11.4) mini_portile2 (~> 2.5.0) racc (~> 1.4) - octokit (4.8.0) + octokit (4.21.0) + faraday (>= 0.9) sawyer (~> 0.8.0, >= 0.5.3) - pathutil (0.16.1) + pathutil (0.16.2) forwardable-extended (~> 2.6) public_suffix (2.0.5) racc (1.5.2) - rb-fsevent (0.10.2) - rb-inotify (0.9.10) - ffi (>= 0.5.0, < 2) + rb-fsevent (0.11.0) + rb-inotify (0.10.1) + ffi (~> 1.0) rouge (2.2.1) ruby-enum (0.7.2) i18n + ruby2_keywords (0.0.4) rubyzip (2.0.0) - safe_yaml (1.0.4) - sass (3.5.5) + safe_yaml (1.0.5) + sass (3.7.4) sass-listen (~> 4.0.0) sass-listen (4.0.0) rb-fsevent (~> 0.9, >= 0.9.4) rb-inotify (~> 0.9, >= 0.9.7) - sawyer (0.8.1) - addressable (>= 2.3.5, < 2.6) - faraday (~> 0.8, < 1.0) + sawyer (0.8.2) + addressable (>= 2.3.5) + faraday (> 0.8, < 2.0) terminal-table (1.8.0) unicode-display_width (~> 1.1, >= 1.1.1) thread_safe (0.3.6) diff --git a/docs/index.md b/docs/index.md index 81c6ae7d..bb87a2a8 100644 --- a/docs/index.md +++ b/docs/index.md @@ -15,12 +15,13 @@ Tree-sitter is a parser generator tool and an incremental parsing library. It ca There are currently bindings that allow Tree-sitter to be used from the following languages: -* [Rust](https://github.com/tree-sitter/tree-sitter/tree/master/lib/binding_rust) -* [JavaScript (Wasm)](https://github.com/tree-sitter/tree-sitter/tree/master/lib/binding_web) +* [Haskell](https://github.com/tree-sitter/haskell-tree-sitter) * [JavaScript (Node.js)](https://github.com/tree-sitter/node-tree-sitter) +* [JavaScript (Wasm)](https://github.com/tree-sitter/tree-sitter/tree/master/lib/binding_web) +* [OCaml](https://github.com/returntocorp/ocaml-tree-sitter-core) * [Python](https://github.com/tree-sitter/py-tree-sitter) * [Ruby](https://github.com/tree-sitter/ruby-tree-sitter) -* [Haskell](https://github.com/tree-sitter/haskell-tree-sitter) +* [Rust](https://github.com/tree-sitter/tree-sitter/tree/master/lib/binding_rust) ### Available Parsers @@ -31,11 +32,13 @@ Parsers for these languages are fairly complete: * [C#](https://github.com/tree-sitter/tree-sitter-c-sharp) * [C++](https://github.com/tree-sitter/tree-sitter-cpp) * [CSS](https://github.com/tree-sitter/tree-sitter-css) +* [DOT](https://github.com/rydesun/tree-sitter-dot) * [Elm](https://github.com/elm-tooling/tree-sitter-elm) * [Eno](https://github.com/eno-lang/tree-sitter-eno) * [ERB / EJS](https://github.com/tree-sitter/tree-sitter-embedded-template) * [Fennel](https://github.com/travonted/tree-sitter-fennel) * [Go](https://github.com/tree-sitter/tree-sitter-go) +* [HCL](https://github.com/MichaHoffmann/tree-sitter-hcl) * [HTML](https://github.com/tree-sitter/tree-sitter-html) * [Java](https://github.com/tree-sitter/tree-sitter-java) * [JavaScript](https://github.com/tree-sitter/tree-sitter-javascript) @@ -60,6 +63,7 @@ Parsers for these languages are fairly complete: * [Vue](https://github.com/ikatyang/tree-sitter-vue) * [YAML](https://github.com/ikatyang/tree-sitter-yaml) * [WASM](https://github.com/wasm-lsp/tree-sitter-wasm) +* [WGSL WebGPU Shading Language](https://github.com/mehmetoguzderin/tree-sitter-wgsl) Parsers for these languages are in development: @@ -67,10 +71,12 @@ Parsers for these languages are in development: * [Erlang](https://github.com/AbstractMachinesLab/tree-sitter-erlang/) * [Dockerfile](https://github.com/camdencheek/tree-sitter-dockerfile) * [Go mod](https://github.com/camdencheek/tree-sitter-go-mod) +* [Hack](https://github.com/slackhq/tree-sitter-hack) * [Haskell](https://github.com/tree-sitter/tree-sitter-haskell) * [Julia](https://github.com/tree-sitter/tree-sitter-julia) * [Kotlin](https://github.com/fwcd/tree-sitter-kotlin) * [Nix](https://github.com/cstrahan/tree-sitter-nix) +* [Objective-C](https://github.com/jiyee/tree-sitter-objc) * [Perl](https://github.com/ganezdragon/tree-sitter-perl) * [Scala](https://github.com/tree-sitter/tree-sitter-scala) * [Sourcepawn](https://github.com/nilshelmig/tree-sitter-sourcepawn) @@ -89,8 +95,8 @@ Parsers for these languages are in development: The design of Tree-sitter was greatly influenced by the following research papers: - [Practical Algorithms for Incremental Software Development Environments](https://www2.eecs.berkeley.edu/Pubs/TechRpts/1997/CSD-97-946.pdf) -- [Context Aware Scanning for Parsing Extensible Languages](http://www.umsec.umn.edu/publications/Context-Aware-Scanning-Parsing-Extensible) -- [Efficient and Flexible Incremental Parsing](http://ftp.cs.berkeley.edu/sggs/toplas-parsing.ps) -- [Incremental Analysis of Real Programming Languages](https://pdfs.semanticscholar.org/ca69/018c29cc415820ed207d7e1d391e2da1656f.pdf) +- [Context Aware Scanning for Parsing Extensible Languages](https://www-users.cse.umn.edu/~evw/pubs/vanwyk07gpce/vanwyk07gpce.pdf) +- [Efficient and Flexible Incremental Parsing](http://harmonia.cs.berkeley.edu/papers/twagner-parsing.pdf) +- [Incremental Analysis of Real Programming Languages](http://harmonia.cs.berkeley.edu/papers/twagner-glr.pdf) - [Error Detection and Recovery in LR Parsers](http://what-when-how.com/compiler-writing/bottom-up-parsing-compiler-writing-part-13) -- [Error Recovery for LR Parsers](http://www.dtic.mil/dtic/tr/fulltext/u2/a043470.pdf) +- [Error Recovery for LR Parsers](https://apps.dtic.mil/sti/pdfs/ADA043470.pdf) diff --git a/docs/section-2-using-parsers.md b/docs/section-2-using-parsers.md index 06aa0c00..d3734018 100644 --- a/docs/section-2-using-parsers.md +++ b/docs/section-2-using-parsers.md @@ -464,7 +464,7 @@ In general, it's a good idea to make patterns more specific by specifying [field #### Negated Fields -You can also constrain a pattern so that it only mathces nodes that *lack* a certain field. To do this, add a field name prefixed by a `!` within the parent pattern. For example, this pattern would match a class declaration with no type parameters: +You can also constrain a pattern so that it only matches nodes that *lack* a certain field. To do this, add a field name prefixed by a `!` within the parent pattern. For example, this pattern would match a class declaration with no type parameters: ``` (class_declaration @@ -586,8 +586,10 @@ This pattern would match a set of possible keyword tokens, capturing them as `@k #### Wildcard Node -A wildcard node is represented with an underscore (`(_)`), it matches any node. +A wildcard node is represented with an underscore (`_`), it matches any node. This is similar to `.` in regular expressions. +There are two types, `(_)` will match any named node, +and `_` will match any named or anonymous node. For example, this pattern would match any node inside a call: diff --git a/docs/section-3-creating-parsers.md b/docs/section-3-creating-parsers.md index 83e5a1c9..f5f7c933 100644 --- a/docs/section-3-creating-parsers.md +++ b/docs/section-3-creating-parsers.md @@ -84,7 +84,7 @@ tree-sitter parse example-file This should print the following: ``` -(source_file [1, 0] - [1, 5]) +(source_file [0, 0] - [1, 0]) ``` You now have a working parser. @@ -95,7 +95,7 @@ Let's go over all of the functionality of the `tree-sitter` command line tool. ### Command: `generate` -The most important command you'll use is `tree-sitter generate`. This command reads the `grammar.js` file in your current working directory and creates a file called `src/parser.c`, which implements the parser. After making changes to your grammar, just run `tree-sitter` generate again. +The most important command you'll use is `tree-sitter generate`. This command reads the `grammar.js` file in your current working directory and creates a file called `src/parser.c`, which implements the parser. After making changes to your grammar, just run `tree-sitter generate` again. The first time you run `tree-sitter generate`, it will also generate a few other files: @@ -674,7 +674,7 @@ This function is responsible for recognizing external tokens. It should return ` * **`TSSymbol result_symbol`** - The symbol that was recognized. Your scan function should *assign* to this field one of the values from the `TokenType` enum, described above. * **`void (*advance)(TSLexer *, bool skip)`** - A function for advancing to the next character. If you pass `true` for the second argument, the current character will be treated as whitespace. * **`void (*mark_end)(TSLexer *)`** - A function for marking the end of the recognized token. This allows matching tokens that require multiple characters of lookahead. By default (if you don't call `mark_end`), any character that you moved past using the `advance` function will be included in the size of the token. But once you call `mark_end`, then any later calls to `advance` will *not* increase the size of the returned token. You can call `mark_end` multiple times to increase the size of the token. -* **`uint32_t (*get_column)(TSLexer *)`** - **(Experimental)** A function for querying the current column position of the lexer. It returns the number of unicode code points (not bytes) since the start of the current line. +* **`uint32_t (*get_column)(TSLexer *)`** - A function for querying the current column position of the lexer. It returns the number of bytes (not characters) since the start of the current line. * **`bool (*is_at_included_range_start)(TSLexer *)`** - A function for checking if the parser has just skipped some characters in the document. When parsing an embedded document using the `ts_parser_set_included_ranges` function (described in the [multi-language document section][multi-language-section]), your scanner may want to apply some special behavior when moving to a disjoint part of the document. For example, in [EJS documents][ejs], the JavaScript parser uses this function to enable inserting automatic semicolon tokens in between the code directives, delimited by `<%` and `%>`. The third argument to the `scan` function is an array of booleans that indicates which of your external tokens are currently expected by the parser. You should only look for a given token if it is valid according to this array. At the same time, you cannot backtrack, so you may need to combine certain pieces of logic. diff --git a/docs/section-6-contributing.md b/docs/section-6-contributing.md index 685fe5e7..36f5f499 100644 --- a/docs/section-6-contributing.md +++ b/docs/section-6-contributing.md @@ -29,7 +29,7 @@ git clone https://github.com/tree-sitter/tree-sitter cd tree-sitter ``` -Optionally, build the WASM library. If you skip this step, then the `tree-sitter web-ui` command will require an internet connection. If you have emscripten installed, this will use your `emcc` compiler. Otherwise, it will use Docker: +Optionally, build the WASM library. If you skip this step, then the `tree-sitter playground` command will require an internet connection. If you have emscripten installed, this will use your `emcc` compiler. Otherwise, it will use Docker: ```sh ./script/build-wasm diff --git a/highlight/Cargo.toml b/highlight/Cargo.toml index 78281df7..7acc10b9 100644 --- a/highlight/Cargo.toml +++ b/highlight/Cargo.toml @@ -4,7 +4,7 @@ description = "Library for performing syntax highlighting with Tree-sitter" version = "0.20.0" authors = [ "Max Brunsfeld ", - "Tim Clem " + "Tim Clem ", ] license = "MIT" readme = "README.md" @@ -21,5 +21,5 @@ regex = "1" thiserror = "1.0" [dependencies.tree-sitter] -version = ">= 0.3.7" +version = "0.20" path = "../lib" diff --git a/highlight/src/lib.rs b/highlight/src/lib.rs index b7bfeba8..58d7e88c 100644 --- a/highlight/src/lib.rs +++ b/highlight/src/lib.rs @@ -586,7 +586,7 @@ where break; } if i > 0 { - &self.layers[0..(i + 1)].rotate_left(1); + self.layers[0..(i + 1)].rotate_left(1); } break; } else { diff --git a/lib/binding_rust/bindings.rs b/lib/binding_rust/bindings.rs index 5c032a36..881780e4 100644 --- a/lib/binding_rust/bindings.rs +++ b/lib/binding_rust/bindings.rs @@ -133,6 +133,7 @@ pub const TSQueryError_TSQueryErrorNodeType: TSQueryError = 2; pub const TSQueryError_TSQueryErrorField: TSQueryError = 3; pub const TSQueryError_TSQueryErrorCapture: TSQueryError = 4; pub const TSQueryError_TSQueryErrorStructure: TSQueryError = 5; +pub const TSQueryError_TSQueryErrorLanguage: TSQueryError = 6; pub type TSQueryError = u32; extern "C" { #[doc = " Create a new parser."] diff --git a/lib/binding_rust/lib.rs b/lib/binding_rust/lib.rs index 08dd7b11..4385014d 100644 --- a/lib/binding_rust/lib.rs +++ b/lib/binding_rust/lib.rs @@ -202,6 +202,7 @@ pub enum QueryErrorKind { Capture, Predicate, Structure, + Language, } #[derive(Debug)] @@ -629,7 +630,7 @@ impl Parser { /// If a pointer is assigned, then the parser will periodically read from /// this pointer during parsing. If it reads a non-zero value, it will halt early, /// returning `None`. See [parse](Parser::parse) for more information. - pub unsafe fn set_cancellation_flag(&self, flag: Option<&AtomicUsize>) { + pub unsafe fn set_cancellation_flag(&mut self, flag: Option<&AtomicUsize>) { if let Some(flag) = flag { ffi::ts_parser_set_cancellation_flag( self.0.as_ptr(), @@ -1231,6 +1232,19 @@ impl Query { // On failure, build an error based on the error code and offset. if ptr.is_null() { + if error_type == ffi::TSQueryError_TSQueryErrorLanguage { + return Err(QueryError { + row: 0, + column: 0, + offset: 0, + message: LanguageError { + version: language.version(), + } + .to_string(), + kind: QueryErrorKind::Language, + }); + } + let offset = error_offset as usize; let mut line_start = 0; let mut row = 0; @@ -1739,6 +1753,10 @@ impl QueryCursor { } impl<'a, 'tree> QueryMatch<'a, 'tree> { + pub fn id(&self) -> u32 { + self.id + } + pub fn remove(self) { unsafe { ffi::ts_query_cursor_remove_match(self.cursor, self.id) } } @@ -1803,21 +1821,36 @@ impl<'a, 'tree> QueryMatch<'a, 'tree> { .iter() .all(|predicate| match predicate { TextPredicate::CaptureEqCapture(i, j, is_positive) => { - let node1 = self.nodes_for_capture_index(*i).next().unwrap(); - let node2 = self.nodes_for_capture_index(*j).next().unwrap(); - let text1 = get_text(buffer1, text_provider.text(node1)); - let text2 = get_text(buffer2, text_provider.text(node2)); - (text1 == text2) == *is_positive + let node1 = self.nodes_for_capture_index(*i).next(); + let node2 = self.nodes_for_capture_index(*j).next(); + match (node1, node2) { + (Some(node1), Some(node2)) => { + let text1 = get_text(buffer1, text_provider.text(node1)); + let text2 = get_text(buffer2, text_provider.text(node2)); + (text1 == text2) == *is_positive + } + _ => true, + } } TextPredicate::CaptureEqString(i, s, is_positive) => { - let node = self.nodes_for_capture_index(*i).next().unwrap(); - let text = get_text(buffer1, text_provider.text(node)); - (text == s.as_bytes()) == *is_positive + let node = self.nodes_for_capture_index(*i).next(); + match node { + Some(node) => { + let text = get_text(buffer1, text_provider.text(node)); + (text == s.as_bytes()) == *is_positive + } + None => true, + } } TextPredicate::CaptureMatchString(i, r, is_positive) => { - let node = self.nodes_for_capture_index(*i).next().unwrap(); - let text = get_text(buffer1, text_provider.text(node)); - r.is_match(text) == *is_positive + let node = self.nodes_for_capture_index(*i).next(); + match node { + Some(node) => { + let text = get_text(buffer1, text_provider.text(node)); + r.is_match(text) == *is_positive + } + None => true, + } } }) } @@ -2105,21 +2138,27 @@ impl fmt::Display for LanguageError { impl fmt::Display for QueryError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!( - f, - "Query error at {}:{}. {}{}", - self.row + 1, - self.column + 1, - match self.kind { - QueryErrorKind::Field => "Invalid field name ", - QueryErrorKind::NodeType => "Invalid node type ", - QueryErrorKind::Capture => "Invalid capture name ", - QueryErrorKind::Predicate => "Invalid predicate: ", - QueryErrorKind::Structure => "Impossible pattern:\n", - QueryErrorKind::Syntax => "Invalid syntax:\n", - }, - self.message - ) + let msg = match self.kind { + QueryErrorKind::Field => "Invalid field name ", + QueryErrorKind::NodeType => "Invalid node type ", + QueryErrorKind::Capture => "Invalid capture name ", + QueryErrorKind::Predicate => "Invalid predicate: ", + QueryErrorKind::Structure => "Impossible pattern:\n", + QueryErrorKind::Syntax => "Invalid syntax:\n", + QueryErrorKind::Language => "", + }; + if msg.len() > 0 { + write!( + f, + "Query error at {}:{}. {}{}", + self.row + 1, + self.column + 1, + msg, + self.message + ) + } else { + write!(f, "{}", self.message) + } } } diff --git a/lib/binding_web/.gitignore b/lib/binding_web/.gitignore index 1a4530c9..eec0cfe6 100644 --- a/lib/binding_web/.gitignore +++ b/lib/binding_web/.gitignore @@ -3,3 +3,4 @@ package-lock.json node_modules *.tgz +LICENSE diff --git a/lib/binding_web/binding.js b/lib/binding_web/binding.js index bf0a91ce..5352cb18 100644 --- a/lib/binding_web/binding.js +++ b/lib/binding_web/binding.js @@ -17,24 +17,15 @@ var MIN_COMPATIBLE_VERSION; var TRANSFER_BUFFER; var currentParseCallback; var currentLogCallback; -var initPromise = new Promise(resolve => { - Module.onRuntimeInitialized = resolve -}).then(() => { - TRANSFER_BUFFER = C._ts_init(); - VERSION = getValue(TRANSFER_BUFFER, 'i32'); - MIN_COMPATIBLE_VERSION = getValue(TRANSFER_BUFFER + SIZE_OF_INT, 'i32'); -}); -class Parser { +class ParserImpl { static init() { - return initPromise; + TRANSFER_BUFFER = C._ts_init(); + VERSION = getValue(TRANSFER_BUFFER, 'i32'); + MIN_COMPATIBLE_VERSION = getValue(TRANSFER_BUFFER + SIZE_OF_INT, 'i32'); } - constructor() { - if (TRANSFER_BUFFER == null) { - throw new Error('You must first call Parser.init() and wait for it to resolve.'); - } - + initialize() { C._ts_parser_new_wasm(); this[0] = getValue(TRANSFER_BUFFER, 'i32'); this[1] = getValue(TRANSFER_BUFFER + SIZE_OF_INT, 'i32'); @@ -794,6 +785,7 @@ class Language { if (c.name === captureName1) node1 = c.node; if (c.name === captureName2) node2 = c.node; } + if(node1 === undefined || node2 === undefined) return true; return (node1.text === node2.text) === isPositive; }); } else { @@ -805,7 +797,7 @@ class Language { return (c.node.text === stringValue) === isPositive; }; } - return false; + return true; }); } break; @@ -828,7 +820,7 @@ class Language { for (const c of captures) { if (c.name === captureName) return regex.test(c.node.text) === isPositive; } - return false; + return true; }); break; @@ -1203,6 +1195,3 @@ function marshalEdit(edit) { setValue(address, edit.oldEndIndex, 'i32'); address += SIZE_OF_INT; setValue(address, edit.newEndIndex, 'i32'); address += SIZE_OF_INT; } - -Parser.Language = Language; -Parser.Parser = Parser; diff --git a/lib/binding_web/exports.json b/lib/binding_web/exports.json index 0313f799..e0b3f718 100644 --- a/lib/binding_web/exports.json +++ b/lib/binding_web/exports.json @@ -23,6 +23,7 @@ "_memchr", "_memcmp", "_memcpy", + "_memmove", "_strlen", "_towupper", diff --git a/lib/binding_web/package.json b/lib/binding_web/package.json index d13552e0..f140d46a 100644 --- a/lib/binding_web/package.json +++ b/lib/binding_web/package.json @@ -9,6 +9,7 @@ }, "scripts": { "test": "mocha", + "prepack": "cp ../../LICENSE .", "prepublishOnly": "node check-artifacts-fresh.js" }, "repository": { diff --git a/lib/binding_web/prefix.js b/lib/binding_web/prefix.js index 3653e99d..382035e1 100644 --- a/lib/binding_web/prefix.js +++ b/lib/binding_web/prefix.js @@ -1,9 +1,15 @@ -(function (root, factory) { - if (typeof define === 'function' && define.amd) { - define([], factory); - } else if (typeof exports === 'object') { - module.exports = factory(); - } else { - window.TreeSitter = factory(); - } -}(this, function () { +var TreeSitter = function() { + var initPromise; + class Parser { + constructor() { + this.initialize(); + } + + initialize() { + throw new Error("cannot construct a Parser before calling `init()`"); + } + + static init(moduleOptions) { + if (initPromise) return initPromise; + Module = Object.assign({ }, Module, moduleOptions); + return initPromise = new Promise((resolveInitPromise) => { diff --git a/lib/binding_web/suffix.js b/lib/binding_web/suffix.js index 0e9fe021..cd91f919 100644 --- a/lib/binding_web/suffix.js +++ b/lib/binding_web/suffix.js @@ -1,2 +1,23 @@ -return Parser; -})); + for (const name of Object.getOwnPropertyNames(ParserImpl.prototype)) { + Object.defineProperty(Parser.prototype, name, { + value: ParserImpl.prototype[name], + enumerable: false, + writable: false, + }) + } + + Parser.Language = Language; + Module.onRuntimeInitialized = () => { + ParserImpl.init(); + resolveInitPromise(); + }; + }); + } + } + + return Parser; +}(); + +if (typeof exports === 'object') { + module.exports = TreeSitter; +} \ No newline at end of file diff --git a/lib/binding_web/tree-sitter-web.d.ts b/lib/binding_web/tree-sitter-web.d.ts index 2127fa41..016af4ae 100644 --- a/lib/binding_web/tree-sitter-web.d.ts +++ b/lib/binding_web/tree-sitter-web.d.ts @@ -1,12 +1,19 @@ declare module 'web-tree-sitter' { class Parser { - static init(): Promise; + /** + * + * @param moduleOptions Optional emscripten module-object, see https://emscripten.org/docs/api_reference/module.html + */ + static init(moduleOptions?: object): Promise; delete(): void; parse(input: string | Parser.Input, previousTree?: Parser.Tree, options?: Parser.Options): Parser.Tree; - getLanguage(): any; - setLanguage(language: any): void; + reset(): void; + getLanguage(): Parser.Language; + setLanguage(language?: Parser.Language | undefined | null): void; getLogger(): Parser.Logger; - setLogger(logFunc: Parser.Logger): void; + setLogger(logFunc?: Parser.Logger | undefined | null): void; + setTimeoutMicros(value: number): void; + getTimeoutMicros(): number; } namespace Parser { @@ -96,8 +103,11 @@ declare module 'web-tree-sitter' { export interface TreeCursor { nodeType: string; + nodeTypeId: number; nodeText: string; + nodeId: number; nodeIsNamed: boolean; + nodeIsMissing: boolean; startPosition: Point; endPosition: Point; startIndex: number; @@ -123,7 +133,7 @@ declare module 'web-tree-sitter' { walk(): TreeCursor; getChangedRanges(other: Tree): Range[]; getEditedRange(other: Tree): Range; - getLanguage(): any; + getLanguage(): Language; } class Language { diff --git a/lib/include/tree_sitter/api.h b/lib/include/tree_sitter/api.h index f02789ee..ede1bc99 100644 --- a/lib/include/tree_sitter/api.h +++ b/lib/include/tree_sitter/api.h @@ -131,6 +131,7 @@ typedef enum { TSQueryErrorField, TSQueryErrorCapture, TSQueryErrorStructure, + TSQueryErrorLanguage, } TSQueryError; /********************/ @@ -618,7 +619,7 @@ TSNode ts_tree_cursor_current_node(const TSTreeCursor *); const char *ts_tree_cursor_current_field_name(const TSTreeCursor *); /** - * Get the field name of the tree cursor's current node. + * Get the field id of the tree cursor's current node. * * This returns zero if the current node doesn't have a field. * See also `ts_node_child_by_field_id`, `ts_language_field_id_for_name`. diff --git a/lib/src/parser.c b/lib/src/parser.c index 0f0b4ac4..bf9b7f3b 100644 --- a/lib/src/parser.c +++ b/lib/src/parser.c @@ -417,7 +417,7 @@ static Subtree ts_parser__lex( LOG( "lex_external state:%d, row:%u, column:%u", lex_mode.external_lex_state, - current_position.extent.row + 1, + current_position.extent.row, current_position.extent.column ); ts_lexer_start(&self->lexer); @@ -456,7 +456,7 @@ static Subtree ts_parser__lex( LOG( "lex_internal state:%d, row:%u, column:%u", lex_mode.lex_state, - current_position.extent.row + 1, + current_position.extent.row, current_position.extent.column ); ts_lexer_start(&self->lexer); @@ -1884,7 +1884,7 @@ TSTree *ts_parser_parse( LOG("process version:%d, version_count:%u, state:%d, row:%u, col:%u", version, ts_stack_version_count(self->stack), ts_stack_state(self->stack, version), - ts_stack_position(self->stack, version).extent.row + 1, + ts_stack_position(self->stack, version).extent.row, ts_stack_position(self->stack, version).extent.column); if (!ts_parser__advance(self, version, allow_node_reuse)) return NULL; diff --git a/lib/src/query.c b/lib/src/query.c index 2e8e4b79..86a9dfea 100644 --- a/lib/src/query.c +++ b/lib/src/query.c @@ -2069,6 +2069,15 @@ TSQuery *ts_query_new( uint32_t *error_offset, TSQueryError *error_type ) { + if ( + !language || + language->version > TREE_SITTER_LANGUAGE_VERSION || + language->version < TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION + ) { + *error_type = TSQueryErrorLanguage; + return NULL; + } + TSQuery *self = ts_malloc(sizeof(TSQuery)); *self = (TSQuery) { .steps = array_new(), @@ -2552,6 +2561,7 @@ static void ts_query_cursor__add_state( pattern->step_index ); array_insert(&self->states, index, ((QueryState) { + .id = UINT32_MAX, .capture_list_id = NONE, .step_index = pattern->step_index, .pattern_index = pattern->pattern_index, @@ -2716,7 +2726,6 @@ static inline bool ts_query_cursor__advance( if (step->depth == PATTERN_DONE_MARKER) { if (state->start_depth > self->depth || self->halted) { LOG(" finish pattern %u\n", state->pattern_index); - state->id = self->next_state_id++; array_push(&self->finished_states, *state); did_match = true; deleted_count++; @@ -3105,7 +3114,6 @@ static inline bool ts_query_cursor__advance( LOG(" defer finishing pattern %u\n", state->pattern_index); } else { LOG(" finish pattern %u\n", state->pattern_index); - state->id = self->next_state_id++; array_push(&self->finished_states, *state); array_erase(&self->states, state - self->states.contents); did_match = true; @@ -3160,6 +3168,7 @@ bool ts_query_cursor_next_match( } QueryState *state = &self->finished_states.contents[0]; + if (state->id == UINT32_MAX) state->id = self->next_state_id++; match->id = state->id; match->pattern_index = state->pattern_index; const CaptureList *captures = capture_list_pool_get( @@ -3269,6 +3278,7 @@ bool ts_query_cursor_next_capture( } if (state) { + if (state->id == UINT32_MAX) state->id = self->next_state_id++; match->id = state->id; match->pattern_index = state->pattern_index; const CaptureList *captures = capture_list_pool_get( diff --git a/script/build-wasm b/script/build-wasm index 201d0b50..19c7aa13 100755 --- a/script/build-wasm +++ b/script/build-wasm @@ -33,7 +33,7 @@ web_dir=lib/binding_web emscripten_flags="-O3" minify_js=1 force_docker=0 -emscripen_version=$(cat "$(dirname "$0")"/../emscripten-version) +emscripen_version=$(cat "$(dirname "$0")"/../cli/emscripten-version) while [[ $# > 0 ]]; do case "$1" in diff --git a/script/fetch-emscripten b/script/fetch-emscripten index 157d0cae..4b579df0 100755 --- a/script/fetch-emscripten +++ b/script/fetch-emscripten @@ -2,7 +2,7 @@ set -e -EMSCRIPTEN_VERSION=$(cat "$(dirname "$0")/../emscripten-version") +EMSCRIPTEN_VERSION=$(cat "$(dirname "$0")/../cli/emscripten-version") mkdir -p target EMSDK_DIR="./target/emsdk" diff --git a/script/generate-unicode-categories-json b/script/generate-unicode-categories-json index 2dd36c3a..a106862e 100755 --- a/script/generate-unicode-categories-json +++ b/script/generate-unicode-categories-json @@ -4,10 +4,14 @@ const CATEGORY_OUTPUT_PATH = './cli/src/generate/prepare_grammar/unicode-categories.json' const PROPERTY_OUTPUT_PATH = './cli/src/generate/prepare_grammar/unicode-properties.json' +const CATEGORY_ALIAS_OUTPUT_PATH = './cli/src/generate/prepare_grammar/unicode-category-aliases.json' +const PROPERTY_ALIAS_OUTPUT_PATH = './cli/src/generate/prepare_grammar/unicode-property-aliases.json' const CATEGORY_URL = 'https://unicode.org/Public/13.0.0/ucd/UnicodeData.txt' const PROPERTY_URL = 'https://unicode.org/Public/13.0.0/ucd/PropList.txt' const DERIVED_PROPERTY_URL = 'https://unicode.org/Public/13.0.0/ucd/DerivedCoreProperties.txt' +const CATEGORY_ALIAS_URL = 'https://unicode.org/Public/13.0.0/ucd/PropertyValueAliases.txt' +const PROPERTY_ALIAS_URL = 'https://unicode.org/Public/13.0.0/ucd/PropertyAliases.txt' const fs = require('fs'); const path = require('path'); @@ -16,7 +20,9 @@ const {spawnSync} = require('child_process'); // Download the unicode data files, caching them inside the 'target' directory. const categoryData = cachedDownload(CATEGORY_URL); const propertyData = cachedDownload(PROPERTY_URL); -const derivedPopertyData = cachedDownload(DERIVED_PROPERTY_URL); +const derivedPropertyData = cachedDownload(DERIVED_PROPERTY_URL); +const categoryAliasData = cachedDownload(CATEGORY_ALIAS_URL); +const propertyAliasData = cachedDownload(PROPERTY_ALIAS_URL); function cachedDownload(url) { let downloadPath = path.join('.', 'target', path.basename(url)) if (fs.existsSync(downloadPath)) { @@ -30,10 +36,12 @@ function cachedDownload(url) { const categories = {}; const properties = {}; +const categoryAliases = {}; +const propertyAliases = {} let data, row, lineStart, lineEnd; // Parse the properties -data = propertyData + derivedPopertyData; +data = propertyData + derivedPropertyData; row = 0; lineStart = 0; lineEnd = -1; @@ -106,7 +114,7 @@ while (lineStart < data.length) { if ( nameStart === 0 || categoryStart == 0 || - categoryEnd === 0 + categoryEnd === -1 ) { throw new Error(`Unexpected format on line ${row}`); } @@ -124,5 +132,110 @@ while (lineStart < data.length) { categories[category].push(codePoint); } +// Parse the category aliases +data = categoryAliasData; +row = 0; +lineStart = 0; +lineEnd = -1; +const IGNORE = /[#\s]/ +while (lineStart < data.length) { + row++; + lineStart = lineEnd + 1; + lineEnd = data.indexOf('\n', lineStart); + if (lineEnd === -1) break; + + // Skip over blank and comment lines + if (IGNORE.test(data[lineStart])) continue; + + // Parse the first three semicolon-separated fields: + // * property value type + // * short name + // * long name + // Other aliases may be listed in additional fields + const propertyValueTypeEnd = data.indexOf(';', lineStart); + const shortNameStart = propertyValueTypeEnd + 1; + const shortNameEnd = data.indexOf(';', shortNameStart); + const longNameStart = shortNameEnd + 1; + if ( + shortNameStart === 0 || + longNameStart === 0 + ) { + throw new Error(`Unexpected format on line ${row}`); + } + + const propertyValueType = data.slice(lineStart, propertyValueTypeEnd).trim(); + const shortName = data.slice(shortNameStart, shortNameEnd).trim(); + + // Filter for General_Category lines + if (propertyValueType !== 'gc') continue; + + let aliasStart = longNameStart; + let lineDone = false; + do { + let aliasEnd = data.indexOf(';', aliasStart); + if (aliasEnd === -1 || aliasEnd > lineEnd) { + aliasEnd = data.indexOf('#', aliasStart); + if (aliasEnd === -1 || aliasEnd > lineEnd) { + aliasEnd = lineEnd; + } + lineDone = true; + } + const alias = data.slice(aliasStart, aliasEnd).trim(); + console.log(alias, shortName); + categoryAliases[alias] = shortName; + aliasStart = aliasEnd + 1; + } while (!lineDone); +} + +// Parse the property aliases +data = propertyAliasData; +row = 0; +lineStart = 0; +lineEnd = -1; +while (lineStart < data.length) { + row++; + lineStart = lineEnd + 1; + lineEnd = data.indexOf('\n', lineStart); + if (lineEnd === -1) break; + + // Skip over blank and comment lines + if (IGNORE.test(data[lineStart])) continue; + + // Parse the first two semicolon fields: + // * short name + // * long name + const shortNameEnd = data.indexOf(';', lineStart); + const longNameStart = shortNameEnd + 1; + + if (longNameStart == 0) { + throw new Error(`Unexpected format on line ${row}`); + } + + let alias = data.slice(lineStart, shortNameEnd).trim(); + let longName = null; + let nameStart = longNameStart; + let lineDone = false; + do { + let nameEnd = data.indexOf(';', nameStart); + if (nameEnd === -1 || nameEnd > lineEnd) { + nameEnd = data.indexOf('#', nameStart); + if (nameEnd === -1 || nameEnd > lineEnd) { + nameEnd = lineEnd; + } + lineDone = true; + } + if (longName == null) { + longName = data.slice(nameStart, nameEnd).trim(); + } else { + alias = data.slice(nameStart, nameEnd).trim(); + } + console.log(alias, longName); + propertyAliases[alias] = longName; + nameStart = nameEnd + 1; + } while (!lineDone); +} + fs.writeFileSync(CATEGORY_OUTPUT_PATH, JSON.stringify(categories), 'utf8'); fs.writeFileSync(PROPERTY_OUTPUT_PATH, JSON.stringify(properties), 'utf8'); +fs.writeFileSync(CATEGORY_ALIAS_OUTPUT_PATH, JSON.stringify(categoryAliases), 'utf8'); +fs.writeFileSync(PROPERTY_ALIAS_OUTPUT_PATH, JSON.stringify(propertyAliases), 'utf8'); diff --git a/tags/Cargo.toml b/tags/Cargo.toml index e59c53e8..f458c00b 100644 --- a/tags/Cargo.toml +++ b/tags/Cargo.toml @@ -4,7 +4,7 @@ description = "Library for extracting tag information" version = "0.20.0" authors = [ "Max Brunsfeld ", - "Patrick Thomson " + "Patrick Thomson ", ] license = "MIT" readme = "README.md" @@ -22,5 +22,5 @@ memchr = "2.3" thiserror = "1.0" [dependencies.tree-sitter] -version = ">= 0.17.0" +version = "0.20" path = "../lib" diff --git a/test/fixtures/test_grammars/unicode_classes/corpus.txt b/test/fixtures/test_grammars/unicode_classes/corpus.txt index d28d1acb..9c35be27 100644 --- a/test/fixtures/test_grammars/unicode_classes/corpus.txt +++ b/test/fixtures/test_grammars/unicode_classes/corpus.txt @@ -30,3 +30,14 @@ Math symbols (program (math_sym) (math_sym) (math_sym) (math_sym) (math_sym)) + +================================ +Letterlike numeric characters +================================ + +ᛯ Ⅵ 〩 + +--- + +(program + (letter_number) (letter_number) (letter_number)) diff --git a/test/fixtures/test_grammars/unicode_classes/grammar.json b/test/fixtures/test_grammars/unicode_classes/grammar.json index 9b040867..7a36d0c1 100644 --- a/test/fixtures/test_grammars/unicode_classes/grammar.json +++ b/test/fixtures/test_grammars/unicode_classes/grammar.json @@ -13,7 +13,8 @@ "members": [ {"type": "SYMBOL", "name": "lower"}, {"type": "SYMBOL", "name": "upper"}, - {"type": "SYMBOL", "name": "math_sym"} + {"type": "SYMBOL", "name": "math_sym"}, + {"type": "SYMBOL", "name": "letter_number"} ] } }, @@ -31,6 +32,11 @@ "math_sym": { "type": "PATTERN", "value": "\\p{Sm}+" + }, + + "letter_number": { + "type": "PATTERN", + "value": "\\p{Letter_Number}" } } }