Expand regex support to include emojis and binary ops

The `Emoji` property alias is already present, but the actual property is not available since it lives in a new file. This adds that file to the `generate-unicode-categories-json`. The `emoji-data` file follows the same format as the ones we already consume in `generate-unicode-categories-json`, so adding emoji support is fairly easy. his, grammars would need to hard-code a set of unicode ranges in their own regex. The Javascript library `emoji-regex` cannot be used because of #451. For unclear reasons, the characters #, *, and 0-9 are marked as `Emoji=Yes` by `emoji-data.txt`. Because of this, a grammar that wishes to use emojis is likely to want to exclude those characters. For that reason, this change also adds support for binary operations in regexes, e.g. `[\p{Emoji}&&[^#*0-9]]`. Lastly (and perhaps controversially), this change introduces new variables available at grammar compile time, for the major, minor, and patch versions of the tree-sitter CLI used to compile the grammar. This will allow grammars to conditionally adopt these new regex features while remaining backward compatible with older versions of the CLI. Without this part of the change, grammar authors who do not precompile and check-in their `grammar.json` would need to wait for downstream systems to adopt a newer tree-sitter CLI version before they could begin to use these features.
2022-02-14 21:46:12 -08:00 · 2022-02-14 21:46:12 -08:00 · 8fadf18655
commit 8fadf18655
parent 2346570901
7 changed files with 172 additions and 22 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -553,6 +553,12 @@ dependencies = [
 "winapi-util",
 ]

+[[package]]
+name = "semver"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0486718e92ec9a68fbed73bb5ef687d71103b142595b406835649bebd33f72c7"
+
 [[package]]
 name = "serde"
 version = "1.0.130"
@ -737,6 +743,7 @@ dependencies = [
 "regex",
 "regex-syntax",
 "rustc-hash",
+ "semver",
 "serde",
 "serde_json",
 "smallbitvec",
--- a/cli/Cargo.toml
+++ b/cli/Cargo.toml
@ -32,6 +32,7 @@ lazy_static = "1.2.0"
 regex = "1"
 regex-syntax = "0.6.4"
 rustc-hash = "1"
+semver = "1.0"
 serde = { version = "1.0.130", features = ["derive"] }
 smallbitvec = "2.5.1"
 tiny_http = "0.8"
--- a/cli/src/generate/mod.rs
+++ b/cli/src/generate/mod.rs
@ -20,6 +20,7 @@ use self::rules::AliasMap;
 use anyhow::{anyhow, Context, Result};
 use lazy_static::lazy_static;
 use regex::{Regex, RegexBuilder};
+use semver::Version;
 use std::fs;
 use std::io::Write;
 use std::path::{Path, PathBuf};
@ -181,10 +182,20 @@ fn load_js_grammar_file(grammar_path: &Path) -> Result<String> {
        .stdin
        .take()
        .expect("Failed to open stdin for node");
+    let cli_version = Version::parse(env!("CARGO_PKG_VERSION"))
+        .expect("Could not parse this package's version as semver.");
+    write!(
+        node_stdin,
+        "global.TREE_SITTER_CLI_VERSION_MAJOR = {};
+        global.TREE_SITTER_CLI_VERSION_MINOR = {};
+        global.TREE_SITTER_CLI_VERSION_PATCH = {};",
+        cli_version.major, cli_version.minor, cli_version.patch,
+    )
+    .expect("Failed to write tree-sitter version to node's stdin");
    let javascript_code = include_bytes!("./dsl.js");
    node_stdin
        .write(javascript_code)
-        .expect("Failed to write to node's stdin");
+        .expect("Failed to write grammar dsl to node's stdin");
    drop(node_stdin);
    let output = node_process
        .wait_with_output()
--- a/cli/src/generate/nfa.rs
+++ b/cli/src/generate/nfa.rs
@ -276,6 +276,20 @@ impl CharacterSet {
        }
    }

+    /// Produces a `CharacterSet` containing every character in `self` that is not present in
+    /// `other`.
+    pub fn difference(mut self, mut other: CharacterSet) -> CharacterSet {
+        self.remove_intersection(&mut other);
+        self
+    }
+
+    /// Produces a `CharacterSet` containing every character that is in _exactly one_ of `self` or
+    /// `other`, but is not present in both sets.
+    pub fn symmetric_difference(mut self, mut other: CharacterSet) -> CharacterSet {
+        self.remove_intersection(&mut other);
+        self.add(&other)
+    }
+
    pub fn iter<'a>(&'a self) -> impl Iterator<Item = u32> + 'a {
        self.ranges.iter().flat_map(|r| r.clone())
    }
@ -817,7 +831,7 @@ mod tests {
    }

    #[test]
-    fn test_character_set_remove_intersection() {
+    fn test_character_set_intersection_difference_ops() {
        struct Row {
            left: CharacterSet,
            right: CharacterSet,
@ -942,6 +956,25 @@ mod tests {
                "row {}b: {:?} - {:?}",
                i, row.right, row.left
            );
+
+            assert_eq!(
+                row.left.clone().difference(row.right.clone()),
+                row.left_only,
+                "row {}b: {:?} -- {:?}",
+                i,
+                row.left,
+                row.right
+            );
+
+            let symm_difference = row.left_only.clone().add(&mut row.right_only.clone());
+            assert_eq!(
+                row.left.clone().symmetric_difference(row.right.clone()),
+                symm_difference,
+                "row {}b: {:?} ~~ {:?}",
+                i,
+                row.left,
+                row.right
+            )
        }
    }

--- a/cli/src/generate/prepare_grammar/expand_tokens.rs
+++ b/cli/src/generate/prepare_grammar/expand_tokens.rs
@ -6,8 +6,8 @@ use anyhow::{anyhow, Context, Result};
 use lazy_static::lazy_static;
 use regex::Regex;
 use regex_syntax::ast::{
-    parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, ClassUnicodeKind, RepetitionKind,
-    RepetitionRange,
+    parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
+    ClassUnicodeKind, RepetitionKind, RepetitionRange,
 };
 use std::collections::HashMap;
 use std::i32;
@ -240,19 +240,14 @@ impl NfaBuilder {
                    self.push_advance(chars, next_state_id);
                    Ok(true)
                }
-                Class::Bracketed(class) => match &class.kind {
-                    ClassSet::Item(item) => {
-                        let mut chars = self.expand_character_class(&item)?;
-                        if class.negated {
-                            chars = chars.negate();
-                        }
-                        self.push_advance(chars, next_state_id);
-                        Ok(true)
+                Class::Bracketed(class) => {
+                    let mut chars = self.translate_class_set(&class.kind)?;
+                    if class.negated {
+                        chars = chars.negate();
                    }
-                    ClassSet::BinaryOp(_) => Err(anyhow!(
-                        "Regex error: Binary operators in character classes aren't supported"
-                    )),
-                },
+                    self.push_advance(chars, next_state_id);
+                    Ok(true)
+                }
            },
            Ast::Repetition(repetition) => match repetition.op.kind {
                RepetitionKind::ZeroOrOne => {
@ -319,6 +314,27 @@ impl NfaBuilder {
        }
    }

+    fn translate_class_set(&self, class_set: &ClassSet) -> Result<CharacterSet> {
+        match &class_set {
+            ClassSet::Item(item) => self.expand_character_class(&item),
+            ClassSet::BinaryOp(binary_op) => {
+                let mut lhs_char_class = self.translate_class_set(&binary_op.lhs)?;
+                let mut rhs_char_class = self.translate_class_set(&binary_op.rhs)?;
+                match binary_op.kind {
+                    ClassSetBinaryOpKind::Intersection => {
+                        Ok(lhs_char_class.remove_intersection(&mut rhs_char_class))
+                    }
+                    ClassSetBinaryOpKind::Difference => {
+                        Ok(lhs_char_class.difference(rhs_char_class))
+                    }
+                    ClassSetBinaryOpKind::SymmetricDifference => {
+                        Ok(lhs_char_class.symmetric_difference(rhs_char_class))
+                    }
+                }
+            }
+        }
+    }
+
    fn expand_one_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result<bool> {
        self.nfa.states.push(NfaState::Accept {
            variable_index: 0,
@ -384,6 +400,13 @@ impl NfaBuilder {
                }
                Ok(set)
            }
+            ClassSetItem::Bracketed(class) => {
+                let mut set = self.translate_class_set(&class.kind)?;
+                if class.negated {
+                    set = set.negate();
+                }
+                Ok(set)
+            }
            _ => Err(anyhow!(
                "Regex error: Unsupported character class syntax {:?}",
                item
@ -782,6 +805,79 @@ mod tests {
                    ("\u{1000b}", Some((3, "\u{1000b}"))),
                ],
            },
+            // Emojis
+            Row {
+                rules: vec![Rule::pattern(r"\p{Emoji}+")],
+                separators: vec![],
+                examples: vec![
+                    ("🐎", Some((0, "🐎"))),
+                    ("🐴🐴", Some((0, "🐴🐴"))),
+                    ("#0", Some((0, "#0"))), // These chars are technically emojis!
+                    ("⻢", None),
+                    ("♞", None),
+                    ("horse", None),
+                ],
+            },
+            // Intersection
+            Row {
+                rules: vec![Rule::pattern(r"[[0-7]&&[4-9]]+")],
+                separators: vec![],
+                examples: vec![
+                    ("456", Some((0, "456"))),
+                    ("64", Some((0, "64"))),
+                    ("452", Some((0, "45"))),
+                    ("91", None),
+                    ("8", None),
+                    ("3", None),
+                ],
+            },
+            // Difference
+            Row {
+                rules: vec![Rule::pattern(r"[[0-9]--[4-7]]+")],
+                separators: vec![],
+                examples: vec![
+                    ("123", Some((0, "123"))),
+                    ("83", Some((0, "83"))),
+                    ("9", Some((0, "9"))),
+                    ("124", Some((0, "12"))),
+                    ("67", None),
+                    ("4", None),
+                ],
+            },
+            // Symmetric difference
+            Row {
+                rules: vec![Rule::pattern(r"[[0-7]~~[4-9]]+")],
+                separators: vec![],
+                examples: vec![
+                    ("123", Some((0, "123"))),
+                    ("83", Some((0, "83"))),
+                    ("9", Some((0, "9"))),
+                    ("124", Some((0, "12"))),
+                    ("67", None),
+                    ("4", None),
+                ],
+            },
+            // Nested set operations
+            Row {
+                //               0 1 2 3 4 5 6 7 8 9
+                // [0-5]:        y y y y y y
+                // [2-4]:            y y y
+                // [0-5]--[2-4]: y y       y
+                // [3-9]:              y y y y y y y
+                // [6-7]:                    y y
+                // [3-9]--[5-7]:       y y y     y y
+                // final regex:  y y   y y       y y
+                rules: vec![Rule::pattern(r"[[[0-5]--[2-4]]~~[[3-9]--[6-7]]]+")],
+                separators: vec![],
+                examples: vec![
+                    ("01", Some((0, "01"))),
+                    ("432", Some((0, "43"))),
+                    ("8", Some((0, "8"))),
+                    ("9", Some((0, "9"))),
+                    ("2", None),
+                    ("567", None),
+                ],
+            },
        ];

        for Row {
--- a/cli/src/generate/prepare_grammar/unicode-properties.json
+++ b/cli/src/generate/prepare_grammar/unicode-properties.json
--- a/script/generate-unicode-categories-json
+++ b/script/generate-unicode-categories-json
@ -12,6 +12,7 @@ const PROPERTY_URL = 'https://unicode.org/Public/14.0.0/ucd/PropList.txt'
 const DERIVED_PROPERTY_URL = 'https://unicode.org/Public/14.0.0/ucd/DerivedCoreProperties.txt'
 const CATEGORY_ALIAS_URL = 'https://unicode.org/Public/14.0.0/ucd/PropertyValueAliases.txt'
 const PROPERTY_ALIAS_URL = 'https://unicode.org/Public/14.0.0/ucd/PropertyAliases.txt'
+const EMOJI_DATA_URL = 'https://unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt'

 const fs = require('fs');
 const path = require('path');
@ -23,6 +24,7 @@ const propertyData = cachedDownload(PROPERTY_URL);
 const derivedPropertyData = cachedDownload(DERIVED_PROPERTY_URL);
 const categoryAliasData = cachedDownload(CATEGORY_ALIAS_URL);
 const propertyAliasData = cachedDownload(PROPERTY_ALIAS_URL);
+const emojiData = cachedDownload(EMOJI_DATA_URL);
 function cachedDownload(url) {
    let downloadPath = path.join('.', 'target', path.basename(url))
    if (fs.existsSync(downloadPath)) {
@ -41,7 +43,7 @@ const propertyAliases = {}
 let data, row, lineStart, lineEnd;

 // Parse the properties
-data = propertyData + derivedPropertyData;
+data = propertyData + derivedPropertyData + emojiData;
 row = 0;
 lineStart = 0;
 lineEnd = -1;
@ -79,7 +81,7 @@ while (lineStart < data.length) {

    const property = data.slice(propertyStart, propertyEnd).trim();

-    console.log(codePoints, property);
+    console.log("Property:", codePoints, property);


    for (let c = codePoints[0]; c <= codePoints[1]; c++) {
@ -123,7 +125,7 @@ while (lineStart < data.length) {
    const name = data.slice(nameStart, nameEnd);
    const category = data.slice(categoryStart, categoryEnd);

-    console.log(codePoint, category, name);
+    console.log("Category:", codePoint, category, name);

    // Group the code points by their category.
    if (!categories[category]) {
@ -181,7 +183,7 @@ while (lineStart < data.length) {
            lineDone = true;
        }
        const alias = data.slice(aliasStart, aliasEnd).trim();
-        console.log(alias, shortName);
+        console.log("Category alias:", alias, shortName);
        categoryAliases[alias] = shortName;
        aliasStart = aliasEnd + 1;
    } while (!lineDone);
@ -229,7 +231,7 @@ while (lineStart < data.length) {
        } else {
            alias = data.slice(nameStart, nameEnd).trim();
        }
-        console.log(alias, longName);
+        console.log("Property alias:", alias, longName);
        propertyAliases[alias] = longName;
        nameStart = nameEnd + 1;
    } while (!lineDone);