Merge pull request #1660 from alex-pinkus/expanded-regex-support

Expand regex support to include emojis and binary ops
2022-02-24 17:14:23 -08:00 · 2022-02-24 17:14:23 -08:00 · 9866674cf8
commit 9866674cf8
parent 5eb0a3090f 8fadf18655
7 changed files with 172 additions and 22 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -553,6 +553,12 @@ dependencies = [
 "winapi-util",
 ]

+[[package]]
+name = "semver"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0486718e92ec9a68fbed73bb5ef687d71103b142595b406835649bebd33f72c7"
+
 [[package]]
 name = "serde"
 version = "1.0.130"
@ -737,6 +743,7 @@ dependencies = [
 "regex",
 "regex-syntax",
 "rustc-hash",
+ "semver",
 "serde",
 "serde_json",
 "smallbitvec",
--- a/cli/Cargo.toml
+++ b/cli/Cargo.toml
@ -32,6 +32,7 @@ lazy_static = "1.2.0"
 regex = "1"
 regex-syntax = "0.6.4"
 rustc-hash = "1"
+semver = "1.0"
 serde = { version = "1.0.130", features = ["derive"] }
 smallbitvec = "2.5.1"
 tiny_http = "0.8"
--- a/cli/src/generate/mod.rs
+++ b/cli/src/generate/mod.rs
@ -20,6 +20,7 @@ use self::rules::AliasMap;
 use anyhow::{anyhow, Context, Result};
 use lazy_static::lazy_static;
 use regex::{Regex, RegexBuilder};
+use semver::Version;
 use std::fs;
 use std::io::Write;
 use std::path::{Path, PathBuf};
@ -178,10 +179,20 @@ fn load_js_grammar_file(grammar_path: &Path) -> Result<String> {
        .stdin
        .take()
        .expect("Failed to open stdin for node");
+    let cli_version = Version::parse(env!("CARGO_PKG_VERSION"))
+        .expect("Could not parse this package's version as semver.");
+    write!(
+        node_stdin,
+        "global.TREE_SITTER_CLI_VERSION_MAJOR = {};
+        global.TREE_SITTER_CLI_VERSION_MINOR = {};
+        global.TREE_SITTER_CLI_VERSION_PATCH = {};",
+        cli_version.major, cli_version.minor, cli_version.patch,
+    )
+    .expect("Failed to write tree-sitter version to node's stdin");
    let javascript_code = include_bytes!("./dsl.js");
    node_stdin
        .write(javascript_code)
-        .expect("Failed to write to node's stdin");
+        .expect("Failed to write grammar dsl to node's stdin");
    drop(node_stdin);
    let output = node_process
        .wait_with_output()
--- a/cli/src/generate/nfa.rs
+++ b/cli/src/generate/nfa.rs
@ -276,6 +276,20 @@ impl CharacterSet {
        }
    }

+    /// Produces a `CharacterSet` containing every character in `self` that is not present in
+    /// `other`.
+    pub fn difference(mut self, mut other: CharacterSet) -> CharacterSet {
+        self.remove_intersection(&mut other);
+        self
+    }
+
+    /// Produces a `CharacterSet` containing every character that is in _exactly one_ of `self` or
+    /// `other`, but is not present in both sets.
+    pub fn symmetric_difference(mut self, mut other: CharacterSet) -> CharacterSet {
+        self.remove_intersection(&mut other);
+        self.add(&other)
+    }
+
    pub fn iter<'a>(&'a self) -> impl Iterator<Item = u32> + 'a {
        self.ranges.iter().flat_map(|r| r.clone())
    }
@ -817,7 +831,7 @@ mod tests {
    }

    #[test]
-    fn test_character_set_remove_intersection() {
+    fn test_character_set_intersection_difference_ops() {
        struct Row {
            left: CharacterSet,
            right: CharacterSet,
@ -942,6 +956,25 @@ mod tests {
                "row {}b: {:?} - {:?}",
                i, row.right, row.left
            );
+
+            assert_eq!(
+                row.left.clone().difference(row.right.clone()),
+                row.left_only,
+                "row {}b: {:?} -- {:?}",
+                i,
+                row.left,
+                row.right
+            );
+
+            let symm_difference = row.left_only.clone().add(&mut row.right_only.clone());
+            assert_eq!(
+                row.left.clone().symmetric_difference(row.right.clone()),
+                symm_difference,
+                "row {}b: {:?} ~~ {:?}",
+                i,
+                row.left,
+                row.right
+            )
        }
    }

--- a/cli/src/generate/prepare_grammar/expand_tokens.rs
+++ b/cli/src/generate/prepare_grammar/expand_tokens.rs
@ -6,8 +6,8 @@ use anyhow::{anyhow, Context, Result};
 use lazy_static::lazy_static;
 use regex::Regex;
 use regex_syntax::ast::{
-    parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, ClassUnicodeKind, RepetitionKind,
-    RepetitionRange,
+    parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
+    ClassUnicodeKind, RepetitionKind, RepetitionRange,
 };
 use std::collections::HashMap;
 use std::i32;
@ -240,19 +240,14 @@ impl NfaBuilder {
                    self.push_advance(chars, next_state_id);
                    Ok(true)
                }
-                Class::Bracketed(class) => match &class.kind {
-                    ClassSet::Item(item) => {
-                        let mut chars = self.expand_character_class(&item)?;
-                        if class.negated {
-                            chars = chars.negate();
-                        }
-                        self.push_advance(chars, next_state_id);
-                        Ok(true)
+                Class::Bracketed(class) => {
+                    let mut chars = self.translate_class_set(&class.kind)?;
+                    if class.negated {
+                        chars = chars.negate();
                    }
-                    ClassSet::BinaryOp(_) => Err(anyhow!(
-                        "Regex error: Binary operators in character classes aren't supported"
-                    )),
-                },
+                    self.push_advance(chars, next_state_id);
+                    Ok(true)
+                }
            },
            Ast::Repetition(repetition) => match repetition.op.kind {
                RepetitionKind::ZeroOrOne => {
@ -319,6 +314,27 @@ impl NfaBuilder {
        }
    }

+    fn translate_class_set(&self, class_set: &ClassSet) -> Result<CharacterSet> {
+        match &class_set {
+            ClassSet::Item(item) => self.expand_character_class(&item),
+            ClassSet::BinaryOp(binary_op) => {
+                let mut lhs_char_class = self.translate_class_set(&binary_op.lhs)?;
+                let mut rhs_char_class = self.translate_class_set(&binary_op.rhs)?;
+                match binary_op.kind {
+                    ClassSetBinaryOpKind::Intersection => {
+                        Ok(lhs_char_class.remove_intersection(&mut rhs_char_class))
+                    }
+                    ClassSetBinaryOpKind::Difference => {
+                        Ok(lhs_char_class.difference(rhs_char_class))
+                    }
+                    ClassSetBinaryOpKind::SymmetricDifference => {
+                        Ok(lhs_char_class.symmetric_difference(rhs_char_class))
+                    }
+                }
+            }
+        }
+    }
+
    fn expand_one_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result<bool> {
        self.nfa.states.push(NfaState::Accept {
            variable_index: 0,
@ -384,6 +400,13 @@ impl NfaBuilder {
                }
                Ok(set)
            }
+            ClassSetItem::Bracketed(class) => {
+                let mut set = self.translate_class_set(&class.kind)?;
+                if class.negated {
+                    set = set.negate();
+                }
+                Ok(set)
+            }
            _ => Err(anyhow!(
                "Regex error: Unsupported character class syntax {:?}",
                item
@ -782,6 +805,79 @@ mod tests {
                    ("\u{1000b}", Some((3, "\u{1000b}"))),
                ],
            },
+            // Emojis
+            Row {
+                rules: vec![Rule::pattern(r"\p{Emoji}+")],
+                separators: vec![],
+                examples: vec![
+                    ("🐎", Some((0, "🐎"))),
+                    ("🐴🐴", Some((0, "🐴🐴"))),
+                    ("#0", Some((0, "#0"))), // These chars are technically emojis!
+                    ("⻢", None),
+                    ("♞", None),
+                    ("horse", None),
+                ],
+            },
+            // Intersection
+            Row {
+                rules: vec![Rule::pattern(r"[[0-7]&&[4-9]]+")],
+                separators: vec![],
+                examples: vec![
+                    ("456", Some((0, "456"))),
+                    ("64", Some((0, "64"))),
+                    ("452", Some((0, "45"))),
+                    ("91", None),
+                    ("8", None),
+                    ("3", None),
+                ],
+            },
+            // Difference
+            Row {
+                rules: vec![Rule::pattern(r"[[0-9]--[4-7]]+")],
+                separators: vec![],
+                examples: vec![
+                    ("123", Some((0, "123"))),
+                    ("83", Some((0, "83"))),
+                    ("9", Some((0, "9"))),
+                    ("124", Some((0, "12"))),
+                    ("67", None),
+                    ("4", None),
+                ],
+            },
+            // Symmetric difference
+            Row {
+                rules: vec![Rule::pattern(r"[[0-7]~~[4-9]]+")],
+                separators: vec![],
+                examples: vec![
+                    ("123", Some((0, "123"))),
+                    ("83", Some((0, "83"))),
+                    ("9", Some((0, "9"))),
+                    ("124", Some((0, "12"))),
+                    ("67", None),
+                    ("4", None),
+                ],
+            },
+            // Nested set operations
+            Row {
+                //               0 1 2 3 4 5 6 7 8 9
+                // [0-5]:        y y y y y y
+                // [2-4]:            y y y
+                // [0-5]--[2-4]: y y       y
+                // [3-9]:              y y y y y y y
+                // [6-7]:                    y y
+                // [3-9]--[5-7]:       y y y     y y
+                // final regex:  y y   y y       y y
+                rules: vec![Rule::pattern(r"[[[0-5]--[2-4]]~~[[3-9]--[6-7]]]+")],
+                separators: vec![],
+                examples: vec![
+                    ("01", Some((0, "01"))),
+                    ("432", Some((0, "43"))),
+                    ("8", Some((0, "8"))),
+                    ("9", Some((0, "9"))),
+                    ("2", None),
+                    ("567", None),
+                ],
+            },
        ];

        for Row {
--- a/cli/src/generate/prepare_grammar/unicode-properties.json
+++ b/cli/src/generate/prepare_grammar/unicode-properties.json
--- a/script/generate-unicode-categories-json
+++ b/script/generate-unicode-categories-json
@ -12,6 +12,7 @@ const PROPERTY_URL = 'https://unicode.org/Public/14.0.0/ucd/PropList.txt'
 const DERIVED_PROPERTY_URL = 'https://unicode.org/Public/14.0.0/ucd/DerivedCoreProperties.txt'
 const CATEGORY_ALIAS_URL = 'https://unicode.org/Public/14.0.0/ucd/PropertyValueAliases.txt'
 const PROPERTY_ALIAS_URL = 'https://unicode.org/Public/14.0.0/ucd/PropertyAliases.txt'
+const EMOJI_DATA_URL = 'https://unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt'

 const fs = require('fs');
 const path = require('path');
@ -23,6 +24,7 @@ const propertyData = cachedDownload(PROPERTY_URL);
 const derivedPropertyData = cachedDownload(DERIVED_PROPERTY_URL);
 const categoryAliasData = cachedDownload(CATEGORY_ALIAS_URL);
 const propertyAliasData = cachedDownload(PROPERTY_ALIAS_URL);
+const emojiData = cachedDownload(EMOJI_DATA_URL);
 function cachedDownload(url) {
    let downloadPath = path.join('.', 'target', path.basename(url))
    if (fs.existsSync(downloadPath)) {
@ -41,7 +43,7 @@ const propertyAliases = {}
 let data, row, lineStart, lineEnd;

 // Parse the properties
-data = propertyData + derivedPropertyData;
+data = propertyData + derivedPropertyData + emojiData;
 row = 0;
 lineStart = 0;
 lineEnd = -1;
@ -79,7 +81,7 @@ while (lineStart < data.length) {

    const property = data.slice(propertyStart, propertyEnd).trim();

-    console.log(codePoints, property);
+    console.log("Property:", codePoints, property);


    for (let c = codePoints[0]; c <= codePoints[1]; c++) {
@ -123,7 +125,7 @@ while (lineStart < data.length) {
    const name = data.slice(nameStart, nameEnd);
    const category = data.slice(categoryStart, categoryEnd);

-    console.log(codePoint, category, name);
+    console.log("Category:", codePoint, category, name);

    // Group the code points by their category.
    if (!categories[category]) {
@ -181,7 +183,7 @@ while (lineStart < data.length) {
            lineDone = true;
        }
        const alias = data.slice(aliasStart, aliasEnd).trim();
-        console.log(alias, shortName);
+        console.log("Category alias:", alias, shortName);
        categoryAliases[alias] = shortName;
        aliasStart = aliasEnd + 1;
    } while (!lineDone);
@ -229,7 +231,7 @@ while (lineStart < data.length) {
        } else {
            alias = data.slice(nameStart, nameEnd).trim();
        }
-        console.log(alias, longName);
+        console.log("Property alias:", alias, longName);
        propertyAliases[alias] = longName;
        nameStart = nameEnd + 1;
    } while (!lineDone);