Merge pull request #1660 from alex-pinkus/expanded-regex-support

Expand regex support to include emojis and binary ops
This commit is contained in:
Max Brunsfeld 2022-02-24 17:14:23 -08:00 committed by GitHub
commit 9866674cf8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 172 additions and 22 deletions

7
Cargo.lock generated
View file

@ -553,6 +553,12 @@ dependencies = [
"winapi-util",
]
[[package]]
name = "semver"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0486718e92ec9a68fbed73bb5ef687d71103b142595b406835649bebd33f72c7"
[[package]]
name = "serde"
version = "1.0.130"
@ -737,6 +743,7 @@ dependencies = [
"regex",
"regex-syntax",
"rustc-hash",
"semver",
"serde",
"serde_json",
"smallbitvec",

View file

@ -32,6 +32,7 @@ lazy_static = "1.2.0"
regex = "1"
regex-syntax = "0.6.4"
rustc-hash = "1"
semver = "1.0"
serde = { version = "1.0.130", features = ["derive"] }
smallbitvec = "2.5.1"
tiny_http = "0.8"

View file

@ -20,6 +20,7 @@ use self::rules::AliasMap;
use anyhow::{anyhow, Context, Result};
use lazy_static::lazy_static;
use regex::{Regex, RegexBuilder};
use semver::Version;
use std::fs;
use std::io::Write;
use std::path::{Path, PathBuf};
@ -178,10 +179,20 @@ fn load_js_grammar_file(grammar_path: &Path) -> Result<String> {
.stdin
.take()
.expect("Failed to open stdin for node");
let cli_version = Version::parse(env!("CARGO_PKG_VERSION"))
.expect("Could not parse this package's version as semver.");
write!(
node_stdin,
"global.TREE_SITTER_CLI_VERSION_MAJOR = {};
global.TREE_SITTER_CLI_VERSION_MINOR = {};
global.TREE_SITTER_CLI_VERSION_PATCH = {};",
cli_version.major, cli_version.minor, cli_version.patch,
)
.expect("Failed to write tree-sitter version to node's stdin");
let javascript_code = include_bytes!("./dsl.js");
node_stdin
.write(javascript_code)
.expect("Failed to write to node's stdin");
.expect("Failed to write grammar dsl to node's stdin");
drop(node_stdin);
let output = node_process
.wait_with_output()

View file

@ -276,6 +276,20 @@ impl CharacterSet {
}
}
/// Produces a `CharacterSet` containing every character in `self` that is not present in
/// `other`.
pub fn difference(mut self, mut other: CharacterSet) -> CharacterSet {
self.remove_intersection(&mut other);
self
}
/// Produces a `CharacterSet` containing every character that is in _exactly one_ of `self` or
/// `other`, but is not present in both sets.
pub fn symmetric_difference(mut self, mut other: CharacterSet) -> CharacterSet {
self.remove_intersection(&mut other);
self.add(&other)
}
pub fn iter<'a>(&'a self) -> impl Iterator<Item = u32> + 'a {
self.ranges.iter().flat_map(|r| r.clone())
}
@ -817,7 +831,7 @@ mod tests {
}
#[test]
fn test_character_set_remove_intersection() {
fn test_character_set_intersection_difference_ops() {
struct Row {
left: CharacterSet,
right: CharacterSet,
@ -942,6 +956,25 @@ mod tests {
"row {}b: {:?} - {:?}",
i, row.right, row.left
);
assert_eq!(
row.left.clone().difference(row.right.clone()),
row.left_only,
"row {}b: {:?} -- {:?}",
i,
row.left,
row.right
);
let symm_difference = row.left_only.clone().add(&mut row.right_only.clone());
assert_eq!(
row.left.clone().symmetric_difference(row.right.clone()),
symm_difference,
"row {}b: {:?} ~~ {:?}",
i,
row.left,
row.right
)
}
}

View file

@ -6,8 +6,8 @@ use anyhow::{anyhow, Context, Result};
use lazy_static::lazy_static;
use regex::Regex;
use regex_syntax::ast::{
parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, ClassUnicodeKind, RepetitionKind,
RepetitionRange,
parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
ClassUnicodeKind, RepetitionKind, RepetitionRange,
};
use std::collections::HashMap;
use std::i32;
@ -240,19 +240,14 @@ impl NfaBuilder {
self.push_advance(chars, next_state_id);
Ok(true)
}
Class::Bracketed(class) => match &class.kind {
ClassSet::Item(item) => {
let mut chars = self.expand_character_class(&item)?;
if class.negated {
chars = chars.negate();
}
self.push_advance(chars, next_state_id);
Ok(true)
Class::Bracketed(class) => {
let mut chars = self.translate_class_set(&class.kind)?;
if class.negated {
chars = chars.negate();
}
ClassSet::BinaryOp(_) => Err(anyhow!(
"Regex error: Binary operators in character classes aren't supported"
)),
},
self.push_advance(chars, next_state_id);
Ok(true)
}
},
Ast::Repetition(repetition) => match repetition.op.kind {
RepetitionKind::ZeroOrOne => {
@ -319,6 +314,27 @@ impl NfaBuilder {
}
}
fn translate_class_set(&self, class_set: &ClassSet) -> Result<CharacterSet> {
match &class_set {
ClassSet::Item(item) => self.expand_character_class(&item),
ClassSet::BinaryOp(binary_op) => {
let mut lhs_char_class = self.translate_class_set(&binary_op.lhs)?;
let mut rhs_char_class = self.translate_class_set(&binary_op.rhs)?;
match binary_op.kind {
ClassSetBinaryOpKind::Intersection => {
Ok(lhs_char_class.remove_intersection(&mut rhs_char_class))
}
ClassSetBinaryOpKind::Difference => {
Ok(lhs_char_class.difference(rhs_char_class))
}
ClassSetBinaryOpKind::SymmetricDifference => {
Ok(lhs_char_class.symmetric_difference(rhs_char_class))
}
}
}
}
}
fn expand_one_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result<bool> {
self.nfa.states.push(NfaState::Accept {
variable_index: 0,
@ -384,6 +400,13 @@ impl NfaBuilder {
}
Ok(set)
}
ClassSetItem::Bracketed(class) => {
let mut set = self.translate_class_set(&class.kind)?;
if class.negated {
set = set.negate();
}
Ok(set)
}
_ => Err(anyhow!(
"Regex error: Unsupported character class syntax {:?}",
item
@ -782,6 +805,79 @@ mod tests {
("\u{1000b}", Some((3, "\u{1000b}"))),
],
},
// Emojis
Row {
rules: vec![Rule::pattern(r"\p{Emoji}+")],
separators: vec![],
examples: vec![
("🐎", Some((0, "🐎"))),
("🐴🐴", Some((0, "🐴🐴"))),
("#0", Some((0, "#0"))), // These chars are technically emojis!
("", None),
("", None),
("horse", None),
],
},
// Intersection
Row {
rules: vec![Rule::pattern(r"[[0-7]&&[4-9]]+")],
separators: vec![],
examples: vec![
("456", Some((0, "456"))),
("64", Some((0, "64"))),
("452", Some((0, "45"))),
("91", None),
("8", None),
("3", None),
],
},
// Difference
Row {
rules: vec![Rule::pattern(r"[[0-9]--[4-7]]+")],
separators: vec![],
examples: vec![
("123", Some((0, "123"))),
("83", Some((0, "83"))),
("9", Some((0, "9"))),
("124", Some((0, "12"))),
("67", None),
("4", None),
],
},
// Symmetric difference
Row {
rules: vec![Rule::pattern(r"[[0-7]~~[4-9]]+")],
separators: vec![],
examples: vec![
("123", Some((0, "123"))),
("83", Some((0, "83"))),
("9", Some((0, "9"))),
("124", Some((0, "12"))),
("67", None),
("4", None),
],
},
// Nested set operations
Row {
// 0 1 2 3 4 5 6 7 8 9
// [0-5]: y y y y y y
// [2-4]: y y y
// [0-5]--[2-4]: y y y
// [3-9]: y y y y y y y
// [6-7]: y y
// [3-9]--[5-7]: y y y y y
// final regex: y y y y y y
rules: vec![Rule::pattern(r"[[[0-5]--[2-4]]~~[[3-9]--[6-7]]]+")],
separators: vec![],
examples: vec![
("01", Some((0, "01"))),
("432", Some((0, "43"))),
("8", Some((0, "8"))),
("9", Some((0, "9"))),
("2", None),
("567", None),
],
},
];
for Row {

File diff suppressed because one or more lines are too long

View file

@ -12,6 +12,7 @@ const PROPERTY_URL = 'https://unicode.org/Public/14.0.0/ucd/PropList.txt'
const DERIVED_PROPERTY_URL = 'https://unicode.org/Public/14.0.0/ucd/DerivedCoreProperties.txt'
const CATEGORY_ALIAS_URL = 'https://unicode.org/Public/14.0.0/ucd/PropertyValueAliases.txt'
const PROPERTY_ALIAS_URL = 'https://unicode.org/Public/14.0.0/ucd/PropertyAliases.txt'
const EMOJI_DATA_URL = 'https://unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt'
const fs = require('fs');
const path = require('path');
@ -23,6 +24,7 @@ const propertyData = cachedDownload(PROPERTY_URL);
const derivedPropertyData = cachedDownload(DERIVED_PROPERTY_URL);
const categoryAliasData = cachedDownload(CATEGORY_ALIAS_URL);
const propertyAliasData = cachedDownload(PROPERTY_ALIAS_URL);
const emojiData = cachedDownload(EMOJI_DATA_URL);
function cachedDownload(url) {
let downloadPath = path.join('.', 'target', path.basename(url))
if (fs.existsSync(downloadPath)) {
@ -41,7 +43,7 @@ const propertyAliases = {}
let data, row, lineStart, lineEnd;
// Parse the properties
data = propertyData + derivedPropertyData;
data = propertyData + derivedPropertyData + emojiData;
row = 0;
lineStart = 0;
lineEnd = -1;
@ -79,7 +81,7 @@ while (lineStart < data.length) {
const property = data.slice(propertyStart, propertyEnd).trim();
console.log(codePoints, property);
console.log("Property:", codePoints, property);
for (let c = codePoints[0]; c <= codePoints[1]; c++) {
@ -123,7 +125,7 @@ while (lineStart < data.length) {
const name = data.slice(nameStart, nameEnd);
const category = data.slice(categoryStart, categoryEnd);
console.log(codePoint, category, name);
console.log("Category:", codePoint, category, name);
// Group the code points by their category.
if (!categories[category]) {
@ -181,7 +183,7 @@ while (lineStart < data.length) {
lineDone = true;
}
const alias = data.slice(aliasStart, aliasEnd).trim();
console.log(alias, shortName);
console.log("Category alias:", alias, shortName);
categoryAliases[alias] = shortName;
aliasStart = aliasEnd + 1;
} while (!lineDone);
@ -229,7 +231,7 @@ while (lineStart < data.length) {
} else {
alias = data.slice(nameStart, nameEnd).trim();
}
console.log(alias, longName);
console.log("Property alias:", alias, longName);
propertyAliases[alias] = longName;
nameStart = nameEnd + 1;
} while (!lineDone);