Merge pull request #1660 from alex-pinkus/expanded-regex-support

Expand regex support to include emojis and binary ops
This commit is contained in:
Max Brunsfeld 2022-02-24 17:14:23 -08:00 committed by GitHub
commit 9866674cf8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 172 additions and 22 deletions

View file

@ -32,6 +32,7 @@ lazy_static = "1.2.0"
regex = "1"
regex-syntax = "0.6.4"
rustc-hash = "1"
semver = "1.0"
serde = { version = "1.0.130", features = ["derive"] }
smallbitvec = "2.5.1"
tiny_http = "0.8"

View file

@ -20,6 +20,7 @@ use self::rules::AliasMap;
use anyhow::{anyhow, Context, Result};
use lazy_static::lazy_static;
use regex::{Regex, RegexBuilder};
use semver::Version;
use std::fs;
use std::io::Write;
use std::path::{Path, PathBuf};
@ -178,10 +179,20 @@ fn load_js_grammar_file(grammar_path: &Path) -> Result<String> {
.stdin
.take()
.expect("Failed to open stdin for node");
let cli_version = Version::parse(env!("CARGO_PKG_VERSION"))
.expect("Could not parse this package's version as semver.");
write!(
node_stdin,
"global.TREE_SITTER_CLI_VERSION_MAJOR = {};
global.TREE_SITTER_CLI_VERSION_MINOR = {};
global.TREE_SITTER_CLI_VERSION_PATCH = {};",
cli_version.major, cli_version.minor, cli_version.patch,
)
.expect("Failed to write tree-sitter version to node's stdin");
let javascript_code = include_bytes!("./dsl.js");
node_stdin
.write(javascript_code)
.expect("Failed to write to node's stdin");
.expect("Failed to write grammar dsl to node's stdin");
drop(node_stdin);
let output = node_process
.wait_with_output()

View file

@ -276,6 +276,20 @@ impl CharacterSet {
}
}
/// Produces a `CharacterSet` containing every character in `self` that is not present in
/// `other`.
pub fn difference(mut self, mut other: CharacterSet) -> CharacterSet {
self.remove_intersection(&mut other);
self
}
/// Produces a `CharacterSet` containing every character that is in _exactly one_ of `self` or
/// `other`, but is not present in both sets.
pub fn symmetric_difference(mut self, mut other: CharacterSet) -> CharacterSet {
self.remove_intersection(&mut other);
self.add(&other)
}
pub fn iter<'a>(&'a self) -> impl Iterator<Item = u32> + 'a {
self.ranges.iter().flat_map(|r| r.clone())
}
@ -817,7 +831,7 @@ mod tests {
}
#[test]
fn test_character_set_remove_intersection() {
fn test_character_set_intersection_difference_ops() {
struct Row {
left: CharacterSet,
right: CharacterSet,
@ -942,6 +956,25 @@ mod tests {
"row {}b: {:?} - {:?}",
i, row.right, row.left
);
assert_eq!(
row.left.clone().difference(row.right.clone()),
row.left_only,
"row {}b: {:?} -- {:?}",
i,
row.left,
row.right
);
let symm_difference = row.left_only.clone().add(&mut row.right_only.clone());
assert_eq!(
row.left.clone().symmetric_difference(row.right.clone()),
symm_difference,
"row {}b: {:?} ~~ {:?}",
i,
row.left,
row.right
)
}
}

View file

@ -6,8 +6,8 @@ use anyhow::{anyhow, Context, Result};
use lazy_static::lazy_static;
use regex::Regex;
use regex_syntax::ast::{
parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, ClassUnicodeKind, RepetitionKind,
RepetitionRange,
parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
ClassUnicodeKind, RepetitionKind, RepetitionRange,
};
use std::collections::HashMap;
use std::i32;
@ -240,19 +240,14 @@ impl NfaBuilder {
self.push_advance(chars, next_state_id);
Ok(true)
}
Class::Bracketed(class) => match &class.kind {
ClassSet::Item(item) => {
let mut chars = self.expand_character_class(&item)?;
if class.negated {
chars = chars.negate();
}
self.push_advance(chars, next_state_id);
Ok(true)
Class::Bracketed(class) => {
let mut chars = self.translate_class_set(&class.kind)?;
if class.negated {
chars = chars.negate();
}
ClassSet::BinaryOp(_) => Err(anyhow!(
"Regex error: Binary operators in character classes aren't supported"
)),
},
self.push_advance(chars, next_state_id);
Ok(true)
}
},
Ast::Repetition(repetition) => match repetition.op.kind {
RepetitionKind::ZeroOrOne => {
@ -319,6 +314,27 @@ impl NfaBuilder {
}
}
fn translate_class_set(&self, class_set: &ClassSet) -> Result<CharacterSet> {
match &class_set {
ClassSet::Item(item) => self.expand_character_class(&item),
ClassSet::BinaryOp(binary_op) => {
let mut lhs_char_class = self.translate_class_set(&binary_op.lhs)?;
let mut rhs_char_class = self.translate_class_set(&binary_op.rhs)?;
match binary_op.kind {
ClassSetBinaryOpKind::Intersection => {
Ok(lhs_char_class.remove_intersection(&mut rhs_char_class))
}
ClassSetBinaryOpKind::Difference => {
Ok(lhs_char_class.difference(rhs_char_class))
}
ClassSetBinaryOpKind::SymmetricDifference => {
Ok(lhs_char_class.symmetric_difference(rhs_char_class))
}
}
}
}
}
fn expand_one_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result<bool> {
self.nfa.states.push(NfaState::Accept {
variable_index: 0,
@ -384,6 +400,13 @@ impl NfaBuilder {
}
Ok(set)
}
ClassSetItem::Bracketed(class) => {
let mut set = self.translate_class_set(&class.kind)?;
if class.negated {
set = set.negate();
}
Ok(set)
}
_ => Err(anyhow!(
"Regex error: Unsupported character class syntax {:?}",
item
@ -782,6 +805,79 @@ mod tests {
("\u{1000b}", Some((3, "\u{1000b}"))),
],
},
// Emojis
Row {
rules: vec![Rule::pattern(r"\p{Emoji}+")],
separators: vec![],
examples: vec![
("🐎", Some((0, "🐎"))),
("🐴🐴", Some((0, "🐴🐴"))),
("#0", Some((0, "#0"))), // These chars are technically emojis!
("", None),
("", None),
("horse", None),
],
},
// Intersection
Row {
rules: vec![Rule::pattern(r"[[0-7]&&[4-9]]+")],
separators: vec![],
examples: vec![
("456", Some((0, "456"))),
("64", Some((0, "64"))),
("452", Some((0, "45"))),
("91", None),
("8", None),
("3", None),
],
},
// Difference
Row {
rules: vec![Rule::pattern(r"[[0-9]--[4-7]]+")],
separators: vec![],
examples: vec![
("123", Some((0, "123"))),
("83", Some((0, "83"))),
("9", Some((0, "9"))),
("124", Some((0, "12"))),
("67", None),
("4", None),
],
},
// Symmetric difference
Row {
rules: vec![Rule::pattern(r"[[0-7]~~[4-9]]+")],
separators: vec![],
examples: vec![
("123", Some((0, "123"))),
("83", Some((0, "83"))),
("9", Some((0, "9"))),
("124", Some((0, "12"))),
("67", None),
("4", None),
],
},
// Nested set operations
Row {
// 0 1 2 3 4 5 6 7 8 9
// [0-5]: y y y y y y
// [2-4]: y y y
// [0-5]--[2-4]: y y y
// [3-9]: y y y y y y y
// [6-7]: y y
// [3-9]--[5-7]: y y y y y
// final regex: y y y y y y
rules: vec![Rule::pattern(r"[[[0-5]--[2-4]]~~[[3-9]--[6-7]]]+")],
separators: vec![],
examples: vec![
("01", Some((0, "01"))),
("432", Some((0, "43"))),
("8", Some((0, "8"))),
("9", Some((0, "9"))),
("2", None),
("567", None),
],
},
];
for Row {

File diff suppressed because one or more lines are too long