Merge pull request #1660 from alex-pinkus/expanded-regex-support
Expand regex support to include emojis and binary ops
This commit is contained in:
commit
9866674cf8
7 changed files with 172 additions and 22 deletions
|
|
@ -32,6 +32,7 @@ lazy_static = "1.2.0"
|
|||
regex = "1"
|
||||
regex-syntax = "0.6.4"
|
||||
rustc-hash = "1"
|
||||
semver = "1.0"
|
||||
serde = { version = "1.0.130", features = ["derive"] }
|
||||
smallbitvec = "2.5.1"
|
||||
tiny_http = "0.8"
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@ use self::rules::AliasMap;
|
|||
use anyhow::{anyhow, Context, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use regex::{Regex, RegexBuilder};
|
||||
use semver::Version;
|
||||
use std::fs;
|
||||
use std::io::Write;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
|
@ -178,10 +179,20 @@ fn load_js_grammar_file(grammar_path: &Path) -> Result<String> {
|
|||
.stdin
|
||||
.take()
|
||||
.expect("Failed to open stdin for node");
|
||||
let cli_version = Version::parse(env!("CARGO_PKG_VERSION"))
|
||||
.expect("Could not parse this package's version as semver.");
|
||||
write!(
|
||||
node_stdin,
|
||||
"global.TREE_SITTER_CLI_VERSION_MAJOR = {};
|
||||
global.TREE_SITTER_CLI_VERSION_MINOR = {};
|
||||
global.TREE_SITTER_CLI_VERSION_PATCH = {};",
|
||||
cli_version.major, cli_version.minor, cli_version.patch,
|
||||
)
|
||||
.expect("Failed to write tree-sitter version to node's stdin");
|
||||
let javascript_code = include_bytes!("./dsl.js");
|
||||
node_stdin
|
||||
.write(javascript_code)
|
||||
.expect("Failed to write to node's stdin");
|
||||
.expect("Failed to write grammar dsl to node's stdin");
|
||||
drop(node_stdin);
|
||||
let output = node_process
|
||||
.wait_with_output()
|
||||
|
|
|
|||
|
|
@ -276,6 +276,20 @@ impl CharacterSet {
|
|||
}
|
||||
}
|
||||
|
||||
/// Produces a `CharacterSet` containing every character in `self` that is not present in
|
||||
/// `other`.
|
||||
pub fn difference(mut self, mut other: CharacterSet) -> CharacterSet {
|
||||
self.remove_intersection(&mut other);
|
||||
self
|
||||
}
|
||||
|
||||
/// Produces a `CharacterSet` containing every character that is in _exactly one_ of `self` or
|
||||
/// `other`, but is not present in both sets.
|
||||
pub fn symmetric_difference(mut self, mut other: CharacterSet) -> CharacterSet {
|
||||
self.remove_intersection(&mut other);
|
||||
self.add(&other)
|
||||
}
|
||||
|
||||
pub fn iter<'a>(&'a self) -> impl Iterator<Item = u32> + 'a {
|
||||
self.ranges.iter().flat_map(|r| r.clone())
|
||||
}
|
||||
|
|
@ -817,7 +831,7 @@ mod tests {
|
|||
}
|
||||
|
||||
#[test]
|
||||
fn test_character_set_remove_intersection() {
|
||||
fn test_character_set_intersection_difference_ops() {
|
||||
struct Row {
|
||||
left: CharacterSet,
|
||||
right: CharacterSet,
|
||||
|
|
@ -942,6 +956,25 @@ mod tests {
|
|||
"row {}b: {:?} - {:?}",
|
||||
i, row.right, row.left
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
row.left.clone().difference(row.right.clone()),
|
||||
row.left_only,
|
||||
"row {}b: {:?} -- {:?}",
|
||||
i,
|
||||
row.left,
|
||||
row.right
|
||||
);
|
||||
|
||||
let symm_difference = row.left_only.clone().add(&mut row.right_only.clone());
|
||||
assert_eq!(
|
||||
row.left.clone().symmetric_difference(row.right.clone()),
|
||||
symm_difference,
|
||||
"row {}b: {:?} ~~ {:?}",
|
||||
i,
|
||||
row.left,
|
||||
row.right
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -6,8 +6,8 @@ use anyhow::{anyhow, Context, Result};
|
|||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
use regex_syntax::ast::{
|
||||
parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, ClassUnicodeKind, RepetitionKind,
|
||||
RepetitionRange,
|
||||
parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
|
||||
ClassUnicodeKind, RepetitionKind, RepetitionRange,
|
||||
};
|
||||
use std::collections::HashMap;
|
||||
use std::i32;
|
||||
|
|
@ -240,19 +240,14 @@ impl NfaBuilder {
|
|||
self.push_advance(chars, next_state_id);
|
||||
Ok(true)
|
||||
}
|
||||
Class::Bracketed(class) => match &class.kind {
|
||||
ClassSet::Item(item) => {
|
||||
let mut chars = self.expand_character_class(&item)?;
|
||||
if class.negated {
|
||||
chars = chars.negate();
|
||||
}
|
||||
self.push_advance(chars, next_state_id);
|
||||
Ok(true)
|
||||
Class::Bracketed(class) => {
|
||||
let mut chars = self.translate_class_set(&class.kind)?;
|
||||
if class.negated {
|
||||
chars = chars.negate();
|
||||
}
|
||||
ClassSet::BinaryOp(_) => Err(anyhow!(
|
||||
"Regex error: Binary operators in character classes aren't supported"
|
||||
)),
|
||||
},
|
||||
self.push_advance(chars, next_state_id);
|
||||
Ok(true)
|
||||
}
|
||||
},
|
||||
Ast::Repetition(repetition) => match repetition.op.kind {
|
||||
RepetitionKind::ZeroOrOne => {
|
||||
|
|
@ -319,6 +314,27 @@ impl NfaBuilder {
|
|||
}
|
||||
}
|
||||
|
||||
fn translate_class_set(&self, class_set: &ClassSet) -> Result<CharacterSet> {
|
||||
match &class_set {
|
||||
ClassSet::Item(item) => self.expand_character_class(&item),
|
||||
ClassSet::BinaryOp(binary_op) => {
|
||||
let mut lhs_char_class = self.translate_class_set(&binary_op.lhs)?;
|
||||
let mut rhs_char_class = self.translate_class_set(&binary_op.rhs)?;
|
||||
match binary_op.kind {
|
||||
ClassSetBinaryOpKind::Intersection => {
|
||||
Ok(lhs_char_class.remove_intersection(&mut rhs_char_class))
|
||||
}
|
||||
ClassSetBinaryOpKind::Difference => {
|
||||
Ok(lhs_char_class.difference(rhs_char_class))
|
||||
}
|
||||
ClassSetBinaryOpKind::SymmetricDifference => {
|
||||
Ok(lhs_char_class.symmetric_difference(rhs_char_class))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn expand_one_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result<bool> {
|
||||
self.nfa.states.push(NfaState::Accept {
|
||||
variable_index: 0,
|
||||
|
|
@ -384,6 +400,13 @@ impl NfaBuilder {
|
|||
}
|
||||
Ok(set)
|
||||
}
|
||||
ClassSetItem::Bracketed(class) => {
|
||||
let mut set = self.translate_class_set(&class.kind)?;
|
||||
if class.negated {
|
||||
set = set.negate();
|
||||
}
|
||||
Ok(set)
|
||||
}
|
||||
_ => Err(anyhow!(
|
||||
"Regex error: Unsupported character class syntax {:?}",
|
||||
item
|
||||
|
|
@ -782,6 +805,79 @@ mod tests {
|
|||
("\u{1000b}", Some((3, "\u{1000b}"))),
|
||||
],
|
||||
},
|
||||
// Emojis
|
||||
Row {
|
||||
rules: vec![Rule::pattern(r"\p{Emoji}+")],
|
||||
separators: vec![],
|
||||
examples: vec![
|
||||
("🐎", Some((0, "🐎"))),
|
||||
("🐴🐴", Some((0, "🐴🐴"))),
|
||||
("#0", Some((0, "#0"))), // These chars are technically emojis!
|
||||
("⻢", None),
|
||||
("♞", None),
|
||||
("horse", None),
|
||||
],
|
||||
},
|
||||
// Intersection
|
||||
Row {
|
||||
rules: vec![Rule::pattern(r"[[0-7]&&[4-9]]+")],
|
||||
separators: vec![],
|
||||
examples: vec![
|
||||
("456", Some((0, "456"))),
|
||||
("64", Some((0, "64"))),
|
||||
("452", Some((0, "45"))),
|
||||
("91", None),
|
||||
("8", None),
|
||||
("3", None),
|
||||
],
|
||||
},
|
||||
// Difference
|
||||
Row {
|
||||
rules: vec![Rule::pattern(r"[[0-9]--[4-7]]+")],
|
||||
separators: vec![],
|
||||
examples: vec![
|
||||
("123", Some((0, "123"))),
|
||||
("83", Some((0, "83"))),
|
||||
("9", Some((0, "9"))),
|
||||
("124", Some((0, "12"))),
|
||||
("67", None),
|
||||
("4", None),
|
||||
],
|
||||
},
|
||||
// Symmetric difference
|
||||
Row {
|
||||
rules: vec![Rule::pattern(r"[[0-7]~~[4-9]]+")],
|
||||
separators: vec![],
|
||||
examples: vec![
|
||||
("123", Some((0, "123"))),
|
||||
("83", Some((0, "83"))),
|
||||
("9", Some((0, "9"))),
|
||||
("124", Some((0, "12"))),
|
||||
("67", None),
|
||||
("4", None),
|
||||
],
|
||||
},
|
||||
// Nested set operations
|
||||
Row {
|
||||
// 0 1 2 3 4 5 6 7 8 9
|
||||
// [0-5]: y y y y y y
|
||||
// [2-4]: y y y
|
||||
// [0-5]--[2-4]: y y y
|
||||
// [3-9]: y y y y y y y
|
||||
// [6-7]: y y
|
||||
// [3-9]--[5-7]: y y y y y
|
||||
// final regex: y y y y y y
|
||||
rules: vec![Rule::pattern(r"[[[0-5]--[2-4]]~~[[3-9]--[6-7]]]+")],
|
||||
separators: vec![],
|
||||
examples: vec![
|
||||
("01", Some((0, "01"))),
|
||||
("432", Some((0, "43"))),
|
||||
("8", Some((0, "8"))),
|
||||
("9", Some((0, "9"))),
|
||||
("2", None),
|
||||
("567", None),
|
||||
],
|
||||
},
|
||||
];
|
||||
|
||||
for Row {
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
Loading…
Add table
Add a link
Reference in a new issue