Start work on handling unicode property escapes in regexes

This commit is contained in:
Max Brunsfeld 2021-01-29 16:37:45 -08:00
parent 38444ea7f9
commit e3ba701344
7 changed files with 246 additions and 10 deletions

View file

@ -13,8 +13,9 @@ impl Error {
Error(vec![format!("Grammar error: {}", message)])
}
pub fn regex(message: &str) -> Self {
Error(vec![format!("Regex error: {}", message)])
pub fn regex(mut message: String) -> Self {
message.insert_str(0, "Regex error: ");
Error(vec![message])
}
pub fn undefined_symbol(name: &str) -> Self {

View file

@ -6,15 +6,23 @@ use crate::generate::rules::Rule;
use lazy_static::lazy_static;
use regex::Regex;
use regex_syntax::ast::{
parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind, RepetitionRange,
parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, ClassUnicodeKind, RepetitionKind,
RepetitionRange,
};
use std::collections::HashMap;
use std::i32;
lazy_static! {
static ref CURLY_BRACE_REGEX: Regex =
Regex::new(r#"(^|[^\\])\{([^}]*[^0-9A-Fa-f,}][^}]*)\}"#).unwrap();
Regex::new(r#"(^|[^\\p])\{([^}]*[^0-9A-Fa-f,}][^}]*)\}"#).unwrap();
static ref UNICODE_CATEGORIES: HashMap<&'static str, Vec<u32>> =
serde_json::from_str(UNICODE_CATEGORIES_JSON).unwrap();
static ref UNICODE_PROPERTIES: HashMap<&'static str, Vec<u32>> =
serde_json::from_str(UNICODE_PROPERTIES_JSON).unwrap();
}
const UNICODE_CATEGORIES_JSON: &'static str = include_str!("./unicode-categories.json");
const UNICODE_PROPERTIES_JSON: &'static str = include_str!("./unicode-properties.json");
const ALLOWED_REDUNDANT_ESCAPED_CHARS: [char; 4] = ['!', '\'', '"', '/'];
struct NfaBuilder {
@ -196,7 +204,7 @@ impl NfaBuilder {
fn expand_regex(&mut self, ast: &Ast, mut next_state_id: u32) -> Result<bool> {
match ast {
Ast::Empty(_) => Ok(false),
Ast::Flags(_) => Err(Error::regex("Flags are not supported")),
Ast::Flags(_) => Err(Error::regex("Flags are not supported".to_string())),
Ast::Literal(literal) => {
self.push_advance(CharacterSet::from_char(literal.c), next_state_id);
Ok(true)
@ -205,10 +213,15 @@ impl NfaBuilder {
self.push_advance(CharacterSet::from_char('\n').negate(), next_state_id);
Ok(true)
}
Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")),
Ast::Assertion(_) => Err(Error::regex("Assertions are not supported".to_string())),
Ast::Class(class) => match class {
Class::Unicode(_) => {
Err(Error::regex("Unicode character classes are not supported"))
Class::Unicode(class) => {
let mut chars = self.expand_unicode_character_class(&class.kind)?;
if class.negated {
chars = chars.negate();
}
self.push_advance(chars, next_state_id);
Ok(true)
}
Class::Perl(class) => {
let mut chars = self.expand_perl_character_class(&class.kind);
@ -228,7 +241,7 @@ impl NfaBuilder {
Ok(true)
}
ClassSet::BinaryOp(_) => Err(Error::regex(
"Binary operators in character classes aren't supported",
"Binary operators in character classes aren't supported".to_string(),
)),
},
},
@ -355,13 +368,63 @@ impl NfaBuilder {
Ok(result)
}
ClassSetItem::Perl(class) => Ok(self.expand_perl_character_class(&class.kind)),
_ => Err(Error::regex(&format!(
_ => Err(Error::regex(format!(
"Unsupported character class syntax {:?}",
item
))),
}
}
fn expand_unicode_character_class(&self, class: &ClassUnicodeKind) -> Result<CharacterSet> {
let mut chars = CharacterSet::empty();
let category_letter;
match class {
ClassUnicodeKind::OneLetter(le) => {
category_letter = le.to_string();
}
ClassUnicodeKind::Named(class_name) => {
if class_name.len() == 1 {
category_letter = class_name.clone();
} else {
let code_points = UNICODE_CATEGORIES
.get(class_name.as_str())
.or_else(|| UNICODE_PROPERTIES.get(class_name.as_str()))
.ok_or_else(|| {
Error::regex(format!(
"Unsupported unicode character class {}",
class_name
))
})?;
for c in code_points {
if let Some(c) = std::char::from_u32(*c) {
chars = chars.add_char(c);
}
}
return Ok(chars);
}
}
ClassUnicodeKind::NamedValue { .. } => {
return Err(Error::regex(
"Key-value unicode properties are not supported".to_string(),
))
}
}
for (category, code_points) in UNICODE_CATEGORIES.iter() {
if category.starts_with(&category_letter) {
for c in code_points {
if let Some(c) = std::char::from_u32(*c) {
chars = chars.add_char(c);
}
}
}
}
Ok(chars)
}
fn expand_perl_character_class(&self, item: &ClassPerlKind) -> CharacterSet {
match item {
ClassPerlKind::Digit => CharacterSet::from_range('0', '9'),

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,128 @@
#!/usr/bin/env node
// This script generates a JSON file that is used by the CLI to handle unicode property escapes.
const CATEGORY_OUTPUT_PATH = './cli/src/generate/prepare_grammar/unicode-categories.json'
const PROPERTY_OUTPUT_PATH = './cli/src/generate/prepare_grammar/unicode-properties.json'
const CATEGORY_URL = 'https://unicode.org/Public/13.0.0/ucd/UnicodeData.txt'
const PROPERTY_URL = 'https://unicode.org/Public/13.0.0/ucd/PropList.txt'
const DERIVED_PROPERTY_URL = 'https://unicode.org/Public/13.0.0/ucd/DerivedCoreProperties.txt'
const fs = require('fs');
const path = require('path');
const {spawnSync} = require('child_process');
// Download the unicode data files, caching them inside the 'target' directory.
const categoryData = cachedDownload(CATEGORY_URL);
const propertyData = cachedDownload(PROPERTY_URL);
const derivedPopertyData = cachedDownload(DERIVED_PROPERTY_URL);
function cachedDownload(url) {
let downloadPath = path.join('.', 'target', path.basename(url))
if (fs.existsSync(downloadPath)) {
return fs.readFileSync(downloadPath, 'utf8');
} else {
const data = spawnSync('curl', [url], {encoding: 'utf8'}).stdout;
fs.writeFileSync(downloadPath, data, 'utf8');
return data;
}
}
const categories = {};
const properties = {};
let data, row, lineStart, lineEnd;
// Parse the properties
data = propertyData + derivedPopertyData;
row = 0;
lineStart = 0;
lineEnd = -1;
const CODE_POINT = /[0-9A-Fa-f]/
while (lineStart < data.length) {
row++;
lineStart = lineEnd + 1;
lineEnd = data.indexOf('\n', lineStart);
if (lineEnd === -1) break;
// Skip over blank and comment lines
if (!CODE_POINT.test(data[lineStart])) continue;
// Parse the first two semicolon fields:
// * code point or code point range
// * property
const codePointEnd = data.indexOf(';', lineStart);
const propertyStart = codePointEnd + 1;
const propertyEnd = data.indexOf('#', propertyStart);
if (
codePointEnd === -1 ||
propertyEnd === -1
) {
throw new Error(`Unexpected format on line ${row}`);
}
// Process ranges (separated by '..)
const codePoints = data.slice(lineStart, codePointEnd).trim()
.split('..')
.map(p => parseInt(p, 16));
if (codePoints.length === 1) {
codePoints.push(codePoints[0]);
}
const property = data.slice(propertyStart, propertyEnd).trim();
console.log(codePoints, property);
for (let c = codePoints[0]; c <= codePoints[1]; c++) {
if (!properties[property]) {
properties[property] = [];
}
properties[property].push(c);
}
}
// Parse the categories.
// Each line represents a code point.
data = categoryData;
row = 0;
lineStart = 0;
lineEnd = -1;
while (lineStart < data.length) {
row++;
lineStart = lineEnd + 1;
lineEnd = data.indexOf('\n', lineStart);
if (lineEnd === -1) break;
// Parse the first three semicolon-separated fields:
// * code point (hexadecimal)
// * name
// * category
const codePointEnd = data.indexOf(';', lineStart);
const nameStart = codePointEnd + 1;
const nameEnd = data.indexOf(';', nameStart);
const categoryStart = nameEnd + 1;
const categoryEnd = data.indexOf(';', categoryStart)
if (
nameStart === 0 ||
categoryStart == 0 ||
categoryEnd === 0
) {
throw new Error(`Unexpected format on line ${row}`);
}
const codePoint = parseInt(data.slice(lineStart, codePointEnd), 16);
const name = data.slice(nameStart, nameEnd);
const category = data.slice(categoryStart, categoryEnd);
console.log(codePoint, category, name);
// Group the code points by their category.
if (!categories[category]) {
categories[category] = [];
}
categories[category].push(codePoint);
}
fs.writeFileSync(CATEGORY_OUTPUT_PATH, JSON.stringify(categories), 'utf8');
fs.writeFileSync(PROPERTY_OUTPUT_PATH, JSON.stringify(properties), 'utf8');

View file

@ -0,0 +1,9 @@
============================
Letters
======================
Aბㄱ
---
(expression (upper))

View file

@ -0,0 +1,33 @@
{
"name": "unicode_classes",
"extras": [
{"type": "PATTERN", "value": "\\s"}
],
"rules": {
"expression": {
"type": "CHOICE",
"members": [
{"type": "SYMBOL", "name": "lower"},
{"type": "SYMBOL", "name": "upper"},
{"type": "SYMBOL", "name": "math_sym"}
]
},
"lower": {
"type": "PATTERN",
"value": "\\p{Ll}\\p{L}*"
},
"upper": {
"type": "PATTERN",
"value": "\\p{Lu}\\p{L}*"
},
"math_sym": {
"type": "PATTERN",
"value": "\\p{Sm}+"
}
}
}