Handle aliases in unicode property escapes in regexes
This commit is contained in:
parent
82d4da553e
commit
e030434ca7
6 changed files with 150 additions and 8 deletions
|
|
@ -19,10 +19,16 @@ lazy_static! {
|
|||
serde_json::from_str(UNICODE_CATEGORIES_JSON).unwrap();
|
||||
static ref UNICODE_PROPERTIES: HashMap<&'static str, Vec<u32>> =
|
||||
serde_json::from_str(UNICODE_PROPERTIES_JSON).unwrap();
|
||||
static ref UNICODE_CATEGORY_ALIASES: HashMap<&'static str, String> =
|
||||
serde_json::from_str(UNICODE_CATEGORY_ALIASES_JSON).unwrap();
|
||||
static ref UNICODE_PROPERTY_ALIASES: HashMap<&'static str, String> =
|
||||
serde_json::from_str(UNICODE_PROPERTY_ALIASES_JSON).unwrap();
|
||||
}
|
||||
|
||||
const UNICODE_CATEGORIES_JSON: &'static str = include_str!("./unicode-categories.json");
|
||||
const UNICODE_PROPERTIES_JSON: &'static str = include_str!("./unicode-properties.json");
|
||||
const UNICODE_CATEGORY_ALIASES_JSON: &'static str = include_str!("./unicode-category-aliases.json");
|
||||
const UNICODE_PROPERTY_ALIASES_JSON: &'static str = include_str!("./unicode-property-aliases.json");
|
||||
const ALLOWED_REDUNDANT_ESCAPED_CHARS: [char; 4] = ['!', '\'', '"', '/'];
|
||||
|
||||
struct NfaBuilder {
|
||||
|
|
@ -394,12 +400,16 @@ impl NfaBuilder {
|
|||
category_letter = le.to_string();
|
||||
}
|
||||
ClassUnicodeKind::Named(class_name) => {
|
||||
if class_name.len() == 1 {
|
||||
category_letter = class_name.clone();
|
||||
let actual_class_name = UNICODE_CATEGORY_ALIASES
|
||||
.get(class_name.as_str())
|
||||
.or_else(|| UNICODE_PROPERTY_ALIASES.get(class_name.as_str()))
|
||||
.unwrap_or(class_name);
|
||||
if actual_class_name.len() == 1 {
|
||||
category_letter = actual_class_name.clone();
|
||||
} else {
|
||||
let code_points = UNICODE_CATEGORIES
|
||||
.get(class_name.as_str())
|
||||
.or_else(|| UNICODE_PROPERTIES.get(class_name.as_str()))
|
||||
.get(actual_class_name.as_str())
|
||||
.or_else(|| UNICODE_PROPERTIES.get(actual_class_name.as_str()))
|
||||
.ok_or_else(|| {
|
||||
anyhow!(
|
||||
"Regex error: Unsupported unicode character class {}",
|
||||
|
|
|
|||
|
|
@ -0,0 +1 @@
|
|||
{"Other":"C","Control":"Cc","cntrl":"Cc","Format":"Cf","Unassigned":"Cn","Private_Use":"Co","Surrogate":"Cs","Letter":"L","Cased_Letter":"LC","Lowercase_Letter":"Ll","Modifier_Letter":"Lm","Other_Letter":"Lo","Titlecase_Letter":"Lt","Uppercase_Letter":"Lu","Mark":"M","Combining_Mark":"M","Spacing_Mark":"Mc","Enclosing_Mark":"Me","Nonspacing_Mark":"Mn","Number":"N","Decimal_Number":"Nd","digit":"Nd","Letter_Number":"Nl","Other_Number":"No","Punctuation":"P","punct":"P","Connector_Punctuation":"Pc","Dash_Punctuation":"Pd","Close_Punctuation":"Pe","Final_Punctuation":"Pf","Initial_Punctuation":"Pi","Other_Punctuation":"Po","Open_Punctuation":"Ps","Symbol":"S","Currency_Symbol":"Sc","Modifier_Symbol":"Sk","Math_Symbol":"Sm","Other_Symbol":"So","Separator":"Z","Line_Separator":"Zl","Paragraph_Separator":"Zp","Space_Separator":"Zs"}
|
||||
|
|
@ -0,0 +1 @@
|
|||
{"cjkAccountingNumeric":"kAccountingNumeric","cjkOtherNumeric":"kOtherNumeric","cjkPrimaryNumeric":"kPrimaryNumeric","nv":"Numeric_Value","cf":"Case_Folding","cjkCompatibilityVariant":"kCompatibilityVariant","dm":"Decomposition_Mapping","FC_NFKC":"FC_NFKC_Closure","lc":"Lowercase_Mapping","NFKC_CF":"NFKC_Casefold","scf":"Simple_Case_Folding","sfc":"Simple_Case_Folding","slc":"Simple_Lowercase_Mapping","stc":"Simple_Titlecase_Mapping","suc":"Simple_Uppercase_Mapping","tc":"Titlecase_Mapping","uc":"Uppercase_Mapping","bmg":"Bidi_Mirroring_Glyph","bpb":"Bidi_Paired_Bracket","cjkIICore":"kIICore","cjkIRG_GSource":"kIRG_GSource","cjkIRG_HSource":"kIRG_HSource","cjkIRG_JSource":"kIRG_JSource","cjkIRG_KPSource":"kIRG_KPSource","cjkIRG_KSource":"kIRG_KSource","cjkIRG_MSource":"kIRG_MSource","cjkIRG_SSource":"kIRG_SSource","cjkIRG_TSource":"kIRG_TSource","cjkIRG_UKSource":"kIRG_UKSource","cjkIRG_USource":"kIRG_USource","cjkIRG_VSource":"kIRG_VSource","cjkRSUnicode":"kRSUnicode","Unicode_Radical_Stroke":"kRSUnicode","URS":"kRSUnicode","EqUIdeo":"Equivalent_Unified_Ideograph","isc":"ISO_Comment","JSN":"Jamo_Short_Name","na":"Name","na1":"Unicode_1_Name","Name_Alias":"Name_Alias","scx":"Script_Extensions","age":"Age","blk":"Block","sc":"Script","bc":"Bidi_Class","bpt":"Bidi_Paired_Bracket_Type","ccc":"Canonical_Combining_Class","dt":"Decomposition_Type","ea":"East_Asian_Width","gc":"General_Category","GCB":"Grapheme_Cluster_Break","hst":"Hangul_Syllable_Type","InPC":"Indic_Positional_Category","InSC":"Indic_Syllabic_Category","jg":"Joining_Group","jt":"Joining_Type","lb":"Line_Break","NFC_QC":"NFC_Quick_Check","NFD_QC":"NFD_Quick_Check","NFKC_QC":"NFKC_Quick_Check","NFKD_QC":"NFKD_Quick_Check","nt":"Numeric_Type","SB":"Sentence_Break","vo":"Vertical_Orientation","WB":"Word_Break","AHex":"ASCII_Hex_Digit","Alpha":"Alphabetic","Bidi_C":"Bidi_Control","Bidi_M":"Bidi_Mirrored","Cased":"Cased","CE":"Composition_Exclusion","CI":"Case_Ignorable","Comp_Ex":"Full_Composition_Exclusion","CWCF":"Changes_When_Casefolded","CWCM":"Changes_When_Casemapped","CWKCF":"Changes_When_NFKC_Casefolded","CWL":"Changes_When_Lowercased","CWT":"Changes_When_Titlecased","CWU":"Changes_When_Uppercased","Dash":"Dash","Dep":"Deprecated","DI":"Default_Ignorable_Code_Point","Dia":"Diacritic","EBase":"Emoji_Modifier_Base","EComp":"Emoji_Component","EMod":"Emoji_Modifier","Emoji":"Emoji","EPres":"Emoji_Presentation","Ext":"Extender","ExtPict":"Extended_Pictographic","Gr_Base":"Grapheme_Base","Gr_Ext":"Grapheme_Extend","Gr_Link":"Grapheme_Link","Hex":"Hex_Digit","Hyphen":"Hyphen","IDC":"ID_Continue","Ideo":"Ideographic","IDS":"ID_Start","IDSB":"IDS_Binary_Operator","IDST":"IDS_Trinary_Operator","Join_C":"Join_Control","LOE":"Logical_Order_Exception","Lower":"Lowercase","Math":"Math","NChar":"Noncharacter_Code_Point","OAlpha":"Other_Alphabetic","ODI":"Other_Default_Ignorable_Code_Point","OGr_Ext":"Other_Grapheme_Extend","OIDC":"Other_ID_Continue","OIDS":"Other_ID_Start","OLower":"Other_Lowercase","OMath":"Other_Math","OUpper":"Other_Uppercase","Pat_Syn":"Pattern_Syntax","Pat_WS":"Pattern_White_Space","PCM":"Prepended_Concatenation_Mark","QMark":"Quotation_Mark","Radical":"Radical","RI":"Regional_Indicator","SD":"Soft_Dotted","STerm":"Sentence_Terminal","Term":"Terminal_Punctuation","UIdeo":"Unified_Ideograph","Upper":"Uppercase","VS":"Variation_Selector","WSpace":"White_Space","space":"White_Space","XIDC":"XID_Continue","XIDS":"XID_Start","XO_NFC":"Expands_On_NFC","XO_NFD":"Expands_On_NFD","XO_NFKC":"Expands_On_NFKC","XO_NFKD":"Expands_On_NFKD"}
|
||||
|
|
@ -4,10 +4,14 @@
|
|||
|
||||
const CATEGORY_OUTPUT_PATH = './cli/src/generate/prepare_grammar/unicode-categories.json'
|
||||
const PROPERTY_OUTPUT_PATH = './cli/src/generate/prepare_grammar/unicode-properties.json'
|
||||
const CATEGORY_ALIAS_OUTPUT_PATH = './cli/src/generate/prepare_grammar/unicode-category-aliases.json'
|
||||
const PROPERTY_ALIAS_OUTPUT_PATH = './cli/src/generate/prepare_grammar/unicode-property-aliases.json'
|
||||
|
||||
const CATEGORY_URL = 'https://unicode.org/Public/13.0.0/ucd/UnicodeData.txt'
|
||||
const PROPERTY_URL = 'https://unicode.org/Public/13.0.0/ucd/PropList.txt'
|
||||
const DERIVED_PROPERTY_URL = 'https://unicode.org/Public/13.0.0/ucd/DerivedCoreProperties.txt'
|
||||
const CATEGORY_ALIAS_URL = 'https://unicode.org/Public/13.0.0/ucd/PropertyValueAliases.txt'
|
||||
const PROPERTY_ALIAS_URL = 'https://unicode.org/Public/13.0.0/ucd/PropertyAliases.txt'
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
|
@ -16,7 +20,9 @@ const {spawnSync} = require('child_process');
|
|||
// Download the unicode data files, caching them inside the 'target' directory.
|
||||
const categoryData = cachedDownload(CATEGORY_URL);
|
||||
const propertyData = cachedDownload(PROPERTY_URL);
|
||||
const derivedPopertyData = cachedDownload(DERIVED_PROPERTY_URL);
|
||||
const derivedPropertyData = cachedDownload(DERIVED_PROPERTY_URL);
|
||||
const categoryAliasData = cachedDownload(CATEGORY_ALIAS_URL);
|
||||
const propertyAliasData = cachedDownload(PROPERTY_ALIAS_URL);
|
||||
function cachedDownload(url) {
|
||||
let downloadPath = path.join('.', 'target', path.basename(url))
|
||||
if (fs.existsSync(downloadPath)) {
|
||||
|
|
@ -30,10 +36,12 @@ function cachedDownload(url) {
|
|||
|
||||
const categories = {};
|
||||
const properties = {};
|
||||
const categoryAliases = {};
|
||||
const propertyAliases = {}
|
||||
let data, row, lineStart, lineEnd;
|
||||
|
||||
// Parse the properties
|
||||
data = propertyData + derivedPopertyData;
|
||||
data = propertyData + derivedPropertyData;
|
||||
row = 0;
|
||||
lineStart = 0;
|
||||
lineEnd = -1;
|
||||
|
|
@ -106,7 +114,7 @@ while (lineStart < data.length) {
|
|||
if (
|
||||
nameStart === 0 ||
|
||||
categoryStart == 0 ||
|
||||
categoryEnd === 0
|
||||
categoryEnd === -1
|
||||
) {
|
||||
throw new Error(`Unexpected format on line ${row}`);
|
||||
}
|
||||
|
|
@ -124,5 +132,110 @@ while (lineStart < data.length) {
|
|||
categories[category].push(codePoint);
|
||||
}
|
||||
|
||||
// Parse the category aliases
|
||||
data = categoryAliasData;
|
||||
row = 0;
|
||||
lineStart = 0;
|
||||
lineEnd = -1;
|
||||
const IGNORE = /[#\s]/
|
||||
while (lineStart < data.length) {
|
||||
row++;
|
||||
lineStart = lineEnd + 1;
|
||||
lineEnd = data.indexOf('\n', lineStart);
|
||||
if (lineEnd === -1) break;
|
||||
|
||||
// Skip over blank and comment lines
|
||||
if (IGNORE.test(data[lineStart])) continue;
|
||||
|
||||
// Parse the first three semicolon-separated fields:
|
||||
// * property value type
|
||||
// * short name
|
||||
// * long name
|
||||
// Other aliases may be listed in additional fields
|
||||
const propertyValueTypeEnd = data.indexOf(';', lineStart);
|
||||
const shortNameStart = propertyValueTypeEnd + 1;
|
||||
const shortNameEnd = data.indexOf(';', shortNameStart);
|
||||
const longNameStart = shortNameEnd + 1;
|
||||
if (
|
||||
shortNameStart === 0 ||
|
||||
longNameStart === 0
|
||||
) {
|
||||
throw new Error(`Unexpected format on line ${row}`);
|
||||
}
|
||||
|
||||
const propertyValueType = data.slice(lineStart, propertyValueTypeEnd).trim();
|
||||
const shortName = data.slice(shortNameStart, shortNameEnd).trim();
|
||||
|
||||
// Filter for General_Category lines
|
||||
if (propertyValueType !== 'gc') continue;
|
||||
|
||||
let aliasStart = longNameStart;
|
||||
let lineDone = false;
|
||||
do {
|
||||
let aliasEnd = data.indexOf(';', aliasStart);
|
||||
if (aliasEnd === -1 || aliasEnd > lineEnd) {
|
||||
aliasEnd = data.indexOf('#', aliasStart);
|
||||
if (aliasEnd === -1 || aliasEnd > lineEnd) {
|
||||
aliasEnd = lineEnd;
|
||||
}
|
||||
lineDone = true;
|
||||
}
|
||||
const alias = data.slice(aliasStart, aliasEnd).trim();
|
||||
console.log(alias, shortName);
|
||||
categoryAliases[alias] = shortName;
|
||||
aliasStart = aliasEnd + 1;
|
||||
} while (!lineDone);
|
||||
}
|
||||
|
||||
// Parse the property aliases
|
||||
data = propertyAliasData;
|
||||
row = 0;
|
||||
lineStart = 0;
|
||||
lineEnd = -1;
|
||||
while (lineStart < data.length) {
|
||||
row++;
|
||||
lineStart = lineEnd + 1;
|
||||
lineEnd = data.indexOf('\n', lineStart);
|
||||
if (lineEnd === -1) break;
|
||||
|
||||
// Skip over blank and comment lines
|
||||
if (IGNORE.test(data[lineStart])) continue;
|
||||
|
||||
// Parse the first two semicolon fields:
|
||||
// * short name
|
||||
// * long name
|
||||
const shortNameEnd = data.indexOf(';', lineStart);
|
||||
const longNameStart = shortNameEnd + 1;
|
||||
|
||||
if (longNameStart == 0) {
|
||||
throw new Error(`Unexpected format on line ${row}`);
|
||||
}
|
||||
|
||||
let alias = data.slice(lineStart, shortNameEnd).trim();
|
||||
let longName = null;
|
||||
let nameStart = longNameStart;
|
||||
let lineDone = false;
|
||||
do {
|
||||
let nameEnd = data.indexOf(';', nameStart);
|
||||
if (nameEnd === -1 || nameEnd > lineEnd) {
|
||||
nameEnd = data.indexOf('#', nameStart);
|
||||
if (nameEnd === -1 || nameEnd > lineEnd) {
|
||||
nameEnd = lineEnd;
|
||||
}
|
||||
lineDone = true;
|
||||
}
|
||||
if (longName == null) {
|
||||
longName = data.slice(nameStart, nameEnd).trim();
|
||||
} else {
|
||||
alias = data.slice(nameStart, nameEnd).trim();
|
||||
}
|
||||
console.log(alias, longName);
|
||||
propertyAliases[alias] = longName;
|
||||
nameStart = nameEnd + 1;
|
||||
} while (!lineDone);
|
||||
}
|
||||
|
||||
fs.writeFileSync(CATEGORY_OUTPUT_PATH, JSON.stringify(categories), 'utf8');
|
||||
fs.writeFileSync(PROPERTY_OUTPUT_PATH, JSON.stringify(properties), 'utf8');
|
||||
fs.writeFileSync(CATEGORY_ALIAS_OUTPUT_PATH, JSON.stringify(categoryAliases), 'utf8');
|
||||
fs.writeFileSync(PROPERTY_ALIAS_OUTPUT_PATH, JSON.stringify(propertyAliases), 'utf8');
|
||||
|
|
|
|||
|
|
@ -30,3 +30,14 @@ Math symbols
|
|||
|
||||
(program
|
||||
(math_sym) (math_sym) (math_sym) (math_sym) (math_sym))
|
||||
|
||||
================================
|
||||
Letterlike numeric characters
|
||||
================================
|
||||
|
||||
ᛯ Ⅵ 〩
|
||||
|
||||
---
|
||||
|
||||
(program
|
||||
(letter_number) (letter_number) (letter_number))
|
||||
|
|
|
|||
|
|
@ -13,7 +13,8 @@
|
|||
"members": [
|
||||
{"type": "SYMBOL", "name": "lower"},
|
||||
{"type": "SYMBOL", "name": "upper"},
|
||||
{"type": "SYMBOL", "name": "math_sym"}
|
||||
{"type": "SYMBOL", "name": "math_sym"},
|
||||
{"type": "SYMBOL", "name": "letter_number"}
|
||||
]
|
||||
}
|
||||
},
|
||||
|
|
@ -31,6 +32,11 @@
|
|||
"math_sym": {
|
||||
"type": "PATTERN",
|
||||
"value": "\\p{Sm}+"
|
||||
},
|
||||
|
||||
"letter_number": {
|
||||
"type": "PATTERN",
|
||||
"value": "\\p{Letter_Number}"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue