feat(cli): bump unicode data to v15.1.0

This commit is contained in:
Amaan Qureshi 2024-05-06 11:05:22 -04:00
parent 6e6dcf1caf
commit b1fd3214db
4 changed files with 165 additions and 163 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -1 +1 @@
{"cjkAccountingNumeric":"kAccountingNumeric","cjkOtherNumeric":"kOtherNumeric","cjkPrimaryNumeric":"kPrimaryNumeric","nv":"Numeric_Value","cf":"Case_Folding","cjkCompatibilityVariant":"kCompatibilityVariant","dm":"Decomposition_Mapping","FC_NFKC":"FC_NFKC_Closure","lc":"Lowercase_Mapping","NFKC_CF":"NFKC_Casefold","scf":"Simple_Case_Folding","sfc":"Simple_Case_Folding","slc":"Simple_Lowercase_Mapping","stc":"Simple_Titlecase_Mapping","suc":"Simple_Uppercase_Mapping","tc":"Titlecase_Mapping","uc":"Uppercase_Mapping","bmg":"Bidi_Mirroring_Glyph","bpb":"Bidi_Paired_Bracket","cjkIICore":"kIICore","cjkIRG_GSource":"kIRG_GSource","cjkIRG_HSource":"kIRG_HSource","cjkIRG_JSource":"kIRG_JSource","cjkIRG_KPSource":"kIRG_KPSource","cjkIRG_KSource":"kIRG_KSource","cjkIRG_MSource":"kIRG_MSource","cjkIRG_SSource":"kIRG_SSource","cjkIRG_TSource":"kIRG_TSource","cjkIRG_UKSource":"kIRG_UKSource","cjkIRG_USource":"kIRG_USource","cjkIRG_VSource":"kIRG_VSource","cjkRSUnicode":"kRSUnicode","Unicode_Radical_Stroke":"kRSUnicode","URS":"kRSUnicode","EqUIdeo":"Equivalent_Unified_Ideograph","isc":"ISO_Comment","JSN":"Jamo_Short_Name","na":"Name","na1":"Unicode_1_Name","Name_Alias":"Name_Alias","scx":"Script_Extensions","age":"Age","blk":"Block","sc":"Script","bc":"Bidi_Class","bpt":"Bidi_Paired_Bracket_Type","ccc":"Canonical_Combining_Class","dt":"Decomposition_Type","ea":"East_Asian_Width","gc":"General_Category","GCB":"Grapheme_Cluster_Break","hst":"Hangul_Syllable_Type","InPC":"Indic_Positional_Category","InSC":"Indic_Syllabic_Category","jg":"Joining_Group","jt":"Joining_Type","lb":"Line_Break","NFC_QC":"NFC_Quick_Check","NFD_QC":"NFD_Quick_Check","NFKC_QC":"NFKC_Quick_Check","NFKD_QC":"NFKD_Quick_Check","nt":"Numeric_Type","SB":"Sentence_Break","vo":"Vertical_Orientation","WB":"Word_Break","AHex":"ASCII_Hex_Digit","Alpha":"Alphabetic","Bidi_C":"Bidi_Control","Bidi_M":"Bidi_Mirrored","Cased":"Cased","CE":"Composition_Exclusion","CI":"Case_Ignorable","Comp_Ex":"Full_Composition_Exclusion","CWCF":"Changes_When_Casefolded","CWCM":"Changes_When_Casemapped","CWKCF":"Changes_When_NFKC_Casefolded","CWL":"Changes_When_Lowercased","CWT":"Changes_When_Titlecased","CWU":"Changes_When_Uppercased","Dash":"Dash","Dep":"Deprecated","DI":"Default_Ignorable_Code_Point","Dia":"Diacritic","EBase":"Emoji_Modifier_Base","EComp":"Emoji_Component","EMod":"Emoji_Modifier","Emoji":"Emoji","EPres":"Emoji_Presentation","Ext":"Extender","ExtPict":"Extended_Pictographic","Gr_Base":"Grapheme_Base","Gr_Ext":"Grapheme_Extend","Gr_Link":"Grapheme_Link","Hex":"Hex_Digit","Hyphen":"Hyphen","IDC":"ID_Continue","Ideo":"Ideographic","IDS":"ID_Start","IDSB":"IDS_Binary_Operator","IDST":"IDS_Trinary_Operator","Join_C":"Join_Control","LOE":"Logical_Order_Exception","Lower":"Lowercase","Math":"Math","NChar":"Noncharacter_Code_Point","OAlpha":"Other_Alphabetic","ODI":"Other_Default_Ignorable_Code_Point","OGr_Ext":"Other_Grapheme_Extend","OIDC":"Other_ID_Continue","OIDS":"Other_ID_Start","OLower":"Other_Lowercase","OMath":"Other_Math","OUpper":"Other_Uppercase","Pat_Syn":"Pattern_Syntax","Pat_WS":"Pattern_White_Space","PCM":"Prepended_Concatenation_Mark","QMark":"Quotation_Mark","Radical":"Radical","RI":"Regional_Indicator","SD":"Soft_Dotted","STerm":"Sentence_Terminal","Term":"Terminal_Punctuation","UIdeo":"Unified_Ideograph","Upper":"Uppercase","VS":"Variation_Selector","WSpace":"White_Space","space":"White_Space","XIDC":"XID_Continue","XIDS":"XID_Start","XO_NFC":"Expands_On_NFC","XO_NFD":"Expands_On_NFD","XO_NFKC":"Expands_On_NFKC","XO_NFKD":"Expands_On_NFKD"}
{"cjkAccountingNumeric":"kAccountingNumeric","cjkOtherNumeric":"kOtherNumeric","cjkPrimaryNumeric":"kPrimaryNumeric","nv":"Numeric_Value","bmg":"Bidi_Mirroring_Glyph","bpb":"Bidi_Paired_Bracket","cf":"Case_Folding","cjkCompatibilityVariant":"kCompatibilityVariant","dm":"Decomposition_Mapping","EqUIdeo":"Equivalent_Unified_Ideograph","FC_NFKC":"FC_NFKC_Closure","lc":"Lowercase_Mapping","NFKC_CF":"NFKC_Casefold","NFKC_SCF":"NFKC_Simple_Casefold","scf":"Simple_Case_Folding","sfc":"Simple_Case_Folding","slc":"Simple_Lowercase_Mapping","stc":"Simple_Titlecase_Mapping","suc":"Simple_Uppercase_Mapping","tc":"Titlecase_Mapping","uc":"Uppercase_Mapping","cjkIICore":"kIICore","cjkIRG_GSource":"kIRG_GSource","cjkIRG_HSource":"kIRG_HSource","cjkIRG_JSource":"kIRG_JSource","cjkIRG_KPSource":"kIRG_KPSource","cjkIRG_KSource":"kIRG_KSource","cjkIRG_MSource":"kIRG_MSource","cjkIRG_SSource":"kIRG_SSource","cjkIRG_TSource":"kIRG_TSource","cjkIRG_UKSource":"kIRG_UKSource","cjkIRG_USource":"kIRG_USource","cjkIRG_VSource":"kIRG_VSource","cjkRSUnicode":"kRSUnicode","Unicode_Radical_Stroke":"kRSUnicode","URS":"kRSUnicode","isc":"ISO_Comment","JSN":"Jamo_Short_Name","na":"Name","na1":"Unicode_1_Name","Name_Alias":"Name_Alias","scx":"Script_Extensions","age":"Age","blk":"Block","sc":"Script","bc":"Bidi_Class","bpt":"Bidi_Paired_Bracket_Type","ccc":"Canonical_Combining_Class","dt":"Decomposition_Type","ea":"East_Asian_Width","gc":"General_Category","GCB":"Grapheme_Cluster_Break","hst":"Hangul_Syllable_Type","InCB":"Indic_Conjunct_Break","InPC":"Indic_Positional_Category","InSC":"Indic_Syllabic_Category","jg":"Joining_Group","jt":"Joining_Type","lb":"Line_Break","NFC_QC":"NFC_Quick_Check","NFD_QC":"NFD_Quick_Check","NFKC_QC":"NFKC_Quick_Check","NFKD_QC":"NFKD_Quick_Check","nt":"Numeric_Type","SB":"Sentence_Break","vo":"Vertical_Orientation","WB":"Word_Break","AHex":"ASCII_Hex_Digit","Alpha":"Alphabetic","Bidi_C":"Bidi_Control","Bidi_M":"Bidi_Mirrored","Cased":"Cased","CE":"Composition_Exclusion","CI":"Case_Ignorable","Comp_Ex":"Full_Composition_Exclusion","CWCF":"Changes_When_Casefolded","CWCM":"Changes_When_Casemapped","CWKCF":"Changes_When_NFKC_Casefolded","CWL":"Changes_When_Lowercased","CWT":"Changes_When_Titlecased","CWU":"Changes_When_Uppercased","Dash":"Dash","Dep":"Deprecated","DI":"Default_Ignorable_Code_Point","Dia":"Diacritic","EBase":"Emoji_Modifier_Base","EComp":"Emoji_Component","EMod":"Emoji_Modifier","Emoji":"Emoji","EPres":"Emoji_Presentation","Ext":"Extender","ExtPict":"Extended_Pictographic","Gr_Base":"Grapheme_Base","Gr_Ext":"Grapheme_Extend","Gr_Link":"Grapheme_Link","Hex":"Hex_Digit","Hyphen":"Hyphen","ID_Compat_Math_Continue":"ID_Compat_Math_Continue","ID_Compat_Math_Start":"ID_Compat_Math_Start","IDC":"ID_Continue","Ideo":"Ideographic","IDS":"ID_Start","IDSB":"IDS_Binary_Operator","IDST":"IDS_Trinary_Operator","IDSU":"IDS_Unary_Operator","Join_C":"Join_Control","LOE":"Logical_Order_Exception","Lower":"Lowercase","Math":"Math","NChar":"Noncharacter_Code_Point","OAlpha":"Other_Alphabetic","ODI":"Other_Default_Ignorable_Code_Point","OGr_Ext":"Other_Grapheme_Extend","OIDC":"Other_ID_Continue","OIDS":"Other_ID_Start","OLower":"Other_Lowercase","OMath":"Other_Math","OUpper":"Other_Uppercase","Pat_Syn":"Pattern_Syntax","Pat_WS":"Pattern_White_Space","PCM":"Prepended_Concatenation_Mark","QMark":"Quotation_Mark","Radical":"Radical","RI":"Regional_Indicator","SD":"Soft_Dotted","STerm":"Sentence_Terminal","Term":"Terminal_Punctuation","UIdeo":"Unified_Ideograph","Upper":"Uppercase","VS":"Variation_Selector","WSpace":"White_Space","space":"White_Space","XIDC":"XID_Continue","XIDS":"XID_Start","XO_NFC":"Expands_On_NFC","XO_NFD":"Expands_On_NFD","XO_NFKC":"Expands_On_NFKC","XO_NFKD":"Expands_On_NFKD"}

View file

@ -7,16 +7,17 @@ const PROPERTY_OUTPUT_PATH = './cli/src/generate/prepare_grammar/unicode-propert
const CATEGORY_ALIAS_OUTPUT_PATH = './cli/src/generate/prepare_grammar/unicode-category-aliases.json'
const PROPERTY_ALIAS_OUTPUT_PATH = './cli/src/generate/prepare_grammar/unicode-property-aliases.json'
const CATEGORY_URL = 'https://unicode.org/Public/14.0.0/ucd/UnicodeData.txt'
const PROPERTY_URL = 'https://unicode.org/Public/14.0.0/ucd/PropList.txt'
const DERIVED_PROPERTY_URL = 'https://unicode.org/Public/14.0.0/ucd/DerivedCoreProperties.txt'
const CATEGORY_ALIAS_URL = 'https://unicode.org/Public/14.0.0/ucd/PropertyValueAliases.txt'
const PROPERTY_ALIAS_URL = 'https://unicode.org/Public/14.0.0/ucd/PropertyAliases.txt'
const EMOJI_DATA_URL = 'https://unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt'
const UNICODE_STANDARD_VERSION = '15.1.0';
const CATEGORY_URL = `https://unicode.org/Public/${UNICODE_STANDARD_VERSION}/ucd/UnicodeData.txt`
const PROPERTY_URL = `https://unicode.org/Public/${UNICODE_STANDARD_VERSION}/ucd/PropList.txt`
const DERIVED_PROPERTY_URL = `https://unicode.org/Public/${UNICODE_STANDARD_VERSION}/ucd/DerivedCoreProperties.txt`
const CATEGORY_ALIAS_URL = `https://unicode.org/Public/${UNICODE_STANDARD_VERSION}/ucd/PropertyValueAliases.txt`
const PROPERTY_ALIAS_URL = `https://unicode.org/Public/${UNICODE_STANDARD_VERSION}/ucd/PropertyAliases.txt`
const EMOJI_DATA_URL = `https://unicode.org/Public/${UNICODE_STANDARD_VERSION}/ucd/emoji/emoji-data.txt`
const fs = require('fs');
const path = require('path');
const {spawnSync} = require('child_process');
const { spawnSync } = require('child_process');
// Download the unicode data files, caching them inside the 'target' directory.
const categoryData = cachedDownload(CATEGORY_URL);
@ -26,14 +27,15 @@ const categoryAliasData = cachedDownload(CATEGORY_ALIAS_URL);
const propertyAliasData = cachedDownload(PROPERTY_ALIAS_URL);
const emojiData = cachedDownload(EMOJI_DATA_URL);
function cachedDownload(url) {
let downloadPath = path.join('.', 'target', path.basename(url))
if (fs.existsSync(downloadPath)) {
return fs.readFileSync(downloadPath, 'utf8');
} else {
const data = spawnSync('curl', [url], {encoding: 'utf8'}).stdout;
fs.writeFileSync(downloadPath, data, 'utf8');
return data;
}
console.log(`Downloading ${url}`);
let downloadPath = path.join('.', 'target', path.basename(url) + `.${UNICODE_STANDARD_VERSION}`)
if (fs.existsSync(downloadPath)) {
return fs.readFileSync(downloadPath, 'utf8');
} else {
const data = spawnSync('curl', [url], { encoding: 'utf8' }).stdout;
fs.writeFileSync(downloadPath, data, 'utf8');
return data;
}
}
const categories = {};
@ -49,47 +51,47 @@ lineStart = 0;
lineEnd = -1;
const CODE_POINT = /[0-9A-Fa-f]/
while (lineStart < data.length) {
row++;
lineStart = lineEnd + 1;
lineEnd = data.indexOf('\n', lineStart);
if (lineEnd === -1) break;
row++;
lineStart = lineEnd + 1;
lineEnd = data.indexOf('\n', lineStart);
if (lineEnd === -1) break;
// Skip over blank and comment lines
if (!CODE_POINT.test(data[lineStart])) continue;
// Skip over blank and comment lines
if (!CODE_POINT.test(data[lineStart])) continue;
// Parse the first two semicolon fields:
// * code point or code point range
// * property
const codePointEnd = data.indexOf(';', lineStart);
const propertyStart = codePointEnd + 1;
const propertyEnd = data.indexOf('#', propertyStart);
// Parse the first two semicolon fields:
// * code point or code point range
// * property
const codePointEnd = data.indexOf(';', lineStart);
const propertyStart = codePointEnd + 1;
const propertyEnd = data.indexOf('#', propertyStart);
if (
codePointEnd === -1 ||
propertyEnd === -1
) {
throw new Error(`Unexpected format on line ${row}`);
}
// Process ranges (separated by '..)
const codePoints = data.slice(lineStart, codePointEnd).trim()
.split('..')
.map(p => parseInt(p, 16));
if (codePoints.length === 1) {
codePoints.push(codePoints[0]);
}
const property = data.slice(propertyStart, propertyEnd).trim();
console.log("Property:", codePoints, property);
for (let c = codePoints[0]; c <= codePoints[1]; c++) {
if (!properties[property]) {
properties[property] = [];
}
properties[property].push(c);
if (
codePointEnd === -1 ||
propertyEnd === -1
) {
throw new Error(`Unexpected format on line ${row}`);
}
// Process ranges (separated by '..)
const codePoints = data.slice(lineStart, codePointEnd).trim()
.split('..')
.map(p => parseInt(p, 16));
if (codePoints.length === 1) {
codePoints.push(codePoints[0]);
}
const property = data.slice(propertyStart, propertyEnd).trim();
console.log("Property:", codePoints, property);
for (let c = codePoints[0]; c <= codePoints[1]; c++) {
if (!properties[property]) {
properties[property] = [];
}
properties[property].push(c);
}
}
// Parse the categories.
@ -99,39 +101,39 @@ row = 0;
lineStart = 0;
lineEnd = -1;
while (lineStart < data.length) {
row++;
lineStart = lineEnd + 1;
lineEnd = data.indexOf('\n', lineStart);
if (lineEnd === -1) break;
row++;
lineStart = lineEnd + 1;
lineEnd = data.indexOf('\n', lineStart);
if (lineEnd === -1) break;
// Parse the first three semicolon-separated fields:
// * code point (hexadecimal)
// * name
// * category
const codePointEnd = data.indexOf(';', lineStart);
const nameStart = codePointEnd + 1;
const nameEnd = data.indexOf(';', nameStart);
const categoryStart = nameEnd + 1;
const categoryEnd = data.indexOf(';', categoryStart)
if (
nameStart === 0 ||
categoryStart == 0 ||
categoryEnd === -1
) {
throw new Error(`Unexpected format on line ${row}`);
}
// Parse the first three semicolon-separated fields:
// * code point (hexadecimal)
// * name
// * category
const codePointEnd = data.indexOf(';', lineStart);
const nameStart = codePointEnd + 1;
const nameEnd = data.indexOf(';', nameStart);
const categoryStart = nameEnd + 1;
const categoryEnd = data.indexOf(';', categoryStart)
if (
nameStart === 0 ||
categoryStart == 0 ||
categoryEnd === -1
) {
throw new Error(`Unexpected format on line ${row}`);
}
const codePoint = parseInt(data.slice(lineStart, codePointEnd), 16);
const name = data.slice(nameStart, nameEnd);
const category = data.slice(categoryStart, categoryEnd);
const codePoint = parseInt(data.slice(lineStart, codePointEnd), 16);
const name = data.slice(nameStart, nameEnd);
const category = data.slice(categoryStart, categoryEnd);
console.log("Category:", codePoint, category, name);
console.log("Category:", codePoint, category, name);
// Group the code points by their category.
if (!categories[category]) {
categories[category] = [];
}
categories[category].push(codePoint);
// Group the code points by their category.
if (!categories[category]) {
categories[category] = [];
}
categories[category].push(codePoint);
}
// Parse the category aliases
@ -141,52 +143,52 @@ lineStart = 0;
lineEnd = -1;
const IGNORE = /[#\s]/
while (lineStart < data.length) {
row++;
lineStart = lineEnd + 1;
lineEnd = data.indexOf('\n', lineStart);
if (lineEnd === -1) break;
row++;
lineStart = lineEnd + 1;
lineEnd = data.indexOf('\n', lineStart);
if (lineEnd === -1) break;
// Skip over blank and comment lines
if (IGNORE.test(data[lineStart])) continue;
// Skip over blank and comment lines
if (IGNORE.test(data[lineStart])) continue;
// Parse the first three semicolon-separated fields:
// * property value type
// * short name
// * long name
// Other aliases may be listed in additional fields
const propertyValueTypeEnd = data.indexOf(';', lineStart);
const shortNameStart = propertyValueTypeEnd + 1;
const shortNameEnd = data.indexOf(';', shortNameStart);
const longNameStart = shortNameEnd + 1;
if (
shortNameStart === 0 ||
longNameStart === 0
) {
throw new Error(`Unexpected format on line ${row}`);
// Parse the first three semicolon-separated fields:
// * property value type
// * short name
// * long name
// Other aliases may be listed in additional fields
const propertyValueTypeEnd = data.indexOf(';', lineStart);
const shortNameStart = propertyValueTypeEnd + 1;
const shortNameEnd = data.indexOf(';', shortNameStart);
const longNameStart = shortNameEnd + 1;
if (
shortNameStart === 0 ||
longNameStart === 0
) {
throw new Error(`Unexpected format on line ${row}`);
}
const propertyValueType = data.slice(lineStart, propertyValueTypeEnd).trim();
const shortName = data.slice(shortNameStart, shortNameEnd).trim();
// Filter for General_Category lines
if (propertyValueType !== 'gc') continue;
let aliasStart = longNameStart;
let lineDone = false;
do {
let aliasEnd = data.indexOf(';', aliasStart);
if (aliasEnd === -1 || aliasEnd > lineEnd) {
aliasEnd = data.indexOf('#', aliasStart);
if (aliasEnd === -1 || aliasEnd > lineEnd) {
aliasEnd = lineEnd;
}
lineDone = true;
}
const propertyValueType = data.slice(lineStart, propertyValueTypeEnd).trim();
const shortName = data.slice(shortNameStart, shortNameEnd).trim();
// Filter for General_Category lines
if (propertyValueType !== 'gc') continue;
let aliasStart = longNameStart;
let lineDone = false;
do {
let aliasEnd = data.indexOf(';', aliasStart);
if (aliasEnd === -1 || aliasEnd > lineEnd) {
aliasEnd = data.indexOf('#', aliasStart);
if (aliasEnd === -1 || aliasEnd > lineEnd) {
aliasEnd = lineEnd;
}
lineDone = true;
}
const alias = data.slice(aliasStart, aliasEnd).trim();
console.log("Category alias:", alias, shortName);
categoryAliases[alias] = shortName;
aliasStart = aliasEnd + 1;
} while (!lineDone);
const alias = data.slice(aliasStart, aliasEnd).trim();
console.log("Category alias:", alias, shortName);
categoryAliases[alias] = shortName;
aliasStart = aliasEnd + 1;
} while (!lineDone);
}
// Parse the property aliases
@ -195,46 +197,46 @@ row = 0;
lineStart = 0;
lineEnd = -1;
while (lineStart < data.length) {
row++;
lineStart = lineEnd + 1;
lineEnd = data.indexOf('\n', lineStart);
if (lineEnd === -1) break;
row++;
lineStart = lineEnd + 1;
lineEnd = data.indexOf('\n', lineStart);
if (lineEnd === -1) break;
// Skip over blank and comment lines
if (IGNORE.test(data[lineStart])) continue;
// Skip over blank and comment lines
if (IGNORE.test(data[lineStart])) continue;
// Parse the first two semicolon fields:
// * short name
// * long name
const shortNameEnd = data.indexOf(';', lineStart);
const longNameStart = shortNameEnd + 1;
// Parse the first two semicolon fields:
// * short name
// * long name
const shortNameEnd = data.indexOf(';', lineStart);
const longNameStart = shortNameEnd + 1;
if (longNameStart == 0) {
throw new Error(`Unexpected format on line ${row}`);
if (longNameStart == 0) {
throw new Error(`Unexpected format on line ${row}`);
}
let alias = data.slice(lineStart, shortNameEnd).trim();
let longName = null;
let nameStart = longNameStart;
let lineDone = false;
do {
let nameEnd = data.indexOf(';', nameStart);
if (nameEnd === -1 || nameEnd > lineEnd) {
nameEnd = data.indexOf('#', nameStart);
if (nameEnd === -1 || nameEnd > lineEnd) {
nameEnd = lineEnd;
}
lineDone = true;
}
let alias = data.slice(lineStart, shortNameEnd).trim();
let longName = null;
let nameStart = longNameStart;
let lineDone = false;
do {
let nameEnd = data.indexOf(';', nameStart);
if (nameEnd === -1 || nameEnd > lineEnd) {
nameEnd = data.indexOf('#', nameStart);
if (nameEnd === -1 || nameEnd > lineEnd) {
nameEnd = lineEnd;
}
lineDone = true;
}
if (longName == null) {
longName = data.slice(nameStart, nameEnd).trim();
} else {
alias = data.slice(nameStart, nameEnd).trim();
}
console.log("Property alias:", alias, longName);
propertyAliases[alias] = longName;
nameStart = nameEnd + 1;
} while (!lineDone);
if (longName == null) {
longName = data.slice(nameStart, nameEnd).trim();
} else {
alias = data.slice(nameStart, nameEnd).trim();
}
console.log("Property alias:", alias, longName);
propertyAliases[alias] = longName;
nameStart = nameEnd + 1;
} while (!lineDone);
}
fs.writeFileSync(CATEGORY_OUTPUT_PATH, JSON.stringify(categories), 'utf8');