feat(cli): bump unicode data to v15.1.0
This commit is contained in:
parent
6e6dcf1caf
commit
b1fd3214db
4 changed files with 165 additions and 163 deletions
|
|
@ -7,16 +7,17 @@ const PROPERTY_OUTPUT_PATH = './cli/src/generate/prepare_grammar/unicode-propert
|
|||
const CATEGORY_ALIAS_OUTPUT_PATH = './cli/src/generate/prepare_grammar/unicode-category-aliases.json'
|
||||
const PROPERTY_ALIAS_OUTPUT_PATH = './cli/src/generate/prepare_grammar/unicode-property-aliases.json'
|
||||
|
||||
const CATEGORY_URL = 'https://unicode.org/Public/14.0.0/ucd/UnicodeData.txt'
|
||||
const PROPERTY_URL = 'https://unicode.org/Public/14.0.0/ucd/PropList.txt'
|
||||
const DERIVED_PROPERTY_URL = 'https://unicode.org/Public/14.0.0/ucd/DerivedCoreProperties.txt'
|
||||
const CATEGORY_ALIAS_URL = 'https://unicode.org/Public/14.0.0/ucd/PropertyValueAliases.txt'
|
||||
const PROPERTY_ALIAS_URL = 'https://unicode.org/Public/14.0.0/ucd/PropertyAliases.txt'
|
||||
const EMOJI_DATA_URL = 'https://unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt'
|
||||
const UNICODE_STANDARD_VERSION = '15.1.0';
|
||||
const CATEGORY_URL = `https://unicode.org/Public/${UNICODE_STANDARD_VERSION}/ucd/UnicodeData.txt`
|
||||
const PROPERTY_URL = `https://unicode.org/Public/${UNICODE_STANDARD_VERSION}/ucd/PropList.txt`
|
||||
const DERIVED_PROPERTY_URL = `https://unicode.org/Public/${UNICODE_STANDARD_VERSION}/ucd/DerivedCoreProperties.txt`
|
||||
const CATEGORY_ALIAS_URL = `https://unicode.org/Public/${UNICODE_STANDARD_VERSION}/ucd/PropertyValueAliases.txt`
|
||||
const PROPERTY_ALIAS_URL = `https://unicode.org/Public/${UNICODE_STANDARD_VERSION}/ucd/PropertyAliases.txt`
|
||||
const EMOJI_DATA_URL = `https://unicode.org/Public/${UNICODE_STANDARD_VERSION}/ucd/emoji/emoji-data.txt`
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const {spawnSync} = require('child_process');
|
||||
const { spawnSync } = require('child_process');
|
||||
|
||||
// Download the unicode data files, caching them inside the 'target' directory.
|
||||
const categoryData = cachedDownload(CATEGORY_URL);
|
||||
|
|
@ -26,14 +27,15 @@ const categoryAliasData = cachedDownload(CATEGORY_ALIAS_URL);
|
|||
const propertyAliasData = cachedDownload(PROPERTY_ALIAS_URL);
|
||||
const emojiData = cachedDownload(EMOJI_DATA_URL);
|
||||
function cachedDownload(url) {
|
||||
let downloadPath = path.join('.', 'target', path.basename(url))
|
||||
if (fs.existsSync(downloadPath)) {
|
||||
return fs.readFileSync(downloadPath, 'utf8');
|
||||
} else {
|
||||
const data = spawnSync('curl', [url], {encoding: 'utf8'}).stdout;
|
||||
fs.writeFileSync(downloadPath, data, 'utf8');
|
||||
return data;
|
||||
}
|
||||
console.log(`Downloading ${url}`);
|
||||
let downloadPath = path.join('.', 'target', path.basename(url) + `.${UNICODE_STANDARD_VERSION}`)
|
||||
if (fs.existsSync(downloadPath)) {
|
||||
return fs.readFileSync(downloadPath, 'utf8');
|
||||
} else {
|
||||
const data = spawnSync('curl', [url], { encoding: 'utf8' }).stdout;
|
||||
fs.writeFileSync(downloadPath, data, 'utf8');
|
||||
return data;
|
||||
}
|
||||
}
|
||||
|
||||
const categories = {};
|
||||
|
|
@ -49,47 +51,47 @@ lineStart = 0;
|
|||
lineEnd = -1;
|
||||
const CODE_POINT = /[0-9A-Fa-f]/
|
||||
while (lineStart < data.length) {
|
||||
row++;
|
||||
lineStart = lineEnd + 1;
|
||||
lineEnd = data.indexOf('\n', lineStart);
|
||||
if (lineEnd === -1) break;
|
||||
row++;
|
||||
lineStart = lineEnd + 1;
|
||||
lineEnd = data.indexOf('\n', lineStart);
|
||||
if (lineEnd === -1) break;
|
||||
|
||||
// Skip over blank and comment lines
|
||||
if (!CODE_POINT.test(data[lineStart])) continue;
|
||||
// Skip over blank and comment lines
|
||||
if (!CODE_POINT.test(data[lineStart])) continue;
|
||||
|
||||
// Parse the first two semicolon fields:
|
||||
// * code point or code point range
|
||||
// * property
|
||||
const codePointEnd = data.indexOf(';', lineStart);
|
||||
const propertyStart = codePointEnd + 1;
|
||||
const propertyEnd = data.indexOf('#', propertyStart);
|
||||
// Parse the first two semicolon fields:
|
||||
// * code point or code point range
|
||||
// * property
|
||||
const codePointEnd = data.indexOf(';', lineStart);
|
||||
const propertyStart = codePointEnd + 1;
|
||||
const propertyEnd = data.indexOf('#', propertyStart);
|
||||
|
||||
if (
|
||||
codePointEnd === -1 ||
|
||||
propertyEnd === -1
|
||||
) {
|
||||
throw new Error(`Unexpected format on line ${row}`);
|
||||
}
|
||||
|
||||
// Process ranges (separated by '..)
|
||||
const codePoints = data.slice(lineStart, codePointEnd).trim()
|
||||
.split('..')
|
||||
.map(p => parseInt(p, 16));
|
||||
if (codePoints.length === 1) {
|
||||
codePoints.push(codePoints[0]);
|
||||
}
|
||||
|
||||
const property = data.slice(propertyStart, propertyEnd).trim();
|
||||
|
||||
console.log("Property:", codePoints, property);
|
||||
|
||||
|
||||
for (let c = codePoints[0]; c <= codePoints[1]; c++) {
|
||||
if (!properties[property]) {
|
||||
properties[property] = [];
|
||||
}
|
||||
properties[property].push(c);
|
||||
if (
|
||||
codePointEnd === -1 ||
|
||||
propertyEnd === -1
|
||||
) {
|
||||
throw new Error(`Unexpected format on line ${row}`);
|
||||
}
|
||||
|
||||
// Process ranges (separated by '..)
|
||||
const codePoints = data.slice(lineStart, codePointEnd).trim()
|
||||
.split('..')
|
||||
.map(p => parseInt(p, 16));
|
||||
if (codePoints.length === 1) {
|
||||
codePoints.push(codePoints[0]);
|
||||
}
|
||||
|
||||
const property = data.slice(propertyStart, propertyEnd).trim();
|
||||
|
||||
console.log("Property:", codePoints, property);
|
||||
|
||||
|
||||
for (let c = codePoints[0]; c <= codePoints[1]; c++) {
|
||||
if (!properties[property]) {
|
||||
properties[property] = [];
|
||||
}
|
||||
properties[property].push(c);
|
||||
}
|
||||
}
|
||||
|
||||
// Parse the categories.
|
||||
|
|
@ -99,39 +101,39 @@ row = 0;
|
|||
lineStart = 0;
|
||||
lineEnd = -1;
|
||||
while (lineStart < data.length) {
|
||||
row++;
|
||||
lineStart = lineEnd + 1;
|
||||
lineEnd = data.indexOf('\n', lineStart);
|
||||
if (lineEnd === -1) break;
|
||||
row++;
|
||||
lineStart = lineEnd + 1;
|
||||
lineEnd = data.indexOf('\n', lineStart);
|
||||
if (lineEnd === -1) break;
|
||||
|
||||
// Parse the first three semicolon-separated fields:
|
||||
// * code point (hexadecimal)
|
||||
// * name
|
||||
// * category
|
||||
const codePointEnd = data.indexOf(';', lineStart);
|
||||
const nameStart = codePointEnd + 1;
|
||||
const nameEnd = data.indexOf(';', nameStart);
|
||||
const categoryStart = nameEnd + 1;
|
||||
const categoryEnd = data.indexOf(';', categoryStart)
|
||||
if (
|
||||
nameStart === 0 ||
|
||||
categoryStart == 0 ||
|
||||
categoryEnd === -1
|
||||
) {
|
||||
throw new Error(`Unexpected format on line ${row}`);
|
||||
}
|
||||
// Parse the first three semicolon-separated fields:
|
||||
// * code point (hexadecimal)
|
||||
// * name
|
||||
// * category
|
||||
const codePointEnd = data.indexOf(';', lineStart);
|
||||
const nameStart = codePointEnd + 1;
|
||||
const nameEnd = data.indexOf(';', nameStart);
|
||||
const categoryStart = nameEnd + 1;
|
||||
const categoryEnd = data.indexOf(';', categoryStart)
|
||||
if (
|
||||
nameStart === 0 ||
|
||||
categoryStart == 0 ||
|
||||
categoryEnd === -1
|
||||
) {
|
||||
throw new Error(`Unexpected format on line ${row}`);
|
||||
}
|
||||
|
||||
const codePoint = parseInt(data.slice(lineStart, codePointEnd), 16);
|
||||
const name = data.slice(nameStart, nameEnd);
|
||||
const category = data.slice(categoryStart, categoryEnd);
|
||||
const codePoint = parseInt(data.slice(lineStart, codePointEnd), 16);
|
||||
const name = data.slice(nameStart, nameEnd);
|
||||
const category = data.slice(categoryStart, categoryEnd);
|
||||
|
||||
console.log("Category:", codePoint, category, name);
|
||||
console.log("Category:", codePoint, category, name);
|
||||
|
||||
// Group the code points by their category.
|
||||
if (!categories[category]) {
|
||||
categories[category] = [];
|
||||
}
|
||||
categories[category].push(codePoint);
|
||||
// Group the code points by their category.
|
||||
if (!categories[category]) {
|
||||
categories[category] = [];
|
||||
}
|
||||
categories[category].push(codePoint);
|
||||
}
|
||||
|
||||
// Parse the category aliases
|
||||
|
|
@ -141,52 +143,52 @@ lineStart = 0;
|
|||
lineEnd = -1;
|
||||
const IGNORE = /[#\s]/
|
||||
while (lineStart < data.length) {
|
||||
row++;
|
||||
lineStart = lineEnd + 1;
|
||||
lineEnd = data.indexOf('\n', lineStart);
|
||||
if (lineEnd === -1) break;
|
||||
row++;
|
||||
lineStart = lineEnd + 1;
|
||||
lineEnd = data.indexOf('\n', lineStart);
|
||||
if (lineEnd === -1) break;
|
||||
|
||||
// Skip over blank and comment lines
|
||||
if (IGNORE.test(data[lineStart])) continue;
|
||||
// Skip over blank and comment lines
|
||||
if (IGNORE.test(data[lineStart])) continue;
|
||||
|
||||
// Parse the first three semicolon-separated fields:
|
||||
// * property value type
|
||||
// * short name
|
||||
// * long name
|
||||
// Other aliases may be listed in additional fields
|
||||
const propertyValueTypeEnd = data.indexOf(';', lineStart);
|
||||
const shortNameStart = propertyValueTypeEnd + 1;
|
||||
const shortNameEnd = data.indexOf(';', shortNameStart);
|
||||
const longNameStart = shortNameEnd + 1;
|
||||
if (
|
||||
shortNameStart === 0 ||
|
||||
longNameStart === 0
|
||||
) {
|
||||
throw new Error(`Unexpected format on line ${row}`);
|
||||
// Parse the first three semicolon-separated fields:
|
||||
// * property value type
|
||||
// * short name
|
||||
// * long name
|
||||
// Other aliases may be listed in additional fields
|
||||
const propertyValueTypeEnd = data.indexOf(';', lineStart);
|
||||
const shortNameStart = propertyValueTypeEnd + 1;
|
||||
const shortNameEnd = data.indexOf(';', shortNameStart);
|
||||
const longNameStart = shortNameEnd + 1;
|
||||
if (
|
||||
shortNameStart === 0 ||
|
||||
longNameStart === 0
|
||||
) {
|
||||
throw new Error(`Unexpected format on line ${row}`);
|
||||
}
|
||||
|
||||
const propertyValueType = data.slice(lineStart, propertyValueTypeEnd).trim();
|
||||
const shortName = data.slice(shortNameStart, shortNameEnd).trim();
|
||||
|
||||
// Filter for General_Category lines
|
||||
if (propertyValueType !== 'gc') continue;
|
||||
|
||||
let aliasStart = longNameStart;
|
||||
let lineDone = false;
|
||||
do {
|
||||
let aliasEnd = data.indexOf(';', aliasStart);
|
||||
if (aliasEnd === -1 || aliasEnd > lineEnd) {
|
||||
aliasEnd = data.indexOf('#', aliasStart);
|
||||
if (aliasEnd === -1 || aliasEnd > lineEnd) {
|
||||
aliasEnd = lineEnd;
|
||||
}
|
||||
lineDone = true;
|
||||
}
|
||||
|
||||
const propertyValueType = data.slice(lineStart, propertyValueTypeEnd).trim();
|
||||
const shortName = data.slice(shortNameStart, shortNameEnd).trim();
|
||||
|
||||
// Filter for General_Category lines
|
||||
if (propertyValueType !== 'gc') continue;
|
||||
|
||||
let aliasStart = longNameStart;
|
||||
let lineDone = false;
|
||||
do {
|
||||
let aliasEnd = data.indexOf(';', aliasStart);
|
||||
if (aliasEnd === -1 || aliasEnd > lineEnd) {
|
||||
aliasEnd = data.indexOf('#', aliasStart);
|
||||
if (aliasEnd === -1 || aliasEnd > lineEnd) {
|
||||
aliasEnd = lineEnd;
|
||||
}
|
||||
lineDone = true;
|
||||
}
|
||||
const alias = data.slice(aliasStart, aliasEnd).trim();
|
||||
console.log("Category alias:", alias, shortName);
|
||||
categoryAliases[alias] = shortName;
|
||||
aliasStart = aliasEnd + 1;
|
||||
} while (!lineDone);
|
||||
const alias = data.slice(aliasStart, aliasEnd).trim();
|
||||
console.log("Category alias:", alias, shortName);
|
||||
categoryAliases[alias] = shortName;
|
||||
aliasStart = aliasEnd + 1;
|
||||
} while (!lineDone);
|
||||
}
|
||||
|
||||
// Parse the property aliases
|
||||
|
|
@ -195,46 +197,46 @@ row = 0;
|
|||
lineStart = 0;
|
||||
lineEnd = -1;
|
||||
while (lineStart < data.length) {
|
||||
row++;
|
||||
lineStart = lineEnd + 1;
|
||||
lineEnd = data.indexOf('\n', lineStart);
|
||||
if (lineEnd === -1) break;
|
||||
row++;
|
||||
lineStart = lineEnd + 1;
|
||||
lineEnd = data.indexOf('\n', lineStart);
|
||||
if (lineEnd === -1) break;
|
||||
|
||||
// Skip over blank and comment lines
|
||||
if (IGNORE.test(data[lineStart])) continue;
|
||||
// Skip over blank and comment lines
|
||||
if (IGNORE.test(data[lineStart])) continue;
|
||||
|
||||
// Parse the first two semicolon fields:
|
||||
// * short name
|
||||
// * long name
|
||||
const shortNameEnd = data.indexOf(';', lineStart);
|
||||
const longNameStart = shortNameEnd + 1;
|
||||
// Parse the first two semicolon fields:
|
||||
// * short name
|
||||
// * long name
|
||||
const shortNameEnd = data.indexOf(';', lineStart);
|
||||
const longNameStart = shortNameEnd + 1;
|
||||
|
||||
if (longNameStart == 0) {
|
||||
throw new Error(`Unexpected format on line ${row}`);
|
||||
if (longNameStart == 0) {
|
||||
throw new Error(`Unexpected format on line ${row}`);
|
||||
}
|
||||
|
||||
let alias = data.slice(lineStart, shortNameEnd).trim();
|
||||
let longName = null;
|
||||
let nameStart = longNameStart;
|
||||
let lineDone = false;
|
||||
do {
|
||||
let nameEnd = data.indexOf(';', nameStart);
|
||||
if (nameEnd === -1 || nameEnd > lineEnd) {
|
||||
nameEnd = data.indexOf('#', nameStart);
|
||||
if (nameEnd === -1 || nameEnd > lineEnd) {
|
||||
nameEnd = lineEnd;
|
||||
}
|
||||
lineDone = true;
|
||||
}
|
||||
|
||||
let alias = data.slice(lineStart, shortNameEnd).trim();
|
||||
let longName = null;
|
||||
let nameStart = longNameStart;
|
||||
let lineDone = false;
|
||||
do {
|
||||
let nameEnd = data.indexOf(';', nameStart);
|
||||
if (nameEnd === -1 || nameEnd > lineEnd) {
|
||||
nameEnd = data.indexOf('#', nameStart);
|
||||
if (nameEnd === -1 || nameEnd > lineEnd) {
|
||||
nameEnd = lineEnd;
|
||||
}
|
||||
lineDone = true;
|
||||
}
|
||||
if (longName == null) {
|
||||
longName = data.slice(nameStart, nameEnd).trim();
|
||||
} else {
|
||||
alias = data.slice(nameStart, nameEnd).trim();
|
||||
}
|
||||
console.log("Property alias:", alias, longName);
|
||||
propertyAliases[alias] = longName;
|
||||
nameStart = nameEnd + 1;
|
||||
} while (!lineDone);
|
||||
if (longName == null) {
|
||||
longName = data.slice(nameStart, nameEnd).trim();
|
||||
} else {
|
||||
alias = data.slice(nameStart, nameEnd).trim();
|
||||
}
|
||||
console.log("Property alias:", alias, longName);
|
||||
propertyAliases[alias] = longName;
|
||||
nameStart = nameEnd + 1;
|
||||
} while (!lineDone);
|
||||
}
|
||||
|
||||
fs.writeFileSync(CATEGORY_OUTPUT_PATH, JSON.stringify(categories), 'utf8');
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue