tree-sitter/script/generate-unicode-categories-json
Alex Pinkus 8fadf18655 Expand regex support to include emojis and binary ops
The `Emoji` property alias is already present, but the actual property
is not available since it lives in a new file. This adds that file to
the `generate-unicode-categories-json`.

The `emoji-data` file follows the same format as the ones we already
consume in `generate-unicode-categories-json`, so adding emoji support
is fairly easy. his, grammars would need to hard-code a set of
unicode ranges in their own regex. The Javascript library `emoji-regex`
cannot be used because of #451.

For unclear reasons, the characters #, *, and 0-9 are marked as
`Emoji=Yes` by `emoji-data.txt`. Because of this, a grammar that wishes
to use emojis is likely to want to exclude those characters. For that
reason, this change also adds support for binary operations in regexes,
e.g. `[\p{Emoji}&&[^#*0-9]]`.

Lastly (and perhaps controversially), this change introduces new
variables available at grammar compile time, for the major, minor, and
patch versions of the tree-sitter CLI used to compile the grammar. This
will allow grammars to conditionally adopt these new regex features
while remaining backward compatible with older versions of the CLI.
Without this part of the change, grammar authors who do not precompile
and check-in their `grammar.json` would need to wait for downstream
systems to adopt a newer tree-sitter CLI version before they could begin
to use these features.
2022-02-19 11:41:36 -08:00

243 lines
7.9 KiB
JavaScript
Executable file

#!/usr/bin/env node
// This script generates a JSON file that is used by the CLI to handle unicode property escapes.
const CATEGORY_OUTPUT_PATH = './cli/src/generate/prepare_grammar/unicode-categories.json'
const PROPERTY_OUTPUT_PATH = './cli/src/generate/prepare_grammar/unicode-properties.json'
const CATEGORY_ALIAS_OUTPUT_PATH = './cli/src/generate/prepare_grammar/unicode-category-aliases.json'
const PROPERTY_ALIAS_OUTPUT_PATH = './cli/src/generate/prepare_grammar/unicode-property-aliases.json'
const CATEGORY_URL = 'https://unicode.org/Public/14.0.0/ucd/UnicodeData.txt'
const PROPERTY_URL = 'https://unicode.org/Public/14.0.0/ucd/PropList.txt'
const DERIVED_PROPERTY_URL = 'https://unicode.org/Public/14.0.0/ucd/DerivedCoreProperties.txt'
const CATEGORY_ALIAS_URL = 'https://unicode.org/Public/14.0.0/ucd/PropertyValueAliases.txt'
const PROPERTY_ALIAS_URL = 'https://unicode.org/Public/14.0.0/ucd/PropertyAliases.txt'
const EMOJI_DATA_URL = 'https://unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt'
const fs = require('fs');
const path = require('path');
const {spawnSync} = require('child_process');
// Download the unicode data files, caching them inside the 'target' directory.
const categoryData = cachedDownload(CATEGORY_URL);
const propertyData = cachedDownload(PROPERTY_URL);
const derivedPropertyData = cachedDownload(DERIVED_PROPERTY_URL);
const categoryAliasData = cachedDownload(CATEGORY_ALIAS_URL);
const propertyAliasData = cachedDownload(PROPERTY_ALIAS_URL);
const emojiData = cachedDownload(EMOJI_DATA_URL);
function cachedDownload(url) {
let downloadPath = path.join('.', 'target', path.basename(url))
if (fs.existsSync(downloadPath)) {
return fs.readFileSync(downloadPath, 'utf8');
} else {
const data = spawnSync('curl', [url], {encoding: 'utf8'}).stdout;
fs.writeFileSync(downloadPath, data, 'utf8');
return data;
}
}
const categories = {};
const properties = {};
const categoryAliases = {};
const propertyAliases = {}
let data, row, lineStart, lineEnd;
// Parse the properties
data = propertyData + derivedPropertyData + emojiData;
row = 0;
lineStart = 0;
lineEnd = -1;
const CODE_POINT = /[0-9A-Fa-f]/
while (lineStart < data.length) {
row++;
lineStart = lineEnd + 1;
lineEnd = data.indexOf('\n', lineStart);
if (lineEnd === -1) break;
// Skip over blank and comment lines
if (!CODE_POINT.test(data[lineStart])) continue;
// Parse the first two semicolon fields:
// * code point or code point range
// * property
const codePointEnd = data.indexOf(';', lineStart);
const propertyStart = codePointEnd + 1;
const propertyEnd = data.indexOf('#', propertyStart);
if (
codePointEnd === -1 ||
propertyEnd === -1
) {
throw new Error(`Unexpected format on line ${row}`);
}
// Process ranges (separated by '..)
const codePoints = data.slice(lineStart, codePointEnd).trim()
.split('..')
.map(p => parseInt(p, 16));
if (codePoints.length === 1) {
codePoints.push(codePoints[0]);
}
const property = data.slice(propertyStart, propertyEnd).trim();
console.log("Property:", codePoints, property);
for (let c = codePoints[0]; c <= codePoints[1]; c++) {
if (!properties[property]) {
properties[property] = [];
}
properties[property].push(c);
}
}
// Parse the categories.
// Each line represents a code point.
data = categoryData;
row = 0;
lineStart = 0;
lineEnd = -1;
while (lineStart < data.length) {
row++;
lineStart = lineEnd + 1;
lineEnd = data.indexOf('\n', lineStart);
if (lineEnd === -1) break;
// Parse the first three semicolon-separated fields:
// * code point (hexadecimal)
// * name
// * category
const codePointEnd = data.indexOf(';', lineStart);
const nameStart = codePointEnd + 1;
const nameEnd = data.indexOf(';', nameStart);
const categoryStart = nameEnd + 1;
const categoryEnd = data.indexOf(';', categoryStart)
if (
nameStart === 0 ||
categoryStart == 0 ||
categoryEnd === -1
) {
throw new Error(`Unexpected format on line ${row}`);
}
const codePoint = parseInt(data.slice(lineStart, codePointEnd), 16);
const name = data.slice(nameStart, nameEnd);
const category = data.slice(categoryStart, categoryEnd);
console.log("Category:", codePoint, category, name);
// Group the code points by their category.
if (!categories[category]) {
categories[category] = [];
}
categories[category].push(codePoint);
}
// Parse the category aliases
data = categoryAliasData;
row = 0;
lineStart = 0;
lineEnd = -1;
const IGNORE = /[#\s]/
while (lineStart < data.length) {
row++;
lineStart = lineEnd + 1;
lineEnd = data.indexOf('\n', lineStart);
if (lineEnd === -1) break;
// Skip over blank and comment lines
if (IGNORE.test(data[lineStart])) continue;
// Parse the first three semicolon-separated fields:
// * property value type
// * short name
// * long name
// Other aliases may be listed in additional fields
const propertyValueTypeEnd = data.indexOf(';', lineStart);
const shortNameStart = propertyValueTypeEnd + 1;
const shortNameEnd = data.indexOf(';', shortNameStart);
const longNameStart = shortNameEnd + 1;
if (
shortNameStart === 0 ||
longNameStart === 0
) {
throw new Error(`Unexpected format on line ${row}`);
}
const propertyValueType = data.slice(lineStart, propertyValueTypeEnd).trim();
const shortName = data.slice(shortNameStart, shortNameEnd).trim();
// Filter for General_Category lines
if (propertyValueType !== 'gc') continue;
let aliasStart = longNameStart;
let lineDone = false;
do {
let aliasEnd = data.indexOf(';', aliasStart);
if (aliasEnd === -1 || aliasEnd > lineEnd) {
aliasEnd = data.indexOf('#', aliasStart);
if (aliasEnd === -1 || aliasEnd > lineEnd) {
aliasEnd = lineEnd;
}
lineDone = true;
}
const alias = data.slice(aliasStart, aliasEnd).trim();
console.log("Category alias:", alias, shortName);
categoryAliases[alias] = shortName;
aliasStart = aliasEnd + 1;
} while (!lineDone);
}
// Parse the property aliases
data = propertyAliasData;
row = 0;
lineStart = 0;
lineEnd = -1;
while (lineStart < data.length) {
row++;
lineStart = lineEnd + 1;
lineEnd = data.indexOf('\n', lineStart);
if (lineEnd === -1) break;
// Skip over blank and comment lines
if (IGNORE.test(data[lineStart])) continue;
// Parse the first two semicolon fields:
// * short name
// * long name
const shortNameEnd = data.indexOf(';', lineStart);
const longNameStart = shortNameEnd + 1;
if (longNameStart == 0) {
throw new Error(`Unexpected format on line ${row}`);
}
let alias = data.slice(lineStart, shortNameEnd).trim();
let longName = null;
let nameStart = longNameStart;
let lineDone = false;
do {
let nameEnd = data.indexOf(';', nameStart);
if (nameEnd === -1 || nameEnd > lineEnd) {
nameEnd = data.indexOf('#', nameStart);
if (nameEnd === -1 || nameEnd > lineEnd) {
nameEnd = lineEnd;
}
lineDone = true;
}
if (longName == null) {
longName = data.slice(nameStart, nameEnd).trim();
} else {
alias = data.slice(nameStart, nameEnd).trim();
}
console.log("Property alias:", alias, longName);
propertyAliases[alias] = longName;
nameStart = nameEnd + 1;
} while (!lineDone);
}
fs.writeFileSync(CATEGORY_OUTPUT_PATH, JSON.stringify(categories), 'utf8');
fs.writeFileSync(PROPERTY_OUTPUT_PATH, JSON.stringify(properties), 'utf8');
fs.writeFileSync(CATEGORY_ALIAS_OUTPUT_PATH, JSON.stringify(categoryAliases), 'utf8');
fs.writeFileSync(PROPERTY_ALIAS_OUTPUT_PATH, JSON.stringify(propertyAliases), 'utf8');