Expand regex support to include emojis and binary ops
The `Emoji` property alias is already present, but the actual property is not available since it lives in a new file. This adds that file to the `generate-unicode-categories-json`. The `emoji-data` file follows the same format as the ones we already consume in `generate-unicode-categories-json`, so adding emoji support is fairly easy. his, grammars would need to hard-code a set of unicode ranges in their own regex. The Javascript library `emoji-regex` cannot be used because of #451. For unclear reasons, the characters #, *, and 0-9 are marked as `Emoji=Yes` by `emoji-data.txt`. Because of this, a grammar that wishes to use emojis is likely to want to exclude those characters. For that reason, this change also adds support for binary operations in regexes, e.g. `[\p{Emoji}&&[^#*0-9]]`. Lastly (and perhaps controversially), this change introduces new variables available at grammar compile time, for the major, minor, and patch versions of the tree-sitter CLI used to compile the grammar. This will allow grammars to conditionally adopt these new regex features while remaining backward compatible with older versions of the CLI. Without this part of the change, grammar authors who do not precompile and check-in their `grammar.json` would need to wait for downstream systems to adopt a newer tree-sitter CLI version before they could begin to use these features.
This commit is contained in:
parent
2346570901
commit
8fadf18655
7 changed files with 172 additions and 22 deletions
|
|
@ -12,6 +12,7 @@ const PROPERTY_URL = 'https://unicode.org/Public/14.0.0/ucd/PropList.txt'
|
|||
const DERIVED_PROPERTY_URL = 'https://unicode.org/Public/14.0.0/ucd/DerivedCoreProperties.txt'
|
||||
const CATEGORY_ALIAS_URL = 'https://unicode.org/Public/14.0.0/ucd/PropertyValueAliases.txt'
|
||||
const PROPERTY_ALIAS_URL = 'https://unicode.org/Public/14.0.0/ucd/PropertyAliases.txt'
|
||||
const EMOJI_DATA_URL = 'https://unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt'
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
|
@ -23,6 +24,7 @@ const propertyData = cachedDownload(PROPERTY_URL);
|
|||
const derivedPropertyData = cachedDownload(DERIVED_PROPERTY_URL);
|
||||
const categoryAliasData = cachedDownload(CATEGORY_ALIAS_URL);
|
||||
const propertyAliasData = cachedDownload(PROPERTY_ALIAS_URL);
|
||||
const emojiData = cachedDownload(EMOJI_DATA_URL);
|
||||
function cachedDownload(url) {
|
||||
let downloadPath = path.join('.', 'target', path.basename(url))
|
||||
if (fs.existsSync(downloadPath)) {
|
||||
|
|
@ -41,7 +43,7 @@ const propertyAliases = {}
|
|||
let data, row, lineStart, lineEnd;
|
||||
|
||||
// Parse the properties
|
||||
data = propertyData + derivedPropertyData;
|
||||
data = propertyData + derivedPropertyData + emojiData;
|
||||
row = 0;
|
||||
lineStart = 0;
|
||||
lineEnd = -1;
|
||||
|
|
@ -79,7 +81,7 @@ while (lineStart < data.length) {
|
|||
|
||||
const property = data.slice(propertyStart, propertyEnd).trim();
|
||||
|
||||
console.log(codePoints, property);
|
||||
console.log("Property:", codePoints, property);
|
||||
|
||||
|
||||
for (let c = codePoints[0]; c <= codePoints[1]; c++) {
|
||||
|
|
@ -123,7 +125,7 @@ while (lineStart < data.length) {
|
|||
const name = data.slice(nameStart, nameEnd);
|
||||
const category = data.slice(categoryStart, categoryEnd);
|
||||
|
||||
console.log(codePoint, category, name);
|
||||
console.log("Category:", codePoint, category, name);
|
||||
|
||||
// Group the code points by their category.
|
||||
if (!categories[category]) {
|
||||
|
|
@ -181,7 +183,7 @@ while (lineStart < data.length) {
|
|||
lineDone = true;
|
||||
}
|
||||
const alias = data.slice(aliasStart, aliasEnd).trim();
|
||||
console.log(alias, shortName);
|
||||
console.log("Category alias:", alias, shortName);
|
||||
categoryAliases[alias] = shortName;
|
||||
aliasStart = aliasEnd + 1;
|
||||
} while (!lineDone);
|
||||
|
|
@ -229,7 +231,7 @@ while (lineStart < data.length) {
|
|||
} else {
|
||||
alias = data.slice(nameStart, nameEnd).trim();
|
||||
}
|
||||
console.log(alias, longName);
|
||||
console.log("Property alias:", alias, longName);
|
||||
propertyAliases[alias] = longName;
|
||||
nameStart = nameEnd + 1;
|
||||
} while (!lineDone);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue