Merge pull request #906 from tree-sitter/unicode-property-escapes
Handle simple unicode property escapes in regexes
This commit is contained in:
commit
9d9eb2234f
11 changed files with 483 additions and 19 deletions
|
|
@ -13,8 +13,9 @@ impl Error {
|
|||
Error(vec![format!("Grammar error: {}", message)])
|
||||
}
|
||||
|
||||
pub fn regex(message: &str) -> Self {
|
||||
Error(vec![format!("Regex error: {}", message)])
|
||||
pub fn regex(mut message: String) -> Self {
|
||||
message.insert_str(0, "Regex error: ");
|
||||
Error(vec![message])
|
||||
}
|
||||
|
||||
pub fn undefined_symbol(name: &str) -> Self {
|
||||
|
|
|
|||
130
cli/src/generate/char_tree.rs
Normal file
130
cli/src/generate/char_tree.rs
Normal file
|
|
@ -0,0 +1,130 @@
|
|||
use std::ops::Range;
|
||||
|
||||
#[derive(PartialEq, Eq)]
|
||||
pub enum CharacterTree {
|
||||
Yes,
|
||||
Compare {
|
||||
value: char,
|
||||
operator: Comparator,
|
||||
consequence: Option<Box<CharacterTree>>,
|
||||
alternative: Option<Box<CharacterTree>>,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq)]
|
||||
pub enum Comparator {
|
||||
Less,
|
||||
LessOrEqual,
|
||||
Equal,
|
||||
GreaterOrEqual,
|
||||
}
|
||||
|
||||
impl CharacterTree {
|
||||
pub fn from_ranges(ranges: &[Range<char>]) -> Option<Self> {
|
||||
match ranges.len() {
|
||||
0 => None,
|
||||
1 => {
|
||||
let range = &ranges[0];
|
||||
if range.start == range.end {
|
||||
Some(CharacterTree::Compare {
|
||||
operator: Comparator::Equal,
|
||||
value: range.start,
|
||||
consequence: Some(Box::new(CharacterTree::Yes)),
|
||||
alternative: None,
|
||||
})
|
||||
} else {
|
||||
Some(CharacterTree::Compare {
|
||||
operator: Comparator::GreaterOrEqual,
|
||||
value: range.start,
|
||||
consequence: Some(Box::new(CharacterTree::Compare {
|
||||
operator: Comparator::LessOrEqual,
|
||||
value: range.end,
|
||||
consequence: Some(Box::new(CharacterTree::Yes)),
|
||||
alternative: None,
|
||||
})),
|
||||
alternative: None,
|
||||
})
|
||||
}
|
||||
}
|
||||
len => {
|
||||
let mid = len / 2;
|
||||
let mid_range = &ranges[mid];
|
||||
Some(CharacterTree::Compare {
|
||||
operator: Comparator::Less,
|
||||
value: mid_range.start,
|
||||
consequence: Self::from_ranges(&ranges[0..mid]).map(Box::new),
|
||||
alternative: Some(Box::new(CharacterTree::Compare {
|
||||
operator: Comparator::LessOrEqual,
|
||||
value: mid_range.end,
|
||||
consequence: Some(Box::new(CharacterTree::Yes)),
|
||||
alternative: Self::from_ranges(&ranges[(mid + 1)..]).map(Box::new),
|
||||
})),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn contains(&self, c: char) -> bool {
|
||||
match self {
|
||||
CharacterTree::Yes => true,
|
||||
CharacterTree::Compare {
|
||||
value,
|
||||
operator,
|
||||
alternative,
|
||||
consequence,
|
||||
} => {
|
||||
let condition = match operator {
|
||||
Comparator::Less => c < *value,
|
||||
Comparator::LessOrEqual => c <= *value,
|
||||
Comparator::Equal => c == *value,
|
||||
Comparator::GreaterOrEqual => c >= *value,
|
||||
};
|
||||
if condition { consequence } else { alternative }
|
||||
.as_ref()
|
||||
.map_or(false, |a| a.contains(c))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_character_tree_simple() {
|
||||
let tree = CharacterTree::from_ranges(&['a'..'d', 'h'..'l', 'p'..'r', 'u'..'u', 'z'..'z'])
|
||||
.unwrap();
|
||||
|
||||
assert!(tree.contains('a'));
|
||||
assert!(tree.contains('b'));
|
||||
assert!(tree.contains('c'));
|
||||
assert!(tree.contains('d'));
|
||||
|
||||
assert!(!tree.contains('e'));
|
||||
assert!(!tree.contains('f'));
|
||||
assert!(!tree.contains('g'));
|
||||
|
||||
assert!(tree.contains('h'));
|
||||
assert!(tree.contains('i'));
|
||||
assert!(tree.contains('j'));
|
||||
assert!(tree.contains('k'));
|
||||
assert!(tree.contains('l'));
|
||||
|
||||
assert!(!tree.contains('m'));
|
||||
assert!(!tree.contains('n'));
|
||||
assert!(!tree.contains('o'));
|
||||
|
||||
assert!(tree.contains('p'));
|
||||
assert!(tree.contains('q'));
|
||||
assert!(tree.contains('r'));
|
||||
|
||||
assert!(!tree.contains('s'));
|
||||
assert!(!tree.contains('s'));
|
||||
|
||||
assert!(tree.contains('u'));
|
||||
|
||||
assert!(!tree.contains('v'));
|
||||
}
|
||||
}
|
||||
|
|
@ -1,4 +1,5 @@
|
|||
mod build_tables;
|
||||
mod char_tree;
|
||||
mod dedup;
|
||||
mod grammars;
|
||||
mod nfa;
|
||||
|
|
|
|||
|
|
@ -6,11 +6,13 @@ use std::fmt;
|
|||
use std::mem::swap;
|
||||
use std::ops::Range;
|
||||
|
||||
/// A set of characters represented as a vector of ranges.
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
pub struct CharacterSet {
|
||||
ranges: Vec<Range<u32>>,
|
||||
}
|
||||
|
||||
/// A state in an NFA representing a regular grammar.
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub enum NfaState {
|
||||
Advance {
|
||||
|
|
@ -54,10 +56,12 @@ impl Default for Nfa {
|
|||
const END: u32 = char::MAX as u32 + 1;
|
||||
|
||||
impl CharacterSet {
|
||||
/// Create a character set with a single character.
|
||||
pub fn empty() -> Self {
|
||||
CharacterSet { ranges: Vec::new() }
|
||||
}
|
||||
|
||||
/// Create a character set with a given *inclusive* range of characters.
|
||||
pub fn from_range(mut first: char, mut last: char) -> Self {
|
||||
if first > last {
|
||||
swap(&mut first, &mut last);
|
||||
|
|
@ -67,12 +71,15 @@ impl CharacterSet {
|
|||
}
|
||||
}
|
||||
|
||||
/// Create a character set with a single character.
|
||||
pub fn from_char(c: char) -> Self {
|
||||
CharacterSet {
|
||||
ranges: vec![(c as u32)..(c as u32 + 1)],
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a character set containing all characters *not* present
|
||||
/// in this character set.
|
||||
pub fn negate(mut self) -> CharacterSet {
|
||||
let mut i = 0;
|
||||
let mut previous_end = 0;
|
||||
|
|
@ -146,6 +153,9 @@ impl CharacterSet {
|
|||
false
|
||||
}
|
||||
|
||||
/// Get the set of characters that are present in both this set
|
||||
/// and the other set. Remove those common characters from both
|
||||
/// of the operands.
|
||||
pub fn remove_intersection(&mut self, other: &mut CharacterSet) -> CharacterSet {
|
||||
let mut intersection = Vec::new();
|
||||
let mut left_i = 0;
|
||||
|
|
@ -271,6 +281,8 @@ impl CharacterSet {
|
|||
self.ranges.is_empty()
|
||||
}
|
||||
|
||||
/// Get a reduced list of character ranges, assuming that a given
|
||||
/// set of characters can be safely ignored.
|
||||
pub fn simplify_ignoring<'a>(
|
||||
&'a self,
|
||||
ruled_out_characters: &'a HashSet<u32>,
|
||||
|
|
|
|||
|
|
@ -6,15 +6,23 @@ use crate::generate::rules::Rule;
|
|||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
use regex_syntax::ast::{
|
||||
parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind, RepetitionRange,
|
||||
parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, ClassUnicodeKind, RepetitionKind,
|
||||
RepetitionRange,
|
||||
};
|
||||
use std::collections::HashMap;
|
||||
use std::i32;
|
||||
|
||||
lazy_static! {
|
||||
static ref CURLY_BRACE_REGEX: Regex =
|
||||
Regex::new(r#"(^|[^\\])\{([^}]*[^0-9A-Fa-f,}][^}]*)\}"#).unwrap();
|
||||
Regex::new(r#"(^|[^\\p])\{([^}]*[^0-9A-Fa-f,}][^}]*)\}"#).unwrap();
|
||||
static ref UNICODE_CATEGORIES: HashMap<&'static str, Vec<u32>> =
|
||||
serde_json::from_str(UNICODE_CATEGORIES_JSON).unwrap();
|
||||
static ref UNICODE_PROPERTIES: HashMap<&'static str, Vec<u32>> =
|
||||
serde_json::from_str(UNICODE_PROPERTIES_JSON).unwrap();
|
||||
}
|
||||
|
||||
const UNICODE_CATEGORIES_JSON: &'static str = include_str!("./unicode-categories.json");
|
||||
const UNICODE_PROPERTIES_JSON: &'static str = include_str!("./unicode-properties.json");
|
||||
const ALLOWED_REDUNDANT_ESCAPED_CHARS: [char; 4] = ['!', '\'', '"', '/'];
|
||||
|
||||
struct NfaBuilder {
|
||||
|
|
@ -196,7 +204,7 @@ impl NfaBuilder {
|
|||
fn expand_regex(&mut self, ast: &Ast, mut next_state_id: u32) -> Result<bool> {
|
||||
match ast {
|
||||
Ast::Empty(_) => Ok(false),
|
||||
Ast::Flags(_) => Err(Error::regex("Flags are not supported")),
|
||||
Ast::Flags(_) => Err(Error::regex("Flags are not supported".to_string())),
|
||||
Ast::Literal(literal) => {
|
||||
self.push_advance(CharacterSet::from_char(literal.c), next_state_id);
|
||||
Ok(true)
|
||||
|
|
@ -205,10 +213,15 @@ impl NfaBuilder {
|
|||
self.push_advance(CharacterSet::from_char('\n').negate(), next_state_id);
|
||||
Ok(true)
|
||||
}
|
||||
Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")),
|
||||
Ast::Assertion(_) => Err(Error::regex("Assertions are not supported".to_string())),
|
||||
Ast::Class(class) => match class {
|
||||
Class::Unicode(_) => {
|
||||
Err(Error::regex("Unicode character classes are not supported"))
|
||||
Class::Unicode(class) => {
|
||||
let mut chars = self.expand_unicode_character_class(&class.kind)?;
|
||||
if class.negated {
|
||||
chars = chars.negate();
|
||||
}
|
||||
self.push_advance(chars, next_state_id);
|
||||
Ok(true)
|
||||
}
|
||||
Class::Perl(class) => {
|
||||
let mut chars = self.expand_perl_character_class(&class.kind);
|
||||
|
|
@ -228,7 +241,7 @@ impl NfaBuilder {
|
|||
Ok(true)
|
||||
}
|
||||
ClassSet::BinaryOp(_) => Err(Error::regex(
|
||||
"Binary operators in character classes aren't supported",
|
||||
"Binary operators in character classes aren't supported".to_string(),
|
||||
)),
|
||||
},
|
||||
},
|
||||
|
|
@ -355,13 +368,63 @@ impl NfaBuilder {
|
|||
Ok(result)
|
||||
}
|
||||
ClassSetItem::Perl(class) => Ok(self.expand_perl_character_class(&class.kind)),
|
||||
_ => Err(Error::regex(&format!(
|
||||
_ => Err(Error::regex(format!(
|
||||
"Unsupported character class syntax {:?}",
|
||||
item
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
fn expand_unicode_character_class(&self, class: &ClassUnicodeKind) -> Result<CharacterSet> {
|
||||
let mut chars = CharacterSet::empty();
|
||||
|
||||
let category_letter;
|
||||
match class {
|
||||
ClassUnicodeKind::OneLetter(le) => {
|
||||
category_letter = le.to_string();
|
||||
}
|
||||
ClassUnicodeKind::Named(class_name) => {
|
||||
if class_name.len() == 1 {
|
||||
category_letter = class_name.clone();
|
||||
} else {
|
||||
let code_points = UNICODE_CATEGORIES
|
||||
.get(class_name.as_str())
|
||||
.or_else(|| UNICODE_PROPERTIES.get(class_name.as_str()))
|
||||
.ok_or_else(|| {
|
||||
Error::regex(format!(
|
||||
"Unsupported unicode character class {}",
|
||||
class_name
|
||||
))
|
||||
})?;
|
||||
for c in code_points {
|
||||
if let Some(c) = std::char::from_u32(*c) {
|
||||
chars = chars.add_char(c);
|
||||
}
|
||||
}
|
||||
|
||||
return Ok(chars);
|
||||
}
|
||||
}
|
||||
ClassUnicodeKind::NamedValue { .. } => {
|
||||
return Err(Error::regex(
|
||||
"Key-value unicode properties are not supported".to_string(),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
for (category, code_points) in UNICODE_CATEGORIES.iter() {
|
||||
if category.starts_with(&category_letter) {
|
||||
for c in code_points {
|
||||
if let Some(c) = std::char::from_u32(*c) {
|
||||
chars = chars.add_char(c);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(chars)
|
||||
}
|
||||
|
||||
fn expand_perl_character_class(&self, item: &ClassPerlKind) -> CharacterSet {
|
||||
match item {
|
||||
ClassPerlKind::Digit => CharacterSet::from_range('0', '9'),
|
||||
|
|
|
|||
1
cli/src/generate/prepare_grammar/unicode-categories.json
Normal file
1
cli/src/generate/prepare_grammar/unicode-categories.json
Normal file
File diff suppressed because one or more lines are too long
1
cli/src/generate/prepare_grammar/unicode-properties.json
Normal file
1
cli/src/generate/prepare_grammar/unicode-properties.json
Normal file
File diff suppressed because one or more lines are too long
|
|
@ -1,3 +1,4 @@
|
|||
use super::char_tree::{CharacterTree, Comparator};
|
||||
use super::grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType};
|
||||
use super::rules::{Alias, AliasMap, Symbol, SymbolType};
|
||||
use super::tables::{
|
||||
|
|
@ -714,7 +715,7 @@ impl Generator {
|
|||
if info.usage_count > 1 {
|
||||
add_line!(
|
||||
self,
|
||||
"static inline bool {}_character_set_{}(int32_t lookahead) {{",
|
||||
"static inline bool {}_character_set_{}(int32_t c) {{",
|
||||
self.symbol_ids[&info.symbol],
|
||||
info.index
|
||||
);
|
||||
|
|
@ -722,7 +723,8 @@ impl Generator {
|
|||
add_line!(self, "return");
|
||||
indent!(self);
|
||||
add_whitespace!(self);
|
||||
self.add_character_range_conditions(&info.ranges, true, 0);
|
||||
let tree = CharacterTree::from_ranges(&info.ranges);
|
||||
self.add_character_tree(tree.as_ref());
|
||||
add!(self, ";\n");
|
||||
dedent!(self);
|
||||
dedent!(self);
|
||||
|
|
@ -844,16 +846,15 @@ impl Generator {
|
|||
ranges: &[Range<char>],
|
||||
is_included: bool,
|
||||
indent_count: usize,
|
||||
) -> bool {
|
||||
) {
|
||||
let mut line_break = "\n".to_string();
|
||||
for _ in 0..self.indent_level + indent_count {
|
||||
line_break.push_str(" ");
|
||||
}
|
||||
|
||||
let mut did_add = false;
|
||||
for range in ranges {
|
||||
for (i, range) in ranges.iter().enumerate() {
|
||||
if is_included {
|
||||
if did_add {
|
||||
if i > 0 {
|
||||
add!(self, " ||{}", line_break);
|
||||
}
|
||||
if range.end == range.start {
|
||||
|
|
@ -872,7 +873,7 @@ impl Generator {
|
|||
add!(self, ")");
|
||||
}
|
||||
} else {
|
||||
if did_add {
|
||||
if i > 0 {
|
||||
add!(self, " &&{}", line_break);
|
||||
}
|
||||
if range.end == range.start {
|
||||
|
|
@ -896,9 +897,67 @@ impl Generator {
|
|||
}
|
||||
}
|
||||
}
|
||||
did_add = true;
|
||||
}
|
||||
did_add
|
||||
}
|
||||
|
||||
fn add_character_tree(&mut self, tree: Option<&CharacterTree>) {
|
||||
match tree {
|
||||
Some(CharacterTree::Compare {
|
||||
value,
|
||||
operator,
|
||||
consequence,
|
||||
alternative,
|
||||
}) => {
|
||||
let op = match operator {
|
||||
Comparator::Less => "<",
|
||||
Comparator::LessOrEqual => "<=",
|
||||
Comparator::Equal => "==",
|
||||
Comparator::GreaterOrEqual => ">=",
|
||||
};
|
||||
let consequence = consequence.as_ref().map(Box::as_ref);
|
||||
let alternative = alternative.as_ref().map(Box::as_ref);
|
||||
|
||||
let simple = alternative.is_none() && consequence == Some(&CharacterTree::Yes);
|
||||
|
||||
if !simple {
|
||||
add!(self, "(");
|
||||
}
|
||||
|
||||
add!(self, "c {} ", op);
|
||||
self.add_character(*value);
|
||||
|
||||
if !simple {
|
||||
if alternative.is_none() {
|
||||
add!(self, " && ");
|
||||
self.add_character_tree(consequence);
|
||||
} else if consequence == Some(&CharacterTree::Yes) {
|
||||
add!(self, " || ");
|
||||
self.add_character_tree(alternative);
|
||||
} else {
|
||||
add!(self, "\n");
|
||||
indent!(self);
|
||||
add_whitespace!(self);
|
||||
add!(self, "? ");
|
||||
self.add_character_tree(consequence);
|
||||
add!(self, "\n");
|
||||
add_whitespace!(self);
|
||||
add!(self, ": ");
|
||||
self.add_character_tree(alternative);
|
||||
dedent!(self);
|
||||
}
|
||||
}
|
||||
|
||||
if !simple {
|
||||
add!(self, ")");
|
||||
}
|
||||
}
|
||||
Some(CharacterTree::Yes) => {
|
||||
add!(self, "true");
|
||||
}
|
||||
None => {
|
||||
add!(self, "false");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn add_advance_action(&mut self, action: &AdvanceAction) {
|
||||
|
|
|
|||
128
script/generate-unicode-categories-json
Executable file
128
script/generate-unicode-categories-json
Executable file
|
|
@ -0,0 +1,128 @@
|
|||
#!/usr/bin/env node
|
||||
|
||||
// This script generates a JSON file that is used by the CLI to handle unicode property escapes.
|
||||
|
||||
const CATEGORY_OUTPUT_PATH = './cli/src/generate/prepare_grammar/unicode-categories.json'
|
||||
const PROPERTY_OUTPUT_PATH = './cli/src/generate/prepare_grammar/unicode-properties.json'
|
||||
|
||||
const CATEGORY_URL = 'https://unicode.org/Public/13.0.0/ucd/UnicodeData.txt'
|
||||
const PROPERTY_URL = 'https://unicode.org/Public/13.0.0/ucd/PropList.txt'
|
||||
const DERIVED_PROPERTY_URL = 'https://unicode.org/Public/13.0.0/ucd/DerivedCoreProperties.txt'
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const {spawnSync} = require('child_process');
|
||||
|
||||
// Download the unicode data files, caching them inside the 'target' directory.
|
||||
const categoryData = cachedDownload(CATEGORY_URL);
|
||||
const propertyData = cachedDownload(PROPERTY_URL);
|
||||
const derivedPopertyData = cachedDownload(DERIVED_PROPERTY_URL);
|
||||
function cachedDownload(url) {
|
||||
let downloadPath = path.join('.', 'target', path.basename(url))
|
||||
if (fs.existsSync(downloadPath)) {
|
||||
return fs.readFileSync(downloadPath, 'utf8');
|
||||
} else {
|
||||
const data = spawnSync('curl', [url], {encoding: 'utf8'}).stdout;
|
||||
fs.writeFileSync(downloadPath, data, 'utf8');
|
||||
return data;
|
||||
}
|
||||
}
|
||||
|
||||
const categories = {};
|
||||
const properties = {};
|
||||
let data, row, lineStart, lineEnd;
|
||||
|
||||
// Parse the properties
|
||||
data = propertyData + derivedPopertyData;
|
||||
row = 0;
|
||||
lineStart = 0;
|
||||
lineEnd = -1;
|
||||
const CODE_POINT = /[0-9A-Fa-f]/
|
||||
while (lineStart < data.length) {
|
||||
row++;
|
||||
lineStart = lineEnd + 1;
|
||||
lineEnd = data.indexOf('\n', lineStart);
|
||||
if (lineEnd === -1) break;
|
||||
|
||||
// Skip over blank and comment lines
|
||||
if (!CODE_POINT.test(data[lineStart])) continue;
|
||||
|
||||
// Parse the first two semicolon fields:
|
||||
// * code point or code point range
|
||||
// * property
|
||||
const codePointEnd = data.indexOf(';', lineStart);
|
||||
const propertyStart = codePointEnd + 1;
|
||||
const propertyEnd = data.indexOf('#', propertyStart);
|
||||
|
||||
if (
|
||||
codePointEnd === -1 ||
|
||||
propertyEnd === -1
|
||||
) {
|
||||
throw new Error(`Unexpected format on line ${row}`);
|
||||
}
|
||||
|
||||
// Process ranges (separated by '..)
|
||||
const codePoints = data.slice(lineStart, codePointEnd).trim()
|
||||
.split('..')
|
||||
.map(p => parseInt(p, 16));
|
||||
if (codePoints.length === 1) {
|
||||
codePoints.push(codePoints[0]);
|
||||
}
|
||||
|
||||
const property = data.slice(propertyStart, propertyEnd).trim();
|
||||
|
||||
console.log(codePoints, property);
|
||||
|
||||
|
||||
for (let c = codePoints[0]; c <= codePoints[1]; c++) {
|
||||
if (!properties[property]) {
|
||||
properties[property] = [];
|
||||
}
|
||||
properties[property].push(c);
|
||||
}
|
||||
}
|
||||
|
||||
// Parse the categories.
|
||||
// Each line represents a code point.
|
||||
data = categoryData;
|
||||
row = 0;
|
||||
lineStart = 0;
|
||||
lineEnd = -1;
|
||||
while (lineStart < data.length) {
|
||||
row++;
|
||||
lineStart = lineEnd + 1;
|
||||
lineEnd = data.indexOf('\n', lineStart);
|
||||
if (lineEnd === -1) break;
|
||||
|
||||
// Parse the first three semicolon-separated fields:
|
||||
// * code point (hexadecimal)
|
||||
// * name
|
||||
// * category
|
||||
const codePointEnd = data.indexOf(';', lineStart);
|
||||
const nameStart = codePointEnd + 1;
|
||||
const nameEnd = data.indexOf(';', nameStart);
|
||||
const categoryStart = nameEnd + 1;
|
||||
const categoryEnd = data.indexOf(';', categoryStart)
|
||||
if (
|
||||
nameStart === 0 ||
|
||||
categoryStart == 0 ||
|
||||
categoryEnd === 0
|
||||
) {
|
||||
throw new Error(`Unexpected format on line ${row}`);
|
||||
}
|
||||
|
||||
const codePoint = parseInt(data.slice(lineStart, codePointEnd), 16);
|
||||
const name = data.slice(nameStart, nameEnd);
|
||||
const category = data.slice(categoryStart, categoryEnd);
|
||||
|
||||
console.log(codePoint, category, name);
|
||||
|
||||
// Group the code points by their category.
|
||||
if (!categories[category]) {
|
||||
categories[category] = [];
|
||||
}
|
||||
categories[category].push(codePoint);
|
||||
}
|
||||
|
||||
fs.writeFileSync(CATEGORY_OUTPUT_PATH, JSON.stringify(categories), 'utf8');
|
||||
fs.writeFileSync(PROPERTY_OUTPUT_PATH, JSON.stringify(properties), 'utf8');
|
||||
32
test/fixtures/test_grammars/unicode_classes/corpus.txt
vendored
Normal file
32
test/fixtures/test_grammars/unicode_classes/corpus.txt
vendored
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
===============
|
||||
Uppercase words
|
||||
===============
|
||||
|
||||
Δბㄱ Ψ Ɓƀ Ƒ Ɣ Śřř
|
||||
|
||||
---
|
||||
|
||||
(program
|
||||
(upper) (upper) (upper) (upper) (upper) (upper))
|
||||
|
||||
================
|
||||
Lowercase words
|
||||
================
|
||||
|
||||
śś ťť ßß
|
||||
|
||||
---
|
||||
|
||||
(program
|
||||
(lower) (lower) (lower))
|
||||
|
||||
================
|
||||
Math symbols
|
||||
================
|
||||
|
||||
≺ ≼ ≠ ≝ ⨔∑
|
||||
|
||||
---
|
||||
|
||||
(program
|
||||
(math_sym) (math_sym) (math_sym) (math_sym) (math_sym))
|
||||
36
test/fixtures/test_grammars/unicode_classes/grammar.json
vendored
Normal file
36
test/fixtures/test_grammars/unicode_classes/grammar.json
vendored
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
{
|
||||
"name": "unicode_classes",
|
||||
|
||||
"extras": [
|
||||
{"type": "PATTERN", "value": "\\s"}
|
||||
],
|
||||
|
||||
"rules": {
|
||||
"program": {
|
||||
"type": "REPEAT",
|
||||
"content": {
|
||||
"type": "CHOICE",
|
||||
"members": [
|
||||
{"type": "SYMBOL", "name": "lower"},
|
||||
{"type": "SYMBOL", "name": "upper"},
|
||||
{"type": "SYMBOL", "name": "math_sym"}
|
||||
]
|
||||
}
|
||||
},
|
||||
|
||||
"lower": {
|
||||
"type": "PATTERN",
|
||||
"value": "\\p{Ll}\\p{L}*"
|
||||
},
|
||||
|
||||
"upper": {
|
||||
"type": "PATTERN",
|
||||
"value": "\\p{Lu}\\p{L}*"
|
||||
},
|
||||
|
||||
"math_sym": {
|
||||
"type": "PATTERN",
|
||||
"value": "\\p{Sm}+"
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue