Merge pull request #906 from tree-sitter/unicode-property-escapes

Handle simple unicode property escapes in regexes
2021-02-17 16:14:42 -08:00 · 2021-02-17 16:14:42 -08:00 · 9d9eb2234f
commit 9d9eb2234f
parent 699af84259 dad8546776
11 changed files with 483 additions and 19 deletions
--- a/cli/src/error.rs
+++ b/cli/src/error.rs
@ -13,8 +13,9 @@ impl Error {
        Error(vec![format!("Grammar error: {}", message)])
    }

-    pub fn regex(message: &str) -> Self {
-        Error(vec![format!("Regex error: {}", message)])
+    pub fn regex(mut message: String) -> Self {
+        message.insert_str(0, "Regex error: ");
+        Error(vec![message])
    }

    pub fn undefined_symbol(name: &str) -> Self {
--- a/cli/src/generate/char_tree.rs
+++ b/cli/src/generate/char_tree.rs
@ -0,0 +1,130 @@
+use std::ops::Range;
+
+#[derive(PartialEq, Eq)]
+pub enum CharacterTree {
+    Yes,
+    Compare {
+        value: char,
+        operator: Comparator,
+        consequence: Option<Box<CharacterTree>>,
+        alternative: Option<Box<CharacterTree>>,
+    },
+}
+
+#[derive(PartialEq, Eq)]
+pub enum Comparator {
+    Less,
+    LessOrEqual,
+    Equal,
+    GreaterOrEqual,
+}
+
+impl CharacterTree {
+    pub fn from_ranges(ranges: &[Range<char>]) -> Option<Self> {
+        match ranges.len() {
+            0 => None,
+            1 => {
+                let range = &ranges[0];
+                if range.start == range.end {
+                    Some(CharacterTree::Compare {
+                        operator: Comparator::Equal,
+                        value: range.start,
+                        consequence: Some(Box::new(CharacterTree::Yes)),
+                        alternative: None,
+                    })
+                } else {
+                    Some(CharacterTree::Compare {
+                        operator: Comparator::GreaterOrEqual,
+                        value: range.start,
+                        consequence: Some(Box::new(CharacterTree::Compare {
+                            operator: Comparator::LessOrEqual,
+                            value: range.end,
+                            consequence: Some(Box::new(CharacterTree::Yes)),
+                            alternative: None,
+                        })),
+                        alternative: None,
+                    })
+                }
+            }
+            len => {
+                let mid = len / 2;
+                let mid_range = &ranges[mid];
+                Some(CharacterTree::Compare {
+                    operator: Comparator::Less,
+                    value: mid_range.start,
+                    consequence: Self::from_ranges(&ranges[0..mid]).map(Box::new),
+                    alternative: Some(Box::new(CharacterTree::Compare {
+                        operator: Comparator::LessOrEqual,
+                        value: mid_range.end,
+                        consequence: Some(Box::new(CharacterTree::Yes)),
+                        alternative: Self::from_ranges(&ranges[(mid + 1)..]).map(Box::new),
+                    })),
+                })
+            }
+        }
+    }
+
+    #[cfg(test)]
+    fn contains(&self, c: char) -> bool {
+        match self {
+            CharacterTree::Yes => true,
+            CharacterTree::Compare {
+                value,
+                operator,
+                alternative,
+                consequence,
+            } => {
+                let condition = match operator {
+                    Comparator::Less => c < *value,
+                    Comparator::LessOrEqual => c <= *value,
+                    Comparator::Equal => c == *value,
+                    Comparator::GreaterOrEqual => c >= *value,
+                };
+                if condition { consequence } else { alternative }
+                    .as_ref()
+                    .map_or(false, |a| a.contains(c))
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_character_tree_simple() {
+        let tree = CharacterTree::from_ranges(&['a'..'d', 'h'..'l', 'p'..'r', 'u'..'u', 'z'..'z'])
+            .unwrap();
+
+        assert!(tree.contains('a'));
+        assert!(tree.contains('b'));
+        assert!(tree.contains('c'));
+        assert!(tree.contains('d'));
+
+        assert!(!tree.contains('e'));
+        assert!(!tree.contains('f'));
+        assert!(!tree.contains('g'));
+
+        assert!(tree.contains('h'));
+        assert!(tree.contains('i'));
+        assert!(tree.contains('j'));
+        assert!(tree.contains('k'));
+        assert!(tree.contains('l'));
+
+        assert!(!tree.contains('m'));
+        assert!(!tree.contains('n'));
+        assert!(!tree.contains('o'));
+
+        assert!(tree.contains('p'));
+        assert!(tree.contains('q'));
+        assert!(tree.contains('r'));
+
+        assert!(!tree.contains('s'));
+        assert!(!tree.contains('s'));
+
+        assert!(tree.contains('u'));
+
+        assert!(!tree.contains('v'));
+    }
+}
--- a/cli/src/generate/mod.rs
+++ b/cli/src/generate/mod.rs
@ -1,4 +1,5 @@
 mod build_tables;
+mod char_tree;
 mod dedup;
 mod grammars;
 mod nfa;
--- a/cli/src/generate/nfa.rs
+++ b/cli/src/generate/nfa.rs
@ -6,11 +6,13 @@ use std::fmt;
 use std::mem::swap;
 use std::ops::Range;

+/// A set of characters represented as a vector of ranges.
 #[derive(Clone, PartialEq, Eq, Hash)]
 pub struct CharacterSet {
    ranges: Vec<Range<u32>>,
 }

+/// A state in an NFA representing a regular grammar.
 #[derive(Debug, PartialEq, Eq)]
 pub enum NfaState {
    Advance {
@ -54,10 +56,12 @@ impl Default for Nfa {
 const END: u32 = char::MAX as u32 + 1;

 impl CharacterSet {
+    /// Create a character set with a single character.
    pub fn empty() -> Self {
        CharacterSet { ranges: Vec::new() }
    }

+    /// Create a character set with a given *inclusive* range of characters.
    pub fn from_range(mut first: char, mut last: char) -> Self {
        if first > last {
            swap(&mut first, &mut last);
@ -67,12 +71,15 @@ impl CharacterSet {
        }
    }

+    /// Create a character set with a single character.
    pub fn from_char(c: char) -> Self {
        CharacterSet {
            ranges: vec![(c as u32)..(c as u32 + 1)],
        }
    }

+    /// Create a character set containing all characters *not* present
+    /// in this character set.
    pub fn negate(mut self) -> CharacterSet {
        let mut i = 0;
        let mut previous_end = 0;
@ -146,6 +153,9 @@ impl CharacterSet {
        false
    }

+    /// Get the set of characters that are present in both this set
+    /// and the other set. Remove those common characters from both
+    /// of the operands.
    pub fn remove_intersection(&mut self, other: &mut CharacterSet) -> CharacterSet {
        let mut intersection = Vec::new();
        let mut left_i = 0;
@ -271,6 +281,8 @@ impl CharacterSet {
        self.ranges.is_empty()
    }

+    /// Get a reduced list of character ranges, assuming that a given
+    /// set of characters can be safely ignored.
    pub fn simplify_ignoring<'a>(
        &'a self,
        ruled_out_characters: &'a HashSet<u32>,
--- a/cli/src/generate/prepare_grammar/expand_tokens.rs
+++ b/cli/src/generate/prepare_grammar/expand_tokens.rs
@ -6,15 +6,23 @@ use crate::generate::rules::Rule;
 use lazy_static::lazy_static;
 use regex::Regex;
 use regex_syntax::ast::{
-    parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, RepetitionKind, RepetitionRange,
+    parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetItem, ClassUnicodeKind, RepetitionKind,
+    RepetitionRange,
 };
+use std::collections::HashMap;
 use std::i32;

 lazy_static! {
    static ref CURLY_BRACE_REGEX: Regex =
-        Regex::new(r#"(^|[^\\])\{([^}]*[^0-9A-Fa-f,}][^}]*)\}"#).unwrap();
+        Regex::new(r#"(^|[^\\p])\{([^}]*[^0-9A-Fa-f,}][^}]*)\}"#).unwrap();
+    static ref UNICODE_CATEGORIES: HashMap<&'static str, Vec<u32>> =
+        serde_json::from_str(UNICODE_CATEGORIES_JSON).unwrap();
+    static ref UNICODE_PROPERTIES: HashMap<&'static str, Vec<u32>> =
+        serde_json::from_str(UNICODE_PROPERTIES_JSON).unwrap();
 }

+const UNICODE_CATEGORIES_JSON: &'static str = include_str!("./unicode-categories.json");
+const UNICODE_PROPERTIES_JSON: &'static str = include_str!("./unicode-properties.json");
 const ALLOWED_REDUNDANT_ESCAPED_CHARS: [char; 4] = ['!', '\'', '"', '/'];

 struct NfaBuilder {
@ -196,7 +204,7 @@ impl NfaBuilder {
    fn expand_regex(&mut self, ast: &Ast, mut next_state_id: u32) -> Result<bool> {
        match ast {
            Ast::Empty(_) => Ok(false),
-            Ast::Flags(_) => Err(Error::regex("Flags are not supported")),
+            Ast::Flags(_) => Err(Error::regex("Flags are not supported".to_string())),
            Ast::Literal(literal) => {
                self.push_advance(CharacterSet::from_char(literal.c), next_state_id);
                Ok(true)
@ -205,10 +213,15 @@ impl NfaBuilder {
                self.push_advance(CharacterSet::from_char('\n').negate(), next_state_id);
                Ok(true)
            }
-            Ast::Assertion(_) => Err(Error::regex("Assertions are not supported")),
+            Ast::Assertion(_) => Err(Error::regex("Assertions are not supported".to_string())),
            Ast::Class(class) => match class {
-                Class::Unicode(_) => {
-                    Err(Error::regex("Unicode character classes are not supported"))
+                Class::Unicode(class) => {
+                    let mut chars = self.expand_unicode_character_class(&class.kind)?;
+                    if class.negated {
+                        chars = chars.negate();
+                    }
+                    self.push_advance(chars, next_state_id);
+                    Ok(true)
                }
                Class::Perl(class) => {
                    let mut chars = self.expand_perl_character_class(&class.kind);
@ -228,7 +241,7 @@ impl NfaBuilder {
                        Ok(true)
                    }
                    ClassSet::BinaryOp(_) => Err(Error::regex(
-                        "Binary operators in character classes aren't supported",
+                        "Binary operators in character classes aren't supported".to_string(),
                    )),
                },
            },
@ -355,13 +368,63 @@ impl NfaBuilder {
                Ok(result)
            }
            ClassSetItem::Perl(class) => Ok(self.expand_perl_character_class(&class.kind)),
-            _ => Err(Error::regex(&format!(
+            _ => Err(Error::regex(format!(
                "Unsupported character class syntax {:?}",
                item
            ))),
        }
    }

+    fn expand_unicode_character_class(&self, class: &ClassUnicodeKind) -> Result<CharacterSet> {
+        let mut chars = CharacterSet::empty();
+
+        let category_letter;
+        match class {
+            ClassUnicodeKind::OneLetter(le) => {
+                category_letter = le.to_string();
+            }
+            ClassUnicodeKind::Named(class_name) => {
+                if class_name.len() == 1 {
+                    category_letter = class_name.clone();
+                } else {
+                    let code_points = UNICODE_CATEGORIES
+                        .get(class_name.as_str())
+                        .or_else(|| UNICODE_PROPERTIES.get(class_name.as_str()))
+                        .ok_or_else(|| {
+                            Error::regex(format!(
+                                "Unsupported unicode character class {}",
+                                class_name
+                            ))
+                        })?;
+                    for c in code_points {
+                        if let Some(c) = std::char::from_u32(*c) {
+                            chars = chars.add_char(c);
+                        }
+                    }
+
+                    return Ok(chars);
+                }
+            }
+            ClassUnicodeKind::NamedValue { .. } => {
+                return Err(Error::regex(
+                    "Key-value unicode properties are not supported".to_string(),
+                ))
+            }
+        }
+
+        for (category, code_points) in UNICODE_CATEGORIES.iter() {
+            if category.starts_with(&category_letter) {
+                for c in code_points {
+                    if let Some(c) = std::char::from_u32(*c) {
+                        chars = chars.add_char(c);
+                    }
+                }
+            }
+        }
+
+        Ok(chars)
+    }
+
    fn expand_perl_character_class(&self, item: &ClassPerlKind) -> CharacterSet {
        match item {
            ClassPerlKind::Digit => CharacterSet::from_range('0', '9'),
--- a/cli/src/generate/prepare_grammar/unicode-categories.json
+++ b/cli/src/generate/prepare_grammar/unicode-categories.json
--- a/cli/src/generate/prepare_grammar/unicode-properties.json
+++ b/cli/src/generate/prepare_grammar/unicode-properties.json
--- a/cli/src/generate/render.rs
+++ b/cli/src/generate/render.rs
@ -1,3 +1,4 @@
+use super::char_tree::{CharacterTree, Comparator};
 use super::grammars::{ExternalToken, LexicalGrammar, SyntaxGrammar, VariableType};
 use super::rules::{Alias, AliasMap, Symbol, SymbolType};
 use super::tables::{
@ -714,7 +715,7 @@ impl Generator {
            if info.usage_count > 1 {
                add_line!(
                    self,
-                    "static inline bool {}_character_set_{}(int32_t lookahead) {{",
+                    "static inline bool {}_character_set_{}(int32_t c) {{",
                    self.symbol_ids[&info.symbol],
                    info.index
                );
@ -722,7 +723,8 @@ impl Generator {
                add_line!(self, "return");
                indent!(self);
                add_whitespace!(self);
-                self.add_character_range_conditions(&info.ranges, true, 0);
+                let tree = CharacterTree::from_ranges(&info.ranges);
+                self.add_character_tree(tree.as_ref());
                add!(self, ";\n");
                dedent!(self);
                dedent!(self);
@ -844,16 +846,15 @@ impl Generator {
        ranges: &[Range<char>],
        is_included: bool,
        indent_count: usize,
-    ) -> bool {
+    ) {
        let mut line_break = "\n".to_string();
        for _ in 0..self.indent_level + indent_count {
            line_break.push_str("  ");
        }

-        let mut did_add = false;
-        for range in ranges {
+        for (i, range) in ranges.iter().enumerate() {
            if is_included {
-                if did_add {
+                if i > 0 {
                    add!(self, " ||{}", line_break);
                }
                if range.end == range.start {
@ -872,7 +873,7 @@ impl Generator {
                    add!(self, ")");
                }
            } else {
-                if did_add {
+                if i > 0 {
                    add!(self, " &&{}", line_break);
                }
                if range.end == range.start {
@ -896,9 +897,67 @@ impl Generator {
                    }
                }
            }
-            did_add = true;
        }
-        did_add
+    }
+
+    fn add_character_tree(&mut self, tree: Option<&CharacterTree>) {
+        match tree {
+            Some(CharacterTree::Compare {
+                value,
+                operator,
+                consequence,
+                alternative,
+            }) => {
+                let op = match operator {
+                    Comparator::Less => "<",
+                    Comparator::LessOrEqual => "<=",
+                    Comparator::Equal => "==",
+                    Comparator::GreaterOrEqual => ">=",
+                };
+                let consequence = consequence.as_ref().map(Box::as_ref);
+                let alternative = alternative.as_ref().map(Box::as_ref);
+
+                let simple = alternative.is_none() && consequence == Some(&CharacterTree::Yes);
+
+                if !simple {
+                    add!(self, "(");
+                }
+
+                add!(self, "c {} ", op);
+                self.add_character(*value);
+
+                if !simple {
+                    if alternative.is_none() {
+                        add!(self, " && ");
+                        self.add_character_tree(consequence);
+                    } else if consequence == Some(&CharacterTree::Yes) {
+                        add!(self, " || ");
+                        self.add_character_tree(alternative);
+                    } else {
+                        add!(self, "\n");
+                        indent!(self);
+                        add_whitespace!(self);
+                        add!(self, "? ");
+                        self.add_character_tree(consequence);
+                        add!(self, "\n");
+                        add_whitespace!(self);
+                        add!(self, ": ");
+                        self.add_character_tree(alternative);
+                        dedent!(self);
+                    }
+                }
+
+                if !simple {
+                    add!(self, ")");
+                }
+            }
+            Some(CharacterTree::Yes) => {
+                add!(self, "true");
+            }
+            None => {
+                add!(self, "false");
+            }
+        }
    }

    fn add_advance_action(&mut self, action: &AdvanceAction) {
--- a/script/generate-unicode-categories-json
+++ b/script/generate-unicode-categories-json
@ -0,0 +1,128 @@
+#!/usr/bin/env node
+
+// This script generates a JSON file that is used by the CLI to handle unicode property escapes.
+
+const CATEGORY_OUTPUT_PATH = './cli/src/generate/prepare_grammar/unicode-categories.json'
+const PROPERTY_OUTPUT_PATH = './cli/src/generate/prepare_grammar/unicode-properties.json'
+
+const CATEGORY_URL = 'https://unicode.org/Public/13.0.0/ucd/UnicodeData.txt'
+const PROPERTY_URL = 'https://unicode.org/Public/13.0.0/ucd/PropList.txt'
+const DERIVED_PROPERTY_URL = 'https://unicode.org/Public/13.0.0/ucd/DerivedCoreProperties.txt'
+
+const fs = require('fs');
+const path = require('path');
+const {spawnSync} = require('child_process');
+
+// Download the unicode data files, caching them inside the 'target' directory.
+const categoryData = cachedDownload(CATEGORY_URL);
+const propertyData = cachedDownload(PROPERTY_URL);
+const derivedPopertyData = cachedDownload(DERIVED_PROPERTY_URL);
+function cachedDownload(url) {
+    let downloadPath = path.join('.', 'target', path.basename(url))
+    if (fs.existsSync(downloadPath)) {
+        return fs.readFileSync(downloadPath, 'utf8');
+    } else {
+        const data = spawnSync('curl', [url], {encoding: 'utf8'}).stdout;
+        fs.writeFileSync(downloadPath, data, 'utf8');
+        return data;
+    }
+}
+
+const categories = {};
+const properties = {};
+let data, row, lineStart, lineEnd;
+
+// Parse the properties
+data = propertyData + derivedPopertyData;
+row = 0;
+lineStart = 0;
+lineEnd = -1;
+const CODE_POINT = /[0-9A-Fa-f]/
+while (lineStart < data.length) {
+    row++;
+    lineStart = lineEnd + 1;
+    lineEnd = data.indexOf('\n', lineStart);
+    if (lineEnd === -1) break;
+
+    // Skip over blank and comment lines
+    if (!CODE_POINT.test(data[lineStart])) continue;
+
+    // Parse the first two semicolon fields:
+    // * code point or code point range
+    // * property
+    const codePointEnd = data.indexOf(';', lineStart);
+    const propertyStart = codePointEnd + 1;
+    const propertyEnd = data.indexOf('#', propertyStart);
+
+    if (
+        codePointEnd === -1 ||
+        propertyEnd === -1
+    ) {
+        throw new Error(`Unexpected format on line ${row}`);
+    }
+
+    // Process ranges (separated by '..)
+    const codePoints = data.slice(lineStart, codePointEnd).trim()
+        .split('..')
+        .map(p => parseInt(p, 16));
+    if (codePoints.length === 1) {
+        codePoints.push(codePoints[0]);
+    }
+
+    const property = data.slice(propertyStart, propertyEnd).trim();
+
+    console.log(codePoints, property);
+
+
+    for (let c = codePoints[0]; c <= codePoints[1]; c++) {
+        if (!properties[property]) {
+            properties[property] = [];
+        }
+        properties[property].push(c);
+    }
+}
+
+// Parse the categories.
+// Each line represents a code point.
+data = categoryData;
+row = 0;
+lineStart = 0;
+lineEnd = -1;
+while (lineStart < data.length) {
+    row++;
+    lineStart = lineEnd + 1;
+    lineEnd = data.indexOf('\n', lineStart);
+    if (lineEnd === -1) break;
+
+    // Parse the first three semicolon-separated fields:
+    // * code point (hexadecimal)
+    // * name
+    // * category
+    const codePointEnd = data.indexOf(';', lineStart);
+    const nameStart = codePointEnd + 1;
+    const nameEnd = data.indexOf(';', nameStart);
+    const categoryStart = nameEnd + 1;
+    const categoryEnd = data.indexOf(';', categoryStart)
+    if (
+        nameStart === 0 ||
+        categoryStart == 0 ||
+        categoryEnd === 0
+    ) {
+        throw new Error(`Unexpected format on line ${row}`);
+    }
+
+    const codePoint = parseInt(data.slice(lineStart, codePointEnd), 16);
+    const name = data.slice(nameStart, nameEnd);
+    const category = data.slice(categoryStart, categoryEnd);
+
+    console.log(codePoint, category, name);
+
+    // Group the code points by their category.
+    if (!categories[category]) {
+        categories[category] = [];
+    }
+    categories[category].push(codePoint);
+}
+
+fs.writeFileSync(CATEGORY_OUTPUT_PATH, JSON.stringify(categories), 'utf8');
+fs.writeFileSync(PROPERTY_OUTPUT_PATH, JSON.stringify(properties), 'utf8');
--- a/test/fixtures/test_grammars/unicode_classes/corpus.txt
+++ b/test/fixtures/test_grammars/unicode_classes/corpus.txt
@ -0,0 +1,32 @@
+===============
+Uppercase words
+===============
+
+Δბㄱ  Ψ  Ɓƀ  Ƒ  Ɣ  Śřř
+
+---
+
+(program
+  (upper) (upper) (upper) (upper) (upper) (upper))
+
+================
+Lowercase words
+================
+
+śś  ťť  ßß
+
+---
+
+(program
+  (lower) (lower) (lower))
+
+================
+Math symbols
+================
+
+≺ ≼ ≠ ≝ ⨔∑
+
+---
+
+(program
+  (math_sym) (math_sym) (math_sym) (math_sym) (math_sym))
--- a/test/fixtures/test_grammars/unicode_classes/grammar.json
+++ b/test/fixtures/test_grammars/unicode_classes/grammar.json
@ -0,0 +1,36 @@
+{
+  "name": "unicode_classes",
+
+  "extras": [
+    {"type": "PATTERN", "value": "\\s"}
+  ],
+
+  "rules": {
+    "program":  {
+      "type": "REPEAT",
+      "content": {
+        "type": "CHOICE",
+        "members": [
+          {"type": "SYMBOL", "name": "lower"},
+          {"type": "SYMBOL", "name": "upper"},
+          {"type": "SYMBOL", "name": "math_sym"}
+        ]
+      }
+    },
+
+    "lower": {
+      "type": "PATTERN",
+      "value": "\\p{Ll}\\p{L}*"
+    },
+
+    "upper": {
+      "type": "PATTERN",
+      "value": "\\p{Lu}\\p{L}*"
+    },
+
+    "math_sym": {
+      "type": "PATTERN",
+      "value": "\\p{Sm}+"
+    }
+  }
+}