Represent character sets as sets of character ranges

This commit is contained in:
Max Brunsfeld 2014-02-05 18:56:04 -08:00
parent 8cce11a52a
commit d3d25f2683
17 changed files with 551 additions and 499 deletions

View file

@ -1,7 +1,7 @@
#ifndef __tree_sitter__item_set_transitions__
#define __tree_sitter__item_set_transitions__
#include "character.h"
#include "character_set.h"
#include "symbol.h"
#include "transition_map.h"
#include "item.h"

View file

@ -8,6 +8,8 @@ using std::to_string;
using std::unordered_map;
using std::unordered_set;
using std::vector;
using std::set;
using std::pair;
namespace tree_sitter {
namespace generate_code {
@ -101,33 +103,30 @@ namespace tree_sitter {
}
}
string condition_for_character_match(const rules::CharacterRange &match) {
string condition_for_character_range(const rules::CharacterRange &range) {
string lookahead("LOOKAHEAD_CHAR()");
auto value = match.value;
switch (match.type) {
case rules::CharacterRangeTypeClass:
switch (value.character_class) {
case rules::CharClassDigit:
return string("isdigit(") + lookahead + ")";
case rules::CharClassWord:
return string("isalnum(") + lookahead + ")";
}
case rules::CharacterRangeTypeSpecific:
return lookahead + " == '" + character_code(value.character) + "'";
case rules::CharacterRangeTypeRange:
return string("'") + value.range.min_character + string("' <= ") + lookahead +
" && " + lookahead + " <= '" + value.range.max_character + "'";
if (range.min == range.max) {
return lookahead + " == '" + character_code(range.min) + "'";
} else {
return string("'") + range.min + string("' <= ") + lookahead +
" && " + lookahead + " <= '" + range.max + "'";
}
}
string condition_for_character_set(const rules::CharacterSet &set) {
vector<string> parts;
for (auto &match : set.ranges)
parts.push_back("(" + condition_for_character_range(match) + ")");
return join(parts, " ||\n ");
}
string condition_for_character_rule(const rules::CharacterSet &rule) {
vector<string> parts;
for (auto &match : rule.ranges) {
parts.push_back("(" + condition_for_character_match(match) + ")");
}
string result = join(parts, " ||\n ");
if (!rule.sign) result = "!(" + result + ")";
return result;
pair<rules::CharacterSet, bool> representation = rule.most_compact_representation();
if (representation.second)
return condition_for_character_set(representation.first);
else
return "!(" + condition_for_character_set(rule.complement()) + ")";
}
string collapse_flags(vector<bool> flags) {
@ -177,17 +176,16 @@ namespace tree_sitter {
}
string lex_error_call(const unordered_set<rules::CharacterSet> &expected_inputs) {
unordered_set<rules::CharacterRange> expected_matches;
rules::CharacterSet expected_set;
for (auto &rule : expected_inputs)
for (auto &match : rule.ranges)
expected_matches.insert(match);
expected_set.union_with(rule);
string result = "LEX_ERROR(" + to_string(expected_matches.size()) + ", EXPECT({";
string result = "LEX_ERROR(" + to_string(expected_set.ranges.size()) + ", EXPECT({";
bool started = false;
for (auto match : expected_matches) {
for (auto &ranges : expected_set.ranges) {
if (started) result += ", ";
started = true;
result += "\"" + escape_string(match.to_string()) + "\"";
result += "\"" + escape_string(ranges.to_string()) + "\"";
}
result += "}));";
return result;

View file

@ -6,7 +6,7 @@
#include <string>
#include <unordered_set>
#include "symbol.h"
#include "character.h"
#include "character_set.h"
namespace tree_sitter {
typedef enum {

View file

@ -1,84 +0,0 @@
#include "rules.h"
using std::string;
using std::hash;
namespace tree_sitter {
namespace rules {
CharacterRange::CharacterRange(char character) : type(CharacterRangeTypeSpecific) { value.character = character; }
CharacterRange::CharacterRange(CharacterClass klass) : type(CharacterRangeTypeClass) { value.character_class = klass; }
CharacterRange::CharacterRange(const std::pair<char, char> bounds) : type(CharacterRangeTypeRange) {
value.range.min_character = bounds.first;
value.range.max_character = bounds.second;
}
bool CharacterRange::operator==(const CharacterRange &right) const {
if (type != right.type)
return false;
switch (type) {
case CharacterRangeTypeClass:
return (value.character_class == right.value.character_class);
case CharacterRangeTypeSpecific:
return (value.character == right.value.character);
case CharacterRangeTypeRange:
return (value.range.min_character == right.value.range.min_character &&
value.range.max_character == right.value.range.max_character);
}
}
string CharacterRange::to_string() const {
switch (type) {
case CharacterRangeTypeClass:
switch (value.character_class) {
case CharClassDigit:
return "<digit>";
case CharClassWord:
return "<word>";
}
case CharacterRangeTypeSpecific:
return (value.character == '\0') ?
"<EOF>" :
string("'") + value.character + "'";
case CharacterRangeTypeRange:
return (string("'") +
value.range.min_character + "'-'" +
value.range.max_character + "'");
}
}
CharacterSet::CharacterSet(char character) : ranges({ CharacterRange(character) }), sign(true) {}
CharacterSet::CharacterSet(CharacterClass char_class) : ranges({ CharacterRange(char_class) }), sign(true) {}
CharacterSet::CharacterSet(const std::unordered_set<CharacterRange> &ranges, bool sign) : ranges(ranges), sign(sign) {}
bool CharacterSet::operator==(const Rule &rule) const {
const CharacterSet *other = dynamic_cast<const CharacterSet *>(&rule);
return other && this->operator==(*other);
}
bool CharacterSet::operator==(const CharacterSet &other) const {
if (other.sign != sign) return false;
if (other.ranges != ranges) return false;
return true;
}
size_t CharacterSet::hash_code() const {
return typeid(this).hash_code() ^ hash<string>()(to_string());
}
rule_ptr CharacterSet::copy() const {
return std::make_shared<CharacterSet>(*this);
}
string CharacterSet::to_string() const {
string prefix("#<char");
if (!sign) prefix += " (not)";
for (auto &range : ranges)
prefix += " " + range.to_string();
return prefix + ">";
}
void CharacterSet::accept(Visitor &visitor) const {
visitor.visit(this);
}
}
}

View file

@ -1,88 +0,0 @@
#ifndef __tree_sitter__character__
#define __tree_sitter__character__
#include "rule.h"
#include <unordered_set>
namespace tree_sitter {
namespace rules {
typedef enum {
CharClassWord,
CharClassDigit
} CharacterClass;
typedef enum {
CharacterRangeTypeSpecific,
CharacterRangeTypeClass,
CharacterRangeTypeRange,
} CharacterRangeType;
struct CharacterRange {
CharacterRangeType type;
union {
CharacterClass character_class;
char character;
struct {
char min_character;
char max_character;
} range;
} value;
CharacterRange(char);
CharacterRange(const std::pair<char, char>);
CharacterRange(CharacterClass);
bool operator==(const CharacterRange &) const;
std::string to_string() const;
};
}
}
namespace std {
template<>
struct hash<tree_sitter::rules::CharacterRange> {
size_t operator()(const tree_sitter::rules::CharacterRange &match) const {
auto type = match.type;
auto result = hash<short int>()(type);
switch (type) {
case tree_sitter::rules::CharacterRangeTypeClass:
result ^= hash<short int>()(match.value.character_class);
case tree_sitter::rules::CharacterRangeTypeRange:
result ^= hash<char>()(match.value.range.min_character);
result ^= hash<char>()(match.value.range.max_character);
case tree_sitter::rules::CharacterRangeTypeSpecific:
result ^= hash<char>()(match.value.character);
}
return result;
}
};
}
namespace tree_sitter {
namespace rules {
class CharacterSet : public Rule {
public:
CharacterSet(char character);
CharacterSet(CharacterClass character_class);
CharacterSet(char min_character, char max_character);
CharacterSet(const std::unordered_set<CharacterRange> &matches, bool sign);
bool operator==(const Rule& other) const;
bool operator==(const CharacterSet& other) const;
size_t hash_code() const;
rule_ptr copy() const;
std::string to_string() const;
void accept(Visitor &visitor) const;
std::unordered_set<CharacterRange> ranges;
bool sign;
};
}
}
namespace std {
template<>
struct hash<tree_sitter::rules::CharacterSet> : hash<tree_sitter::rules::Rule> {};
}
#endif

View file

@ -0,0 +1,128 @@
#include "rules.h"
using std::string;
using std::hash;
using std::set;
namespace tree_sitter {
namespace rules {
char MAX_CHAR = -1;
CharacterRange::CharacterRange(char value) : min(value), max(value) {}
CharacterRange::CharacterRange(char min, char max) : min(min), max(max) {}
bool CharacterRange::operator==(const CharacterRange &other) const {
return min == other.min && max == other.max;
}
bool CharacterRange::operator<(const CharacterRange &other) const {
if (min < other.min) return true;
if (min > other.min) return false;
if (max < other.max) return true;
return false;
}
string escape_character(char input) {
switch (input) {
case '\0':
return "\\0";
default:
return string() + input;
}
}
bool CharacterRange::is_adjacent(const CharacterRange &other) const {
return
(min <= other.min && max >= (other.min - 1)) ||
(min <= (other.max + 1) && max >= other.max);
}
void CharacterRange::add_range(const CharacterRange &other) {
if (other.min < min) min = other.min;
if (other.max > max) max = other.max;
}
string CharacterRange::to_string() const {
if (min == max) {
return escape_character(min);
} else {
if (min == 0)
return string("<-") + max;
else if (max == MAX_CHAR)
return string() + min + "->";
else
return string() + min + "-" + max;
}
}
CharacterSet::CharacterSet() : ranges({}) {}
CharacterSet::CharacterSet(const set<CharacterRange> &ranges) : ranges(ranges) {}
CharacterSet::CharacterSet(const set<CharacterRange> &ranges, bool sign) :
ranges(sign ? ranges : CharacterSet(ranges).complement().ranges) {}
bool CharacterSet::operator==(const Rule &rule) const {
const CharacterSet *other = dynamic_cast<const CharacterSet *>(&rule);
return other && (ranges == other->ranges);
}
size_t CharacterSet::hash_code() const {
return typeid(this).hash_code() ^ hash<string>()(to_string());
}
rule_ptr CharacterSet::copy() const {
return std::make_shared<CharacterSet>(*this);
}
string CharacterSet::to_string() const {
string result("#<char {");
for (auto &range : ranges)
result += " " + range.to_string();
return result + " }>";
}
CharacterSet CharacterSet::complement() const {
set<CharacterRange> result;
char current_char = 0;
for (auto &range : ranges) {
if (range.min != 0)
result.insert(CharacterRange(current_char, range.min - 1));
current_char = range.max + 1;
}
if (current_char != 0)
result.insert(CharacterRange(current_char, MAX_CHAR));
return CharacterSet(result);
}
std::pair<CharacterSet, bool> CharacterSet::most_compact_representation() const {
auto first_range = *ranges.begin();
if (first_range.min == 0 && first_range.max > 0) {
return { this->complement(), false };
} else {
return { *this, true };
}
}
void add_range(CharacterSet *self, CharacterRange new_range) {
set<CharacterRange> new_ranges;
for (auto range : self->ranges) {
if (range.is_adjacent(new_range)) {
new_range.add_range(range);
} else {
new_ranges.insert(range);
}
}
new_ranges.insert(new_range);
self->ranges = new_ranges;
}
void CharacterSet::union_with(const CharacterSet &other) {
for (auto &other_range : other.ranges) {
add_range(this, other_range);
}
}
void CharacterSet::accept(Visitor &visitor) const {
visitor.visit(this);
}
}
}

View file

@ -0,0 +1,64 @@
#ifndef __tree_sitter__character_set__
#define __tree_sitter__character_set__
#include "rule.h"
#include <set>
namespace tree_sitter {
namespace rules {
struct CharacterRange {
char min;
char max;
CharacterRange(char);
CharacterRange(char, char);
bool operator==(const CharacterRange &) const;
bool operator<(const CharacterRange &) const;
bool is_adjacent(const CharacterRange &) const;
void add_range(const CharacterRange &);
std::string to_string() const;
};
}
}
namespace std {
template<>
struct hash<tree_sitter::rules::CharacterRange> {
size_t operator()(const tree_sitter::rules::CharacterRange &range) const {
return (hash<char>()(range.min) ^ hash<char>()(range.max));
}
};
}
namespace tree_sitter {
namespace rules {
class CharacterSet : public Rule {
public:
CharacterSet();
CharacterSet(const std::set<CharacterRange> &ranges);
CharacterSet(const std::set<CharacterRange> &ranges, bool);
CharacterSet complement() const;
void union_with(const CharacterSet &other);
std::pair<CharacterSet, bool> most_compact_representation() const;
bool operator==(const Rule& other) const;
size_t hash_code() const;
rule_ptr copy() const;
std::string to_string() const;
void accept(Visitor &visitor) const;
std::set<CharacterRange> ranges;
};
}
}
namespace std {
template<>
struct hash<tree_sitter::rules::CharacterSet> : hash<tree_sitter::rules::Rule> {};
}
#endif

View file

@ -2,6 +2,7 @@
using std::string;
using std::hash;
using std::set;
namespace tree_sitter {
namespace rules {
@ -38,18 +39,6 @@ namespace tree_sitter {
return result;
}
rule_ptr char_set() {
bool is_affirmative = true;
if (peek() == '^') {
next();
is_affirmative = false;
}
std::unordered_set<CharacterRange> matches;
while (has_more_input() && (peek() != ']'))
matches.insert(single_char());
return character(matches, is_affirmative);
}
rule_ptr atom() {
rule_ptr result;
switch (peek()) {
@ -63,7 +52,7 @@ namespace tree_sitter {
break;
case '[':
next();
result = char_set();
result = char_set().copy();
if (peek() != ']')
error("mismatched square brackets");
else
@ -73,13 +62,25 @@ namespace tree_sitter {
error("mismatched parens");
break;
default:
result = character({ single_char() }, true);
result = single_char().copy();
}
return result;
}
CharacterRange single_char() {
CharacterRange value('\0');
CharacterSet char_set() {
bool is_affirmative = true;
if (peek() == '^') {
next();
is_affirmative = false;
}
CharacterSet result;
while (has_more_input() && (peek() != ']'))
result.union_with(single_char());
return is_affirmative ? result : result.complement();
}
CharacterSet single_char() {
CharacterSet value({ '\0' });
switch (peek()) {
case '\\':
next();
@ -91,28 +92,28 @@ namespace tree_sitter {
next();
if (peek() == '-') {
next();
value = CharacterRange({ first_char, peek() });
value = CharacterSet({ {first_char, peek()} }, true);
next();
} else {
value = first_char;
value = CharacterSet({ first_char });
}
}
return value;
}
CharacterRange escaped_char(char value) {
CharacterSet escaped_char(char value) {
switch (value) {
case '\\':
case '(':
case ')':
return value;
return CharacterSet({ value });
case 'w':
return CharClassWord;
return CharacterSet({{'a', 'z'}, {'A', 'Z'}}, true);
case 'd':
return CharClassDigit;
return CharacterSet({{'0', '9'}}, true);
default:
error("unrecognized escape sequence");
return '\0';
return CharacterSet();
}
}

View file

@ -3,6 +3,7 @@
using std::make_shared;
using std::string;
using std::initializer_list;
using std::set;
namespace tree_sitter {
namespace rules {
@ -11,15 +12,16 @@ namespace tree_sitter {
}
rule_ptr character(char value) {
return make_shared<CharacterSet>(value);
set<CharacterRange> ranges = { value };
return make_shared<CharacterSet>(ranges);
}
rule_ptr character(CharacterClass value) {
return make_shared<CharacterSet>(value);
rule_ptr character(const set<CharacterRange> &ranges) {
return make_shared<CharacterSet>(ranges);
}
rule_ptr character(const std::unordered_set<CharacterRange> &matches, bool is_affirmative) {
return make_shared<CharacterSet>(matches, is_affirmative);
rule_ptr character(const set<CharacterRange> &ranges, bool sign) {
return make_shared<CharacterSet>(ranges, sign);
}
rule_ptr choice(const initializer_list<rule_ptr> &rules) {

View file

@ -8,7 +8,7 @@
#include "seq.h"
#include "string.h"
#include "pattern.h"
#include "character.h"
#include "character_set.h"
#include "repeat.h"
#include "visitor.h"
@ -16,9 +16,8 @@ namespace tree_sitter {
namespace rules {
rule_ptr blank();
rule_ptr character(char value);
rule_ptr character(CharacterClass value);
rule_ptr character(const std::unordered_set<CharacterRange> &matches);
rule_ptr character(const std::unordered_set<CharacterRange> &matches, bool);
rule_ptr character(const std::set<CharacterRange> &matches);
rule_ptr character(const std::set<CharacterRange> &matches, bool);
rule_ptr choice(const std::initializer_list<rule_ptr> &rules);
rule_ptr pattern(const std::string &value);