Allow Character rules to handle arbitrary character sets

This commit is contained in:
Max Brunsfeld 2014-01-29 19:18:21 -08:00
parent bc1d115ee2
commit 7f62e752be
16 changed files with 322 additions and 309 deletions

View file

@ -49,7 +49,7 @@ namespace tree_sitter {
rules::Character rule = *transition.first;
LexItemSet item_set = *transition.second;
size_t new_state_index = add_lex_state(item_set);
lex_table.add_action(state_index, rule.value, LexAction::Advance(new_state_index));
lex_table.add_action(state_index, rule, LexAction::Advance(new_state_index));
}
}

View file

@ -1,62 +0,0 @@
#include "char_match.h"
using std::string;
namespace tree_sitter {
CharMatch CharMatchSpecific(char value) {
CharMatch result = { .type = CharMatchTypeSpecific };
result.value.character = value;
return result;
}
CharMatch CharMatchClass(CharClass value) {
CharMatch result = { .type = CharMatchTypeClass };
result.value.character = value;
return result;
}
CharMatch CharMatchRange(char min, char max) {
CharMatch result = { .type = CharMatchTypeRange };
result.value.range.min_character = min;
result.value.range.max_character = max;
return result;
}
string CharMatchToString(CharMatch match) {
switch (match.type) {
case CharMatchTypeClass:
switch (match.value.character_class) {
case CharClassDigit:
return "<digit>";
case CharClassWord:
return "<word>";
}
case CharMatchTypeSpecific:
return string("'") + string(&match.value.character) + "'";
case CharMatchTypeRange:
return (
string("'") +
string(&match.value.range.min_character) + "'-'" +
string(&match.value.range.max_character) + "'");
}
}
bool operator==(const CharMatch &left, const CharMatch &right) {
if (left.type != right.type)
return false;
switch (left.type) {
case CharMatchTypeClass:
return (left.value.character_class == right.value.character_class);
case CharMatchTypeSpecific:
return (left.value.character == right.value.character);
case CharMatchTypeRange:
return (
left.value.range.min_character == right.value.range.min_character &&
left.value.range.max_character == right.value.range.max_character);
}
}
std::ostream& operator<<(std::ostream& stream, const CharMatch &match) {
return stream << CharMatchToString(match);
}
}

View file

@ -1,52 +0,0 @@
#ifndef __TreeSitter__char_match__
#define __TreeSitter__char_match__
#include <unordered_map>
#include <string>
namespace tree_sitter {
typedef enum {
CharMatchTypeSpecific,
CharMatchTypeClass,
CharMatchTypeRange,
} CharMatchType;
typedef enum {
CharClassWord,
CharClassDigit
} CharClass;
struct CharMatch {
CharMatchType type;
union {
CharClass character_class;
char character;
struct {
char min_character;
char max_character;
} range;
} value;
};
CharMatch CharMatchSpecific(char);
CharMatch CharMatchClass(CharClass);
CharMatch CharMatchRange(char, char);
std::string CharMatchToString(CharMatch);
bool operator==(const CharMatch &, const CharMatch &);
std::ostream& operator<<(std::ostream& stream, const CharMatch &rule);
}
namespace std {
template<>
struct hash<tree_sitter::CharMatch> {
size_t operator()(const tree_sitter::CharMatch &match) const {
return (
hash<int>()(match.type) ^
hash<char>()(match.value.range.min_character) ^
hash<char>()(match.value.range.max_character));
}
};
}
#endif

View file

@ -101,23 +101,33 @@ namespace tree_sitter {
}
}
string condition_for_char_match(const CharMatch &char_match) {
string condition_for_character_match(const rules::CharacterMatch &match) {
auto value = "LOOKAHEAD_CHAR()";
switch (char_match.type) {
case CharMatchTypeClass:
switch (char_match.value.character_class) {
case CharClassDigit:
switch (match.type) {
case rules::CharacterMatchTypeClass:
switch (match.value.character_class) {
case rules::CharClassDigit:
return string("isdigit(") + value + ")";
case CharClassWord:
case rules::CharClassWord:
return string("isalnum(") + value + ")";
}
case CharMatchTypeSpecific:
return string(value) + " == '" + character_code(char_match.value.character) + "'";
case rules::CharacterMatchTypeSpecific:
return string(value) + " == '" + character_code(match.value.character) + "'";
default:
return "";
}
}
string condition_for_character_rule(const rules::Character &rule) {
vector<string> parts;
for (auto &match : rule.matches) {
parts.push_back(condition_for_character_match(match));
}
string result = join(parts, " || ");
if (!rule.sign) result = "!(" + result + ")";
return result;
}
string collapse_flags(vector<bool> flags) {
string result;
bool started = false;
@ -164,19 +174,24 @@ namespace tree_sitter {
return input;
}
string lex_error_call(const unordered_set<CharMatch> &expected_inputs) {
string result = "LEX_ERROR(" + to_string(expected_inputs.size()) + ", EXPECT({";
string lex_error_call(const unordered_set<rules::Character> &expected_inputs) {
unordered_set<rules::CharacterMatch> expected_matches;
for (auto &rule : expected_inputs)
for (auto &match : rule.matches)
expected_matches.insert(match);
string result = "LEX_ERROR(" + to_string(expected_matches.size()) + ", EXPECT({";
bool started = false;
for (auto match : expected_inputs) {
for (auto match : expected_matches) {
if (started) result += ", ";
started = true;
result += "\"" + escape_string(CharMatchToString(match)) + "\"";
result += "\"" + escape_string(match.to_string()) + "\"";
}
result += "}));";
return result;
}
string code_for_lex_actions(const unordered_set<LexAction> &actions, const unordered_set<CharMatch> &expected_inputs) {
string code_for_lex_actions(const unordered_set<LexAction> &actions, const unordered_set<rules::Character> &expected_inputs) {
auto action = actions.begin();
if (action == actions.end()) {
return lex_error_call(expected_inputs);
@ -206,7 +221,7 @@ namespace tree_sitter {
string result = "";
auto expected_inputs = parse_state.expected_inputs();
for (auto pair : parse_state.actions)
result += _if(condition_for_char_match(pair.first), code_for_lex_actions(pair.second, expected_inputs));
result += _if(condition_for_character_rule(pair.first), code_for_lex_actions(pair.second, expected_inputs));
result += code_for_lex_actions(parse_state.default_actions, expected_inputs);
return result;
}

View file

@ -45,8 +45,8 @@ namespace tree_sitter {
}
// State
unordered_set<CharMatch> LexState::expected_inputs() const {
unordered_set<CharMatch> result;
unordered_set<rules::Character> LexState::expected_inputs() const {
unordered_set<rules::Character> result;
for (auto pair : actions)
result.insert(pair.first);
return result;
@ -58,7 +58,7 @@ namespace tree_sitter {
return states.size() - 1;
}
void LexTable::add_action(size_t state_index, CharMatch match, LexAction action) {
void LexTable::add_action(size_t state_index, rules::Character match, LexAction action) {
states[state_index].actions[match].insert(action);
}

View file

@ -5,8 +5,8 @@
#include <vector>
#include <string>
#include <unordered_set>
#include "char_match.h"
#include "symbol.h"
#include "character.h"
namespace tree_sitter {
typedef enum {
@ -45,15 +45,15 @@ namespace std {
namespace tree_sitter {
class LexState {
public:
std::unordered_map<CharMatch, std::unordered_set<LexAction>> actions;
std::unordered_map<rules::Character, std::unordered_set<LexAction>> actions;
std::unordered_set<LexAction> default_actions;
std::unordered_set<CharMatch> expected_inputs() const;
std::unordered_set<rules::Character> expected_inputs() const;
};
class LexTable {
public:
size_t add_state();
void add_action(size_t state_index, CharMatch match, LexAction action);
void add_action(size_t state_index, rules::Character rule, LexAction action);
void add_default_action(size_t state_index, LexAction action);
std::vector<LexState> states;

View file

@ -5,17 +5,64 @@ using std::hash;
namespace tree_sitter {
namespace rules {
Character::Character(char value) : value(CharMatchSpecific(value)) {};
Character::Character(CharClass value) : value(CharMatchClass(value)) {};
Character::Character(char min, char max) : value(CharMatchRange(min, max)) {};
CharacterMatch::CharacterMatch(char character) : type(CharacterMatchTypeSpecific) { value.character = character; }
CharacterMatch::CharacterMatch(CharacterClass klass) : type(CharacterMatchTypeClass) { value.character_class = klass; }
CharacterMatch::CharacterMatch(std::pair<char, char> bounds) : type(CharacterMatchTypeRange) {
value.range.min_character = bounds.first;
value.range.max_character = bounds.second;
}
Character::Character(char character) : matches({ CharacterMatch(character) }), sign(true) {}
Character::Character(CharacterClass char_class) : matches({ CharacterMatch(char_class) }), sign(true) {}
Character::Character(const std::vector<CharacterMatch> &matches, bool sign) : matches(matches), sign(sign) {}
bool CharacterMatch::operator==(const CharacterMatch &right) const {
if (type != right.type)
return false;
switch (type) {
case CharacterMatchTypeClass:
return (value.character_class == right.value.character_class);
case CharacterMatchTypeSpecific:
return (value.character == right.value.character);
case CharacterMatchTypeRange:
return (value.range.min_character == right.value.range.min_character &&
value.range.max_character == right.value.range.max_character);
}
}
string CharacterMatch::to_string() const {
switch (type) {
case CharacterMatchTypeClass:
switch (value.character_class) {
case CharClassDigit:
return "<digit>";
case CharClassWord:
return "<word>";
}
case CharacterMatchTypeSpecific:
return (value.character == '\0') ?
"<EOF>" :
string("'") + value.character + "'";
case CharacterMatchTypeRange:
return (string("'") +
value.range.min_character + "'-'" +
value.range.max_character + "'");
}
}
bool Character::operator==(const Rule &rule) const {
const Character *other = dynamic_cast<const Character *>(&rule);
return other && (other->value == value);
if (!other) return false;
auto size = matches.size();
if (other->matches.size() != size) return false;
for (int i = 0; i < size; i++)
if (!(matches[i] == other->matches[i])) return false;
return true;
}
size_t Character::hash_code() const {
return typeid(this).hash_code() ^ hash<string>()(CharMatchToString(value));
return typeid(this).hash_code() ^ hash<string>()(to_string());
}
rule_ptr Character::copy() const {
@ -23,7 +70,10 @@ namespace tree_sitter {
}
string Character::to_string() const {
return string("#<char ") + CharMatchToString(value) + ">";
string prefix("#<char");
for (auto &match : matches)
prefix += " " + match.to_string();
return prefix + ">";
}
void Character::accept(Visitor &visitor) const {

View file

@ -2,15 +2,46 @@
#define __tree_sitter__char__
#include "rule.h"
#include "char_match.h"
#include <vector>
#include <unordered_set>
namespace tree_sitter {
namespace rules {
typedef enum {
CharClassWord,
CharClassDigit
} CharacterClass;
typedef enum {
CharacterMatchTypeSpecific,
CharacterMatchTypeClass,
CharacterMatchTypeRange,
} CharacterMatchType;
struct CharacterMatch {
CharacterMatchType type;
union {
CharacterClass character_class;
char character;
struct {
char min_character;
char max_character;
} range;
} value;
CharacterMatch(char);
CharacterMatch(std::pair<char, char>);
CharacterMatch(CharacterClass);
bool operator==(const CharacterMatch &) const;
std::string to_string() const;
};
class Character : public Rule {
public:
Character(char character);
Character(CharClass character_class);
Character(CharacterClass character_class);
Character(char min_character, char max_character);
Character(const std::vector<CharacterMatch> &matches, bool sign);
bool operator==(const Rule& other) const;
size_t hash_code() const;
@ -18,9 +49,35 @@ namespace tree_sitter {
std::string to_string() const;
void accept(Visitor &visitor) const;
const CharMatch value;
std::vector<CharacterMatch> matches;
bool sign;
};
}
}
namespace std {
template<>
struct hash<tree_sitter::rules::CharacterMatch> {
size_t operator()(const tree_sitter::rules::CharacterMatch &match) const {
auto type = match.type;
auto result = hash<short int>()(type);
switch (type) {
case tree_sitter::rules::CharacterMatchTypeClass:
result ^= hash<short int>()(match.value.character_class);
case tree_sitter::rules::CharacterMatchTypeRange:
result ^= hash<char>()(match.value.range.min_character);
result ^= hash<char>()(match.value.range.max_character);
case tree_sitter::rules::CharacterMatchTypeSpecific:
result ^= hash<char>()(match.value.character);
}
return result;
}
};
}
namespace std {
template<>
struct hash<tree_sitter::rules::Character> : hash<tree_sitter::rules::Rule> {};
}
#endif

View file

@ -14,9 +14,13 @@ namespace tree_sitter {
return make_shared<Character>(value);
}
rule_ptr character(CharClass value) {
rule_ptr character(CharacterClass value) {
return make_shared<Character>(value);
}
rule_ptr character(const std::vector<CharacterMatch> &matches, bool is_affirmative) {
return make_shared<Character>(matches, is_affirmative);
}
rule_ptr choice(const initializer_list<rule_ptr> &rules) {
rule_ptr result;

View file

@ -16,8 +16,10 @@ namespace tree_sitter {
namespace rules {
rule_ptr blank();
rule_ptr character(char value);
rule_ptr character(char min, char max);
rule_ptr character(CharClass value);
rule_ptr character(CharacterClass value);
rule_ptr character(const std::vector<CharacterMatch> &matches);
rule_ptr character(const std::vector<CharacterMatch> &matches, bool);
rule_ptr choice(const std::initializer_list<rule_ptr> &rules);
rule_ptr pattern(const std::string &value);
rule_ptr repeat(const rule_ptr content);

View file

@ -1,4 +1,5 @@
#include "rules.h"
#include <unordered_map>
using std::string;
using std::hash;