From ecd317ccd9d2ed86504432de95b5a8e03eaebbd6 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 14 Nov 2013 21:25:58 -0800 Subject: [PATCH] Compute transitions for regex pattern rules --- TreeSitter.xcodeproj/project.pbxproj | 17 +++++- spec/lr/parse_table_spec.cpp | 7 ++- spec/main.cpp | 4 +- spec/rules/pattern_spec.cpp | 63 ++++++++++++++++++++++ spec/{ => rules}/rules_spec.cpp | 22 ++++++-- src/rules/Pattern.cpp | 80 +++++++++++++++++++++++++++- src/rules/Pattern.h | 1 + src/rules/char.cpp | 4 +- src/rules/choice.cpp | 2 +- src/rules/rule.cpp | 13 +++++ src/rules/rule.h | 12 +++-- src/rules/seq.cpp | 2 +- src/rules/symbol.cpp | 2 +- 13 files changed, 206 insertions(+), 23 deletions(-) create mode 100644 spec/rules/pattern_spec.cpp rename spec/{ => rules}/rules_spec.cpp (82%) diff --git a/TreeSitter.xcodeproj/project.pbxproj b/TreeSitter.xcodeproj/project.pbxproj index ae91c5a9..1890ded5 100644 --- a/TreeSitter.xcodeproj/project.pbxproj +++ b/TreeSitter.xcodeproj/project.pbxproj @@ -23,6 +23,7 @@ 125120A018307DEC00C9B56A /* parse_table.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1251209E18307DEC00C9B56A /* parse_table.cpp */; }; 125120A4183083BD00C9B56A /* arithmetic.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 125120A3183083BD00C9B56A /* arithmetic.cpp */; }; 12D1369D18328C5A005F3369 /* item_spec.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12D1369C18328C5A005F3369 /* item_spec.cpp */; }; + 12D136A1183570F5005F3369 /* pattern_spec.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12D136A0183570F5005F3369 /* pattern_spec.cpp */; }; 12F9A64E182DD5FD00FAF50C /* spec_helper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12F9A64C182DD5FD00FAF50C /* spec_helper.cpp */; }; 12F9A651182DD6BC00FAF50C /* grammar.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12F9A64F182DD6BC00FAF50C /* grammar.cpp */; }; 27A343CA69E17E0F9EBEDF1C /* Pattern.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 27A340F3EEB184C040521323 /* Pattern.cpp */; }; @@ -131,7 +132,7 @@ 121492C5181E200B008E9BDA /* igloo_alt.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = igloo_alt.h; sourceTree = ""; }; 121492C6181E200B008E9BDA /* igloo_framework.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = igloo_framework.h; sourceTree = ""; }; 121492E9181E200B008E9BDA /* main.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = main.cpp; path = spec/main.cpp; sourceTree = SOURCE_ROOT; }; - 121492EA181E200B008E9BDA /* rules_spec.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = rules_spec.cpp; path = spec/rules_spec.cpp; sourceTree = SOURCE_ROOT; }; + 121492EA181E200B008E9BDA /* rules_spec.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = rules_spec.cpp; path = spec/rules/rules_spec.cpp; sourceTree = SOURCE_ROOT; }; 12512092182F307C00C9B56A /* parse_table_spec.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = parse_table_spec.cpp; path = spec/lr/parse_table_spec.cpp; sourceTree = SOURCE_ROOT; }; 1251209A1830145300C9B56A /* rule.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = rule.cpp; sourceTree = ""; }; 1251209E18307DEC00C9B56A /* parse_table.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = parse_table.cpp; sourceTree = ""; }; @@ -141,6 +142,7 @@ 12C344421822F27700B07BE3 /* transition_map.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = transition_map.h; sourceTree = ""; }; 12D1369C18328C5A005F3369 /* item_spec.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = item_spec.cpp; path = spec/lr/item_spec.cpp; sourceTree = SOURCE_ROOT; }; 12D1369E18342088005F3369 /* todo.md */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = todo.md; sourceTree = ""; }; + 12D136A0183570F5005F3369 /* pattern_spec.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = pattern_spec.cpp; path = spec/rules/pattern_spec.cpp; sourceTree = SOURCE_ROOT; }; 12E71794181D02A80051A649 /* specs */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = specs; sourceTree = BUILT_PRODUCTS_DIR; }; 12E71852181D081C0051A649 /* rules.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = rules.h; sourceTree = ""; }; 12F9A64C182DD5FD00FAF50C /* spec_helper.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = spec_helper.cpp; path = spec/spec_helper.cpp; sourceTree = SOURCE_ROOT; }; @@ -418,6 +420,16 @@ path = spec/test_grammars; sourceTree = ""; }; + 12D1369F18357066005F3369 /* rules */ = { + isa = PBXGroup; + children = ( + 121492EA181E200B008E9BDA /* rules_spec.cpp */, + 12D136A0183570F5005F3369 /* pattern_spec.cpp */, + ); + name = rules; + path = spec/rules; + sourceTree = ""; + }; 12E716F9181D010E0051A649 = { isa = PBXGroup; children = ( @@ -452,11 +464,11 @@ 12E71796181D02A80051A649 /* spec */ = { isa = PBXGroup; children = ( + 12D1369F18357066005F3369 /* rules */, 125120A118307FCA00C9B56A /* test_grammars */, 1214925C181E200B008E9BDA /* externals */, 1213061C182C854F00FCF928 /* lr */, 121492E9181E200B008E9BDA /* main.cpp */, - 121492EA181E200B008E9BDA /* rules_spec.cpp */, 12F9A64C182DD5FD00FAF50C /* spec_helper.cpp */, 12F9A64D182DD5FD00FAF50C /* spec_helper.h */, ); @@ -515,6 +527,7 @@ buildActionMask = 2147483647; files = ( 12130614182C3A1700FCF928 /* seq.cpp in Sources */, + 12D136A1183570F5005F3369 /* pattern_spec.cpp in Sources */, 125120A4183083BD00C9B56A /* arithmetic.cpp in Sources */, 1214930F181E200B008E9BDA /* rules_spec.cpp in Sources */, 1213061B182C84DF00FCF928 /* item.cpp in Sources */, diff --git a/spec/lr/parse_table_spec.cpp b/spec/lr/parse_table_spec.cpp index 4d022806..f7749826 100644 --- a/spec/lr/parse_table_spec.cpp +++ b/spec/lr/parse_table_spec.cpp @@ -1,7 +1,10 @@ #include "spec_helper.h" +#include "../test_grammars/arithmetic.h" +using namespace tree_sitter::lr; -Describe(parse_table_construction) { - Describe(the_starting_state) { +Describe(build_parse_tables) { + Describe(lexing_tables) { + }; }; diff --git a/spec/main.cpp b/spec/main.cpp index 5134b938..fa1b4f29 100644 --- a/spec/main.cpp +++ b/spec/main.cpp @@ -1,8 +1,6 @@ #include -using namespace igloo; - int main(int argc, char *argv[]) { - return TestRunner::RunAllTests(argc, argv); + return igloo::TestRunner::RunAllTests(argc, argv); } \ No newline at end of file diff --git a/spec/rules/pattern_spec.cpp b/spec/rules/pattern_spec.cpp new file mode 100644 index 00000000..2be10058 --- /dev/null +++ b/spec/rules/pattern_spec.cpp @@ -0,0 +1,63 @@ +#include "spec_helper.h" +#include "rules.h" +#include "transition_map.h" + +using namespace tree_sitter::rules; + +Describe(pattern_rules) { + It(parses_simple_strings) { + pattern_ptr rule = pattern("abc"); + AssertThat( + rule->to_rule_tree()->to_string(), + Equals(seq({ + character('a'), + character('b'), + character('c') + })->to_string())); + }; + + It(parses_choices) { + pattern_ptr rule = pattern("ab|cd|ef"); + AssertThat( + rule->to_rule_tree()->to_string(), + Equals(choice({ + seq({ + character('a'), + character('b'), + }), + seq({ + character('c'), + character('d') + }), + seq({ + character('e'), + character('f') + }) + })->to_string())); + }; + + It(parses_choices_in_sequences) { + pattern_ptr rule = pattern("(a|b)cd"); + AssertThat( + rule->to_rule_tree()->to_string(), + Equals(seq({ + choice({ + character('a'), + character('b'), + }), + character('c'), + character('d') + })->to_string())); + }; + + It(parses_special_characters_when_they_are_escaped) { + pattern_ptr rule = pattern("a\\(b"); + AssertThat( + rule->to_rule_tree()->to_string(), + Equals(seq({ + character('a'), + character('('), + character('b') + })->to_string())); + } +}; \ No newline at end of file diff --git a/spec/rules_spec.cpp b/spec/rules/rules_spec.cpp similarity index 82% rename from spec/rules_spec.cpp rename to spec/rules/rules_spec.cpp index ddf36c82..0aa2ddbc 100644 --- a/spec/rules_spec.cpp +++ b/spec/rules/rules_spec.cpp @@ -11,11 +11,11 @@ Describe(Rules) { It(constructs_binary_trees) { AssertThat( rules::seq({ symbol1, symbol2, symbol3 })->to_string(), - Equals(std::string("(seq (sym '1') (seq (sym '2') (sym '3')))"))); + Equals(std::string("(seq (seq (sym '1') (sym '2')) (sym '3'))"))); AssertThat( rules::choice({ symbol1, symbol2, symbol3 })->to_string(), - Equals(std::string("(choice (sym '1') (choice (sym '2') (sym '3')))"))); + Equals(std::string("(choice (choice (sym '1') (sym '2')) (sym '3'))"))); } }; @@ -65,8 +65,11 @@ Describe(Rules) { It(handles_long_sequences) { AssertThat( rules::seq({ - rules::seq({ symbol1, symbol2 }), - rules::seq({ symbol3, symbol4 }) })->transitions(), + symbol1, + symbol2, + symbol3, + symbol4 + })->transitions(), EqualsTransitionMap(TransitionMap( { symbol1 }, { rules::seq({ symbol2, symbol3, symbol4 }) } @@ -92,5 +95,14 @@ Describe(Rules) { { rules::seq({ rules::character('a'), rules::character('d') }) } ))); } - }; + + It(handles_patterns) { + AssertThat( + rules::pattern("a|b")->transitions(), + EqualsTransitionMap(TransitionMap( + { rules::character('a'), rules::character('b') }, + { rules::blank(), rules::blank() } + ))); + } +}; }; diff --git a/src/rules/Pattern.cpp b/src/rules/Pattern.cpp index 3bdd97f5..9e26db1a 100644 --- a/src/rules/Pattern.cpp +++ b/src/rules/Pattern.cpp @@ -1,8 +1,80 @@ +#include "choice.h" +#include "seq.h" #include "Pattern.h" #include "transition_map.h" +using namespace std; + namespace tree_sitter { namespace rules { + class PatternParser { + public: + PatternParser(const string &input) : + input(input), + position(0), + length(input.length()) {} + + rule_ptr rule() { + auto result = term(); + while (has_more_input() && peek() == '|') { + next(); + result = choice({ result, term() }); + } + return result; + } + + private: + rule_ptr term() { + rule_ptr result = factor(); + while (has_more_input() && (peek() != '|') && (peek() != ')')) + result = seq({ result, factor() }); + return result; + } + + rule_ptr factor() { + return atom(); + } + + rule_ptr atom() { + rule_ptr result; + switch (peek()) { + case '(': + next(); + result = rule(); + if (peek() == ')') { + next(); + return result; + } else { + throw std::string("Invalid regex pattern: ") + input; + } + break; + case '\\': + next(); + default: + result = character(peek()); + next(); + return result; + break; + } + } + + void next() { + position++; + } + + char peek() { + return input[position]; + } + + bool has_more_input() { + return position < length; + } + + const std::string input; + const size_t length; + int position; + }; + Pattern::Pattern(const std::string &string) : value(string) {}; pattern_ptr pattern(const std::string &value) { @@ -10,9 +82,13 @@ namespace tree_sitter { } TransitionMap Pattern::transitions() const { - return tree_sitter::TransitionMap(); + return to_rule_tree()->transitions(); } - + + rule_ptr Pattern::to_rule_tree() const { + return PatternParser(value).rule(); + } + bool Pattern::operator ==(tree_sitter::rules::Rule const &other) const { return false; } diff --git a/src/rules/Pattern.h b/src/rules/Pattern.h index d92254ca..7202d347 100644 --- a/src/rules/Pattern.h +++ b/src/rules/Pattern.h @@ -11,6 +11,7 @@ namespace tree_sitter { TransitionMap transitions() const; bool operator==(const Rule& other) const; std::string to_string() const; + rule_ptr to_rule_tree() const; private: const std::string value; }; diff --git a/src/rules/char.cpp b/src/rules/char.cpp index c8d86f3c..cbe06a1f 100644 --- a/src/rules/char.cpp +++ b/src/rules/char.cpp @@ -18,11 +18,11 @@ namespace tree_sitter { bool Char::operator==(const Rule &rule) const { const Char *other = dynamic_cast(&rule); - return (other != nullptr) && (other->value == value); + return other && (other->value == value); } string Char::to_string() const { - return std::string("'") + &value + "'"; + return std::string("'") + value + "'"; } } } diff --git a/src/rules/choice.cpp b/src/rules/choice.cpp index 91fda915..7cf57b07 100644 --- a/src/rules/choice.cpp +++ b/src/rules/choice.cpp @@ -19,7 +19,7 @@ namespace tree_sitter { bool Choice::operator==(const Rule &rule) const { const Choice *other = dynamic_cast(&rule); - return (other != NULL) && (*other->left == *left) && (*other->right == *right); + return other && (*other->left == *left) && (*other->right == *right); } std::string Choice::to_string() const { diff --git a/src/rules/rule.cpp b/src/rules/rule.cpp index 97451c4e..5599bef5 100644 --- a/src/rules/rule.cpp +++ b/src/rules/rule.cpp @@ -2,10 +2,23 @@ namespace tree_sitter { namespace rules { + bool Rule::operator==(const rule_ptr other) const { + return true; + } + std::ostream& operator<<(std::ostream& stream, const Rule &rule) { stream << rule.to_string(); return stream; } + + std::ostream& operator<<(std::ostream& stream, const rule_ptr &rule) + { + if (rule.get() == nullptr) + stream << std::string(""); + else + stream << rule->to_string(); + return stream; + } } } diff --git a/src/rules/rule.h b/src/rules/rule.h index 2628e6d1..6a5cb1b9 100644 --- a/src/rules/rule.h +++ b/src/rules/rule.h @@ -7,22 +7,26 @@ namespace tree_sitter { template class TransitionMap; namespace rules { + class Rule; + typedef std::shared_ptr rule_ptr; + class Rule { public: virtual TransitionMap transitions() const = 0; virtual bool operator==(const Rule& other) const = 0; virtual std::string to_string() const = 0; + bool operator==(const rule_ptr other) const; }; - typedef std::shared_ptr rule_ptr; std::ostream& operator<<(std::ostream& stream, const Rule &rule); + std::ostream& operator<<(std::ostream& stream, const rule_ptr &rule); template rule_ptr build_binary_rule_tree(const std::initializer_list &rules) { - rule_ptr result(nullptr); - for (auto it = rules.end() - 1; it >= rules.begin(); --it) - result = result.get() ? std::make_shared(*it, result) : *it; + rule_ptr result; + for (auto rule : rules) + result = result.get() ? std::make_shared(result, rule) : rule; return result; } } diff --git a/src/rules/seq.cpp b/src/rules/seq.cpp index 70a8fa4a..173d1fc8 100644 --- a/src/rules/seq.cpp +++ b/src/rules/seq.cpp @@ -21,7 +21,7 @@ namespace tree_sitter { bool Seq::operator==(const Rule &rule) const { const Seq *other = dynamic_cast(&rule); - return (other != NULL) && (*other->left == *left) && (*other->right == *right); + return other && (*other->left == *left) && (*other->right == *right); } std::string Seq::to_string() const { diff --git a/src/rules/symbol.cpp b/src/rules/symbol.cpp index f9e59046..8b3f8c6c 100644 --- a/src/rules/symbol.cpp +++ b/src/rules/symbol.cpp @@ -16,7 +16,7 @@ namespace tree_sitter { bool Symbol::operator==(const Rule &rule) const { const Symbol *other = dynamic_cast(&rule); - return (other != NULL) && (other->name == name); + return other && (other->name == name); } std::string Symbol::to_string() const {