diff --git a/examples/grammars/json.hpp b/examples/grammars/json.hpp index 9bd1364d..cb194e7d 100644 --- a/examples/grammars/json.hpp +++ b/examples/grammars/json.hpp @@ -35,13 +35,7 @@ namespace test_grammars { _sym("left_bracket"), comma_sep(sym("value")), _sym("right_bracket"), }) }, - { "string", seq({ - str("\""), - repeat(choice({ - pattern("[^\"]"), - str("\\\""), - })), - str("\"") }) }, + { "string", pattern("\"([^\"]|\\\\\")+\"") }, { "number", pattern("\\d+") }, { "comma", str(",") }, { "colon", str(":") }, diff --git a/spec/compiler/prepare_grammar_spec.cpp b/spec/compiler/prepare_grammar_spec.cpp index b9e8f36a..3af1fca7 100644 --- a/spec/compiler/prepare_grammar_spec.cpp +++ b/spec/compiler/prepare_grammar_spec.cpp @@ -10,17 +10,14 @@ using prepare_grammar::perform; describe("preparing a grammar", []() { describe("extracting tokens", []() { - it("moves sub-rules that don't contain symbols into a separate 'lexical' grammar", [&]() { + it("moves strings and patterns into a separate 'lexical' grammar", [&]() { pair result = perform(Grammar("rule1", { { "rule1", seq({ - character({ 'a' }), - character({ 'b' }), + str("ab"), seq({ sym("rule2"), sym("rule3") }), - seq({ - character({ 'a' }), - character({ 'b' }) }) }) } + str("ab") }) } })); AssertThat(result.first, Equals(PreparedGrammar("rule1", { @@ -33,18 +30,14 @@ describe("preparing a grammar", []() { }, {}))); AssertThat(result.second, Equals(PreparedGrammar("", {}, { - { "token1", rules::seq({ - rules::character({ 'a' }), - rules::character({ 'b' }) }) }, + { "token1", str("ab") }, }))); }); it("moves entire rules into the lexical grammar when possible, preserving their names", [&]() { auto result = perform(Grammar("rule1", { { "rule1", sym("rule2") }, - { "rule2", seq({ - character({ 'a' }), - character({ 'b' }) }) } + { "rule2", pattern("a|b") } })); AssertThat(result.first, Equals(PreparedGrammar("rule1", { @@ -52,9 +45,7 @@ describe("preparing a grammar", []() { }, {}))); AssertThat(result.second, Equals(PreparedGrammar("", { - { "rule2", seq({ - character({ 'a' }), - character({ 'b' }) }) }, + { "rule2", pattern("a|b") }, }, {}))); }); @@ -97,28 +88,6 @@ describe("preparing a grammar", []() { }) } }))); }); - - it("does not replace repeat rules that can be moved into the lexical grammar", [&]() { - pair result = perform(Grammar("rule1", { - { "rule1", seq({ - sym("x"), - repeat(seq({ str("a"), str("b") })), - sym("y") - }) }, - })); - - AssertThat(result.first, Equals(PreparedGrammar("rule1", { - { "rule1", seq({ - sym("x"), - make_shared("token1", SymbolTypeAuxiliary), - sym("y") - }) }, - }, {}))); - - AssertThat(result.second, Equals(PreparedGrammar("", {}, { - { "token1", repeat(seq({ str("a"), str("b") })) }, - }))); - }); }); }); diff --git a/src/compiler/prepare_grammar/extract_tokens.cpp b/src/compiler/prepare_grammar/extract_tokens.cpp index 74364b93..bc78f9b3 100644 --- a/src/compiler/prepare_grammar/extract_tokens.cpp +++ b/src/compiler/prepare_grammar/extract_tokens.cpp @@ -1,5 +1,4 @@ #include "extract_tokens.h" -#include "search_for_symbols.h" #include "tree_sitter/compiler.h" #include "prepared_grammar.h" #include "rules/visitor.h" @@ -17,15 +16,38 @@ namespace tree_sitter { using std::map; using std::make_shared; using namespace rules; - + namespace prepare_grammar { + class TokenChecker : public Visitor { + public: + bool value; + + void default_visit(const Rule *rule) { + value = false; + } + + void visit(const String *rule) { + value = true; + } + + void visit(const Pattern *rule) { + value = true; + } + }; + + bool is_token(const rule_ptr &rule) { + TokenChecker checker; + rule->accept(checker); + return checker.value; + } + class TokenExtractor : Visitor { public: rule_ptr value; map tokens; rule_ptr initial_apply(const rule_ptr rule) { - if (!search_for_symbols(rule)) { + if (is_token(rule)) { return rule_ptr(); } else { return apply(rule); @@ -33,7 +55,7 @@ namespace tree_sitter { } rule_ptr apply(const rule_ptr rule) { - if (search_for_symbols(rule) || rule->operator==(Blank())) { + if (!is_token(rule) || rule->operator==(Blank())) { rule->accept(*this); return value; } else { diff --git a/src/compiler/prepare_grammar/search_for_symbols.cpp b/src/compiler/prepare_grammar/search_for_symbols.cpp deleted file mode 100644 index 66869afa..00000000 --- a/src/compiler/prepare_grammar/search_for_symbols.cpp +++ /dev/null @@ -1,45 +0,0 @@ -#include "search_for_symbols.h" -#include "rules/visitor.h" -#include "rules/choice.h" -#include "rules/seq.h" -#include "rules/repeat.h" - -namespace tree_sitter { - using namespace rules; - - namespace prepare_grammar { - class SymbolSearcher : rules::Visitor { - public: - bool value; - - bool apply(const rule_ptr rule) { - rule->accept(*this); - return value; - } - - void default_visit(const Rule *rule) { - value = false; - } - - void visit(const Symbol *symbol) { - value = true; - } - - void visit(const Choice *choice) { - value = apply(choice->left) || apply(choice->right); - } - - void visit(const Seq *seq) { - value = apply(seq->left) || apply(seq->right); - } - - void visit(const Repeat *rule) { - value = apply(rule->content); - } - }; - - bool search_for_symbols(const rule_ptr &rule) { - return SymbolSearcher().apply(rule); - } - } -} \ No newline at end of file diff --git a/src/compiler/prepare_grammar/search_for_symbols.h b/src/compiler/prepare_grammar/search_for_symbols.h deleted file mode 100644 index 6023e556..00000000 --- a/src/compiler/prepare_grammar/search_for_symbols.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef __tree_sitter__search_for_symbols__ -#define __tree_sitter__search_for_symbols__ - -#include "tree_sitter/compiler.h" - -namespace tree_sitter { - namespace prepare_grammar { - bool search_for_symbols(const rules::rule_ptr &); - } -} - -#endif diff --git a/tree_sitter.xcodeproj/project.pbxproj b/tree_sitter.xcodeproj/project.pbxproj index a1ba2c9f..c6f5d1a2 100644 --- a/tree_sitter.xcodeproj/project.pbxproj +++ b/tree_sitter.xcodeproj/project.pbxproj @@ -33,7 +33,6 @@ 12EDCF8D187C6282005A7A07 /* document.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12EDCF8C187C6282005A7A07 /* document.cpp */; }; 12EDCF981881FCD5005A7A07 /* extract_tokens.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12EDCF8F1881FCCA005A7A07 /* extract_tokens.cpp */; }; 12EDCF991881FCD9005A7A07 /* perform.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12EDCF911881FCCA005A7A07 /* perform.cpp */; }; - 12EDCF9A1881FCD9005A7A07 /* search_for_symbols.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12EDCF931881FCCA005A7A07 /* search_for_symbols.cpp */; }; 12EDCFAF18820387005A7A07 /* parse_table.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12EDCF9D18820116005A7A07 /* parse_table.cpp */; }; 12EDCFB018820392005A7A07 /* item.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12EDCFA218820137005A7A07 /* item.cpp */; }; 12EDCFB21882039A005A7A07 /* perform.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12EDCFA418820137005A7A07 /* perform.cpp */; }; @@ -137,8 +136,6 @@ 12EDCF901881FCCA005A7A07 /* extract_tokens.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = extract_tokens.h; path = src/compiler/prepare_grammar/extract_tokens.h; sourceTree = SOURCE_ROOT; }; 12EDCF911881FCCA005A7A07 /* perform.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = perform.cpp; path = src/compiler/prepare_grammar/perform.cpp; sourceTree = SOURCE_ROOT; }; 12EDCF921881FCCA005A7A07 /* perform.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = perform.h; path = src/compiler/prepare_grammar/perform.h; sourceTree = SOURCE_ROOT; }; - 12EDCF931881FCCA005A7A07 /* search_for_symbols.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = search_for_symbols.cpp; path = src/compiler/prepare_grammar/search_for_symbols.cpp; sourceTree = SOURCE_ROOT; }; - 12EDCF941881FCCA005A7A07 /* search_for_symbols.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = search_for_symbols.h; path = src/compiler/prepare_grammar/search_for_symbols.h; sourceTree = SOURCE_ROOT; }; 12EDCF9C18820116005A7A07 /* lex_table.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = lex_table.h; sourceTree = ""; }; 12EDCF9D18820116005A7A07 /* parse_table.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = parse_table.cpp; sourceTree = ""; }; 12EDCF9E18820116005A7A07 /* parse_table.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = parse_table.h; sourceTree = ""; }; @@ -352,8 +349,6 @@ 12EDCF901881FCCA005A7A07 /* extract_tokens.h */, 12EDCF911881FCCA005A7A07 /* perform.cpp */, 12EDCF921881FCCA005A7A07 /* perform.h */, - 12EDCF931881FCCA005A7A07 /* search_for_symbols.cpp */, - 12EDCF941881FCCA005A7A07 /* search_for_symbols.h */, ); name = prepare_grammar; path = grammar; @@ -540,7 +535,6 @@ 12F9A651182DD6BC00FAF50C /* grammar.cpp in Sources */, 12D136A4183678A2005F3369 /* repeat.cpp in Sources */, 1225CC6418765693000D4723 /* prepare_grammar_spec.cpp in Sources */, - 12EDCF9A1881FCD9005A7A07 /* search_for_symbols.cpp in Sources */, 12EDCFB21882039A005A7A07 /* perform.cpp in Sources */, 1236A7D218B554C800593ABB /* prepared_grammar.cpp in Sources */, 12FD40E718639B910041A84E /* visitor.cpp in Sources */,