Simplify logic for extracting tokens from grammar
This commit is contained in:
parent
946088bccc
commit
713b3899c5
6 changed files with 33 additions and 111 deletions
|
|
@ -35,13 +35,7 @@ namespace test_grammars {
|
|||
_sym("left_bracket"),
|
||||
comma_sep(sym("value")),
|
||||
_sym("right_bracket"), }) },
|
||||
{ "string", seq({
|
||||
str("\""),
|
||||
repeat(choice({
|
||||
pattern("[^\"]"),
|
||||
str("\\\""),
|
||||
})),
|
||||
str("\"") }) },
|
||||
{ "string", pattern("\"([^\"]|\\\\\")+\"") },
|
||||
{ "number", pattern("\\d+") },
|
||||
{ "comma", str(",") },
|
||||
{ "colon", str(":") },
|
||||
|
|
|
|||
|
|
@ -10,17 +10,14 @@ using prepare_grammar::perform;
|
|||
|
||||
describe("preparing a grammar", []() {
|
||||
describe("extracting tokens", []() {
|
||||
it("moves sub-rules that don't contain symbols into a separate 'lexical' grammar", [&]() {
|
||||
it("moves strings and patterns into a separate 'lexical' grammar", [&]() {
|
||||
pair<PreparedGrammar, PreparedGrammar> result = perform(Grammar("rule1", {
|
||||
{ "rule1", seq({
|
||||
character({ 'a' }),
|
||||
character({ 'b' }),
|
||||
str("ab"),
|
||||
seq({
|
||||
sym("rule2"),
|
||||
sym("rule3") }),
|
||||
seq({
|
||||
character({ 'a' }),
|
||||
character({ 'b' }) }) }) }
|
||||
str("ab") }) }
|
||||
}));
|
||||
|
||||
AssertThat(result.first, Equals(PreparedGrammar("rule1", {
|
||||
|
|
@ -33,18 +30,14 @@ describe("preparing a grammar", []() {
|
|||
}, {})));
|
||||
|
||||
AssertThat(result.second, Equals(PreparedGrammar("", {}, {
|
||||
{ "token1", rules::seq({
|
||||
rules::character({ 'a' }),
|
||||
rules::character({ 'b' }) }) },
|
||||
{ "token1", str("ab") },
|
||||
})));
|
||||
});
|
||||
|
||||
it("moves entire rules into the lexical grammar when possible, preserving their names", [&]() {
|
||||
auto result = perform(Grammar("rule1", {
|
||||
{ "rule1", sym("rule2") },
|
||||
{ "rule2", seq({
|
||||
character({ 'a' }),
|
||||
character({ 'b' }) }) }
|
||||
{ "rule2", pattern("a|b") }
|
||||
}));
|
||||
|
||||
AssertThat(result.first, Equals(PreparedGrammar("rule1", {
|
||||
|
|
@ -52,9 +45,7 @@ describe("preparing a grammar", []() {
|
|||
}, {})));
|
||||
|
||||
AssertThat(result.second, Equals(PreparedGrammar("", {
|
||||
{ "rule2", seq({
|
||||
character({ 'a' }),
|
||||
character({ 'b' }) }) },
|
||||
{ "rule2", pattern("a|b") },
|
||||
}, {})));
|
||||
});
|
||||
|
||||
|
|
@ -97,28 +88,6 @@ describe("preparing a grammar", []() {
|
|||
}) }
|
||||
})));
|
||||
});
|
||||
|
||||
it("does not replace repeat rules that can be moved into the lexical grammar", [&]() {
|
||||
pair<PreparedGrammar, PreparedGrammar> result = perform(Grammar("rule1", {
|
||||
{ "rule1", seq({
|
||||
sym("x"),
|
||||
repeat(seq({ str("a"), str("b") })),
|
||||
sym("y")
|
||||
}) },
|
||||
}));
|
||||
|
||||
AssertThat(result.first, Equals(PreparedGrammar("rule1", {
|
||||
{ "rule1", seq({
|
||||
sym("x"),
|
||||
make_shared<Symbol>("token1", SymbolTypeAuxiliary),
|
||||
sym("y")
|
||||
}) },
|
||||
}, {})));
|
||||
|
||||
AssertThat(result.second, Equals(PreparedGrammar("", {}, {
|
||||
{ "token1", repeat(seq({ str("a"), str("b") })) },
|
||||
})));
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
#include "extract_tokens.h"
|
||||
#include "search_for_symbols.h"
|
||||
#include "tree_sitter/compiler.h"
|
||||
#include "prepared_grammar.h"
|
||||
#include "rules/visitor.h"
|
||||
|
|
@ -17,15 +16,38 @@ namespace tree_sitter {
|
|||
using std::map;
|
||||
using std::make_shared;
|
||||
using namespace rules;
|
||||
|
||||
|
||||
namespace prepare_grammar {
|
||||
class TokenChecker : public Visitor {
|
||||
public:
|
||||
bool value;
|
||||
|
||||
void default_visit(const Rule *rule) {
|
||||
value = false;
|
||||
}
|
||||
|
||||
void visit(const String *rule) {
|
||||
value = true;
|
||||
}
|
||||
|
||||
void visit(const Pattern *rule) {
|
||||
value = true;
|
||||
}
|
||||
};
|
||||
|
||||
bool is_token(const rule_ptr &rule) {
|
||||
TokenChecker checker;
|
||||
rule->accept(checker);
|
||||
return checker.value;
|
||||
}
|
||||
|
||||
class TokenExtractor : Visitor {
|
||||
public:
|
||||
rule_ptr value;
|
||||
map<const string, const rule_ptr> tokens;
|
||||
|
||||
rule_ptr initial_apply(const rule_ptr rule) {
|
||||
if (!search_for_symbols(rule)) {
|
||||
if (is_token(rule)) {
|
||||
return rule_ptr();
|
||||
} else {
|
||||
return apply(rule);
|
||||
|
|
@ -33,7 +55,7 @@ namespace tree_sitter {
|
|||
}
|
||||
|
||||
rule_ptr apply(const rule_ptr rule) {
|
||||
if (search_for_symbols(rule) || rule->operator==(Blank())) {
|
||||
if (!is_token(rule) || rule->operator==(Blank())) {
|
||||
rule->accept(*this);
|
||||
return value;
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -1,45 +0,0 @@
|
|||
#include "search_for_symbols.h"
|
||||
#include "rules/visitor.h"
|
||||
#include "rules/choice.h"
|
||||
#include "rules/seq.h"
|
||||
#include "rules/repeat.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
using namespace rules;
|
||||
|
||||
namespace prepare_grammar {
|
||||
class SymbolSearcher : rules::Visitor {
|
||||
public:
|
||||
bool value;
|
||||
|
||||
bool apply(const rule_ptr rule) {
|
||||
rule->accept(*this);
|
||||
return value;
|
||||
}
|
||||
|
||||
void default_visit(const Rule *rule) {
|
||||
value = false;
|
||||
}
|
||||
|
||||
void visit(const Symbol *symbol) {
|
||||
value = true;
|
||||
}
|
||||
|
||||
void visit(const Choice *choice) {
|
||||
value = apply(choice->left) || apply(choice->right);
|
||||
}
|
||||
|
||||
void visit(const Seq *seq) {
|
||||
value = apply(seq->left) || apply(seq->right);
|
||||
}
|
||||
|
||||
void visit(const Repeat *rule) {
|
||||
value = apply(rule->content);
|
||||
}
|
||||
};
|
||||
|
||||
bool search_for_symbols(const rule_ptr &rule) {
|
||||
return SymbolSearcher().apply(rule);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,12 +0,0 @@
|
|||
#ifndef __tree_sitter__search_for_symbols__
|
||||
#define __tree_sitter__search_for_symbols__
|
||||
|
||||
#include "tree_sitter/compiler.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
namespace prepare_grammar {
|
||||
bool search_for_symbols(const rules::rule_ptr &);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -33,7 +33,6 @@
|
|||
12EDCF8D187C6282005A7A07 /* document.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12EDCF8C187C6282005A7A07 /* document.cpp */; };
|
||||
12EDCF981881FCD5005A7A07 /* extract_tokens.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12EDCF8F1881FCCA005A7A07 /* extract_tokens.cpp */; };
|
||||
12EDCF991881FCD9005A7A07 /* perform.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12EDCF911881FCCA005A7A07 /* perform.cpp */; };
|
||||
12EDCF9A1881FCD9005A7A07 /* search_for_symbols.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12EDCF931881FCCA005A7A07 /* search_for_symbols.cpp */; };
|
||||
12EDCFAF18820387005A7A07 /* parse_table.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12EDCF9D18820116005A7A07 /* parse_table.cpp */; };
|
||||
12EDCFB018820392005A7A07 /* item.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12EDCFA218820137005A7A07 /* item.cpp */; };
|
||||
12EDCFB21882039A005A7A07 /* perform.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12EDCFA418820137005A7A07 /* perform.cpp */; };
|
||||
|
|
@ -137,8 +136,6 @@
|
|||
12EDCF901881FCCA005A7A07 /* extract_tokens.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = extract_tokens.h; path = src/compiler/prepare_grammar/extract_tokens.h; sourceTree = SOURCE_ROOT; };
|
||||
12EDCF911881FCCA005A7A07 /* perform.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = perform.cpp; path = src/compiler/prepare_grammar/perform.cpp; sourceTree = SOURCE_ROOT; };
|
||||
12EDCF921881FCCA005A7A07 /* perform.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = perform.h; path = src/compiler/prepare_grammar/perform.h; sourceTree = SOURCE_ROOT; };
|
||||
12EDCF931881FCCA005A7A07 /* search_for_symbols.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = search_for_symbols.cpp; path = src/compiler/prepare_grammar/search_for_symbols.cpp; sourceTree = SOURCE_ROOT; };
|
||||
12EDCF941881FCCA005A7A07 /* search_for_symbols.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = search_for_symbols.h; path = src/compiler/prepare_grammar/search_for_symbols.h; sourceTree = SOURCE_ROOT; };
|
||||
12EDCF9C18820116005A7A07 /* lex_table.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = lex_table.h; sourceTree = "<group>"; };
|
||||
12EDCF9D18820116005A7A07 /* parse_table.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = parse_table.cpp; sourceTree = "<group>"; };
|
||||
12EDCF9E18820116005A7A07 /* parse_table.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = parse_table.h; sourceTree = "<group>"; };
|
||||
|
|
@ -352,8 +349,6 @@
|
|||
12EDCF901881FCCA005A7A07 /* extract_tokens.h */,
|
||||
12EDCF911881FCCA005A7A07 /* perform.cpp */,
|
||||
12EDCF921881FCCA005A7A07 /* perform.h */,
|
||||
12EDCF931881FCCA005A7A07 /* search_for_symbols.cpp */,
|
||||
12EDCF941881FCCA005A7A07 /* search_for_symbols.h */,
|
||||
);
|
||||
name = prepare_grammar;
|
||||
path = grammar;
|
||||
|
|
@ -540,7 +535,6 @@
|
|||
12F9A651182DD6BC00FAF50C /* grammar.cpp in Sources */,
|
||||
12D136A4183678A2005F3369 /* repeat.cpp in Sources */,
|
||||
1225CC6418765693000D4723 /* prepare_grammar_spec.cpp in Sources */,
|
||||
12EDCF9A1881FCD9005A7A07 /* search_for_symbols.cpp in Sources */,
|
||||
12EDCFB21882039A005A7A07 /* perform.cpp in Sources */,
|
||||
1236A7D218B554C800593ABB /* prepared_grammar.cpp in Sources */,
|
||||
12FD40E718639B910041A84E /* visitor.cpp in Sources */,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue