Simplify logic for extracting tokens from grammar

This commit is contained in:
Max Brunsfeld 2014-02-23 10:00:49 -08:00
parent 946088bccc
commit 713b3899c5
6 changed files with 33 additions and 111 deletions

View file

@ -35,13 +35,7 @@ namespace test_grammars {
_sym("left_bracket"),
comma_sep(sym("value")),
_sym("right_bracket"), }) },
{ "string", seq({
str("\""),
repeat(choice({
pattern("[^\"]"),
str("\\\""),
})),
str("\"") }) },
{ "string", pattern("\"([^\"]|\\\\\")+\"") },
{ "number", pattern("\\d+") },
{ "comma", str(",") },
{ "colon", str(":") },

View file

@ -10,17 +10,14 @@ using prepare_grammar::perform;
describe("preparing a grammar", []() {
describe("extracting tokens", []() {
it("moves sub-rules that don't contain symbols into a separate 'lexical' grammar", [&]() {
it("moves strings and patterns into a separate 'lexical' grammar", [&]() {
pair<PreparedGrammar, PreparedGrammar> result = perform(Grammar("rule1", {
{ "rule1", seq({
character({ 'a' }),
character({ 'b' }),
str("ab"),
seq({
sym("rule2"),
sym("rule3") }),
seq({
character({ 'a' }),
character({ 'b' }) }) }) }
str("ab") }) }
}));
AssertThat(result.first, Equals(PreparedGrammar("rule1", {
@ -33,18 +30,14 @@ describe("preparing a grammar", []() {
}, {})));
AssertThat(result.second, Equals(PreparedGrammar("", {}, {
{ "token1", rules::seq({
rules::character({ 'a' }),
rules::character({ 'b' }) }) },
{ "token1", str("ab") },
})));
});
it("moves entire rules into the lexical grammar when possible, preserving their names", [&]() {
auto result = perform(Grammar("rule1", {
{ "rule1", sym("rule2") },
{ "rule2", seq({
character({ 'a' }),
character({ 'b' }) }) }
{ "rule2", pattern("a|b") }
}));
AssertThat(result.first, Equals(PreparedGrammar("rule1", {
@ -52,9 +45,7 @@ describe("preparing a grammar", []() {
}, {})));
AssertThat(result.second, Equals(PreparedGrammar("", {
{ "rule2", seq({
character({ 'a' }),
character({ 'b' }) }) },
{ "rule2", pattern("a|b") },
}, {})));
});
@ -97,28 +88,6 @@ describe("preparing a grammar", []() {
}) }
})));
});
it("does not replace repeat rules that can be moved into the lexical grammar", [&]() {
pair<PreparedGrammar, PreparedGrammar> result = perform(Grammar("rule1", {
{ "rule1", seq({
sym("x"),
repeat(seq({ str("a"), str("b") })),
sym("y")
}) },
}));
AssertThat(result.first, Equals(PreparedGrammar("rule1", {
{ "rule1", seq({
sym("x"),
make_shared<Symbol>("token1", SymbolTypeAuxiliary),
sym("y")
}) },
}, {})));
AssertThat(result.second, Equals(PreparedGrammar("", {}, {
{ "token1", repeat(seq({ str("a"), str("b") })) },
})));
});
});
});

View file

@ -1,5 +1,4 @@
#include "extract_tokens.h"
#include "search_for_symbols.h"
#include "tree_sitter/compiler.h"
#include "prepared_grammar.h"
#include "rules/visitor.h"
@ -17,15 +16,38 @@ namespace tree_sitter {
using std::map;
using std::make_shared;
using namespace rules;
namespace prepare_grammar {
class TokenChecker : public Visitor {
public:
bool value;
void default_visit(const Rule *rule) {
value = false;
}
void visit(const String *rule) {
value = true;
}
void visit(const Pattern *rule) {
value = true;
}
};
bool is_token(const rule_ptr &rule) {
TokenChecker checker;
rule->accept(checker);
return checker.value;
}
class TokenExtractor : Visitor {
public:
rule_ptr value;
map<const string, const rule_ptr> tokens;
rule_ptr initial_apply(const rule_ptr rule) {
if (!search_for_symbols(rule)) {
if (is_token(rule)) {
return rule_ptr();
} else {
return apply(rule);
@ -33,7 +55,7 @@ namespace tree_sitter {
}
rule_ptr apply(const rule_ptr rule) {
if (search_for_symbols(rule) || rule->operator==(Blank())) {
if (!is_token(rule) || rule->operator==(Blank())) {
rule->accept(*this);
return value;
} else {

View file

@ -1,45 +0,0 @@
#include "search_for_symbols.h"
#include "rules/visitor.h"
#include "rules/choice.h"
#include "rules/seq.h"
#include "rules/repeat.h"
namespace tree_sitter {
using namespace rules;
namespace prepare_grammar {
class SymbolSearcher : rules::Visitor {
public:
bool value;
bool apply(const rule_ptr rule) {
rule->accept(*this);
return value;
}
void default_visit(const Rule *rule) {
value = false;
}
void visit(const Symbol *symbol) {
value = true;
}
void visit(const Choice *choice) {
value = apply(choice->left) || apply(choice->right);
}
void visit(const Seq *seq) {
value = apply(seq->left) || apply(seq->right);
}
void visit(const Repeat *rule) {
value = apply(rule->content);
}
};
bool search_for_symbols(const rule_ptr &rule) {
return SymbolSearcher().apply(rule);
}
}
}

View file

@ -1,12 +0,0 @@
#ifndef __tree_sitter__search_for_symbols__
#define __tree_sitter__search_for_symbols__
#include "tree_sitter/compiler.h"
namespace tree_sitter {
namespace prepare_grammar {
bool search_for_symbols(const rules::rule_ptr &);
}
}
#endif

View file

@ -33,7 +33,6 @@
12EDCF8D187C6282005A7A07 /* document.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12EDCF8C187C6282005A7A07 /* document.cpp */; };
12EDCF981881FCD5005A7A07 /* extract_tokens.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12EDCF8F1881FCCA005A7A07 /* extract_tokens.cpp */; };
12EDCF991881FCD9005A7A07 /* perform.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12EDCF911881FCCA005A7A07 /* perform.cpp */; };
12EDCF9A1881FCD9005A7A07 /* search_for_symbols.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12EDCF931881FCCA005A7A07 /* search_for_symbols.cpp */; };
12EDCFAF18820387005A7A07 /* parse_table.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12EDCF9D18820116005A7A07 /* parse_table.cpp */; };
12EDCFB018820392005A7A07 /* item.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12EDCFA218820137005A7A07 /* item.cpp */; };
12EDCFB21882039A005A7A07 /* perform.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 12EDCFA418820137005A7A07 /* perform.cpp */; };
@ -137,8 +136,6 @@
12EDCF901881FCCA005A7A07 /* extract_tokens.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = extract_tokens.h; path = src/compiler/prepare_grammar/extract_tokens.h; sourceTree = SOURCE_ROOT; };
12EDCF911881FCCA005A7A07 /* perform.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = perform.cpp; path = src/compiler/prepare_grammar/perform.cpp; sourceTree = SOURCE_ROOT; };
12EDCF921881FCCA005A7A07 /* perform.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = perform.h; path = src/compiler/prepare_grammar/perform.h; sourceTree = SOURCE_ROOT; };
12EDCF931881FCCA005A7A07 /* search_for_symbols.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = search_for_symbols.cpp; path = src/compiler/prepare_grammar/search_for_symbols.cpp; sourceTree = SOURCE_ROOT; };
12EDCF941881FCCA005A7A07 /* search_for_symbols.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = search_for_symbols.h; path = src/compiler/prepare_grammar/search_for_symbols.h; sourceTree = SOURCE_ROOT; };
12EDCF9C18820116005A7A07 /* lex_table.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = lex_table.h; sourceTree = "<group>"; };
12EDCF9D18820116005A7A07 /* parse_table.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = parse_table.cpp; sourceTree = "<group>"; };
12EDCF9E18820116005A7A07 /* parse_table.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = parse_table.h; sourceTree = "<group>"; };
@ -352,8 +349,6 @@
12EDCF901881FCCA005A7A07 /* extract_tokens.h */,
12EDCF911881FCCA005A7A07 /* perform.cpp */,
12EDCF921881FCCA005A7A07 /* perform.h */,
12EDCF931881FCCA005A7A07 /* search_for_symbols.cpp */,
12EDCF941881FCCA005A7A07 /* search_for_symbols.h */,
);
name = prepare_grammar;
path = grammar;
@ -540,7 +535,6 @@
12F9A651182DD6BC00FAF50C /* grammar.cpp in Sources */,
12D136A4183678A2005F3369 /* repeat.cpp in Sources */,
1225CC6418765693000D4723 /* prepare_grammar_spec.cpp in Sources */,
12EDCF9A1881FCD9005A7A07 /* search_for_symbols.cpp in Sources */,
12EDCFB21882039A005A7A07 /* perform.cpp in Sources */,
1236A7D218B554C800593ABB /* prepared_grammar.cpp in Sources */,
12FD40E718639B910041A84E /* visitor.cpp in Sources */,