Backfill tests for token extraction in auxiliary rules

This commit is contained in:
Max Brunsfeld 2014-05-12 13:21:14 -07:00
parent 608b5ce02b
commit 5245bc01fe
3 changed files with 19242 additions and 14280 deletions

File diff suppressed because it is too large Load diff

View file

@ -8,13 +8,13 @@ using namespace rules;
using prepare_grammar::extract_tokens;
describe("extracting tokens from a grammar", []() {
it("moves strings into the lexical grammar", [&]() {
it("moves string rules into the lexical grammar", [&]() {
pair<PreparedGrammar, PreparedGrammar> result = extract_tokens(PreparedGrammar({
{ "rule0", seq({ str("ab"), i_sym(0) }) }
{ "rule_A", seq({ str("ab"), i_sym(0) }) }
}, {}));
AssertThat(result.first, Equals(PreparedGrammar({
{ "rule0", seq({ i_aux_token(0), i_sym(0) }) }
{ "rule_A", seq({ i_aux_token(0), i_sym(0) }) }
}, {})));
AssertThat(result.second, Equals(PreparedGrammar({}, {
@ -22,13 +22,13 @@ describe("extracting tokens from a grammar", []() {
})));
});
it("moves patterns into the lexical grammar", [&]() {
it("moves pattern rules into the lexical grammar", [&]() {
pair<PreparedGrammar, PreparedGrammar> result = extract_tokens(PreparedGrammar({
{ "rule0", seq({ pattern("a+"), i_sym(0) }) }
{ "rule_A", seq({ pattern("a+"), i_sym(0) }) }
}, {}));
AssertThat(result.first, Equals(PreparedGrammar({
{ "rule0", seq({ i_aux_token(0), i_sym(0) }) }
{ "rule_A", seq({ i_aux_token(0), i_sym(0) }) }
}, {})));
AssertThat(result.second, Equals(PreparedGrammar({}, {
@ -38,13 +38,13 @@ describe("extracting tokens from a grammar", []() {
it("moves other rules marked as tokens into the lexical grammar", [&]() {
pair<PreparedGrammar, PreparedGrammar> result = extract_tokens(PreparedGrammar({
{ "rule0", seq({
{ "rule_A", seq({
token(choice({ str("a"), str("b") })),
i_sym(0) }) }
}, {}));
AssertThat(result.first, Equals(PreparedGrammar({
{ "rule0", seq({ i_aux_token(0), i_sym(0) }) }
{ "rule_A", seq({ i_aux_token(0), i_sym(0) }) }
}, {})));
AssertThat(result.second, Equals(PreparedGrammar({}, {
@ -52,13 +52,13 @@ describe("extracting tokens from a grammar", []() {
})));
});
it("does not extract blanks into tokens", [&]() {
it("does not extract blanks", [&]() {
pair<PreparedGrammar, PreparedGrammar> result = extract_tokens(PreparedGrammar({
{ "rule1", choice({ i_sym(0), blank() }) },
{ "rule_A", choice({ i_sym(0), blank() }) },
}, {}));
AssertThat(result.first, Equals(PreparedGrammar({
{ "rule1", choice({ i_sym(0), blank() }) },
{ "rule_A", choice({ i_sym(0), blank() }) },
}, {})));
AssertThat(result.second, Equals(PreparedGrammar({}, {})));
@ -66,11 +66,11 @@ describe("extracting tokens from a grammar", []() {
it("does not create duplicate tokens in the lexical grammar", [&]() {
pair<PreparedGrammar, PreparedGrammar> result = extract_tokens(PreparedGrammar({
{ "rule0", seq({ str("ab"), i_sym(0), str("ab") }) },
{ "rule_A", seq({ str("ab"), i_sym(0), str("ab") }) },
}, {}));
AssertThat(result.first, Equals(PreparedGrammar({
{ "rule0", seq({ i_aux_token(0), i_sym(0), i_aux_token(0) }) }
{ "rule_A", seq({ i_aux_token(0), i_sym(0), i_aux_token(0) }) }
}, {})));
AssertThat(result.second, Equals(PreparedGrammar({}, {
@ -78,52 +78,85 @@ describe("extracting tokens from a grammar", []() {
})));
});
it("moves entire rules into the lexical grammar when possible, updating referencing symbols", [&]() {
auto result = extract_tokens(PreparedGrammar({
{ "rule0", i_sym(1) },
{ "rule1", pattern("a|b") },
{ "rule2", token(seq({ str("a"), str("b") })) },
}, {}));
AssertThat(result.first, Equals(PreparedGrammar({
{ "rule0", i_token(0) }
}, {})));
AssertThat(result.second, Equals(PreparedGrammar({
{ "rule1", pattern("a|b") },
{ "rule2", token(seq({ str("a"), str("b") })) },
}, {})));
it("extracts tokens from the grammar's auxiliary rules", [&]() {
pair<PreparedGrammar, PreparedGrammar> result = extract_tokens(PreparedGrammar({}, {
{ "rule_A", seq({ str("ab"), i_sym(0) }) }
}));
AssertThat(result.first, Equals(PreparedGrammar({}, {
{ "rule_A", seq({ i_aux_token(0), i_sym(0) }) }
})));
AssertThat(result.second, Equals(PreparedGrammar({}, {
{ "token0", str("ab") },
})));
});
it("updates symbols whose indices need to change due to deleted rules", [&]() {
auto result = extract_tokens(PreparedGrammar({
{ "rule0", str("ab") },
{ "rule1", i_sym(0) },
{ "rule2", i_sym(1) },
}, {}));
AssertThat(result.first, Equals(PreparedGrammar({
{ "rule1", i_token(0) },
{ "rule2", i_sym(0) },
}, {})));
AssertThat(result.second, Equals(PreparedGrammar({
{ "rule0", str("ab") },
}, {})));
});
it("updates the grammar's ubiquitous_tokens", [&]() {
auto result = extract_tokens(PreparedGrammar({
{ "rule0", str("ab") },
{ "rule1", i_sym(0) },
{ "rule2", i_sym(1) },
}, {}, PreparedGrammarOptions({
{ Symbol(0) }
})));
AssertThat(result.first.options.ubiquitous_tokens, Equals(vector<Symbol>({
{ Symbol(0, SymbolOptionToken) }
})));
describe("when an entire rule can be extracted", [&]() {
it("moves the rule the lexical grammar when possible and updates referencing symbols", [&]() {
auto result = extract_tokens(PreparedGrammar({
{ "rule_A", i_sym(1) },
{ "rule_B", pattern("a|b") },
{ "rule_C", token(seq({ str("a"), str("b") })) },
}, {}));
AssertThat(result.first, Equals(PreparedGrammar({
{ "rule_A", i_token(0) }
}, {})));
AssertThat(result.second, Equals(PreparedGrammar({
{ "rule_B", pattern("a|b") },
{ "rule_C", token(seq({ str("a"), str("b") })) },
}, {})));
});
it("updates symbols whose indices need to change due to deleted rules", [&]() {
auto result = extract_tokens(PreparedGrammar({
{ "rule_A", str("ab") },
{ "rule_B", i_sym(0) },
{ "rule_C", i_sym(1) },
}, {}));
AssertThat(result.first, Equals(PreparedGrammar({
{ "rule_B", i_token(0) },
{ "rule_C", i_sym(0) },
}, {})));
AssertThat(result.second, Equals(PreparedGrammar({
{ "rule_A", str("ab") },
}, {})));
});
it("updates the grammar's ubiquitous_tokens", [&]() {
auto result = extract_tokens(PreparedGrammar({
{ "rule_A", str("ab") },
{ "rule_B", i_sym(0) },
{ "rule_C", i_sym(1) },
}, {}, PreparedGrammarOptions({
{ Symbol(0) }
})));
AssertThat(result.first.options.ubiquitous_tokens, Equals(vector<Symbol>({
{ Symbol(0, SymbolOptionToken) }
})));
});
it("extracts entire auxiliary rules", [&]() {
auto result = extract_tokens(PreparedGrammar({}, {
{ "rule_A", str("ab") },
{ "rule_B", i_aux_sym(0) },
{ "rule_C", i_aux_sym(1) },
}));
AssertThat(result.first, Equals(PreparedGrammar({}, {
{ "rule_B", i_aux_token(0) },
{ "rule_C", i_aux_sym(0) },
})));
AssertThat(result.second, Equals(PreparedGrammar({}, {
{ "rule_A", str("ab") },
})));
});
});
});

View file

@ -5,10 +5,6 @@
#include "tree_sitter/compiler.h"
#include "compiler/prepared_grammar.h"
#include "compiler/rules/visitor.h"
#include "compiler/rules/seq.h"
#include "compiler/rules/choice.h"
#include "compiler/rules/repeat.h"
#include "compiler/rules/blank.h"
#include "compiler/rules/symbol.h"
#include "compiler/rules/string.h"
#include "compiler/rules/metadata.h"
@ -23,7 +19,6 @@ namespace tree_sitter {
using std::make_shared;
using rules::rule_ptr;
using rules::Symbol;
using std::dynamic_pointer_cast;
namespace prepare_grammar {
class IsToken : public rules::RuleFn<bool> {
@ -51,49 +46,46 @@ namespace tree_sitter {
public:
Symbol replace_symbol(const Symbol &rule) {
if (rule.is_built_in()) return rule;
auto replacement_pair = replacements.find(rule);
if (replacement_pair != replacements.end())
return replacement_pair->second;
else if (rule.is_built_in())
return rule;
else
return Symbol(new_index_for_symbol(rule), rule.options);
}
SymbolInliner(const map<Symbol, Symbol> &replacements, size_t rule_count, size_t aux_rule_count) :
replacements(replacements)
{}
SymbolInliner(const map<Symbol, Symbol> &replacements) : replacements(replacements) {}
};
const rules::SymbolOption SymbolOptionAuxToken = rules::SymbolOption(rules::SymbolOptionToken|rules::SymbolOptionAuxiliary);
class TokenExtractor : public rules::IdentityRuleFn {
size_t add_token(rule_ptr rule) {
rule_ptr apply_to_token(const rules::Rule *rule) {
auto result = rule->copy();
for (size_t i = 0; i < tokens.size(); i++)
if (tokens[i].second->operator==(*rule))
return i;
return make_shared<Symbol>(i, SymbolOptionAuxToken);
size_t index = tokens.size();
tokens.push_back({ "token" + to_string(index), rule });
return index;
}
tokens.push_back({ "token" + to_string(index), result });
return make_shared<Symbol>(index, SymbolOptionAuxToken);
rule_ptr apply_to_token(const rules::rule_ptr rule) {
size_t index = add_token(rule);
return make_shared<rules::Symbol>(index, rules::SymbolOption(rules::SymbolOptionToken|rules::SymbolOptionAuxiliary));
}
rule_ptr default_apply(const rules::Rule *rule) {
auto result = rule->copy();
if (IsToken().apply(result)) {
return apply_to_token(result);
if (IsToken().apply(rule->copy())) {
return apply_to_token(rule);
} else {
return result;
}
}
rule_ptr apply_to(const rules::Metadata *rule) {
if (rule->value_for(rules::IS_TOKEN)) {
return apply_to_token(rule->copy());
auto result = rule->copy();
if (IsToken().apply(rule->copy())) {
return apply_to_token(rule);
} else {
return make_shared<rules::Metadata>(apply(rule->rule), rule->value);
return rules::IdentityRuleFn::apply_to(rule);
}
}
@ -103,6 +95,8 @@ namespace tree_sitter {
pair<PreparedGrammar, PreparedGrammar> extract_tokens(const PreparedGrammar &input_grammar) {
vector<pair<string, rule_ptr>> rules, tokens, aux_rules, aux_tokens;
vector<Symbol> ubiquitous_tokens;
TokenExtractor extractor;
map<Symbol, Symbol> symbol_replacements;
@ -112,7 +106,7 @@ namespace tree_sitter {
tokens.push_back(pair);
symbol_replacements.insert({
Symbol(i),
Symbol(tokens.size() - 1, rules::SymbolOption(rules::SymbolOptionToken))
Symbol(tokens.size() - 1, rules::SymbolOptionToken)
});
} else {
rules.push_back({ pair.first, extractor.apply(pair.second) });
@ -134,16 +128,13 @@ namespace tree_sitter {
aux_tokens.insert(aux_tokens.end(), extractor.tokens.begin(), extractor.tokens.end());
SymbolInliner inliner(symbol_replacements, input_grammar.rules.size(), input_grammar.aux_rules.size());
vector<Symbol> ubiquitous_tokens;
SymbolInliner inliner(symbol_replacements);
for (auto &pair : rules)
pair.second = inliner.apply(pair.second);
for (auto &pair : aux_rules)
pair.second = inliner.apply(pair.second);
for (auto &symbol : input_grammar.options.ubiquitous_tokens) {
for (auto &symbol : input_grammar.options.ubiquitous_tokens)
ubiquitous_tokens.push_back(inliner.replace_symbol(symbol));
}
PreparedGrammarOptions parse_options(input_grammar.options);
parse_options.ubiquitous_tokens = ubiquitous_tokens;