diff --git a/examples/grammars/arithmetic.cc b/examples/grammars/arithmetic.cc index 915c7ed9..e8750ee1 100644 --- a/examples/grammars/arithmetic.cc +++ b/examples/grammars/arithmetic.cc @@ -28,6 +28,9 @@ extern const Grammar arithmetic = Grammar({ { "variable", pattern("\\a[\\w_]*") }, { "comment", pattern("#.*") }, -}).ubiquitous_tokens({ "comment" }); +}).ubiquitous_tokens({ + sym("comment"), + pattern("\\s"), +}); } // namespace tree_sitter_examples diff --git a/examples/grammars/golang.cc b/examples/grammars/golang.cc index 4a868771..4a11fc87 100644 --- a/examples/grammars/golang.cc +++ b/examples/grammars/golang.cc @@ -166,8 +166,10 @@ extern const Grammar golang = Grammar({ { "_identifier", pattern("\\a[\\w_]*") }, { "number", pattern("\\d+(\\.\\d+)?") }, { "comment", keypattern("//[^\n]*") }, -}) - .ubiquitous_tokens({ "comment", "_line_break" }) - .separators({ ' ', '\t', '\r' }); +}).ubiquitous_tokens({ + sym("comment"), + sym("_line_break"), + pattern("[ \t\r]"), +}); } // namespace tree_sitter_examples diff --git a/examples/grammars/javascript.cc b/examples/grammars/javascript.cc index 0987c3df..cd25f547 100644 --- a/examples/grammars/javascript.cc +++ b/examples/grammars/javascript.cc @@ -213,8 +213,10 @@ extern const Grammar javascript = Grammar({ { "null", keyword("null") }, { "true", keyword("true") }, { "false", keyword("false") }, -}) - .ubiquitous_tokens({ "comment", "_line_break" }) - .separators({ ' ', '\t', '\r' }); +}).ubiquitous_tokens({ + sym("comment"), + sym("_line_break"), + pattern("[ \t\r]"), +}); } // namespace tree_sitter_examples diff --git a/examples/grammars/json.cc b/examples/grammars/json.cc index 81af2368..457d26d0 100644 --- a/examples/grammars/json.cc +++ b/examples/grammars/json.cc @@ -6,7 +6,7 @@ namespace tree_sitter_examples { using tree_sitter::Grammar; using namespace tree_sitter::rules; -extern const Grammar json({ +extern const Grammar json = Grammar({ { "value", choice({ sym("object"), sym("array"), @@ -25,6 +25,8 @@ extern const Grammar json({ { "null", keyword("null") }, { "true", keyword("true") }, { "false", keyword("false") }, +}).ubiquitous_tokens({ + pattern("\\s"), }); } // namespace tree_sitter_examples diff --git a/include/tree_sitter/compiler.h b/include/tree_sitter/compiler.h index a6941088..710a065f 100644 --- a/include/tree_sitter/compiler.h +++ b/include/tree_sitter/compiler.h @@ -33,8 +33,7 @@ std::ostream &operator<<(std::ostream &stream, const rules::rule_ptr &rule); class Grammar { const std::vector > rules_; - std::set ubiquitous_tokens_; - std::set separators_; + std::set ubiquitous_tokens_; public: Grammar(const std::vector > &rules); @@ -42,10 +41,9 @@ class Grammar { std::string start_rule_name() const; const rules::rule_ptr rule(const std::string &name) const; const std::vector > &rules() const; - const std::set &ubiquitous_tokens() const; - Grammar &ubiquitous_tokens(const std::set &ubiquitous_tokens); - const std::set &separators() const; - Grammar &separators(const std::set &separators); + const std::set &ubiquitous_tokens() const; + Grammar &ubiquitous_tokens( + const std::set &ubiquitous_tokens); }; struct Conflict { @@ -57,7 +55,8 @@ struct Conflict { enum GrammarErrorType { GrammarErrorTypeRegex, - GrammarErrorTypeUndefinedSymbol + GrammarErrorTypeUndefinedSymbol, + GrammarErrorTypeInvalidUbiquitousToken }; class GrammarError { diff --git a/spec/compiler/build_tables/conflict_manager_spec.cc b/spec/compiler/build_tables/conflict_manager_spec.cc index 8d597aeb..6006565c 100644 --- a/spec/compiler/build_tables/conflict_manager_spec.cc +++ b/spec/compiler/build_tables/conflict_manager_spec.cc @@ -21,7 +21,7 @@ describe("resolving parse conflicts", []() { { "token1", pattern("[a-c]") }, { "token2", pattern("[b-d]") }, { "token3", keyword("stuff") }, - }, {}, set()); + }, {}, {}); describe("lexical conflicts", [&]() { Symbol sym1(0, SymbolOptionToken); diff --git a/spec/compiler/helpers/containers.h b/spec/compiler/helpers/containers.h index 0757d5d5..6c1ecb37 100644 --- a/spec/compiler/helpers/containers.h +++ b/spec/compiler/helpers/containers.h @@ -19,13 +19,13 @@ template class rule_map : public map { public: bool operator==(const map &other) const { - if (this->size() != other.size()) return false; - for (const auto &pair : *this) { - auto other_pair = other.find(pair.first); - if (other_pair == other.end()) return false; - if (!pair.second->operator==(*other_pair->second)) return false; - } - return true; + if (this->size() != other.size()) return false; + for (const auto &pair : *this) { + auto other_pair = other.find(pair.first); + if (other_pair == other.end()) return false; + if (!pair.second->operator==(*other_pair->second)) return false; + } + return true; } rule_map(const initializer_list> &list) : map(list) {} @@ -34,19 +34,35 @@ class rule_map : public map { class rule_list : public vector> { public: bool operator==(const vector> &other) const { - if (this->size() != other.size()) return false; - for (size_t i = 0; i < this->size(); i++) { - auto pair = this->operator[](i); - auto other_pair = other[i]; - if (!pair.second->operator==(*other_pair.second)) - return false; - } - return true; + if (this->size() != other.size()) return false; + for (size_t i = 0; i < this->size(); i++) { + auto pair = this->operator[](i); + auto other_pair = other[i]; + if (!pair.second->operator==(*other_pair.second)) + return false; + } + return true; } rule_list(const initializer_list> &list) : vector>(list) {} }; +class rule_vector : public vector { + public: + bool operator==(const vector &other) const { + if (this->size() != other.size()) return false; + for (size_t i = 0; i < this->size(); i++) { + auto rule = this->operator[](i); + auto other_rule = other[i]; + if (!rule->operator==(*rule)) + return false; + } + return true; + } + + rule_vector(const initializer_list &list) : + vector(list) {} +}; #endif // HELPERS_CONTAINERS_H_ diff --git a/spec/compiler/prepare_grammar/expand_repeats_spec.cc b/spec/compiler/prepare_grammar/expand_repeats_spec.cc index 07232085..e9ea305c 100644 --- a/spec/compiler/prepare_grammar/expand_repeats_spec.cc +++ b/spec/compiler/prepare_grammar/expand_repeats_spec.cc @@ -77,7 +77,7 @@ describe("expand_repeats", []() { auto match = expand_repeats(grammar); AssertThat(match.rules, Equals(rule_list({ - { "rule0", seq({ + { "rule0", seq({ choice({ i_aux_sym(0), blank() }), choice({ i_aux_sym(1), blank() }) }) }, }))); diff --git a/spec/compiler/prepare_grammar/extract_tokens_spec.cc b/spec/compiler/prepare_grammar/extract_tokens_spec.cc index b9735e73..1d3f310a 100644 --- a/spec/compiler/prepare_grammar/extract_tokens_spec.cc +++ b/spec/compiler/prepare_grammar/extract_tokens_spec.cc @@ -1,7 +1,6 @@ #include "compiler/compiler_spec_helper.h" #include "compiler/prepared_grammar.h" #include "compiler/prepare_grammar/extract_tokens.h" -#include "compiler/prepare_grammar/interned_grammar.h" #include "compiler/prepared_grammar.h" #include "compiler/helpers/containers.h" @@ -9,175 +8,226 @@ START_TEST using namespace rules; using prepare_grammar::extract_tokens; -using prepare_grammar::InternedGrammar; -describe("extracting tokens from a grammar", []() { +describe("extract_tokens", []() { it("moves string rules into the lexical grammar", [&]() { - pair result = extract_tokens(InternedGrammar{ - { + tuple result = + extract_tokens(Grammar({ { "rule_A", seq({ str("ab"), i_sym(0) }) } - }, - set(), - set() - }); + })); - AssertThat(result.first.rules, Equals(rule_list({ + AssertThat(get<0>(result).rules, Equals(rule_list({ { "rule_A", seq({ i_aux_token(0), i_sym(0) }) } }))); - AssertThat(result.first.aux_rules, IsEmpty()) - AssertThat(result.second.rules, IsEmpty()) - AssertThat(result.second.aux_rules, Equals(rule_list({ + AssertThat(get<0>(result).aux_rules, IsEmpty()) + + AssertThat(get<1>(result).rules, IsEmpty()) + AssertThat(get<1>(result).aux_rules, Equals(rule_list({ { "'ab'", str("ab") }, }))); }); it("moves pattern rules into the lexical grammar", [&]() { - pair result = extract_tokens(InternedGrammar{ - { - { "rule_A", seq({ pattern("a+"), i_sym(0) }) } - }, - set(), - set() - }); + auto result = extract_tokens(Grammar({ + { "rule_A", seq({ pattern("a+"), i_sym(0) }) } + })); - AssertThat(result.first.rules, Equals(rule_list({ + AssertThat(get<0>(result).rules, Equals(rule_list({ { "rule_A", seq({ i_aux_token(0), i_sym(0) }) } }))); - AssertThat(result.first.aux_rules, IsEmpty()) - AssertThat(result.second.rules, IsEmpty()) - AssertThat(result.second.aux_rules, Equals(rule_list({ + AssertThat(get<0>(result).aux_rules, IsEmpty()) + + AssertThat(get<1>(result).rules, IsEmpty()) + AssertThat(get<1>(result).aux_rules, Equals(rule_list({ { "/a+/", pattern("a+") }, }))); }); it("moves other rules marked as tokens into the lexical grammar", [&]() { - pair result = extract_tokens(InternedGrammar{ - { - { "rule_A", seq({ - token(seq({ pattern("."), choice({ str("a"), str("b") }) })), - i_sym(0) }) } - }, - set(), - set() - }); + auto result = extract_tokens(Grammar({ + { "rule_A", seq({ + token(seq({ pattern("."), choice({ str("a"), str("b") }) })), + i_sym(0) }) } + })); - AssertThat(result.first.rules, Equals(rule_list({ + AssertThat(get<0>(result).rules, Equals(rule_list({ { "rule_A", seq({ i_aux_token(0), i_sym(0) }) } }))); - AssertThat(result.first.aux_rules, IsEmpty()) - AssertThat(result.second.rules, IsEmpty()) - AssertThat(result.second.aux_rules, Equals(rule_list({ + AssertThat(get<0>(result).aux_rules, IsEmpty()) + + AssertThat(get<1>(result).rules, IsEmpty()) + AssertThat(get<1>(result).aux_rules, Equals(rule_list({ { "(seq /./ (choice 'a' 'b'))", token(seq({ pattern("."), choice({ str("a"), str("b") }) })) }, }))); }); - it("does not extract blanks", [&]() { - pair result = extract_tokens(InternedGrammar{ - { - { "rule_A", choice({ i_sym(0), blank() }) }, - }, - set(), - set() - }); + it("does not move blank rules", [&]() { + auto result = extract_tokens(Grammar({ + { "rule_A", choice({ i_sym(0), blank() }) }, + })); - AssertThat(result.first.rules, Equals(rule_list({ + AssertThat(get<0>(result).rules, Equals(rule_list({ { "rule_A", choice({ i_sym(0), blank() }) }, }))); - AssertThat(result.first.aux_rules, IsEmpty()) - AssertThat(result.second.rules, IsEmpty()) - AssertThat(result.second.aux_rules, IsEmpty()) + AssertThat(get<0>(result).aux_rules, IsEmpty()) + + AssertThat(get<1>(result).rules, IsEmpty()) + AssertThat(get<1>(result).aux_rules, IsEmpty()) }); it("does not create duplicate tokens in the lexical grammar", [&]() { - pair result = extract_tokens(InternedGrammar{ - { - { "rule_A", seq({ str("ab"), i_sym(0), str("ab") }) }, - }, - set(), - set() - }); + auto result = extract_tokens(Grammar({ + { "rule_A", seq({ str("ab"), i_sym(0), str("ab") }) }, + })); - AssertThat(result.first.rules, Equals(rule_list({ + AssertThat(get<0>(result).rules, Equals(rule_list({ { "rule_A", seq({ i_aux_token(0), i_sym(0), i_aux_token(0) }) } }))); - AssertThat(result.first.aux_rules, IsEmpty()) - AssertThat(result.second.rules, IsEmpty()) - AssertThat(result.second.aux_rules, Equals(rule_list({ + AssertThat(get<0>(result).aux_rules, IsEmpty()) + + AssertThat(get<1>(result).rules, IsEmpty()) + AssertThat(get<1>(result).aux_rules, Equals(rule_list({ { "'ab'", str("ab") }, }))) }); - it("preserves the separator characters in the lexical grammar", [&]() { - pair result = extract_tokens(InternedGrammar{ - { - { "rule_A", str("ab") }, - }, - set(), - { 'x', 'y', 'z' } - }); - - AssertThat(result.second.separators, Equals(set({ 'x', 'y', 'z' }))); - }); - describe("when an entire rule can be extracted", [&]() { it("moves the rule the lexical grammar when possible and updates referencing symbols", [&]() { - auto result = extract_tokens(InternedGrammar{ - { - { "rule_A", i_sym(1) }, - { "rule_B", pattern("a|b") }, - { "rule_C", token(seq({ str("a"), str("b") })) }, - }, - set(), - set() - }); + auto result = extract_tokens(Grammar({ + { "rule_A", i_sym(1) }, + { "rule_B", pattern("a|b") }, + { "rule_C", token(seq({ str("a"), str("b") })) }, + })); - AssertThat(result.first.rules, Equals(rule_list({ + AssertThat(get<0>(result).rules, Equals(rule_list({ { "rule_A", i_token(0) } }))); - AssertThat(result.first.aux_rules, IsEmpty()); - AssertThat(result.second.rules, Equals(rule_list({ + AssertThat(get<0>(result).aux_rules, IsEmpty()); + + AssertThat(get<1>(result).rules, Equals(rule_list({ { "rule_B", pattern("a|b") }, { "rule_C", token(seq({ str("a"), str("b") })) }, }))); - AssertThat(result.second.aux_rules, IsEmpty()); + AssertThat(get<1>(result).aux_rules, IsEmpty()); }); it("updates symbols whose indices need to change due to deleted rules", [&]() { - auto result = extract_tokens(InternedGrammar{ - { - { "rule_A", str("ab") }, - { "rule_B", i_sym(0) }, - { "rule_C", i_sym(1) }, - }, - set(), - set() - }); + auto result = extract_tokens(Grammar({ + { "rule_A", str("ab") }, + { "rule_B", i_sym(0) }, + { "rule_C", i_sym(1) }, + })); - AssertThat(result.first.rules, Equals(rule_list({ + AssertThat(get<0>(result).rules, Equals(rule_list({ { "rule_B", i_token(0) }, { "rule_C", i_sym(0) }, }))); - AssertThat(result.first.aux_rules, IsEmpty()); - AssertThat(result.second.rules, Equals(rule_list({ + AssertThat(get<0>(result).aux_rules, IsEmpty()); + + AssertThat(get<1>(result).rules, Equals(rule_list({ { "rule_A", str("ab") }, }))); - AssertThat(result.second.aux_rules, IsEmpty()); + AssertThat(get<1>(result).aux_rules, IsEmpty()); + }); + }); + + describe("handling ubiquitous tokens!", [&]() { + describe("ubiquitous tokens that are not symbols", [&]() { + it("adds them to the lexical grammar's separators", [&]() { + auto result = extract_tokens(Grammar({ + { "rule_A", str("x") }, + }).ubiquitous_tokens({ + pattern("\\s+"), + str("y"), + })); + + AssertThat(get<2>(result), Equals(nullptr)); + + AssertThat(get<1>(result).separators, Equals(rule_vector({ + pattern("\\s+"), + str("y"), + }))); + + AssertThat(get<0>(result).ubiquitous_tokens, IsEmpty()); + }); }); - it("updates the grammar's ubiquitous_tokens", [&]() { - auto result = extract_tokens(InternedGrammar{ - { - { "rule_A", str("ab") }, - { "rule_B", i_sym(0) }, - { "rule_C", i_sym(1) }, - }, - { Symbol(0) }, - set() - }); + describe("ubiquitous tokens that point to moved rules", [&]() { + it("updates them according to the new symbol numbers", [&]() { + auto result = extract_tokens(Grammar( { + { "rule_A", seq({ str("w"), i_sym(1) }) }, + { "rule_B", str("x") }, + { "rule_C", str("y") }, + }).ubiquitous_tokens({ + i_sym(2), + })); - AssertThat(result.first.ubiquitous_tokens, Equals(set({ - { Symbol(0, SymbolOptionToken) } - }))); + AssertThat(get<2>(result), Equals(nullptr)); + + AssertThat(get<0>(result).ubiquitous_tokens, Equals(set({ + { Symbol(1, SymbolOptionToken) }, + }))); + + AssertThat(get<1>(result).separators, IsEmpty()); + }); + }); + + describe("ubiquitous tokens that are visible", [&]() { + it("preserves them in the syntactic grammar", [&]() { + auto result = extract_tokens(Grammar({ + { "rule_A", str("ab") }, + { "rule_B", str("bc") }, + }).ubiquitous_tokens({ i_sym(1) })); + + AssertThat(get<2>(result), Equals(nullptr)); + + AssertThat(get<0>(result).ubiquitous_tokens, Equals(set({ + Symbol(1, SymbolOptionToken) + }))); + + AssertThat(get<1>(result).separators, IsEmpty()); + }); + }); + + describe("ubiquitous tokens that are used in other grammar rules", [&]() { + it("preserves them in the syntactic grammar", [&]() { + auto result = extract_tokens(Grammar({ + { "rule_A", seq({ i_sym(1), str("ab") }) }, + { "_rule_B", str("bc") }, + }).ubiquitous_tokens({ i_sym(1) })); + + AssertThat(get<2>(result), Equals(nullptr)); + + AssertThat(get<0>(result).ubiquitous_tokens, Equals(set({ + Symbol(0, SymbolOptionToken), + }))); + + AssertThat(get<1>(result).separators, IsEmpty()); + }); + }); + + describe("ubiquitous tokens that are non-token symbols", [&]() { + it("returns an error", [&]() { + auto result = extract_tokens(Grammar({ + { "rule_A", seq({ str("x"), i_sym(1) }), }, + { "rule_B", seq({ str("y"), str("z") }) }, + }).ubiquitous_tokens({ i_sym(1) })); + + AssertThat(get<2>(result), !Equals(nullptr)); + AssertThat(get<2>(result), EqualsPointer(new GrammarError(GrammarErrorTypeInvalidUbiquitousToken, "Not a token: (sym 1)"))); + }); + }); + + describe("ubiquitous tokens that are non-token symbols", [&]() { + it("returns an error", [&]() { + auto result = extract_tokens(Grammar({ + { "rule_A", str("x") }, + { "rule_B", str("y") }, + }).ubiquitous_tokens({ choice({ i_sym(1), blank() }) })); + + AssertThat(get<2>(result), !Equals(nullptr)); + AssertThat(get<2>(result), EqualsPointer(new GrammarError(GrammarErrorTypeInvalidUbiquitousToken, "Not a token: (choice (sym 1) (blank))"))); + }); }); }); }); diff --git a/spec/compiler/prepare_grammar/intern_symbols_spec.cc b/spec/compiler/prepare_grammar/intern_symbols_spec.cc index 32569f00..60a46550 100644 --- a/spec/compiler/prepare_grammar/intern_symbols_spec.cc +++ b/spec/compiler/prepare_grammar/intern_symbols_spec.cc @@ -21,7 +21,7 @@ describe("interning symbols in a grammar", []() { auto result = intern_symbols(grammar); AssertThat(result.second, Equals((GrammarError *)nullptr)); - AssertThat(result.first.rules, Equals(rule_list({ + AssertThat(result.first.rules(), Equals(rule_list({ { "x", choice({ i_sym(1), i_sym(2) }) }, { "y", i_sym(2) }, { "z", str("stuff") }, @@ -45,24 +45,13 @@ describe("interning symbols in a grammar", []() { { "x", choice({ sym("y"), sym("z") }) }, { "y", sym("z") }, { "z", str("stuff") } - }).ubiquitous_tokens({ "z" }); + }).ubiquitous_tokens({ sym("z") }); auto result = intern_symbols(grammar); AssertThat(result.second, Equals((GrammarError *)nullptr)); - AssertThat(result.first.ubiquitous_tokens, Equals(set({ - Symbol(2) - }))); - }); - - it("preserves the grammar's separator character set", [&]() { - auto grammar = Grammar({ - { "z", str("stuff") } - }).separators({ 'x', 'y' }); - - auto result = intern_symbols(grammar); - - AssertThat(result.first.separators, Equals(set({ 'x', 'y' }))) + AssertThat(result.first.ubiquitous_tokens().size(), Equals(1)); + AssertThat(*result.first.ubiquitous_tokens().begin(), EqualsPointer(i_sym(2))); }); }); diff --git a/spec/runtime/document_spec.cc b/spec/runtime/document_spec.cc index 1c12ab38..30ae56ea 100644 --- a/spec/runtime/document_spec.cc +++ b/spec/runtime/document_spec.cc @@ -102,7 +102,7 @@ describe("Document", [&]() { it("updates the parse tree", [&]() { AssertThat(string(ts_node_string(ts_document_root_node(doc))), Equals( "(DOCUMENT (exponent " - "(variable) " + "(variable) " "(group (sum (number) (product (variable) (number))))))")); }); diff --git a/src/compiler/build_tables/build_lex_table.cc b/src/compiler/build_tables/build_lex_table.cc index 76368952..5f682d05 100644 --- a/src/compiler/build_tables/build_lex_table.cc +++ b/src/compiler/build_tables/build_lex_table.cc @@ -3,10 +3,12 @@ #include #include #include +#include #include #include "compiler/prepared_grammar.h" #include "compiler/rules/built_in_symbols.h" #include "compiler/rules/metadata.h" +#include "compiler/rules/choice.h" #include "compiler/rules/repeat.h" #include "compiler/rules/blank.h" #include "compiler/rules/seq.h" @@ -22,6 +24,8 @@ using std::map; using std::unordered_map; using std::set; using std::make_shared; +using std::vector; +using std::dynamic_pointer_cast; using rules::Symbol; using rules::CharacterSet; @@ -101,17 +105,24 @@ class LexTableBuilder { lex_table.state(state_id).is_token_start = true; } - CharacterSet separator_set() const { - CharacterSet result; - for (char c : lex_grammar.separators) - result.include(c); - return result; + // TODO - remove this hack. right now, nested repeats cause + // item sets which are equivalent to appear unequal. + rules::rule_ptr separators() const { + std::vector separators; + for (auto &rule : lex_grammar.separators) { + auto repeat = dynamic_pointer_cast(rule); + if (repeat.get()) + separators.push_back(repeat->content); + else + separators.push_back(rule); + } + return rules::repeat(rules::choice(separators)); } rules::rule_ptr after_separators(rules::rule_ptr rule) { return rules::Seq::Build( { make_shared( - make_shared(separator_set().copy()), + separators(), map( { { rules::START_TOKEN, 1 }, { rules::PRECEDENCE, -1 }, })), rule, }); diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index 73c689c4..abdcfa64 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -137,7 +137,9 @@ class CCodeGenerator { line("#pragma GCC diagnostic push"); line("#pragma GCC diagnostic ignored \"-Wmissing-field-initializers\""); line(); - line("static const TSParseAction ts_parse_actions[STATE_COUNT][SYMBOL_COUNT] = {"); + line( + "static const TSParseAction " + "ts_parse_actions[STATE_COUNT][SYMBOL_COUNT] = {"); indent([&]() { for (auto &state : parse_table.states) { diff --git a/src/compiler/grammar.cc b/src/compiler/grammar.cc index 3057917e..783037cc 100644 --- a/src/compiler/grammar.cc +++ b/src/compiler/grammar.cc @@ -12,9 +12,7 @@ using rules::rule_ptr; Grammar::Grammar( const std::vector > &rules) - : rules_(rules), - ubiquitous_tokens_({}), - separators_({ ' ', '\r', '\t', '\n' }) {} + : rules_(rules), ubiquitous_tokens_({}) {} bool Grammar::operator==(const Grammar &other) const { if (other.rules_.size() != rules_.size()) @@ -63,22 +61,15 @@ ostream &operator<<(ostream &stream, const GrammarError *error) { return stream << string("#"); } -const set &Grammar::ubiquitous_tokens() const { +const set &Grammar::ubiquitous_tokens() const { return ubiquitous_tokens_; } -Grammar &Grammar::ubiquitous_tokens(const set &ubiquitous_tokens) { +Grammar &Grammar::ubiquitous_tokens(const set &ubiquitous_tokens) { ubiquitous_tokens_ = ubiquitous_tokens; return *this; } -const set &Grammar::separators() const { return separators_; } - -Grammar &Grammar::separators(const set &separators) { - separators_ = separators; - return *this; -} - const vector > &Grammar::rules() const { return rules_; } } // namespace tree_sitter diff --git a/src/compiler/prepare_grammar/expand_repeats.cc b/src/compiler/prepare_grammar/expand_repeats.cc index c611644b..2d21d4f2 100644 --- a/src/compiler/prepare_grammar/expand_repeats.cc +++ b/src/compiler/prepare_grammar/expand_repeats.cc @@ -35,10 +35,11 @@ class ExpandRepeats : public rules::IdentityRuleFn { string helper_rule_name = rule_name + string("_repeat") + to_string(index); rule_ptr repeat_symbol = make_shared(offset + index, rules::SymbolOptionAuxiliary); - aux_rules.push_back({ - helper_rule_name, - Seq::Build({ inner_rule, Choice::Build({ repeat_symbol, make_shared() }) }) - }); + aux_rules.push_back( + { helper_rule_name, + Seq::Build( + { inner_rule, + Choice::Build({ repeat_symbol, make_shared() }) }) }); return Choice::Build({ repeat_symbol, make_shared() }); } diff --git a/src/compiler/prepare_grammar/expand_tokens.cc b/src/compiler/prepare_grammar/expand_tokens.cc index 2650f291..674339d2 100644 --- a/src/compiler/prepare_grammar/expand_tokens.cc +++ b/src/compiler/prepare_grammar/expand_tokens.cc @@ -46,7 +46,8 @@ class ExpandTokens : public rules::IdentityRuleFn { pair expand_tokens( const LexicalGrammar &grammar) { - vector > rules, aux_rules; + vector> rules, aux_rules; + vector separators; ExpandTokens expander; for (auto &pair : grammar.rules) { @@ -63,7 +64,14 @@ pair expand_tokens( aux_rules.push_back({ pair.first, rule }); } - return { LexicalGrammar(rules, aux_rules, grammar.separators), nullptr, }; + for (auto &sep : grammar.separators) { + auto rule = expander.apply(sep); + if (expander.error) + return { LexicalGrammar(), expander.error }; + separators.push_back(rule); + } + + return { LexicalGrammar(rules, aux_rules, separators), nullptr, }; } } // namespace prepare_grammar diff --git a/src/compiler/prepare_grammar/extract_tokens.cc b/src/compiler/prepare_grammar/extract_tokens.cc index 6b634b05..2686f5b4 100644 --- a/src/compiler/prepare_grammar/extract_tokens.cc +++ b/src/compiler/prepare_grammar/extract_tokens.cc @@ -10,7 +10,6 @@ #include "compiler/rules/string.h" #include "compiler/rules/metadata.h" #include "compiler/rules/pattern.h" -#include "compiler/prepare_grammar/interned_grammar.h" #include "compiler/prepare_grammar/token_description.h" #include "compiler/prepare_grammar/is_token.h" @@ -18,15 +17,27 @@ namespace tree_sitter { namespace prepare_grammar { using std::pair; +using std::tuple; using std::string; using std::map; using std::to_string; using std::vector; using std::set; using std::make_shared; +using std::dynamic_pointer_cast; using rules::rule_ptr; using rules::Symbol; +using rules::SymbolOptionToken; +using rules::SymbolOptionAuxToken; +class UsedSymbols : public rules::IdentityRuleFn { + set used_symbols_; + + rules::rule_ptr apply(rules::Symbol *sym) { + used_symbols_.insert(*sym); + return sym->copy(); + } +}; class SymbolInliner : public rules::IdentityRuleFn { map replacements; @@ -59,8 +70,6 @@ class SymbolInliner : public rules::IdentityRuleFn { }; class TokenExtractor : public rules::IdentityRuleFn { - const rules::SymbolOption SymbolOptionAuxToken = rules::SymbolOption( - rules::SymbolOptionToken | rules::SymbolOptionAuxiliary); rule_ptr apply_to_token(const rules::Rule *input) { auto rule = input->copy(); @@ -91,23 +100,28 @@ class TokenExtractor : public rules::IdentityRuleFn { } public: - vector > tokens; + vector> tokens; }; -pair extract_tokens( - const InternedGrammar &input_grammar) { - vector > rules, tokens, aux_rules, aux_tokens; - set ubiquitous_tokens; +static const GrammarError *ubiq_token_err(const string &msg) { + return new GrammarError(GrammarErrorTypeInvalidUbiquitousToken, msg); +} - TokenExtractor extractor; +tuple extract_tokens( + const Grammar &grammar) { + vector> rules, tokens, aux_rules, aux_tokens; + vector separators; + set ubiquitous_tokens; map symbol_replacements; - for (size_t i = 0; i < input_grammar.rules.size(); i++) { - auto pair = input_grammar.rules[i]; + TokenExtractor extractor; + + for (size_t i = 0; i < grammar.rules().size(); i++) { + auto pair = grammar.rules()[i]; if (is_token(pair.second)) { tokens.push_back(pair); symbol_replacements.insert( - { Symbol(i), Symbol(tokens.size() - 1, rules::SymbolOptionToken) }); + { Symbol(i), Symbol(tokens.size() - 1, SymbolOptionToken) }); } else { rules.push_back({ pair.first, extractor.apply(pair.second) }); } @@ -119,11 +133,27 @@ pair extract_tokens( SymbolInliner inliner(symbol_replacements); for (auto &pair : rules) pair.second = inliner.apply(pair.second); - for (auto &symbol : input_grammar.ubiquitous_tokens) - ubiquitous_tokens.insert(inliner.replace_symbol(symbol)); + + for (auto rule : grammar.ubiquitous_tokens()) { + if (is_token(rule)) { + separators.push_back(rule); + } else { + auto sym = dynamic_pointer_cast(extractor.apply(rule)); + if (!sym.get()) + return { SyntaxGrammar(), LexicalGrammar(), + ubiq_token_err("Not a token: " + rule->to_string()) }; + + Symbol symbol = inliner.replace_symbol(*sym); + if (!symbol.is_token()) + return { SyntaxGrammar(), LexicalGrammar(), + ubiq_token_err("Not a token: " + symbol.to_string()) }; + + ubiquitous_tokens.insert(symbol); + } + } return { SyntaxGrammar(rules, aux_rules, ubiquitous_tokens), - LexicalGrammar(tokens, aux_tokens, input_grammar.separators), }; + LexicalGrammar(tokens, aux_tokens, separators), nullptr }; } } // namespace prepare_grammar diff --git a/src/compiler/prepare_grammar/extract_tokens.h b/src/compiler/prepare_grammar/extract_tokens.h index fc94c0f1..1f3b3413 100644 --- a/src/compiler/prepare_grammar/extract_tokens.h +++ b/src/compiler/prepare_grammar/extract_tokens.h @@ -2,17 +2,18 @@ #define COMPILER_PREPARE_GRAMMAR_EXTRACT_TOKENS_H_ #include -#include "compiler/prepare_grammar/interned_grammar.h" +#include "tree_sitter/compiler.h" namespace tree_sitter { +class Grammar; class SyntaxGrammar; class LexicalGrammar; namespace prepare_grammar { -std::pair extract_tokens( - const InternedGrammar &); +std::tuple extract_tokens( + const Grammar &); } // namespace prepare_grammar } // namespace tree_sitter diff --git a/src/compiler/prepare_grammar/intern_symbols.cc b/src/compiler/prepare_grammar/intern_symbols.cc index f7b21603..79b9a710 100644 --- a/src/compiler/prepare_grammar/intern_symbols.cc +++ b/src/compiler/prepare_grammar/intern_symbols.cc @@ -3,7 +3,6 @@ #include #include #include "tree_sitter/compiler.h" -#include "compiler/prepare_grammar/interned_grammar.h" #include "compiler/prepared_grammar.h" #include "compiler/rules/visitor.h" #include "compiler/rules/named_symbol.h" @@ -42,15 +41,13 @@ class InternSymbols : public rules::IdentityRuleFn { string missing_rule_name; }; -pair missing_rule_error( - string rule_name) { - InternedGrammar grammar; - return { grammar, new GrammarError(GrammarErrorTypeUndefinedSymbol, - "Undefined rule '" + rule_name + "'") }; +pair missing_rule_error(string rule_name) { + return { Grammar({}), + new GrammarError(GrammarErrorTypeUndefinedSymbol, + "Undefined rule '" + rule_name + "'") }; } -pair intern_symbols( - const Grammar &grammar) { +pair intern_symbols(const Grammar &grammar) { InternSymbols interner(grammar); vector > rules; @@ -61,20 +58,15 @@ pair intern_symbols( rules.push_back({ pair.first, new_rule }); } - set ubiquitous_tokens; - for (auto &name : grammar.ubiquitous_tokens()) { - auto token = interner.symbol_for_rule_name(name); - if (!token.get()) - return missing_rule_error(name); - ubiquitous_tokens.insert(*token); + set ubiquitous_tokens; + for (auto &rule : grammar.ubiquitous_tokens()) { + auto new_rule = interner.apply(rule); + if (!interner.missing_rule_name.empty()) + return missing_rule_error(interner.missing_rule_name); + ubiquitous_tokens.insert(new_rule); } - InternedGrammar result; - result.rules = rules; - result.ubiquitous_tokens = ubiquitous_tokens; - result.separators = grammar.separators(); - - return { result, nullptr }; + return { Grammar(rules).ubiquitous_tokens(ubiquitous_tokens), nullptr }; } } // namespace prepare_grammar diff --git a/src/compiler/prepare_grammar/intern_symbols.h b/src/compiler/prepare_grammar/intern_symbols.h index bc6380e7..9530f90d 100644 --- a/src/compiler/prepare_grammar/intern_symbols.h +++ b/src/compiler/prepare_grammar/intern_symbols.h @@ -4,7 +4,6 @@ #include #include #include "tree_sitter/compiler.h" -#include "compiler/prepare_grammar/interned_grammar.h" namespace tree_sitter { @@ -12,8 +11,7 @@ class Grammar; namespace prepare_grammar { -std::pair intern_symbols( - const Grammar &); +std::pair intern_symbols(const Grammar &); } // namespace prepare_grammar } // namespace tree_sitter diff --git a/src/compiler/prepare_grammar/interned_grammar.h b/src/compiler/prepare_grammar/interned_grammar.h deleted file mode 100644 index 5baeb808..00000000 --- a/src/compiler/prepare_grammar/interned_grammar.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef COMPILER_PREPARE_GRAMMAR_INTERNED_GRAMMAR_H_ -#define COMPILER_PREPARE_GRAMMAR_INTERNED_GRAMMAR_H_ - -#include -#include -#include -#include -#include "tree_sitter/compiler.h" -#include "compiler/rules/symbol.h" - -namespace tree_sitter { -namespace prepare_grammar { - -class InternedGrammar { - public: - std::vector > rules; - std::set ubiquitous_tokens; - std::set separators; -}; - -} // namespace prepare_grammar -} // namespace tree_sitter - -#endif // COMPILER_PREPARE_GRAMMAR_INTERNED_GRAMMAR_H_ diff --git a/src/compiler/prepare_grammar/is_token.cc b/src/compiler/prepare_grammar/is_token.cc index a4afa85d..b7d14a7e 100644 --- a/src/compiler/prepare_grammar/is_token.cc +++ b/src/compiler/prepare_grammar/is_token.cc @@ -16,9 +16,7 @@ class IsToken : public rules::RuleFn { } }; -bool is_token(const rules::rule_ptr &rule) { - return IsToken().apply(rule); -} +bool is_token(const rules::rule_ptr &rule) { return IsToken().apply(rule); } } // namespace prepare_grammar } // namespace tree_sitter diff --git a/src/compiler/prepare_grammar/is_token.h b/src/compiler/prepare_grammar/is_token.h index 8ee7c666..0a40f10d 100644 --- a/src/compiler/prepare_grammar/is_token.h +++ b/src/compiler/prepare_grammar/is_token.h @@ -12,4 +12,3 @@ bool is_token(const rules::rule_ptr &); } // namespace tree_sitter #endif // COMPILER_PREPARE_GRAMMAR_IS_TOKEN_H_ - diff --git a/src/compiler/prepare_grammar/parse_regex.cc b/src/compiler/prepare_grammar/parse_regex.cc index ff6747d7..6912cbaf 100644 --- a/src/compiler/prepare_grammar/parse_regex.cc +++ b/src/compiler/prepare_grammar/parse_regex.cc @@ -182,11 +182,8 @@ class PatternParser { case 'd': return CharacterSet().include('0', '9'); case 's': - return CharacterSet() - .include(' ') - .include('\t') - .include('\n') - .include('\r'); + return CharacterSet().include(' ').include('\t').include('\n').include( + '\r'); case 't': return CharacterSet().include('\t'); case 'n': diff --git a/src/compiler/prepare_grammar/prepare_grammar.cc b/src/compiler/prepare_grammar/prepare_grammar.cc index ec117b80..eed630c5 100644 --- a/src/compiler/prepare_grammar/prepare_grammar.cc +++ b/src/compiler/prepare_grammar/prepare_grammar.cc @@ -2,7 +2,6 @@ #include "compiler/prepare_grammar/expand_tokens.h" #include "compiler/prepare_grammar/extract_tokens.h" #include "compiler/prepare_grammar/intern_symbols.h" -#include "compiler/prepare_grammar/interned_grammar.h" #include "compiler/prepare_grammar/prepare_grammar.h" #include "compiler/prepared_grammar.h" @@ -10,23 +9,26 @@ namespace tree_sitter { namespace prepare_grammar { using std::tuple; +using std::get; using std::make_tuple; tuple prepare_grammar( const Grammar &input_grammar) { auto result = intern_symbols(input_grammar); - const InternedGrammar &grammar = result.first; + const Grammar &grammar = result.first; const GrammarError *error = result.second; - if (error) return make_tuple(SyntaxGrammar(), LexicalGrammar(), error); auto grammars = extract_tokens(grammar); - const SyntaxGrammar &rule_grammar = expand_repeats(grammars.first); - auto expand_tokens_result = expand_tokens(grammars.second); + const SyntaxGrammar &rule_grammar = expand_repeats(get<0>(grammars)); + error = get<2>(grammars); + if (error) + return make_tuple(SyntaxGrammar(), LexicalGrammar(), error); + + auto expand_tokens_result = expand_tokens(get<1>(grammars)); const LexicalGrammar &lex_grammar = expand_tokens_result.first; error = expand_tokens_result.second; - if (error) return make_tuple(SyntaxGrammar(), LexicalGrammar(), error); diff --git a/src/compiler/prepared_grammar.cc b/src/compiler/prepared_grammar.cc index 82687924..7d0d80e9 100644 --- a/src/compiler/prepared_grammar.cc +++ b/src/compiler/prepared_grammar.cc @@ -50,7 +50,7 @@ SyntaxGrammar::SyntaxGrammar( LexicalGrammar::LexicalGrammar( const vector > &rules, const vector > &aux_rules, - const set &separators) + const vector &separators) : PreparedGrammar(rules, aux_rules), separators(separators) {} } // namespace tree_sitter diff --git a/src/compiler/prepared_grammar.h b/src/compiler/prepared_grammar.h index be015c9d..cf681f35 100644 --- a/src/compiler/prepared_grammar.h +++ b/src/compiler/prepared_grammar.h @@ -47,9 +47,9 @@ class LexicalGrammar : public PreparedGrammar { LexicalGrammar( const std::vector > &rules, const std::vector > &aux_rules, - const std::set &separators); + const std::vector &separators); - std::set separators; + std::vector separators; }; } // namespace tree_sitter diff --git a/src/compiler/rules/symbol.cc b/src/compiler/rules/symbol.cc index 7f155953..086f62aa 100644 --- a/src/compiler/rules/symbol.cc +++ b/src/compiler/rules/symbol.cc @@ -10,6 +10,9 @@ using std::string; using std::to_string; using std::hash; +SymbolOption SymbolOptionAuxToken = + SymbolOption(SymbolOptionToken | SymbolOptionAuxiliary); + Symbol::Symbol(int index) : index(index), options(SymbolOption(0)) {} Symbol::Symbol(int index, SymbolOption options) diff --git a/src/compiler/rules/symbol.h b/src/compiler/rules/symbol.h index 699ab8a6..25023cd9 100644 --- a/src/compiler/rules/symbol.h +++ b/src/compiler/rules/symbol.h @@ -12,6 +12,8 @@ typedef enum { SymbolOptionAuxiliary = 1 << 1, } SymbolOption; +extern SymbolOption SymbolOptionAuxToken; + class Symbol : public Rule { public: explicit Symbol(int index);