diff --git a/project.gyp b/project.gyp index 536b308c..f2cf1237 100644 --- a/project.gyp +++ b/project.gyp @@ -25,6 +25,7 @@ 'src/compiler/compile.cc', 'src/compiler/generate_code/c_code.cc', 'src/compiler/lex_table.cc', + 'src/compiler/lexical_grammar.cc', 'src/compiler/parse_grammar.cc', 'src/compiler/parse_table.cc', 'src/compiler/precedence_range.cc', diff --git a/spec/compiler/build_tables/compatible_tokens_spec.cc b/spec/compiler/build_tables/compatible_tokens_spec.cc index 4dcf531a..ea3a9b68 100644 --- a/spec/compiler/build_tables/compatible_tokens_spec.cc +++ b/spec/compiler/build_tables/compatible_tokens_spec.cc @@ -14,17 +14,18 @@ START_TEST describe("recovery_tokens(rule)", []() { it("includes rules that can only begin and end with an explicit set of characters", [&]() { LexicalGrammar grammar; + grammar.separators = { character({ ' ' }), }; grammar.variables = { - Variable("var0", VariableTypeNamed, character({}, false)), - Variable("var1", VariableTypeNamed, seq({ + LexicalVariable("var0", VariableTypeNamed, character({}, false), false), + LexicalVariable("var1", VariableTypeNamed, seq({ character({ 'a', 'b' }), character({}, false), character({ 'c', 'd' }), - })), + }), false), }; AssertThat(get_compatible_tokens(grammar).recovery_tokens, Equals>({ Symbol(1, Symbol::Terminal) })); diff --git a/spec/compiler/build_tables/lex_item_spec.cc b/spec/compiler/build_tables/lex_item_spec.cc index 7042922f..27ceef69 100644 --- a/spec/compiler/build_tables/lex_item_spec.cc +++ b/spec/compiler/build_tables/lex_item_spec.cc @@ -13,11 +13,10 @@ START_TEST describe("LexItem", []() { describe("completion_status()", [&]() { - it("indicates whether the item is done, its precedence, and whether it is a string", [&]() { + it("indicates whether the item is done and its precedence", [&]() { LexItem item1(Symbol(0, Symbol::Terminal), character({ 'a', 'b', 'c' })); AssertThat(item1.completion_status().is_done, IsFalse()); AssertThat(item1.completion_status().precedence, Equals(PrecedenceRange())); - AssertThat(item1.completion_status().is_string, IsFalse()); MetadataParams params; params.precedence = 3; @@ -30,12 +29,10 @@ describe("LexItem", []() { AssertThat(item2.completion_status().is_done, IsTrue()); AssertThat(item2.completion_status().precedence, Equals(PrecedenceRange(3))); - AssertThat(item2.completion_status().is_string, IsTrue()); LexItem item3(Symbol(0, Symbol::Terminal), repeat(character({ ' ', '\t' }))); AssertThat(item3.completion_status().is_done, IsTrue()); AssertThat(item3.completion_status().precedence, Equals(PrecedenceRange())); - AssertThat(item3.completion_status().is_string, IsFalse()); }); }); }); diff --git a/spec/compiler/build_tables/parse_item_set_builder_spec.cc b/spec/compiler/build_tables/parse_item_set_builder_spec.cc index 6548f37a..13cbc01b 100644 --- a/spec/compiler/build_tables/parse_item_set_builder_spec.cc +++ b/spec/compiler/build_tables/parse_item_set_builder_spec.cc @@ -12,12 +12,13 @@ using namespace rules; START_TEST describe("ParseItemSetBuilder", []() { - vector lexical_variables; + vector lexical_variables; for (size_t i = 0; i < 20; i++) { - lexical_variables.push_back(Variable{ + lexical_variables.push_back({ "token_" + to_string(i), VariableTypeNamed, blank(), + false }); } diff --git a/spec/compiler/prepare_grammar/expand_tokens_spec.cc b/spec/compiler/prepare_grammar/expand_tokens_spec.cc index 0aa83b3a..936ad1a9 100644 --- a/spec/compiler/prepare_grammar/expand_tokens_spec.cc +++ b/spec/compiler/prepare_grammar/expand_tokens_spec.cc @@ -15,89 +15,149 @@ describe("expand_tokens", []() { describe("string rules", [&]() { it("replaces strings with sequences of character sets", [&]() { - LexicalGrammar grammar{{ - Variable("rule_A", VariableTypeNamed, seq({ - i_sym(10), - str("xyz"), - i_sym(11), - })), - }, {}}; + LexicalGrammar grammar { + { + LexicalVariable { + "rule_A", + VariableTypeNamed, + seq({ + i_sym(10), + str("xyz"), + i_sym(11), + }), + false + } + }, + {} + }; auto result = expand_tokens(grammar); AssertThat(result.second, Equals(CompileError::none())); - AssertThat(result.first.variables, Equals(vector({ - Variable("rule_A", VariableTypeNamed, seq({ - i_sym(10), - metadata(seq({ - character({ 'x' }), - character({ 'y' }), - character({ 'z' }), - }), string_token_params), - i_sym(11), - })), - }))); + AssertThat(result.first.variables, Equals(vector { + LexicalVariable { + "rule_A", + VariableTypeNamed, + seq({ + i_sym(10), + metadata(seq({ + character({ 'x' }), + character({ 'y' }), + character({ 'z' }), + }), string_token_params), + i_sym(11), + }), + false + } + })); }); it("handles strings containing non-ASCII UTF8 characters", [&]() { - LexicalGrammar grammar{{ - Variable("rule_A", VariableTypeNamed, str("\u03B1 \u03B2")), - }, {}}; + LexicalGrammar grammar { + { + LexicalVariable { + "rule_A", + VariableTypeNamed, + str("\u03B1 \u03B2"), + false + }, + }, + {} + }; auto result = expand_tokens(grammar); - AssertThat(result.first.variables, Equals(vector({ - Variable("rule_A", VariableTypeNamed, metadata(seq({ - character({ 945 }), - character({ ' ' }), - character({ 946 }), - }), string_token_params)), - }))); + AssertThat(result.first.variables, Equals(vector { + LexicalVariable { + "rule_A", + VariableTypeNamed, + metadata(seq({ + character({ 945 }), + character({ ' ' }), + character({ 946 }), + }), string_token_params), + false + } + })); }); }); describe("regexp rules", [&]() { it("replaces regexps with the equivalent rule tree", [&]() { - LexicalGrammar grammar{{ - Variable("rule_A", VariableTypeNamed, seq({ - i_sym(10), - pattern("x*"), - i_sym(11), - })), - }, {}}; + LexicalGrammar grammar { + { + LexicalVariable { + "rule_A", + VariableTypeNamed, + seq({ + i_sym(10), + pattern("x*"), + i_sym(11), + }), + false + } + }, + {} + }; auto result = expand_tokens(grammar); AssertThat(result.second, Equals(CompileError::none())); - AssertThat(result.first.variables, Equals(vector({ - Variable("rule_A", VariableTypeNamed, seq({ - i_sym(10), - repeat(character({ 'x' })), - i_sym(11), - })), - }))); + AssertThat(result.first.variables, Equals(vector { + LexicalVariable { + "rule_A", + VariableTypeNamed, + seq({ + i_sym(10), + repeat(character({ 'x' })), + i_sym(11), + }), + false + } + })); }); it("handles regexps containing non-ASCII UTF8 characters", [&]() { - LexicalGrammar grammar{{ - Variable("rule_A", VariableTypeNamed, pattern("[^\u03B1-\u03B4]*")), - }, {}}; + LexicalGrammar grammar { + { + LexicalVariable { + "rule_A", + VariableTypeNamed, + pattern("[^\u03B1-\u03B4]*"), + false + } + }, + {} + }; auto result = expand_tokens(grammar); - AssertThat(result.first.variables, Equals(vector({ - Variable("rule_A", VariableTypeNamed, repeat(character({ 945, 946, 947, 948 }, false))), - }))); + AssertThat(result.first.variables, Equals(vector { + LexicalVariable { + "rule_A", + VariableTypeNamed, + repeat(character({ 945, 946, 947, 948 }, false)), + false + } + })); }); it("returns an error when the grammar contains an invalid regex", [&]() { - LexicalGrammar grammar{{ - Variable("rule_A", VariableTypeNamed, seq({ - pattern("("), - str("xyz"), - pattern("["), - })) - }, {}}; + LexicalGrammar grammar { + { + LexicalVariable { + "rule_A", + VariableTypeNamed, + seq({ + pattern("("), + str("xyz"), + pattern("["), + }), + false + }, + }, + {} + }; auto result = expand_tokens(grammar); diff --git a/spec/compiler/prepare_grammar/extract_tokens_spec.cc b/spec/compiler/prepare_grammar/extract_tokens_spec.cc index 3aa576df..de7c4fa5 100644 --- a/spec/compiler/prepare_grammar/extract_tokens_spec.cc +++ b/spec/compiler/prepare_grammar/extract_tokens_spec.cc @@ -16,20 +16,25 @@ using prepare_grammar::InitialSyntaxGrammar; describe("extract_tokens", []() { it("moves strings, patterns, and sub-rules marked as tokens into the lexical grammar", [&]() { - auto result = extract_tokens(InternedGrammar{{ - Variable("rule_A", VariableTypeNamed, repeat1(seq({ - str("ab"), - pattern("cd*"), - choice({ - i_sym(1), - i_sym(2), - token(repeat1(choice({ str("ef"), str("gh") }))), - }), - }))), - Variable("rule_B", VariableTypeNamed, pattern("ij+")), - Variable("rule_C", VariableTypeNamed, choice({ str("kl"), blank() })), - Variable("rule_D", VariableTypeNamed, repeat1(i_sym(3))) - }, {}, {}, {}}); + auto result = extract_tokens(InternedGrammar { + { + Variable("rule_A", VariableTypeNamed, repeat1(seq({ + str("ab"), + pattern("cd*"), + choice({ + i_sym(1), + i_sym(2), + token(repeat1(choice({ str("ef"), str("gh") }))), + }), + }))), + Variable("rule_B", VariableTypeNamed, pattern("ij+")), + Variable("rule_C", VariableTypeNamed, choice({ str("kl"), blank() })), + Variable("rule_D", VariableTypeNamed, repeat1(i_sym(3))) + }, + {}, + {}, + {} + }); InitialSyntaxGrammar &syntax_grammar = get<0>(result); LexicalGrammar &lexical_grammar = get<1>(result); @@ -64,46 +69,51 @@ describe("extract_tokens", []() { Variable("rule_D", VariableTypeNamed, repeat1(i_sym(2))), }))); - AssertThat(lexical_grammar.variables, Equals(vector({ + AssertThat(lexical_grammar.variables, Equals(vector({ // Strings become anonymous rules. - Variable("ab", VariableTypeAnonymous, str("ab")), + LexicalVariable("ab", VariableTypeAnonymous, str("ab"), true), // Patterns become hidden rules. - Variable("/cd*/", VariableTypeAuxiliary, pattern("cd*")), + LexicalVariable("/cd*/", VariableTypeAuxiliary, pattern("cd*"), false), // Rules marked as tokens become hidden rules. - Variable("/(ef|gh)*/", VariableTypeAuxiliary, repeat1(choice({ + LexicalVariable("/(ef|gh)*/", VariableTypeAuxiliary, repeat1(choice({ str("ef"), str("gh") - }))), + })), false), // This named rule was moved wholesale to the lexical grammar. - Variable("rule_B", VariableTypeNamed, pattern("ij+")), + LexicalVariable("rule_B", VariableTypeNamed, pattern("ij+"), false), // Strings become anonymous rules. - Variable("kl", VariableTypeAnonymous, str("kl")), + LexicalVariable("kl", VariableTypeAnonymous, str("kl"), true), }))); }); it("does not create duplicate tokens in the lexical grammar", [&]() { - auto result = extract_tokens(InternedGrammar{{ - Variable("rule_A", VariableTypeNamed, seq({ - str("ab"), - i_sym(0), - str("ab"), - })), - }, {}, {}, {}}); + auto result = extract_tokens(InternedGrammar { + { + Variable("rule_A", VariableTypeNamed, seq({ + str("ab"), + i_sym(0), + str("ab"), + })), + }, + {}, + {}, + {} + }); InitialSyntaxGrammar &syntax_grammar = get<0>(result); LexicalGrammar &lexical_grammar = get<1>(result); - AssertThat(syntax_grammar.variables, Equals(vector({ - Variable("rule_A", VariableTypeNamed, seq({ i_token(0), i_sym(0), i_token(0) })), - }))); + AssertThat(syntax_grammar.variables, Equals(vector { + Variable {"rule_A", VariableTypeNamed, seq({ i_token(0), i_sym(0), i_token(0) })}, + })); - AssertThat(lexical_grammar.variables, Equals(vector({ - Variable("ab", VariableTypeAnonymous, str("ab")), - }))) + AssertThat(lexical_grammar.variables, Equals(vector { + LexicalVariable {"ab", VariableTypeAnonymous, str("ab"), true}, + })) }); it("does not move entire rules into the lexical grammar if their content is used elsewhere in the grammar", [&]() { @@ -122,11 +132,11 @@ describe("extract_tokens", []() { Variable("rule_C", VariableTypeNamed, seq({ i_token(2), i_token(1) })), }))); - AssertThat(lexical_grammar.variables, Equals(vector({ - Variable("ab", VariableTypeAnonymous, str("ab")), - Variable("cd", VariableTypeAnonymous, str("cd")), - Variable("ef", VariableTypeAnonymous, str("ef")), - }))); + AssertThat(lexical_grammar.variables, Equals(vector { + LexicalVariable {"ab", VariableTypeAnonymous, str("ab"), true}, + LexicalVariable {"cd", VariableTypeAnonymous, str("cd"), true}, + LexicalVariable {"ef", VariableTypeAnonymous, str("ef"), true}, + })); }); it("renumbers the grammar's expected conflict symbols based on any moved rules", [&]() { diff --git a/spec/helpers/rule_helpers.cc b/spec/helpers/rule_helpers.cc index 0b010d2e..968d59ba 100644 --- a/spec/helpers/rule_helpers.cc +++ b/spec/helpers/rule_helpers.cc @@ -1,6 +1,8 @@ #include "rule_helpers.h" #include #include "compiler/rules/symbol.h" +#include "compiler/variable.h" +#include "compiler/lexical_grammar.h" namespace tree_sitter { using std::make_shared; @@ -52,4 +54,9 @@ namespace tree_sitter { return left.name == right.name && left.rule->operator==(*right.rule) && left.type == right.type; } + + bool operator==(const LexicalVariable &left, const LexicalVariable &right) { + return left.name == right.name && left.rule->operator==(*right.rule) && + left.type == right.type && left.is_string == right.is_string; + } } diff --git a/spec/helpers/rule_helpers.h b/spec/helpers/rule_helpers.h index a985d294..8ebe87e8 100644 --- a/spec/helpers/rule_helpers.h +++ b/spec/helpers/rule_helpers.h @@ -15,7 +15,11 @@ namespace tree_sitter { rule_ptr i_token(size_t index); rule_ptr active_prec(int precedence, rule_ptr); + struct Variable; + struct LexicalVariable; + bool operator==(const Variable &left, const Variable &right); + bool operator==(const LexicalVariable &left, const LexicalVariable &right); } #endif // HELPERS_RULE_HELPERS_H_ diff --git a/spec/helpers/stream_methods.cc b/spec/helpers/stream_methods.cc index a4b275ea..5ef2898c 100644 --- a/spec/helpers/stream_methods.cc +++ b/spec/helpers/stream_methods.cc @@ -3,6 +3,7 @@ #include "tree_sitter/compiler.h" #include "compiler/parse_table.h" #include "compiler/syntax_grammar.h" +#include "compiler/lexical_grammar.h" #include "compiler/build_tables/parse_item.h" #include "compiler/build_tables/lex_item.h" @@ -41,6 +42,11 @@ ostream &operator<<(ostream &stream, const SyntaxVariable &variable) { return stream << string("{") << variable.name << string(", ") << variable.productions << string(", ") << to_string(variable.type) << string("}"); } +ostream &operator<<(ostream &stream, const LexicalVariable &variable) { + return stream << "{" << variable.name << ", " << variable.rule << ", " << + to_string(variable.type) << ", " << to_string(variable.is_string) << "}"; +} + std::ostream &operator<<(std::ostream &stream, const AdvanceAction &action) { return stream << string("#"; } diff --git a/spec/helpers/stream_methods.h b/spec/helpers/stream_methods.h index 28b201c3..1b6a794b 100644 --- a/spec/helpers/stream_methods.h +++ b/spec/helpers/stream_methods.h @@ -93,6 +93,7 @@ using std::string; using std::to_string; struct Variable; struct SyntaxVariable; +struct LexicalVariable; struct AdvanceAction; struct AcceptTokenAction; class ParseAction; @@ -107,6 +108,7 @@ ostream &operator<<(ostream &, const Rule &); ostream &operator<<(ostream &, const rule_ptr &); ostream &operator<<(ostream &, const Variable &); ostream &operator<<(ostream &, const SyntaxVariable &); +ostream &operator<<(ostream &, const LexicalVariable &); ostream &operator<<(ostream &, const AdvanceAction &); ostream &operator<<(ostream &, const AcceptTokenAction &); ostream &operator<<(ostream &, const ParseAction &); diff --git a/src/compiler/build_tables/build_lex_table.cc b/src/compiler/build_tables/build_lex_table.cc index e0dc4a8b..7102fc29 100644 --- a/src/compiler/build_tables/build_lex_table.cc +++ b/src/compiler/build_tables/build_lex_table.cc @@ -99,7 +99,8 @@ class LexTableBuilder { LexItem::CompletionStatus completion_status = item.completion_status(); if (completion_status.is_done) { AcceptTokenAction action(item.lhs, completion_status.precedence.max, - completion_status.is_string); + item.lhs.is_built_in() || + lex_grammar.variables[item.lhs.index].is_string); auto current_action = lex_table.state(state_id).accept_action; if (conflict_manager.resolve(action, current_action)) diff --git a/src/compiler/build_tables/build_parse_table.cc b/src/compiler/build_tables/build_parse_table.cc index ce721119..4798108e 100644 --- a/src/compiler/build_tables/build_parse_table.cc +++ b/src/compiler/build_tables/build_parse_table.cc @@ -72,10 +72,11 @@ class ParseTableBuilder { })); CompileError error = process_part_state_queue(); - if (error.type != TSCompileErrorTypeNone) + if (error.type != TSCompileErrorTypeNone) { return { parse_table, error }; + } - parse_table.mergeable_symbols = compatible_tokens.recovery_tokens; + update_unmergable_token_pairs(); build_error_parse_state(); @@ -111,7 +112,7 @@ class ParseTableBuilder { void build_error_parse_state() { ParseState error_state; - for (const Symbol symbol : parse_table.mergeable_symbols) { + for (const Symbol symbol : compatible_tokens.recovery_tokens) { add_out_of_context_parse_state(&error_state, symbol); } @@ -292,6 +293,25 @@ class ParseTableBuilder { } } + void update_unmergable_token_pairs() { + for (const ParseState &state : parse_table.states) { + for (Symbol::Index token_index = 0, token_count = lexical_grammar.variables.size(); token_index < token_count; token_index++) { + Symbol token(token_index, Symbol::Terminal); + if (state.terminal_entries.count(token)) { + auto &incompatible_token_indices = compatible_tokens.unmergeable_pairs[token_index]; + auto iter = incompatible_token_indices.begin(); + while (iter != incompatible_token_indices.end()) { + if (state.terminal_entries.count(Symbol(*iter, Symbol::NonTerminal))) { + iter = incompatible_token_indices.erase(iter); + } else { + ++iter; + } + } + } + } + } + } + void remove_duplicate_parse_states() { map> state_indices_by_signature; @@ -382,11 +402,19 @@ class ParseTableBuilder { for (auto &entry : state.terminal_entries) { Symbol lookahead = entry.first; const vector &actions = entry.second.actions; + auto &incompatible_token_indices = compatible_tokens.unmergeable_pairs[lookahead.index]; const auto &other_entry = other.terminal_entries.find(lookahead); if (other_entry == other.terminal_entries.end()) { - if (compatible_tokens.recovery_tokens.count(lookahead) == 0 && !lookahead.is_built_in()) - return false; + if (!lookahead.is_built_in()) { + if (!compatible_tokens.recovery_tokens.count(lookahead)) + return false; + for (Symbol::Index incompatible_index : incompatible_token_indices) { + if (other.terminal_entries.count(Symbol(incompatible_index, Symbol::Terminal))) { + return false; + } + } + } if (actions.back().type != ParseActionTypeReduce) return false; if (!has_entry(other, entry.second)) @@ -401,10 +429,18 @@ class ParseTableBuilder { for (auto &entry : other.terminal_entries) { Symbol lookahead = entry.first; const vector &actions = entry.second.actions; + auto &incompatible_token_indices = compatible_tokens.unmergeable_pairs[lookahead.index]; if (!state.terminal_entries.count(lookahead)) { - if (compatible_tokens.recovery_tokens.count(lookahead) == 0 && !lookahead.is_built_in()) - return false; + if (!lookahead.is_built_in()) { + if (!compatible_tokens.recovery_tokens.count(lookahead)) + return false; + for (Symbol::Index incompatible_index : incompatible_token_indices) { + if (state.terminal_entries.count(Symbol(incompatible_index, Symbol::Terminal))) { + return false; + } + } + } if (actions.back().type != ParseActionTypeReduce) return false; if (!has_entry(state, entry.second)) @@ -629,7 +665,7 @@ class ParseTableBuilder { switch (symbol.type) { case Symbol::Terminal: { - const Variable &variable = lexical_grammar.variables[symbol.index]; + const LexicalVariable &variable = lexical_grammar.variables[symbol.index]; if (variable.type == VariableTypeNamed) return variable.name; else diff --git a/src/compiler/build_tables/compatible_tokens.cc b/src/compiler/build_tables/compatible_tokens.cc index 1f29b671..98099612 100644 --- a/src/compiler/build_tables/compatible_tokens.cc +++ b/src/compiler/build_tables/compatible_tokens.cc @@ -83,6 +83,7 @@ using FirstCharactersIntersector = CharacterIntersector; CompatibleTokensResult get_compatible_tokens(const LexicalGrammar &grammar) { CompatibleTokensResult result; + result.unmergeable_pairs.resize(grammar.variables.size()); AllCharacters all_separator_characters; for (const rule_ptr &separator : grammar.separators) @@ -90,7 +91,8 @@ CompatibleTokensResult get_compatible_tokens(const LexicalGrammar &grammar) { for (size_t i = 0; i < grammar.variables.size(); i++) { Symbol symbol(i, Symbol::Terminal); - rule_ptr rule = grammar.variables[i].rule; + const LexicalVariable &variable = grammar.variables[i]; + rule_ptr rule = variable.rule; FirstCharacters first_characters; first_characters.apply(rule); @@ -109,18 +111,20 @@ CompatibleTokensResult get_compatible_tokens(const LexicalGrammar &grammar) { !last_characters.result.includes_all && !last_characters.result.intersects(all_separator_characters.result); - bool has_no_separators = - !all_characters.result.intersects(all_separator_characters.result); + bool has_separators = + all_characters.result.intersects(all_separator_characters.result); - if ((has_distinct_start && has_distinct_end) || has_no_separators) + if ((has_distinct_start && has_distinct_end) || !has_separators) result.recovery_tokens.insert(symbol); - for (size_t j = 0; j < grammar.variables.size(); j++) { - if (j == i) continue; - Symbol other_symbol(j, Symbol::Terminal); - FirstCharactersIntersector intersector(&first_characters.result); - if (intersector.apply(grammar.variables[j].rule)) { - result.unmergeable_pairs[symbol].insert(other_symbol); + for (size_t j = 0; j < i; j++) { + const LexicalVariable &other_variable = grammar.variables[j]; + if (has_separators) { + FirstCharactersIntersector intersector(&first_characters.result); + if (intersector.apply(other_variable.rule)) { + result.unmergeable_pairs[i].insert(j); + result.unmergeable_pairs[j].insert(i); + } } } } diff --git a/src/compiler/build_tables/compatible_tokens.h b/src/compiler/build_tables/compatible_tokens.h index 5c15358f..f8061902 100644 --- a/src/compiler/build_tables/compatible_tokens.h +++ b/src/compiler/build_tables/compatible_tokens.h @@ -3,8 +3,9 @@ #include "compiler/rule.h" #include "compiler/rules/symbol.h" -#include +#include #include +#include namespace tree_sitter { @@ -14,7 +15,7 @@ namespace build_tables { struct CompatibleTokensResult { std::set recovery_tokens; - std::map> unmergeable_pairs; + std::vector> unmergeable_pairs; }; CompatibleTokensResult get_compatible_tokens(const LexicalGrammar &); diff --git a/src/compiler/build_tables/lex_item.cc b/src/compiler/build_tables/lex_item.cc index 152b2469..4c9056df 100644 --- a/src/compiler/build_tables/lex_item.cc +++ b/src/compiler/build_tables/lex_item.cc @@ -32,19 +32,15 @@ LexItem::CompletionStatus LexItem::completion_status() const { CompletionStatus apply_to(const rules::Choice *rule) { for (const auto &element : rule->elements) { CompletionStatus status = apply(element); - if (status.is_done) - return status; + if (status.is_done) return status; } - return { false, PrecedenceRange(), false }; + return { false, PrecedenceRange() }; } CompletionStatus apply_to(const rules::Metadata *rule) { CompletionStatus result = apply(rule->rule); - if (result.is_done) { - if (result.precedence.empty && rule->params.has_precedence) - result.precedence.add(rule->params.precedence); - if (rule->params.is_string) - result.is_string = true; + if (result.is_done && result.precedence.empty && rule->params.has_precedence) { + result.precedence.add(rule->params.precedence); } return result; } @@ -54,15 +50,16 @@ LexItem::CompletionStatus LexItem::completion_status() const { } CompletionStatus apply_to(const rules::Blank *rule) { - return { true, PrecedenceRange(), false }; + return { true, PrecedenceRange() }; } CompletionStatus apply_to(const rules::Seq *rule) { CompletionStatus left_status = apply(rule->left); - if (left_status.is_done) + if (left_status.is_done) { return apply(rule->right); - else - return { false, PrecedenceRange(), false }; + } else { + return { false, PrecedenceRange() }; + } } }; @@ -80,8 +77,9 @@ bool LexItemSet::operator==(const LexItemSet &other) const { LexItemSet::TransitionMap LexItemSet::transitions() const { TransitionMap result; - for (const LexItem &item : entries) + for (const LexItem &item : entries) { lex_item_transitions(&result, item); + } return result; } diff --git a/src/compiler/build_tables/lex_item.h b/src/compiler/build_tables/lex_item.h index 4c45f80d..b6b07de7 100644 --- a/src/compiler/build_tables/lex_item.h +++ b/src/compiler/build_tables/lex_item.h @@ -19,7 +19,6 @@ class LexItem { struct CompletionStatus { bool is_done; PrecedenceRange precedence; - bool is_string; }; bool operator==(const LexItem &other) const; diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index bc84e557..d2fd1acf 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -561,7 +561,7 @@ class CCodeGenerator { return { variable.name, variable.type }; } case Symbol::Terminal: { - const Variable &variable = lexical_grammar.variables[symbol.index]; + const LexicalVariable &variable = lexical_grammar.variables[symbol.index]; return { variable.name, variable.type }; } case Symbol::External: diff --git a/src/compiler/lexical_grammar.cc b/src/compiler/lexical_grammar.cc new file mode 100644 index 00000000..3738ecb3 --- /dev/null +++ b/src/compiler/lexical_grammar.cc @@ -0,0 +1,11 @@ +#include "compiler/lexical_grammar.h" + +namespace tree_sitter { + +using std::string; + +LexicalVariable::LexicalVariable( + const string &name, VariableType type, const rule_ptr &rule, bool is_string) + : name(name), rule(rule), type(type), is_string(is_string) {} + +} // namespace tree_sitter diff --git a/src/compiler/lexical_grammar.h b/src/compiler/lexical_grammar.h index 58aa54f3..90536ecd 100644 --- a/src/compiler/lexical_grammar.h +++ b/src/compiler/lexical_grammar.h @@ -9,8 +9,17 @@ namespace tree_sitter { +struct LexicalVariable { + LexicalVariable(const std::string &, VariableType, const rule_ptr &, bool); + + std::string name; + rule_ptr rule; + VariableType type; + bool is_string; +}; + struct LexicalGrammar { - std::vector variables; + std::vector variables; std::vector separators; }; diff --git a/src/compiler/parse_table.h b/src/compiler/parse_table.h index 615714b1..6f684b21 100644 --- a/src/compiler/parse_table.h +++ b/src/compiler/parse_table.h @@ -96,8 +96,6 @@ class ParseTable { std::vector states; std::map symbols; - - std::set mergeable_symbols; }; } // namespace tree_sitter diff --git a/src/compiler/prepare_grammar/expand_tokens.cc b/src/compiler/prepare_grammar/expand_tokens.cc index b024e27c..ff268782 100644 --- a/src/compiler/prepare_grammar/expand_tokens.cc +++ b/src/compiler/prepare_grammar/expand_tokens.cc @@ -67,11 +67,11 @@ pair expand_tokens(const LexicalGrammar &grammar) LexicalGrammar result; ExpandTokens expander; - for (const Variable &variable : grammar.variables) { + for (const LexicalVariable &variable : grammar.variables) { auto rule = expander.apply(variable.rule); if (expander.error.type) return { result, expander.error }; - result.variables.push_back(Variable(variable.name, variable.type, rule)); + result.variables.push_back({variable.name, variable.type, rule, variable.is_string}); } for (auto &sep : grammar.separators) { diff --git a/src/compiler/prepare_grammar/extract_tokens.cc b/src/compiler/prepare_grammar/extract_tokens.cc index 9d161ca8..895d116c 100644 --- a/src/compiler/prepare_grammar/extract_tokens.cc +++ b/src/compiler/prepare_grammar/extract_tokens.cc @@ -56,7 +56,7 @@ class SymbolReplacer : public rules::IdentityRuleFn { class TokenExtractor : public rules::IdentityRuleFn { using rules::IdentityRuleFn::apply_to; - rule_ptr apply_to_token(const Rule *input, VariableType entry_type) { + rule_ptr apply_to_token(const Rule *input, VariableType entry_type, bool is_string) { for (size_t i = 0; i < tokens.size(); i++) if (tokens[i].rule->operator==(*input)) { token_usage_counts[i]++; @@ -65,29 +65,30 @@ class TokenExtractor : public rules::IdentityRuleFn { rule_ptr rule = input->copy(); size_t index = tokens.size(); - tokens.push_back(Variable(token_description(rule), entry_type, rule)); + tokens.push_back({token_description(rule), entry_type, rule, is_string}); token_usage_counts.push_back(1); return make_shared(index, Symbol::Terminal); } rule_ptr apply_to(const rules::String *rule) { - return apply_to_token(rule, VariableTypeAnonymous); + return apply_to_token(rule, VariableTypeAnonymous, true); } rule_ptr apply_to(const rules::Pattern *rule) { - return apply_to_token(rule, VariableTypeAuxiliary); + return apply_to_token(rule, VariableTypeAuxiliary, false); } rule_ptr apply_to(const rules::Metadata *rule) { - if (rule->params.is_token) - return apply_to_token(rule->rule.get(), VariableTypeAuxiliary); - else + if (rule->params.is_token) { + return apply_to_token(rule->rule.get(), VariableTypeAuxiliary, false); + } else { return rules::IdentityRuleFn::apply_to(rule); + } } public: vector token_usage_counts; - vector tokens; + vector tokens; }; static CompileError extra_token_error(const string &message) { @@ -139,8 +140,9 @@ tuple extract_tokens( for (const ConflictSet &conflict_set : grammar.expected_conflicts) { ConflictSet new_conflict_set; - for (const Symbol &symbol : conflict_set) + for (const Symbol &symbol : conflict_set) { new_conflict_set.insert(symbol_replacer.replace_symbol(symbol)); + } syntax_grammar.expected_conflicts.insert(new_conflict_set); } @@ -154,7 +156,7 @@ tuple extract_tokens( for (const rule_ptr &rule : grammar.extra_tokens) { int i = 0; bool used_elsewhere_in_grammar = false; - for (const Variable &variable : lexical_grammar.variables) { + for (const LexicalVariable &variable : lexical_grammar.variables) { if (variable.rule->operator==(*rule)) { syntax_grammar.extra_tokens.insert(Symbol(i, Symbol::Terminal)); used_elsewhere_in_grammar = true; @@ -171,9 +173,10 @@ tuple extract_tokens( } auto symbol = rule->as(); - if (!symbol) + if (!symbol) { return make_tuple(syntax_grammar, lexical_grammar, extra_token_error(rule->to_string())); + } Symbol new_symbol = symbol_replacer.replace_symbol(*symbol); if (new_symbol.is_non_terminal()) { diff --git a/src/compiler/prepare_grammar/normalize_rules.cc b/src/compiler/prepare_grammar/normalize_rules.cc index 0e1da9fd..2e4dd205 100644 --- a/src/compiler/prepare_grammar/normalize_rules.cc +++ b/src/compiler/prepare_grammar/normalize_rules.cc @@ -8,7 +8,7 @@ namespace prepare_grammar { LexicalGrammar normalize_rules(const LexicalGrammar &input_grammar) { LexicalGrammar result(input_grammar); - for (Variable &variable : result.variables) { + for (LexicalVariable &variable : result.variables) { variable.rule = rules::Choice::build(extract_choices(variable.rule)); } diff --git a/src/compiler/syntax_grammar.cc b/src/compiler/syntax_grammar.cc index aa3074e8..b27344f1 100644 --- a/src/compiler/syntax_grammar.cc +++ b/src/compiler/syntax_grammar.cc @@ -8,10 +8,8 @@ namespace tree_sitter { using std::string; -using std::to_string; using std::pair; using std::vector; -using std::set; SyntaxVariable::SyntaxVariable(const string &name, VariableType type, const vector &productions)