diff --git a/project.gyp b/project.gyp index 2687a4cf..8871fc16 100644 --- a/project.gyp +++ b/project.gyp @@ -11,13 +11,12 @@ 'externals/json-parser', ], 'sources': [ - 'src/compiler/build_tables/build_lex_table.cc', 'src/compiler/build_tables/build_parse_table.cc', 'src/compiler/build_tables/build_tables.cc', - 'src/compiler/build_tables/recovery_tokens.cc', 'src/compiler/build_tables/lex_item.cc', 'src/compiler/build_tables/lex_item_transitions.cc', 'src/compiler/build_tables/lex_conflict_manager.cc', + 'src/compiler/build_tables/lex_table_builder.cc', 'src/compiler/build_tables/lookahead_set.cc', 'src/compiler/build_tables/parse_item.cc', 'src/compiler/build_tables/parse_item_set_builder.cc', @@ -41,7 +40,6 @@ 'src/compiler/prepare_grammar/token_description.cc', 'src/compiler/rule.cc', 'src/compiler/syntax_grammar.cc', - 'src/compiler/variable.cc', 'src/compiler/rules/blank.cc', 'src/compiler/rules/built_in_symbols.cc', 'src/compiler/rules/character_range.cc', diff --git a/spec/compiler/build_tables/distinctive_tokens_spec.cc b/spec/compiler/build_tables/distinctive_tokens_spec.cc deleted file mode 100644 index f01d76cb..00000000 --- a/spec/compiler/build_tables/distinctive_tokens_spec.cc +++ /dev/null @@ -1,34 +0,0 @@ -#include "spec_helper.h" -#include "compiler/rules/character_set.h" -#include "compiler/build_tables/recovery_tokens.h" -#include "compiler/lexical_grammar.h" -#include "helpers/rule_helpers.h" -#include "helpers/stream_methods.h" -#include "compiler/rules.h" - -using namespace rules; -using namespace build_tables; - -START_TEST - -describe("recovery_tokens(rule)", []() { - it("includes rules that can only begin and end with an explicit set of characters", [&]() { - LexicalGrammar grammar; - grammar.separators = { - character({ ' ' }), - }; - - grammar.variables = { - Variable("var0", VariableTypeNamed, character({}, false)), - Variable("var1", VariableTypeNamed, seq({ - character({ 'a', 'b' }), - character({}, false), - character({ 'c', 'd' }), - })), - }; - - AssertThat(recovery_tokens(grammar), Equals>({ Symbol(1, Symbol::Terminal) })); - }); -}); - -END_TEST diff --git a/spec/compiler/build_tables/lex_conflict_manager_spec.cc b/spec/compiler/build_tables/lex_conflict_manager_spec.cc index 3aa75a4c..f7382a74 100644 --- a/spec/compiler/build_tables/lex_conflict_manager_spec.cc +++ b/spec/compiler/build_tables/lex_conflict_manager_spec.cc @@ -20,6 +20,10 @@ describe("LexConflictManager::resolve(new_action, old_action)", []() { Symbol sym4(3, Symbol::Terminal); LexItemSet item_set({ LexItem(sym4, blank() )}); + before_each([&]() { + conflict_manager = LexConflictManager(); + }); + it("favors advance actions over empty accept token actions", [&]() { update = conflict_manager.resolve(item_set, AdvanceAction(2, {0, 0}, true), AcceptTokenAction()); AssertThat(update, IsTrue()); @@ -65,6 +69,7 @@ describe("LexConflictManager::resolve(new_action, old_action)", []() { describe("advance/accept-token conflicts", [&]() { describe("when the token to accept has higher precedence", [&]() { it("prefers the accept-token action", [&]() { + AssertThat(conflict_manager.possible_extensions, IsEmpty()); update = conflict_manager.resolve(item_set, AdvanceAction(1, { 1, 2 }, true), AcceptTokenAction(sym3, 3, true)); AssertThat(update, IsFalse()); AssertThat(conflict_manager.possible_extensions, IsEmpty()); @@ -72,13 +77,9 @@ describe("LexConflictManager::resolve(new_action, old_action)", []() { }); describe("when the token to accept does not have a higher precedence", [&]() { - it("favors the advance action", [&]() { + it("favors the advance action and adds the in-progress tokens as possible extensions of the discarded token", [&]() { update = conflict_manager.resolve(item_set, AdvanceAction(1, { 1, 2 }, true), AcceptTokenAction(sym3, 2, true)); AssertThat(update, IsTrue()); - }); - - it("adds the in-progress tokens as possible extensions of the discarded token", [&]() { - conflict_manager.resolve(item_set, AdvanceAction(1, { 1, 2 }, true), AcceptTokenAction(sym3, 3, true)); AssertThat(conflict_manager.possible_extensions[sym3.index], Contains(sym4.index)); }); }); diff --git a/spec/compiler/build_tables/lex_item_spec.cc b/spec/compiler/build_tables/lex_item_spec.cc index 7042922f..27ceef69 100644 --- a/spec/compiler/build_tables/lex_item_spec.cc +++ b/spec/compiler/build_tables/lex_item_spec.cc @@ -13,11 +13,10 @@ START_TEST describe("LexItem", []() { describe("completion_status()", [&]() { - it("indicates whether the item is done, its precedence, and whether it is a string", [&]() { + it("indicates whether the item is done and its precedence", [&]() { LexItem item1(Symbol(0, Symbol::Terminal), character({ 'a', 'b', 'c' })); AssertThat(item1.completion_status().is_done, IsFalse()); AssertThat(item1.completion_status().precedence, Equals(PrecedenceRange())); - AssertThat(item1.completion_status().is_string, IsFalse()); MetadataParams params; params.precedence = 3; @@ -30,12 +29,10 @@ describe("LexItem", []() { AssertThat(item2.completion_status().is_done, IsTrue()); AssertThat(item2.completion_status().precedence, Equals(PrecedenceRange(3))); - AssertThat(item2.completion_status().is_string, IsTrue()); LexItem item3(Symbol(0, Symbol::Terminal), repeat(character({ ' ', '\t' }))); AssertThat(item3.completion_status().is_done, IsTrue()); AssertThat(item3.completion_status().precedence, Equals(PrecedenceRange())); - AssertThat(item3.completion_status().is_string, IsFalse()); }); }); }); diff --git a/spec/compiler/build_tables/parse_item_set_builder_spec.cc b/spec/compiler/build_tables/parse_item_set_builder_spec.cc index 6548f37a..9493d280 100644 --- a/spec/compiler/build_tables/parse_item_set_builder_spec.cc +++ b/spec/compiler/build_tables/parse_item_set_builder_spec.cc @@ -12,12 +12,13 @@ using namespace rules; START_TEST describe("ParseItemSetBuilder", []() { - vector lexical_variables; + vector lexical_variables; for (size_t i = 0; i < 20; i++) { - lexical_variables.push_back(Variable{ + lexical_variables.push_back({ "token_" + to_string(i), VariableTypeNamed, blank(), + false }); } @@ -25,13 +26,13 @@ describe("ParseItemSetBuilder", []() { it("adds items at the beginnings of referenced rules", [&]() { SyntaxGrammar grammar{{ - SyntaxVariable("rule0", VariableTypeNamed, { + SyntaxVariable{"rule0", VariableTypeNamed, { Production({ {Symbol(1, Symbol::NonTerminal), 0, AssociativityNone}, {Symbol(11, Symbol::Terminal), 0, AssociativityNone}, }), - }), - SyntaxVariable("rule1", VariableTypeNamed, { + }}, + SyntaxVariable{"rule1", VariableTypeNamed, { Production({ {Symbol(12, Symbol::Terminal), 0, AssociativityNone}, {Symbol(13, Symbol::Terminal), 0, AssociativityNone}, @@ -39,13 +40,13 @@ describe("ParseItemSetBuilder", []() { Production({ {Symbol(2, Symbol::NonTerminal), 0, AssociativityNone}, }) - }), - SyntaxVariable("rule2", VariableTypeNamed, { + }}, + SyntaxVariable{"rule2", VariableTypeNamed, { Production({ {Symbol(14, Symbol::Terminal), 0, AssociativityNone}, {Symbol(15, Symbol::Terminal), 0, AssociativityNone}, }) - }), + }}, }, {}, {}, {}}; auto production = [&](int variable_index, int production_index) -> const Production & { @@ -84,19 +85,19 @@ describe("ParseItemSetBuilder", []() { it("handles rules with empty productions", [&]() { SyntaxGrammar grammar{{ - SyntaxVariable("rule0", VariableTypeNamed, { + SyntaxVariable{"rule0", VariableTypeNamed, { Production({ {Symbol(1, Symbol::NonTerminal), 0, AssociativityNone}, {Symbol(11, Symbol::Terminal), 0, AssociativityNone}, }), - }), - SyntaxVariable("rule1", VariableTypeNamed, { + }}, + SyntaxVariable{"rule1", VariableTypeNamed, { Production({ {Symbol(12, Symbol::Terminal), 0, AssociativityNone}, {Symbol(13, Symbol::Terminal), 0, AssociativityNone}, }), Production({}) - }), + }}, }, {}, {}, {}}; auto production = [&](int variable_index, int production_index) -> const Production & { diff --git a/spec/compiler/prepare_grammar/expand_repeats_spec.cc b/spec/compiler/prepare_grammar/expand_repeats_spec.cc index d8c93a41..d15f630f 100644 --- a/spec/compiler/prepare_grammar/expand_repeats_spec.cc +++ b/spec/compiler/prepare_grammar/expand_repeats_spec.cc @@ -2,6 +2,7 @@ #include "compiler/prepare_grammar/initial_syntax_grammar.h" #include "compiler/prepare_grammar/expand_repeats.h" #include "helpers/rule_helpers.h" +#include "helpers/stream_methods.h" START_TEST @@ -11,141 +12,159 @@ using prepare_grammar::expand_repeats; describe("expand_repeats", []() { it("replaces repeat rules with pairs of recursive rules", [&]() { - InitialSyntaxGrammar grammar{{ - Variable("rule0", VariableTypeNamed, repeat1(i_token(0))), - }, {}, {}, {}}; + InitialSyntaxGrammar grammar{ + { + Variable{"rule0", VariableTypeNamed, repeat1(i_token(0))}, + }, + {}, {}, {} + }; auto result = expand_repeats(grammar); - AssertThat(result.variables, Equals(vector({ - Variable("rule0", VariableTypeNamed, i_sym(1)), - Variable("rule0_repeat1", VariableTypeAuxiliary, choice({ + AssertThat(result.variables, Equals(vector{ + Variable{"rule0", VariableTypeNamed, i_sym(1)}, + Variable{"rule0_repeat1", VariableTypeAuxiliary, choice({ seq({ i_sym(1), i_token(0) }), i_token(0), - })), - }))); + })}, + })); }); it("replaces repeats inside of sequences", [&]() { - InitialSyntaxGrammar grammar{{ - Variable("rule0", VariableTypeNamed, seq({ - i_token(10), - repeat1(i_token(11)), - })), - }, {}, {}, {}}; + InitialSyntaxGrammar grammar{ + { + Variable{"rule0", VariableTypeNamed, seq({ + i_token(10), + repeat1(i_token(11)), + })}, + }, + {}, {}, {} + }; auto result = expand_repeats(grammar); - AssertThat(result.variables, Equals(vector({ - Variable("rule0", VariableTypeNamed, seq({ + AssertThat(result.variables, Equals(vector{ + Variable{"rule0", VariableTypeNamed, seq({ i_token(10), i_sym(1), - })), - Variable("rule0_repeat1", VariableTypeAuxiliary, choice({ + })}, + Variable{"rule0_repeat1", VariableTypeAuxiliary, choice({ seq({ i_sym(1), i_token(11) }), i_token(11) - })), - }))); + })}, + })); }); it("replaces repeats inside of choices", [&]() { - InitialSyntaxGrammar grammar{{ - Variable("rule0", VariableTypeNamed, choice({ - i_token(10), - repeat1(i_token(11)) - })), - }, {}, {}, {}}; + InitialSyntaxGrammar grammar{ + { + Variable{"rule0", VariableTypeNamed, choice({ + i_token(10), + repeat1(i_token(11)) + })}, + }, + {}, {}, {} + }; auto result = expand_repeats(grammar); - AssertThat(result.variables, Equals(vector({ - Variable("rule0", VariableTypeNamed, choice({ + AssertThat(result.variables, Equals(vector{ + Variable{"rule0", VariableTypeNamed, choice({ i_token(10), i_sym(1), - })), - Variable("rule0_repeat1", VariableTypeAuxiliary, choice({ + })}, + Variable{"rule0_repeat1", VariableTypeAuxiliary, choice({ seq({ i_sym(1), i_token(11) }), i_token(11), - })), - }))); + })}, + })); }); it("does not create redundant auxiliary rules", [&]() { - InitialSyntaxGrammar grammar{{ - Variable("rule0", VariableTypeNamed, choice({ - seq({ i_token(1), repeat1(i_token(4)) }), - seq({ i_token(2), repeat1(i_token(4)) }), - })), - Variable("rule1", VariableTypeNamed, seq({ - i_token(3), - repeat1(i_token(4)) - })), - }, {}, {}, {}}; + InitialSyntaxGrammar grammar{ + { + Variable{"rule0", VariableTypeNamed, choice({ + seq({ i_token(1), repeat1(i_token(4)) }), + seq({ i_token(2), repeat1(i_token(4)) }), + })}, + Variable{"rule1", VariableTypeNamed, seq({ + i_token(3), + repeat1(i_token(4)) + })}, + }, + {}, {}, {} + }; auto result = expand_repeats(grammar); - AssertThat(result.variables, Equals(vector({ - Variable("rule0", VariableTypeNamed, choice({ + AssertThat(result.variables, Equals(vector{ + Variable{"rule0", VariableTypeNamed, choice({ seq({ i_token(1), i_sym(2) }), seq({ i_token(2), i_sym(2) }), - })), - Variable("rule1", VariableTypeNamed, seq({ + })}, + Variable{"rule1", VariableTypeNamed, seq({ i_token(3), i_sym(2), - })), - Variable("rule0_repeat1", VariableTypeAuxiliary, choice({ + })}, + Variable{"rule0_repeat1", VariableTypeAuxiliary, choice({ seq({ i_sym(2), i_token(4) }), i_token(4), - })), - }))); + })}, + })); }); it("can replace multiple repeats in the same rule", [&]() { - InitialSyntaxGrammar grammar{{ - Variable("rule0", VariableTypeNamed, seq({ - repeat1(i_token(10)), - repeat1(i_token(11)), - })), - }, {}, {}, {}}; + InitialSyntaxGrammar grammar{ + { + Variable{"rule0", VariableTypeNamed, seq({ + repeat1(i_token(10)), + repeat1(i_token(11)), + })}, + }, + {}, {}, {} + }; auto result = expand_repeats(grammar); - AssertThat(result.variables, Equals(vector({ - Variable("rule0", VariableTypeNamed, seq({ + AssertThat(result.variables, Equals(vector{ + Variable{"rule0", VariableTypeNamed, seq({ i_sym(1), i_sym(2), - })), - Variable("rule0_repeat1", VariableTypeAuxiliary, choice({ + })}, + Variable{"rule0_repeat1", VariableTypeAuxiliary, choice({ seq({ i_sym(1), i_token(10) }), i_token(10), - })), - Variable("rule0_repeat2", VariableTypeAuxiliary, choice({ + })}, + Variable{"rule0_repeat2", VariableTypeAuxiliary, choice({ seq({ i_sym(2), i_token(11) }), i_token(11), - })), - }))); + })}, + })); }); it("can replace repeats in multiple rules", [&]() { - InitialSyntaxGrammar grammar{{ - Variable("rule0", VariableTypeNamed, repeat1(i_token(10))), - Variable("rule1", VariableTypeNamed, repeat1(i_token(11))), - }, {}, {}, {}}; + InitialSyntaxGrammar grammar{ + { + Variable{"rule0", VariableTypeNamed, repeat1(i_token(10))}, + Variable{"rule1", VariableTypeNamed, repeat1(i_token(11))}, + }, + {}, {}, {} + }; auto result = expand_repeats(grammar); - AssertThat(result.variables, Equals(vector({ - Variable("rule0", VariableTypeNamed, i_sym(2)), - Variable("rule1", VariableTypeNamed, i_sym(3)), - Variable("rule0_repeat1", VariableTypeAuxiliary, choice({ + AssertThat(result.variables, Equals(vector{ + Variable{"rule0", VariableTypeNamed, i_sym(2)}, + Variable{"rule1", VariableTypeNamed, i_sym(3)}, + Variable{"rule0_repeat1", VariableTypeAuxiliary, choice({ seq({ i_sym(2), i_token(10) }), i_token(10), - })), - Variable("rule1_repeat1", VariableTypeAuxiliary, choice({ + })}, + Variable{"rule1_repeat1", VariableTypeAuxiliary, choice({ seq({ i_sym(3), i_token(11) }), i_token(11), - })), - }))); + })}, + })); }); }); diff --git a/spec/compiler/prepare_grammar/expand_tokens_spec.cc b/spec/compiler/prepare_grammar/expand_tokens_spec.cc index 0aa83b3a..fdfea02c 100644 --- a/spec/compiler/prepare_grammar/expand_tokens_spec.cc +++ b/spec/compiler/prepare_grammar/expand_tokens_spec.cc @@ -15,89 +15,149 @@ describe("expand_tokens", []() { describe("string rules", [&]() { it("replaces strings with sequences of character sets", [&]() { - LexicalGrammar grammar{{ - Variable("rule_A", VariableTypeNamed, seq({ - i_sym(10), - str("xyz"), - i_sym(11), - })), - }, {}}; + LexicalGrammar grammar{ + { + LexicalVariable{ + "rule_A", + VariableTypeNamed, + seq({ + i_sym(10), + str("xyz"), + i_sym(11), + }), + false + } + }, + {} + }; auto result = expand_tokens(grammar); AssertThat(result.second, Equals(CompileError::none())); - AssertThat(result.first.variables, Equals(vector({ - Variable("rule_A", VariableTypeNamed, seq({ - i_sym(10), - metadata(seq({ - character({ 'x' }), - character({ 'y' }), - character({ 'z' }), - }), string_token_params), - i_sym(11), - })), - }))); + AssertThat(result.first.variables, Equals(vector{ + LexicalVariable{ + "rule_A", + VariableTypeNamed, + seq({ + i_sym(10), + metadata(seq({ + character({ 'x' }), + character({ 'y' }), + character({ 'z' }), + }), string_token_params), + i_sym(11), + }), + false + } + })); }); it("handles strings containing non-ASCII UTF8 characters", [&]() { - LexicalGrammar grammar{{ - Variable("rule_A", VariableTypeNamed, str("\u03B1 \u03B2")), - }, {}}; + LexicalGrammar grammar{ + { + LexicalVariable{ + "rule_A", + VariableTypeNamed, + str("\u03B1 \u03B2"), + false + }, + }, + {} + }; auto result = expand_tokens(grammar); - AssertThat(result.first.variables, Equals(vector({ - Variable("rule_A", VariableTypeNamed, metadata(seq({ - character({ 945 }), - character({ ' ' }), - character({ 946 }), - }), string_token_params)), - }))); + AssertThat(result.first.variables, Equals(vector{ + LexicalVariable{ + "rule_A", + VariableTypeNamed, + metadata(seq({ + character({ 945 }), + character({ ' ' }), + character({ 946 }), + }), string_token_params), + false + } + })); }); }); describe("regexp rules", [&]() { it("replaces regexps with the equivalent rule tree", [&]() { - LexicalGrammar grammar{{ - Variable("rule_A", VariableTypeNamed, seq({ - i_sym(10), - pattern("x*"), - i_sym(11), - })), - }, {}}; + LexicalGrammar grammar{ + { + LexicalVariable{ + "rule_A", + VariableTypeNamed, + seq({ + i_sym(10), + pattern("x*"), + i_sym(11), + }), + false + } + }, + {} + }; auto result = expand_tokens(grammar); AssertThat(result.second, Equals(CompileError::none())); - AssertThat(result.first.variables, Equals(vector({ - Variable("rule_A", VariableTypeNamed, seq({ - i_sym(10), - repeat(character({ 'x' })), - i_sym(11), - })), - }))); + AssertThat(result.first.variables, Equals(vector{ + LexicalVariable{ + "rule_A", + VariableTypeNamed, + seq({ + i_sym(10), + repeat(character({ 'x' })), + i_sym(11), + }), + false + } + })); }); it("handles regexps containing non-ASCII UTF8 characters", [&]() { - LexicalGrammar grammar{{ - Variable("rule_A", VariableTypeNamed, pattern("[^\u03B1-\u03B4]*")), - }, {}}; + LexicalGrammar grammar{ + { + LexicalVariable{ + "rule_A", + VariableTypeNamed, + pattern("[^\u03B1-\u03B4]*"), + false + } + }, + {} + }; auto result = expand_tokens(grammar); - AssertThat(result.first.variables, Equals(vector({ - Variable("rule_A", VariableTypeNamed, repeat(character({ 945, 946, 947, 948 }, false))), - }))); + AssertThat(result.first.variables, Equals(vector{ + LexicalVariable{ + "rule_A", + VariableTypeNamed, + repeat(character({ 945, 946, 947, 948 }, false)), + false + } + })); }); it("returns an error when the grammar contains an invalid regex", [&]() { - LexicalGrammar grammar{{ - Variable("rule_A", VariableTypeNamed, seq({ - pattern("("), - str("xyz"), - pattern("["), - })) - }, {}}; + LexicalGrammar grammar{ + { + LexicalVariable{ + "rule_A", + VariableTypeNamed, + seq({ + pattern("("), + str("xyz"), + pattern("["), + }), + false + }, + }, + {} + }; auto result = expand_tokens(grammar); diff --git a/spec/compiler/prepare_grammar/extract_tokens_spec.cc b/spec/compiler/prepare_grammar/extract_tokens_spec.cc index 3aa576df..ea9dd415 100644 --- a/spec/compiler/prepare_grammar/extract_tokens_spec.cc +++ b/spec/compiler/prepare_grammar/extract_tokens_spec.cc @@ -16,20 +16,25 @@ using prepare_grammar::InitialSyntaxGrammar; describe("extract_tokens", []() { it("moves strings, patterns, and sub-rules marked as tokens into the lexical grammar", [&]() { - auto result = extract_tokens(InternedGrammar{{ - Variable("rule_A", VariableTypeNamed, repeat1(seq({ - str("ab"), - pattern("cd*"), - choice({ - i_sym(1), - i_sym(2), - token(repeat1(choice({ str("ef"), str("gh") }))), - }), - }))), - Variable("rule_B", VariableTypeNamed, pattern("ij+")), - Variable("rule_C", VariableTypeNamed, choice({ str("kl"), blank() })), - Variable("rule_D", VariableTypeNamed, repeat1(i_sym(3))) - }, {}, {}, {}}); + auto result = extract_tokens(InternedGrammar{ + { + Variable{"rule_A", VariableTypeNamed, repeat1(seq({ + str("ab"), + pattern("cd*"), + choice({ + i_sym(1), + i_sym(2), + token(repeat1(choice({ str("ef"), str("gh") }))), + }), + }))}, + Variable{"rule_B", VariableTypeNamed, pattern("ij+")}, + Variable{"rule_C", VariableTypeNamed, choice({ str("kl"), blank() })}, + Variable{"rule_D", VariableTypeNamed, repeat1(i_sym(3))}, + }, + {}, + {}, + {} + }); InitialSyntaxGrammar &syntax_grammar = get<0>(result); LexicalGrammar &lexical_grammar = get<1>(result); @@ -37,8 +42,8 @@ describe("extract_tokens", []() { AssertThat(error, Equals(CompileError::none())); - AssertThat(syntax_grammar.variables, Equals(vector({ - Variable("rule_A", VariableTypeNamed, repeat1(seq({ + AssertThat(syntax_grammar.variables, Equals(vector{ + Variable{"rule_A", VariableTypeNamed, repeat1(seq({ // This string is now the first token in the lexical grammar. i_token(0), @@ -58,83 +63,88 @@ describe("extract_tokens", []() { // This token rule is now the third rule in the lexical grammar. i_token(2), }), - }))), + }))}, - Variable("rule_C", VariableTypeNamed, choice({ i_token(4), blank() })), - Variable("rule_D", VariableTypeNamed, repeat1(i_sym(2))), - }))); + Variable{"rule_C", VariableTypeNamed, choice({ i_token(4), blank() })}, + Variable{"rule_D", VariableTypeNamed, repeat1(i_sym(2))}, + })); - AssertThat(lexical_grammar.variables, Equals(vector({ + AssertThat(lexical_grammar.variables, Equals(vector({ // Strings become anonymous rules. - Variable("ab", VariableTypeAnonymous, str("ab")), + LexicalVariable{"ab", VariableTypeAnonymous, str("ab"), true}, // Patterns become hidden rules. - Variable("/cd*/", VariableTypeAuxiliary, pattern("cd*")), + LexicalVariable{"/cd*/", VariableTypeAuxiliary, pattern("cd*"), false}, // Rules marked as tokens become hidden rules. - Variable("/(ef|gh)*/", VariableTypeAuxiliary, repeat1(choice({ + LexicalVariable{"/(ef|gh)*/", VariableTypeAuxiliary, repeat1(choice({ str("ef"), str("gh") - }))), + })), false}, // This named rule was moved wholesale to the lexical grammar. - Variable("rule_B", VariableTypeNamed, pattern("ij+")), + LexicalVariable{"rule_B", VariableTypeNamed, pattern("ij+"), false}, // Strings become anonymous rules. - Variable("kl", VariableTypeAnonymous, str("kl")), + LexicalVariable{"kl", VariableTypeAnonymous, str("kl"), true}, }))); }); it("does not create duplicate tokens in the lexical grammar", [&]() { - auto result = extract_tokens(InternedGrammar{{ - Variable("rule_A", VariableTypeNamed, seq({ - str("ab"), - i_sym(0), - str("ab"), - })), - }, {}, {}, {}}); + auto result = extract_tokens(InternedGrammar{ + { + Variable{"rule_A", VariableTypeNamed, seq({ + str("ab"), + i_sym(0), + str("ab"), + })}, + }, + {}, + {}, + {} + }); InitialSyntaxGrammar &syntax_grammar = get<0>(result); LexicalGrammar &lexical_grammar = get<1>(result); - AssertThat(syntax_grammar.variables, Equals(vector({ - Variable("rule_A", VariableTypeNamed, seq({ i_token(0), i_sym(0), i_token(0) })), - }))); + AssertThat(syntax_grammar.variables, Equals(vector { + Variable {"rule_A", VariableTypeNamed, seq({ i_token(0), i_sym(0), i_token(0) })}, + })); - AssertThat(lexical_grammar.variables, Equals(vector({ - Variable("ab", VariableTypeAnonymous, str("ab")), - }))) + AssertThat(lexical_grammar.variables, Equals(vector { + LexicalVariable {"ab", VariableTypeAnonymous, str("ab"), true}, + })) }); it("does not move entire rules into the lexical grammar if their content is used elsewhere in the grammar", [&]() { auto result = extract_tokens(InternedGrammar{{ - Variable("rule_A", VariableTypeNamed, seq({ i_sym(1), str("ab") })), - Variable("rule_B", VariableTypeNamed, str("cd")), - Variable("rule_C", VariableTypeNamed, seq({ str("ef"), str("cd") })), + Variable{"rule_A", VariableTypeNamed, seq({ i_sym(1), str("ab") })}, + Variable{"rule_B", VariableTypeNamed, str("cd")}, + Variable{"rule_C", VariableTypeNamed, seq({ str("ef"), str("cd") })}, }, {}, {}, {}}); InitialSyntaxGrammar &syntax_grammar = get<0>(result); LexicalGrammar &lexical_grammar = get<1>(result); AssertThat(syntax_grammar.variables, Equals(vector({ - Variable("rule_A", VariableTypeNamed, seq({ i_sym(1), i_token(0) })), - Variable("rule_B", VariableTypeNamed, i_token(1)), - Variable("rule_C", VariableTypeNamed, seq({ i_token(2), i_token(1) })), + Variable{"rule_A", VariableTypeNamed, seq({ i_sym(1), i_token(0) })}, + Variable{"rule_B", VariableTypeNamed, i_token(1)}, + Variable{"rule_C", VariableTypeNamed, seq({ i_token(2), i_token(1) })}, }))); - AssertThat(lexical_grammar.variables, Equals(vector({ - Variable("ab", VariableTypeAnonymous, str("ab")), - Variable("cd", VariableTypeAnonymous, str("cd")), - Variable("ef", VariableTypeAnonymous, str("ef")), - }))); + AssertThat(lexical_grammar.variables, Equals(vector { + LexicalVariable {"ab", VariableTypeAnonymous, str("ab"), true}, + LexicalVariable {"cd", VariableTypeAnonymous, str("cd"), true}, + LexicalVariable {"ef", VariableTypeAnonymous, str("ef"), true}, + })); }); it("renumbers the grammar's expected conflict symbols based on any moved rules", [&]() { auto result = extract_tokens(InternedGrammar{ { - Variable("rule_A", VariableTypeNamed, str("ok")), - Variable("rule_B", VariableTypeNamed, repeat(i_sym(0))), - Variable("rule_C", VariableTypeNamed, repeat(seq({ i_sym(0), i_sym(0) }))), + Variable{"rule_A", VariableTypeNamed, str("ok")}, + Variable{"rule_B", VariableTypeNamed, repeat(i_sym(0))}, + Variable{"rule_C", VariableTypeNamed, repeat(seq({ i_sym(0), i_sym(0) }))}, }, { str(" ") @@ -155,12 +165,17 @@ describe("extract_tokens", []() { describe("handling extra tokens", [&]() { it("adds inline extra tokens to the lexical grammar's separators", [&]() { - auto result = extract_tokens(InternedGrammar{{ - Variable("rule_A", VariableTypeNamed, str("x")), - }, { - str("y"), - pattern("\\s+"), - }, {}, {}}); + auto result = extract_tokens(InternedGrammar{ + { + Variable{"rule_A", VariableTypeNamed, str("x")}, + }, + { + str("y"), + pattern("\\s+"), + }, + {}, + {} + }); AssertThat(get<2>(result), Equals(CompileError::none())); @@ -172,12 +187,17 @@ describe("extract_tokens", []() { }); it("handles inline extra tokens that match tokens in the grammar", [&]() { - auto result = extract_tokens(InternedGrammar{{ - Variable("rule_A", VariableTypeNamed, str("x")), - Variable("rule_B", VariableTypeNamed, str("y")), - }, { - str("y"), - }, {}, {}}); + auto result = extract_tokens(InternedGrammar{ + { + Variable{"rule_A", VariableTypeNamed, str("x")}, + Variable{"rule_B", VariableTypeNamed, str("y")}, + }, + { + str("y"), + }, + {}, + {} + }); AssertThat(get<2>(result), Equals(CompileError::none())); AssertThat(get<1>(result).separators.size(), Equals(0)); @@ -185,13 +205,18 @@ describe("extract_tokens", []() { }); it("updates extra symbols according to the new symbol numbers", [&]() { - auto result = extract_tokens(InternedGrammar{{ - Variable("rule_A", VariableTypeNamed, seq({ str("w"), str("x"), i_sym(1) })), - Variable("rule_B", VariableTypeNamed, str("y")), - Variable("rule_C", VariableTypeNamed, str("z")), - }, { - i_sym(2), - }, {}, {}}); + auto result = extract_tokens(InternedGrammar{ + { + Variable{"rule_A", VariableTypeNamed, seq({ str("w"), str("x"), i_sym(1) })}, + Variable{"rule_B", VariableTypeNamed, str("y")}, + Variable{"rule_C", VariableTypeNamed, str("z")}, + }, + { + i_sym(2), + }, + {}, + {} + }); AssertThat(get<2>(result), Equals(CompileError::none())); @@ -204,8 +229,8 @@ describe("extract_tokens", []() { it("returns an error if any extra tokens are non-token symbols", [&]() { auto result = extract_tokens(InternedGrammar{{ - Variable("rule_A", VariableTypeNamed, seq({ str("x"), i_sym(1) })), - Variable("rule_B", VariableTypeNamed, seq({ str("y"), str("z") })), + Variable{"rule_A", VariableTypeNamed, seq({ str("x"), i_sym(1) })}, + Variable{"rule_B", VariableTypeNamed, seq({ str("y"), str("z") })}, }, { i_sym(1) }, {}, {}}); AssertThat(get<2>(result), !Equals(CompileError::none())); @@ -216,8 +241,8 @@ describe("extract_tokens", []() { it("returns an error if any extra tokens are non-token rules", [&]() { auto result = extract_tokens(InternedGrammar{{ - Variable("rule_A", VariableTypeNamed, str("x")), - Variable("rule_B", VariableTypeNamed, str("y")), + Variable{"rule_A", VariableTypeNamed, str("x")}, + Variable{"rule_B", VariableTypeNamed, str("y")}, }, { choice({ i_sym(1), blank() }) }, {}, {}}); AssertThat(get<2>(result), !Equals(CompileError::none())); @@ -231,8 +256,8 @@ describe("extract_tokens", []() { it("returns an error if an external token has the same name as a non-terminal rule", [&]() { auto result = extract_tokens(InternedGrammar{ { - Variable("rule_A", VariableTypeNamed, seq({ str("x"), i_sym(1) })), - Variable("rule_B", VariableTypeNamed, seq({ str("y"), str("z") })), + Variable{"rule_A", VariableTypeNamed, seq({ str("x"), i_sym(1) })}, + Variable{"rule_B", VariableTypeNamed, seq({ str("y"), str("z") })}, }, {}, {}, diff --git a/spec/compiler/prepare_grammar/flatten_grammar_spec.cc b/spec/compiler/prepare_grammar/flatten_grammar_spec.cc index 823da8e6..c93b6d52 100644 --- a/spec/compiler/prepare_grammar/flatten_grammar_spec.cc +++ b/spec/compiler/prepare_grammar/flatten_grammar_spec.cc @@ -12,7 +12,7 @@ using prepare_grammar::flatten_rule; describe("flatten_grammar", []() { it("associates each symbol with the precedence and associativity binding it to its successor", [&]() { - SyntaxVariable result = flatten_rule(Variable( + SyntaxVariable result = flatten_rule(Variable{ "test", VariableTypeNamed, seq({ @@ -30,7 +30,7 @@ describe("flatten_grammar", []() { })), i_sym(7), }) - )); + }); AssertThat(result.name, Equals("test")); AssertThat(result.type, Equals(VariableTypeNamed)); @@ -54,14 +54,14 @@ describe("flatten_grammar", []() { }); it("uses the last assigned precedence", [&]() { - SyntaxVariable result = flatten_rule(Variable( + SyntaxVariable result = flatten_rule(Variable{ "test1", VariableTypeNamed, prec_left(101, seq({ i_sym(1), i_sym(2), })) - )); + }); AssertThat(result.productions, Equals(vector({ Production({ @@ -70,13 +70,13 @@ describe("flatten_grammar", []() { }) }))) - result = flatten_rule(Variable( + result = flatten_rule(Variable{ "test2", VariableTypeNamed, prec_left(101, seq({ i_sym(1), })) - )); + }); AssertThat(result.productions, Equals(vector({ Production({ diff --git a/spec/compiler/prepare_grammar/intern_symbols_spec.cc b/spec/compiler/prepare_grammar/intern_symbols_spec.cc index 9142eab6..4682a716 100644 --- a/spec/compiler/prepare_grammar/intern_symbols_spec.cc +++ b/spec/compiler/prepare_grammar/intern_symbols_spec.cc @@ -15,27 +15,32 @@ using prepare_grammar::intern_symbols; describe("intern_symbols", []() { it("replaces named symbols with numerically-indexed symbols", [&]() { - Grammar grammar{{ - { "x", choice({ sym("y"), sym("_z") }) }, - { "y", sym("_z") }, - { "_z", str("stuff") } - }, {}, {}, {}}; + Grammar grammar{ + { + {"x", choice({ sym("y"), sym("_z") })}, + {"y", sym("_z")}, + {"_z", str("stuff")} + }, {}, {}, {} + }; auto result = intern_symbols(grammar); AssertThat(result.second, Equals(CompileError::none())); - AssertThat(result.first.variables, Equals(vector({ - Variable("x", VariableTypeNamed, choice({ i_sym(1), i_sym(2) })), - Variable("y", VariableTypeNamed, i_sym(2)), - Variable("_z", VariableTypeHidden, str("stuff")), - }))); + AssertThat(result.first.variables, Equals(vector{ + Variable{"x", VariableTypeNamed, choice({ i_sym(1), i_sym(2) })}, + Variable{"y", VariableTypeNamed, i_sym(2)}, + Variable{"_z", VariableTypeHidden, str("stuff")}, + })); }); describe("when there are symbols that reference undefined rules", [&]() { it("returns an error", []() { - Grammar grammar{{ - { "x", sym("y") }, - }, {}, {}, {}}; + Grammar grammar{ + { + {"x", sym("y")}, + }, + {}, {}, {} + }; auto result = intern_symbols(grammar); @@ -44,13 +49,17 @@ describe("intern_symbols", []() { }); it("translates the grammar's optional 'extra_tokens' to numerical symbols", [&]() { - Grammar grammar{{ - { "x", choice({ sym("y"), sym("z") }) }, - { "y", sym("z") }, - { "z", str("stuff") } - }, { - sym("z") - }, {}, {}}; + Grammar grammar{ + { + {"x", choice({ sym("y"), sym("z") })}, + {"y", sym("z")}, + {"z", str("stuff")} + }, + { + sym("z") + }, + {}, {} + }; auto result = intern_symbols(grammar); @@ -60,29 +69,34 @@ describe("intern_symbols", []() { }); it("records any rule names that match external token names", [&]() { - Grammar grammar{{ - { "x", choice({ sym("y"), sym("z") }) }, - { "y", sym("z") }, - { "z", str("stuff") } - }, {}, {}, { - "w", - "z" - }}; + Grammar grammar{ + { + {"x", choice({ sym("y"), sym("z") })}, + {"y", sym("z")}, + {"z", str("stuff")}, + }, + {}, + {}, + { + "w", + "z" + } + }; auto result = intern_symbols(grammar); - AssertThat(result.first.external_tokens, Equals(vector({ - { + AssertThat(result.first.external_tokens, Equals(vector{ + ExternalToken{ "w", VariableTypeNamed, rules::NONE() }, - { + ExternalToken{ "z", VariableTypeNamed, Symbol(2, Symbol::NonTerminal) - } - }))) + }, + })) }); }); diff --git a/spec/fixtures/external_scanners/extra_external_tokens.c b/spec/fixtures/external_scanners/extra_external_tokens.c deleted file mode 100644 index 5c409639..00000000 --- a/spec/fixtures/external_scanners/extra_external_tokens.c +++ /dev/null @@ -1,42 +0,0 @@ -#include - -enum { - COMMENT, -}; - -void *tree_sitter_extra_external_tokens_external_scanner_create() { - return NULL; -} - -void tree_sitter_extra_external_tokens_external_scanner_reset(void *payload) { -} - -bool tree_sitter_extra_external_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) { - return true; -} - -void tree_sitter_extra_external_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) { -} - -bool tree_sitter_extra_external_tokens_external_scanner_scan( - void *payload, TSLexer *lexer, const bool *whitelist) { - - while (lexer->lookahead == ' ') { - lexer->advance(lexer, true); - } - - if (lexer->lookahead == '#') { - lexer->advance(lexer, false); - while (lexer->lookahead != '\n') { - lexer->advance(lexer, false); - } - - lexer->result_symbol = COMMENT; - return true; - } - - return false; -} - -void tree_sitter_extra_external_tokens_external_scanner_destroy(void *payload) { -} diff --git a/spec/fixtures/test_grammars/anonymous_tokens_with_escaped_chars/corpus.txt b/spec/fixtures/test_grammars/anonymous_tokens_with_escaped_chars/corpus.txt new file mode 100644 index 00000000..06a7bf0b --- /dev/null +++ b/spec/fixtures/test_grammars/anonymous_tokens_with_escaped_chars/corpus.txt @@ -0,0 +1,32 @@ +================================================ +anonymous tokens defined with character classes +================================================ +1234 +--- + +(first_rule) + +================================================= +anonymous tokens defined with LF escape sequence +================================================= + + +--- + +(first_rule) + +================================================= +anonymous tokens defined with CR escape sequence +================================================= + +--- + +(first_rule) + +================================================ +anonymous tokens with quotes +================================================ +'hello' +--- + +(first_rule) diff --git a/spec/fixtures/test_grammars/anonymous_tokens_with_escaped_chars/grammar.json b/spec/fixtures/test_grammars/anonymous_tokens_with_escaped_chars/grammar.json new file mode 100644 index 00000000..d2613776 --- /dev/null +++ b/spec/fixtures/test_grammars/anonymous_tokens_with_escaped_chars/grammar.json @@ -0,0 +1,14 @@ +{ + "name": "anonymous_tokens_with_escaped_chars", + "rules": { + "first_rule": { + "type": "CHOICE", + "members": [ + {"type": "STRING", "value": "\n"}, + {"type": "STRING", "value": "\r"}, + {"type": "STRING", "value": "'hello'"}, + {"type": "PATTERN", "value": "\\d+"} + ] + } + } +} \ No newline at end of file diff --git a/spec/fixtures/test_grammars/associativity_left/corpus.txt b/spec/fixtures/test_grammars/associativity_left/corpus.txt new file mode 100644 index 00000000..4ab8e0db --- /dev/null +++ b/spec/fixtures/test_grammars/associativity_left/corpus.txt @@ -0,0 +1,8 @@ +=================== +chained operations +=================== +x+y+z +--- +(expression (math_operation + (expression (math_operation (expression (identifier)) (expression (identifier)))) + (expression (identifier)))) \ No newline at end of file diff --git a/spec/fixtures/test_grammars/associativity_left/grammar.json b/spec/fixtures/test_grammars/associativity_left/grammar.json new file mode 100644 index 00000000..b1a25914 --- /dev/null +++ b/spec/fixtures/test_grammars/associativity_left/grammar.json @@ -0,0 +1,31 @@ +{ + "name": "associativity_left", + + "rules": { + "expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "math_operation"}, + {"type": "SYMBOL", "name": "identifier"} + ] + }, + + "math_operation": { + "type": "PREC_LEFT", + "value": 0, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "+"}, + {"type": "SYMBOL", "name": "expression"} + ] + } + }, + + "identifier": { + "type": "PATTERN", + "value": "[a-zA-Z]+" + } + } +} \ No newline at end of file diff --git a/spec/fixtures/test_grammars/associativity_missing/expected_error.txt b/spec/fixtures/test_grammars/associativity_missing/expected_error.txt new file mode 100644 index 00000000..f9cc955d --- /dev/null +++ b/spec/fixtures/test_grammars/associativity_missing/expected_error.txt @@ -0,0 +1,13 @@ +Unresolved conflict for symbol sequence: + + expression '+' expression • '+' … + +Possible interpretations: + + 1: (math_operation expression '+' expression) • '+' … + 2: expression '+' (math_operation expression • '+' expression) + +Possible resolutions: + + 1: Specify a left or right associativity in `math_operation` + 2: Add a conflict for these rules: `math_operation` diff --git a/spec/fixtures/test_grammars/associativity_missing/grammar.json b/spec/fixtures/test_grammars/associativity_missing/grammar.json new file mode 100644 index 00000000..e5bd9d83 --- /dev/null +++ b/spec/fixtures/test_grammars/associativity_missing/grammar.json @@ -0,0 +1,27 @@ +{ + "name": "associativity_missing", + + "rules": { + "expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "math_operation"}, + {"type": "SYMBOL", "name": "identifier"} + ] + }, + + "math_operation": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "+"}, + {"type": "SYMBOL", "name": "expression"} + ] + }, + + "identifier": { + "type": "PATTERN", + "value": "[a-zA-Z]+" + } + } +} \ No newline at end of file diff --git a/spec/fixtures/test_grammars/associativity_right/corpus.txt b/spec/fixtures/test_grammars/associativity_right/corpus.txt new file mode 100644 index 00000000..280bbc31 --- /dev/null +++ b/spec/fixtures/test_grammars/associativity_right/corpus.txt @@ -0,0 +1,8 @@ +=================== +chained operations +=================== +x+y+z +--- +(expression (math_operation + (expression (identifier)) + (expression (math_operation (expression (identifier)) (expression (identifier)))))) diff --git a/spec/fixtures/test_grammars/associativity_right/grammar.json b/spec/fixtures/test_grammars/associativity_right/grammar.json new file mode 100644 index 00000000..80ce1ebb --- /dev/null +++ b/spec/fixtures/test_grammars/associativity_right/grammar.json @@ -0,0 +1,31 @@ +{ + "name": "associativity_right", + + "rules": { + "expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "math_operation"}, + {"type": "SYMBOL", "name": "identifier"} + ] + }, + + "math_operation": { + "type": "PREC_RIGHT", + "value": 0, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "+"}, + {"type": "SYMBOL", "name": "expression"} + ] + } + }, + + "identifier": { + "type": "PATTERN", + "value": "[a-zA-Z]+" + } + } +} \ No newline at end of file diff --git a/spec/fixtures/test_grammars/conflicting_precedence/expected_error.txt b/spec/fixtures/test_grammars/conflicting_precedence/expected_error.txt new file mode 100644 index 00000000..a38dd8b5 --- /dev/null +++ b/spec/fixtures/test_grammars/conflicting_precedence/expected_error.txt @@ -0,0 +1,15 @@ +Unresolved conflict for symbol sequence: + + expression '+' expression • '*' … + +Possible interpretations: + + 1: (sum expression '+' expression) • '*' … + 2: expression '+' (product expression • '*' expression) + 3: expression '+' (other_thing expression • '*' '*') + +Possible resolutions: + + 1: Specify a higher precedence in `product` and `other_thing` than in the other rules. + 2: Specify a higher precedence in `sum` than in the other rules. + 3: Add a conflict for these rules: `sum` `product` `other_thing` diff --git a/spec/fixtures/test_grammars/conflicting_precedence/grammar.json b/spec/fixtures/test_grammars/conflicting_precedence/grammar.json new file mode 100644 index 00000000..4e82de64 --- /dev/null +++ b/spec/fixtures/test_grammars/conflicting_precedence/grammar.json @@ -0,0 +1,58 @@ +{ + "name": "conflicting_precedence", + + "rules": { + "expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "sum"}, + {"type": "SYMBOL", "name": "product"}, + {"type": "SYMBOL", "name": "other_thing"} + ] + }, + + "sum": { + "type": "PREC_LEFT", + "value": 0, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "+"}, + {"type": "SYMBOL", "name": "expression"} + ] + } + }, + + "product": { + "type": "PREC_LEFT", + "value": 1, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "*"}, + {"type": "SYMBOL", "name": "expression"} + ] + } + }, + + "other_thing": { + "type": "PREC_LEFT", + "value": -1, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "*"}, + {"type": "STRING", "value": "*"} + ] + } + }, + + "identifier": { + "type": "PATTERN", + "value": "[a-zA-Z]+" + } + } +} \ No newline at end of file diff --git a/spec/fixtures/test_grammars/epsilon_rules/expected_error.txt b/spec/fixtures/test_grammars/epsilon_rules/expected_error.txt new file mode 100644 index 00000000..39b3d5fa --- /dev/null +++ b/spec/fixtures/test_grammars/epsilon_rules/expected_error.txt @@ -0,0 +1,2 @@ +The rule `rule_2` matches the empty string. +Tree-sitter currently does not support syntactic rules that match the empty string. diff --git a/spec/fixtures/test_grammars/epsilon_rules/grammar.json b/spec/fixtures/test_grammars/epsilon_rules/grammar.json new file mode 100644 index 00000000..5be5b983 --- /dev/null +++ b/spec/fixtures/test_grammars/epsilon_rules/grammar.json @@ -0,0 +1,15 @@ +{ + "name": "epsilon_rules", + + "rules": { + "rule_1": {"type": "SYMBOL", "name": "rule_2"}, + + "rule_2": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "rule_1"}, + {"type": "BLANK"} + ] + } + } +} \ No newline at end of file diff --git a/spec/fixtures/test_grammars/external_and_internal_tokens/corpus.txt b/spec/fixtures/test_grammars/external_and_internal_tokens/corpus.txt new file mode 100644 index 00000000..4d691420 --- /dev/null +++ b/spec/fixtures/test_grammars/external_and_internal_tokens/corpus.txt @@ -0,0 +1,41 @@ +========================================= +single-line statements - internal tokens +========================================= + +a b + +--- + +(statement (variable) (variable) (line_break)) + +========================================= +multi-line statements - internal tokens +========================================= + +a +b + +--- + +(statement (variable) (variable) (line_break)) + +========================================= +single-line statements - external tokens +========================================= + +'hello' 'world' + +--- + +(statement (string) (string) (line_break)) + +========================================= +multi-line statements - external tokens +========================================= + +'hello' +'world' + +--- + +(statement (string) (string) (line_break)) diff --git a/spec/fixtures/test_grammars/external_and_internal_tokens/grammar.json b/spec/fixtures/test_grammars/external_and_internal_tokens/grammar.json new file mode 100644 index 00000000..f24e1c1c --- /dev/null +++ b/spec/fixtures/test_grammars/external_and_internal_tokens/grammar.json @@ -0,0 +1,36 @@ +{ + "name": "external_and_internal_tokens", + + "externals": [ + "string", + "line_break" + ], + + "extras": [ + {"type": "PATTERN", "value": "\\s"} + ], + + "rules": { + "statement": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "_expression"}, + {"type": "SYMBOL", "name": "_expression"}, + {"type": "SYMBOL", "name": "line_break"} + ] + }, + + "_expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "string"}, + {"type": "SYMBOL", "name": "variable"}, + {"type": "SYMBOL", "name": "number"} + ] + }, + + "variable": {"type": "PATTERN", "value": "\\a+"}, + "number": {"type": "PATTERN", "value": "\\d+"}, + "line_break": {"type": "STRING", "value": "\n"} + } +} \ No newline at end of file diff --git a/spec/fixtures/test_grammars/external_and_internal_tokens/readme.md b/spec/fixtures/test_grammars/external_and_internal_tokens/readme.md new file mode 100644 index 00000000..14ae934f --- /dev/null +++ b/spec/fixtures/test_grammars/external_and_internal_tokens/readme.md @@ -0,0 +1 @@ +This grammar has an external scanner whose `scan` method needs to be able to check for the validity of an *internal* token. This is done by including the names of that internal token (`_line_break`) in the grammar's `externals` field. \ No newline at end of file diff --git a/spec/fixtures/external_scanners/shared_external_tokens.c b/spec/fixtures/test_grammars/external_and_internal_tokens/scanner.c similarity index 62% rename from spec/fixtures/external_scanners/shared_external_tokens.c rename to spec/fixtures/test_grammars/external_and_internal_tokens/scanner.c index 0bee00d8..4d0acd0a 100644 --- a/spec/fixtures/external_scanners/shared_external_tokens.c +++ b/spec/fixtures/test_grammars/external_and_internal_tokens/scanner.c @@ -1,4 +1,3 @@ -#include #include enum { @@ -6,21 +5,17 @@ enum { LINE_BREAK }; -void *tree_sitter_shared_external_tokens_external_scanner_create() { - return NULL; -} +void *tree_sitter_external_and_internal_tokens_external_scanner_create() { return NULL; } -void tree_sitter_shared_external_tokens_external_scanner_reset(void *payload) { -} +void tree_sitter_external_and_internal_tokens_external_scanner_destroy(void *payload) {} -bool tree_sitter_shared_external_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) { - return true; -} +void tree_sitter_external_and_internal_tokens_external_scanner_reset(void *payload) {} -void tree_sitter_shared_external_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) { -} +bool tree_sitter_external_and_internal_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) { return true; } -bool tree_sitter_shared_external_tokens_external_scanner_scan( +void tree_sitter_external_and_internal_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) {} + +bool tree_sitter_external_and_internal_tokens_external_scanner_scan( void *payload, TSLexer *lexer, const bool *whitelist) { // If a line-break is a valid lookahead token, only skip spaces. @@ -58,6 +53,3 @@ bool tree_sitter_shared_external_tokens_external_scanner_scan( return false; } - -void tree_sitter_shared_external_tokens_external_scanner_destroy(void *payload) { -} diff --git a/spec/fixtures/test_grammars/external_extra_tokens/corpus.txt b/spec/fixtures/test_grammars/external_extra_tokens/corpus.txt new file mode 100644 index 00000000..ceac4b8a --- /dev/null +++ b/spec/fixtures/test_grammars/external_extra_tokens/corpus.txt @@ -0,0 +1,10 @@ +======================== +extra external tokens +======================== + +x = # a comment +y + +--- + +(assignment (variable) (comment) (variable)) diff --git a/spec/fixtures/test_grammars/external_extra_tokens/grammar.json b/spec/fixtures/test_grammars/external_extra_tokens/grammar.json new file mode 100644 index 00000000..ed13b34a --- /dev/null +++ b/spec/fixtures/test_grammars/external_extra_tokens/grammar.json @@ -0,0 +1,25 @@ +{ + "name": "external_extra_tokens", + + "externals": [ + "comment" + ], + + "extras": [ + {"type": "PATTERN", "value": "\\s"}, + {"type": "SYMBOL", "name": "comment"} + ], + + "rules": { + "assignment": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "variable"}, + {"type": "STRING", "value": "="}, + {"type": "SYMBOL", "name": "variable"} + ] + }, + + "variable": {"type": "PATTERN", "value": "\\a+"} + } +} \ No newline at end of file diff --git a/spec/fixtures/test_grammars/external_extra_tokens/scanner.c b/spec/fixtures/test_grammars/external_extra_tokens/scanner.c new file mode 100644 index 00000000..4bd3e22e --- /dev/null +++ b/spec/fixtures/test_grammars/external_extra_tokens/scanner.c @@ -0,0 +1,36 @@ +#include + +enum { + COMMENT, +}; + +void *tree_sitter_external_extra_tokens_external_scanner_create() { return NULL; } + +void tree_sitter_external_extra_tokens_external_scanner_destroy(void *payload) {} + +void tree_sitter_external_extra_tokens_external_scanner_reset(void *payload) {} + +bool tree_sitter_external_extra_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) { return true; } + +void tree_sitter_external_extra_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) {} + +bool tree_sitter_external_extra_tokens_external_scanner_scan( + void *payload, TSLexer *lexer, const bool *whitelist) { + + while (lexer->lookahead == ' ') { + lexer->advance(lexer, true); + } + + if (lexer->lookahead == '#') { + lexer->advance(lexer, false); + while (lexer->lookahead != '\n') { + lexer->advance(lexer, false); + } + + lexer->result_symbol = COMMENT; + return true; + } + + return false; +} + diff --git a/spec/fixtures/test_grammars/external_tokens/corpus.txt b/spec/fixtures/test_grammars/external_tokens/corpus.txt new file mode 100644 index 00000000..94153c16 --- /dev/null +++ b/spec/fixtures/test_grammars/external_tokens/corpus.txt @@ -0,0 +1,22 @@ +======================== +simple external tokens +========================= + +x + %(sup (external) scanner?) + +--- + +(expression (sum (expression (identifier)) (expression (string)))) + +================================== +external tokens that require state +================================== + +%{sup {} #{x + y} {} scanner?} + +--- + +(expression (string + (expression (sum + (expression (identifier)) + (expression (identifier)))))) diff --git a/spec/fixtures/test_grammars/external_tokens/grammar.json b/spec/fixtures/test_grammars/external_tokens/grammar.json new file mode 100644 index 00000000..8a175404 --- /dev/null +++ b/spec/fixtures/test_grammars/external_tokens/grammar.json @@ -0,0 +1,57 @@ +{ + "name": "external_tokens", + + "externals": [ + "_percent_string", + "_percent_string_start", + "_percent_string_end" + ], + + "extras": [ + {"type": "PATTERN", "value": "\\s"} + ], + + "rules": { + "expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "string"}, + {"type": "SYMBOL", "name": "sum"}, + {"type": "SYMBOL", "name": "identifier"} + ] + }, + + "sum": { + "type": "PREC_LEFT", + "value": 0, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "+"}, + {"type": "SYMBOL", "name": "expression"} + ] + } + }, + + "string": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "_percent_string"}, + { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "_percent_string_start"}, + {"type": "SYMBOL", "name": "expression"}, + {"type": "SYMBOL", "name": "_percent_string_end"} + ] + }, + ] + }, + + "identifier": { + "type": "PATTERN", + "value": "\\a+" + } + } +} \ No newline at end of file diff --git a/spec/fixtures/external_scanners/percent_strings.c b/spec/fixtures/test_grammars/external_tokens/scanner.c similarity index 80% rename from spec/fixtures/external_scanners/percent_strings.c rename to spec/fixtures/test_grammars/external_tokens/scanner.c index 9f68696e..7622e74d 100644 --- a/spec/fixtures/external_scanners/percent_strings.c +++ b/spec/fixtures/test_grammars/external_tokens/scanner.c @@ -1,4 +1,3 @@ -#include #include enum { @@ -13,7 +12,7 @@ typedef struct { uint32_t depth; } Scanner; -void *tree_sitter_external_scanner_example_external_scanner_create() { +void *tree_sitter_external_tokens_external_scanner_create() { Scanner *scanner = malloc(sizeof(Scanner)); *scanner = (Scanner){ .open_delimiter = 0, @@ -23,7 +22,17 @@ void *tree_sitter_external_scanner_example_external_scanner_create() { return scanner; } -bool tree_sitter_external_scanner_example_external_scanner_scan( +void tree_sitter_external_tokens_external_scanner_destroy(void *payload) { + free(payload); +} + +void tree_sitter_external_tokens_external_scanner_reset(void *payload) {} + +bool tree_sitter_external_tokens_external_scanner_serialize(void *payload, TSExternalTokenState state) { return true; } + +void tree_sitter_external_tokens_external_scanner_deserialize(void *payload, TSExternalTokenState state) {} + +bool tree_sitter_external_tokens_external_scanner_scan( void *payload, TSLexer *lexer, const bool *whitelist) { Scanner *scanner = payload; @@ -103,16 +112,3 @@ bool tree_sitter_external_scanner_example_external_scanner_scan( return false; } -void tree_sitter_external_scanner_example_external_scanner_reset(void *payload) { -} - -bool tree_sitter_external_scanner_example_external_scanner_serialize(void *payload, TSExternalTokenState state) { - return true; -} - -void tree_sitter_external_scanner_example_external_scanner_deserialize(void *payload, TSExternalTokenState state) { -} - -void tree_sitter_external_scanner_example_external_scanner_destroy(void *payload) { - free(payload); -} diff --git a/spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/corpus.txt b/spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/corpus.txt new file mode 100644 index 00000000..d8b75557 --- /dev/null +++ b/spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/corpus.txt @@ -0,0 +1,33 @@ +======================== +regexes +======================== + +/a+/ + +--- + +(expression (regex)) + +======================== +conditionals +======================== + +(if (1) /a+/) + +--- + +(expression (parenthesized (expression (conditional + (parenthesized (expression (number))) + (expression (regex)))))) + +======================== +quotients +======================== + +((1) / 2) + +--- + +(expression (parenthesized (expression (quotient + (expression (parenthesized (expression (number)))) + (expression (number)))))) diff --git a/spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/grammar.json b/spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/grammar.json new file mode 100644 index 00000000..143d6f2d --- /dev/null +++ b/spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/grammar.json @@ -0,0 +1,65 @@ +{ + "name": "lexical_conflicts_due_to_state_merging", + + "extras": [ + {"type": "PATTERN", "value": "\\s"} + ], + + "rules": { + "expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "conditional"}, + {"type": "SYMBOL", "name": "regex"}, + {"type": "SYMBOL", "name": "quotient"}, + {"type": "SYMBOL", "name": "number"}, + {"type": "SYMBOL", "name": "parenthesized"} + ] + }, + + "conditional": { + "type": "PREC_LEFT", + "value": 1, + "content": { + "type": "SEQ", + "members": [ + {"type": "STRING", "value": "if"}, + {"type": "SYMBOL", "name": "parenthesized"}, + {"type": "SYMBOL", "name": "expression"} + ] + } + }, + + "quotient": { + "type": "PREC_LEFT", + "value": 0, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "/"}, + {"type": "SYMBOL", "name": "expression"} + ] + } + }, + + "regex": { + "type": "PATTERN", + "value": "/[^/\n]+/" + }, + + "number": { + "type": "PATTERN", + "value": "\\d+" + }, + + "parenthesized": { + "type": "SEQ", + "members": [ + {"type": "STRING", "value": "("}, + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": ")"} + ] + } + } +} \ No newline at end of file diff --git a/spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/readme.md b/spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/readme.md new file mode 100644 index 00000000..9fc5fd7f --- /dev/null +++ b/spec/fixtures/test_grammars/lexical_conflicts_due_to_state_merging/readme.md @@ -0,0 +1,20 @@ +This grammar has two tokens, `regex` and `/`, which conflict: when a `/` character is encountered, the lexer can't tell if it is part of a `/` token or a `regex` by looking ahead only one character. But because these tokens are never valid in the same position, this doesn't cause any problem. + +When merging similar parse states in order to reduce the size of the parse table, it is important that we avoid merging states in a way that causes these two tokens to both appear as valid lookahead symbols in a given state. + +If we weren't careful, this grammar would cause that to happen, because a `regex` is valid in this state: + +``` +(if (1) /\w+/) + ^ +``` + +and a `/` is valid in this state: + + +``` +((1) / 2) + ^ +``` + +And these two states would otherwise be candidates for merging, because they both contain only the action `reduce(parenthesized, 3)`. \ No newline at end of file diff --git a/spec/fixtures/test_grammars/precedence_on_single_child_missing/expected_error.txt b/spec/fixtures/test_grammars/precedence_on_single_child_missing/expected_error.txt new file mode 100644 index 00000000..b1be0828 --- /dev/null +++ b/spec/fixtures/test_grammars/precedence_on_single_child_missing/expected_error.txt @@ -0,0 +1,15 @@ +Unresolved conflict for symbol sequence: + + identifier • '{' … + +Possible interpretations: + + 1: (expression identifier) • '{' … + 2: (function_call identifier • block) + +Possible resolutions: + + 1: Specify a higher precedence in `function_call` than in the other rules. + 2: Specify a higher precedence in `expression` than in the other rules. + 3: Specify a left or right associativity in `expression` + 4: Add a conflict for these rules: `expression` `function_call` diff --git a/spec/fixtures/test_grammars/precedence_on_single_child_missing/grammar.json b/spec/fixtures/test_grammars/precedence_on_single_child_missing/grammar.json new file mode 100644 index 00000000..19852708 --- /dev/null +++ b/spec/fixtures/test_grammars/precedence_on_single_child_missing/grammar.json @@ -0,0 +1,63 @@ +{ + "name": "precedence_on_single_child_missing", + + "extras": [ + {"type": "PATTERN", "value": "\\s"} + ], + + "rules": { + "expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "function_call"}, + {"type": "SYMBOL", "name": "identifier"} + ] + }, + + "function_call": { + "type": "PREC_RIGHT", + "value": 0, + "content": { + "type": "CHOICE", + "members": [ + { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "identifier"}, + {"type": "SYMBOL", "name": "expression"} + ] + }, + { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "identifier"}, + {"type": "SYMBOL", "name": "block"} + ] + }, + { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "identifier"}, + {"type": "SYMBOL", "name": "expression"}, + {"type": "SYMBOL", "name": "block"} + ] + } + ] + } + }, + + "block": { + "type": "SEQ", + "members": [ + {"type": "STRING", "value": "{"}, + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "}"} + ] + }, + + "identifier": { + "type": "PATTERN", + "value": "[a-zA-Z]+" + } + } +} \ No newline at end of file diff --git a/spec/fixtures/test_grammars/precedence_on_single_child_missing/readme.md b/spec/fixtures/test_grammars/precedence_on_single_child_missing/readme.md new file mode 100644 index 00000000..9db7345f --- /dev/null +++ b/spec/fixtures/test_grammars/precedence_on_single_child_missing/readme.md @@ -0,0 +1,14 @@ +This language has function calls similar to Ruby's, with no parentheses required, and optional blocks. + +There is a shift/reduce conflict here: + +``` +foo bar { baz } + ^ +``` + +The possible actions are: +1. `reduce(expression, 1)` - `bar` is an expression being passed to the `foo` function. +2. `shift` - `bar` is a function being called with the block `{ baz }` + +The grammars `precedence_on_single_child_negative` and `precedence_on_single_child_positive` show possible resolutions to this conflict. \ No newline at end of file diff --git a/spec/fixtures/test_grammars/precedence_on_single_child_negative/corpus.txt b/spec/fixtures/test_grammars/precedence_on_single_child_negative/corpus.txt new file mode 100644 index 00000000..69678dae --- /dev/null +++ b/spec/fixtures/test_grammars/precedence_on_single_child_negative/corpus.txt @@ -0,0 +1,12 @@ +=========================== +function calls with blocks +=========================== + +foo bar { baz } + +--- + +(expression (function_call + (identifier) + (expression (identifier)) + (block (expression (identifier))))) \ No newline at end of file diff --git a/spec/fixtures/test_grammars/precedence_on_single_child_negative/grammar.json b/spec/fixtures/test_grammars/precedence_on_single_child_negative/grammar.json new file mode 100644 index 00000000..fc237f54 --- /dev/null +++ b/spec/fixtures/test_grammars/precedence_on_single_child_negative/grammar.json @@ -0,0 +1,63 @@ +{ + "name": "precedence_on_single_child_negative", + + "extras": [ + {"type": "PATTERN", "value": "\\s"} + ], + + "rules": { + "expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "function_call"}, + {"type": "SYMBOL", "name": "identifier"} + ] + }, + + "function_call": { + "type": "PREC_RIGHT", + "value": -1, + "content": { + "type": "CHOICE", + "members": [ + { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "identifier"}, + {"type": "SYMBOL", "name": "expression"} + ] + }, + { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "identifier"}, + {"type": "SYMBOL", "name": "block"} + ] + }, + { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "identifier"}, + {"type": "SYMBOL", "name": "expression"}, + {"type": "SYMBOL", "name": "block"} + ] + } + ] + } + }, + + "block": { + "type": "SEQ", + "members": [ + {"type": "STRING", "value": "{"}, + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "}"} + ] + }, + + "identifier": { + "type": "PATTERN", + "value": "[a-zA-Z]+" + } + } +} \ No newline at end of file diff --git a/spec/fixtures/test_grammars/precedence_on_single_child_negative/readme.md b/spec/fixtures/test_grammars/precedence_on_single_child_negative/readme.md new file mode 100644 index 00000000..5b2cd804 --- /dev/null +++ b/spec/fixtures/test_grammars/precedence_on_single_child_negative/readme.md @@ -0,0 +1 @@ +This grammar resolves the conflict shown in the `precedence_on_single_child_missing` grammar by giving `function_call` a negative precedence. This causes reducing the `bar` variable to an expression to be preferred over shifting the `{` token as part of `function_call`. \ No newline at end of file diff --git a/spec/fixtures/test_grammars/precedence_on_single_child_positive/corpus.txt b/spec/fixtures/test_grammars/precedence_on_single_child_positive/corpus.txt new file mode 100644 index 00000000..ee01d488 --- /dev/null +++ b/spec/fixtures/test_grammars/precedence_on_single_child_positive/corpus.txt @@ -0,0 +1,13 @@ +=========================== +function calls with blocks +=========================== + +foo bar { baz } + +--- + +(expression (function_call + (identifier) + (expression (function_call + (identifier) + (block (expression (identifier))))))) \ No newline at end of file diff --git a/spec/fixtures/test_grammars/precedence_on_single_child_positive/grammar.json b/spec/fixtures/test_grammars/precedence_on_single_child_positive/grammar.json new file mode 100644 index 00000000..7ffa73ed --- /dev/null +++ b/spec/fixtures/test_grammars/precedence_on_single_child_positive/grammar.json @@ -0,0 +1,63 @@ +{ + "name": "precedence_on_single_child_positive", + + "extras": [ + {"type": "PATTERN", "value": "\\s"} + ], + + "rules": { + "expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "function_call"}, + {"type": "SYMBOL", "name": "identifier"} + ] + }, + + "function_call": { + "type": "PREC_RIGHT", + "value": 1, + "content": { + "type": "CHOICE", + "members": [ + { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "identifier"}, + {"type": "SYMBOL", "name": "expression"} + ] + }, + { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "identifier"}, + {"type": "SYMBOL", "name": "block"} + ] + }, + { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "identifier"}, + {"type": "SYMBOL", "name": "expression"}, + {"type": "SYMBOL", "name": "block"} + ] + } + ] + } + }, + + "block": { + "type": "SEQ", + "members": [ + {"type": "STRING", "value": "{"}, + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "}"} + ] + }, + + "identifier": { + "type": "PATTERN", + "value": "[a-zA-Z]+" + } + } +} \ No newline at end of file diff --git a/spec/fixtures/test_grammars/precedence_on_single_child_positive/readme.md b/spec/fixtures/test_grammars/precedence_on_single_child_positive/readme.md new file mode 100644 index 00000000..3bb78e41 --- /dev/null +++ b/spec/fixtures/test_grammars/precedence_on_single_child_positive/readme.md @@ -0,0 +1 @@ +This grammar resolves the conflict shown in the `precedence_on_single_child_missing` grammar by giving `function_call` a positive precedence. This causes shifting the `{` token as part of `function_call` to be preferred over reducing the `bar` variable to an expression. \ No newline at end of file diff --git a/spec/fixtures/test_grammars/precedence_on_subsequence/corpus.txt b/spec/fixtures/test_grammars/precedence_on_subsequence/corpus.txt new file mode 100644 index 00000000..1b3666f6 --- /dev/null +++ b/spec/fixtures/test_grammars/precedence_on_subsequence/corpus.txt @@ -0,0 +1,24 @@ +========================================== +curly brace blocks with high precedence +========================================== + +a b {} + +--- + +(expression (function_call + (identifier) + (expression (function_call (identifier) (block))))) + +========================================== +do blocks with low precedence +========================================== + +a b do end + +--- + +(expression (function_call + (identifier) + (expression (identifier)) + (do_block))) diff --git a/spec/fixtures/test_grammars/precedence_on_subsequence/grammar.json b/spec/fixtures/test_grammars/precedence_on_subsequence/grammar.json new file mode 100644 index 00000000..d05db765 --- /dev/null +++ b/spec/fixtures/test_grammars/precedence_on_subsequence/grammar.json @@ -0,0 +1,135 @@ +{ + "name": "precedence_on_subsequence", + + "extras": [ + {"type": "PATTERN", "value": "\\s"} + ], + + "rules": { + "expression": { + "type": "PREC_LEFT", + "value": 0, + "content": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "function_call"}, + {"type": "SYMBOL", "name": "identifier"}, + {"type": "SYMBOL", "name": "scope_resolution"} + ] + } + }, + + "function_call": { + "type": "CHOICE", + "members": [ + { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "identifier"}, + {"type": "SYMBOL", "name": "expression"} + ] + }, + + { + "type": "PREC", + "value": 1, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "identifier"}, + {"type": "SYMBOL", "name": "block"} + ] + } + }, + + { + "type": "PREC", + "value": -1, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "identifier"}, + {"type": "SYMBOL", "name": "do_block"} + ] + } + }, + + { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "identifier"}, + { + "type": "PREC", + "value": 1, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "SYMBOL", "name": "block"} + ] + } + } + ] + }, + + { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "identifier"}, + { + "type": "PREC", + "value": -1, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "SYMBOL", "name": "do_block"} + ] + } + } + ] + } + ] + }, + + "scope_resolution": { + "type": "PREC_LEFT", + "value": 1, + "content": { + "type": "CHOICE", + "members": [ + { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "::"}, + {"type": "SYMBOL", "name": "expression"} + ] + }, + { + "type": "SEQ", + "members": [ + {"type": "STRING", "value": "::"}, + {"type": "SYMBOL", "name": "expression"}, + ] + } + ] + } + }, + + "block": { + "type": "STRING", + "value": "{}" + }, + + "do_block": { + "type": "STRING", + "value": "do end" + }, + + "identifier": { + "type": "PATTERN", + "value": "[a-zA-Z]+" + } + } +} \ No newline at end of file diff --git a/spec/fixtures/test_grammars/readme.md b/spec/fixtures/test_grammars/readme.md new file mode 100644 index 00000000..a8f0449d --- /dev/null +++ b/spec/fixtures/test_grammars/readme.md @@ -0,0 +1,3 @@ +These small grammars demonstrate specific features or test for certain specific regressions. + +For some of them, compilation is expected to fail with a given error message. For others, the resulting parser is expected to produce certain trees. \ No newline at end of file diff --git a/spec/fixtures/test_grammars/readme_grammar/corpus.txt b/spec/fixtures/test_grammars/readme_grammar/corpus.txt new file mode 100644 index 00000000..df339f20 --- /dev/null +++ b/spec/fixtures/test_grammars/readme_grammar/corpus.txt @@ -0,0 +1,13 @@ +================================== +the readme example +================================== + +a + b * c + +--- + +(expression (sum + (expression (variable)) + (expression (product + (expression (variable)) + (expression (variable)))))) \ No newline at end of file diff --git a/spec/fixtures/test_grammars/readme_grammar/grammar.json b/spec/fixtures/test_grammars/readme_grammar/grammar.json new file mode 100644 index 00000000..fd496068 --- /dev/null +++ b/spec/fixtures/test_grammars/readme_grammar/grammar.json @@ -0,0 +1,67 @@ +{ + "name": "readme_grammar", + + // Things that can appear anywhere in the language, like comments + // and whitespace, are expressed as 'extras'. + "extras": [ + {"type": "PATTERN", "value": "\\s"}, + {"type": "SYMBOL", "name": "comment"} + ], + + "rules": { + + // The first rule listed in the grammar becomes the 'start rule'. + "expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "sum"}, + {"type": "SYMBOL", "name": "product"}, + {"type": "SYMBOL", "name": "number"}, + {"type": "SYMBOL", "name": "variable"}, + { + "type": "SEQ", + "members": [ + {"type": "STRING", "value": "("}, + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": ")"} + ] + } + ] + }, + + // Tokens like '+' and '*' are described directly within the + // grammar's rules, as opposed to in a seperate lexer description. + "sum": { + "type": "PREC_LEFT", + "value": 1, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "+"}, + {"type": "SYMBOL", "name": "expression"} + ] + } + }, + + // Ambiguities can be resolved at compile time by assigning precedence + // values to rule subtrees. + "product": { + "type": "PREC_LEFT", + "value": 2, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "*"}, + {"type": "SYMBOL", "name": "expression"} + ] + } + }, + + // Tokens can be specified using ECMAScript regexps. + "number": {"type": "PATTERN", "value": "\\d+"}, + "comment": {"type": "PATTERN", "value": "#.*"}, + "variable": {"type": "PATTERN", "value": "[a-zA-Z]\\w*"} + } +} \ No newline at end of file diff --git a/spec/fixtures/test_grammars/start_rule_is_blank/corpus.txt b/spec/fixtures/test_grammars/start_rule_is_blank/corpus.txt new file mode 100644 index 00000000..2b028562 --- /dev/null +++ b/spec/fixtures/test_grammars/start_rule_is_blank/corpus.txt @@ -0,0 +1,7 @@ +======================== +the empty string +======================= + +--- + +(first_rule) \ No newline at end of file diff --git a/spec/fixtures/test_grammars/start_rule_is_blank/grammar.json b/spec/fixtures/test_grammars/start_rule_is_blank/grammar.json new file mode 100644 index 00000000..94b6c6c4 --- /dev/null +++ b/spec/fixtures/test_grammars/start_rule_is_blank/grammar.json @@ -0,0 +1,6 @@ +{ + "name": "start_rule_is_blank", + "rules": { + "first_rule": {"type": "BLANK"} + } +} \ No newline at end of file diff --git a/spec/fixtures/test_grammars/start_rule_is_token/corpus.txt b/spec/fixtures/test_grammars/start_rule_is_token/corpus.txt new file mode 100644 index 00000000..aaa4e20b --- /dev/null +++ b/spec/fixtures/test_grammars/start_rule_is_token/corpus.txt @@ -0,0 +1,6 @@ +=========================== +the single token +========================== +the-value +--- +(first_rule) diff --git a/spec/fixtures/test_grammars/start_rule_is_token/grammar.json b/spec/fixtures/test_grammars/start_rule_is_token/grammar.json new file mode 100644 index 00000000..9b60c0d4 --- /dev/null +++ b/spec/fixtures/test_grammars/start_rule_is_token/grammar.json @@ -0,0 +1,6 @@ +{ + "name": "start_rule_is_token", + "rules": { + "first_rule": {"type": "STRING", "value": "the-value"} + } +} \ No newline at end of file diff --git a/spec/helpers/file_helpers.cc b/spec/helpers/file_helpers.cc new file mode 100644 index 00000000..3c08bec2 --- /dev/null +++ b/spec/helpers/file_helpers.cc @@ -0,0 +1,61 @@ +#include "helpers/file_helpers.h" +#include +#include +#include +#include + +using std::string; +using std::ifstream; +using std::istreambuf_iterator; +using std::ofstream; +using std::vector; + +bool file_exists(const string &path) { + struct stat file_stat; + return stat(path.c_str(), &file_stat) == 0; +} + +int get_modified_time(const string &path) { + struct stat file_stat; + if (stat(path.c_str(), &file_stat) != 0) { + if (errno != ENOENT) + fprintf(stderr, "Error in stat() for path: %s\n", + path.c_str()); + return 0; + } + return file_stat.st_mtime; +} + +string read_file(const string &path) { + ifstream file(path); + istreambuf_iterator file_iterator(file), end_iterator; + string content(file_iterator, end_iterator); + file.close(); + return content; +} + +void write_file(const string &path, const string &content) { + ofstream file(path); + file << content; + file.close(); +} + +vector list_directory(const string &path) { + vector result; + + DIR *dir = opendir(path.c_str()); + if (!dir) { + printf("\nTest error - no such directory '%s'", path.c_str()); + return result; + } + + struct dirent *dir_entry; + while ((dir_entry = readdir(dir))) { + string name(dir_entry->d_name); + if (name != "." && name != "..") { + result.push_back(name); + } + } + + closedir(dir); + return result; +} \ No newline at end of file diff --git a/spec/helpers/file_helpers.h b/spec/helpers/file_helpers.h new file mode 100644 index 00000000..c3d798ea --- /dev/null +++ b/spec/helpers/file_helpers.h @@ -0,0 +1,14 @@ +#ifndef HELPERS_FILE_HELPERS_H_ +#define HELPERS_FILE_HELPERS_H_ + +#include +#include +#include + +bool file_exists(const std::string &path); +int get_modified_time(const std::string &path); +std::string read_file(const std::string &path); +void write_file(const std::string &path, const std::string &content); +std::vector list_directory(const std::string &path); + +#endif // HELPERS_FILE_HELPERS_H_ diff --git a/spec/helpers/load_language.cc b/spec/helpers/load_language.cc index c59eca95..71829c5d 100644 --- a/spec/helpers/load_language.cc +++ b/spec/helpers/load_language.cc @@ -1,12 +1,12 @@ #include "spec_helper.h" #include "helpers/load_language.h" +#include "helpers/file_helpers.h" #include #include #include #include #include #include -#include #include #include #include "tree_sitter/compiler.h" @@ -54,25 +54,10 @@ static std::string run_command(const char *cmd, const char *args[]) { } } -static bool file_exists(const string &path) { - struct stat file_stat; - return stat(path.c_str(), &file_stat) == 0; -} - -static int get_modified_time(const string &path) { - struct stat file_stat; - if (stat(path.c_str(), &file_stat) != 0) { - if (errno != ENOENT) - fprintf(stderr, "Error in stat() for path: %s\n", + path.c_str()); - return 0; - } - return file_stat.st_mtime; -} - -const TSLanguage *load_language(const string &source_filename, - const string &lib_filename, - const string &language_name, - string external_scanner_filename = "") { +static const TSLanguage *load_language(const string &source_filename, + const string &lib_filename, + const string &language_name, + string external_scanner_filename = "") { string language_function_name = "tree_sitter_" + language_name; string header_dir = getenv("PWD") + string("/include"); int source_mtime = get_modified_time(source_filename); @@ -132,9 +117,9 @@ const TSLanguage *load_language(const string &source_filename, return reinterpret_cast(language_function)(); } -const TSLanguage *load_compile_result(const string &name, - const TSCompileResult &compile_result, - string external_scanner_path) { +const TSLanguage *load_test_language(const string &name, + const TSCompileResult &compile_result, + string external_scanner_path) { if (compile_result.error_type != TSCompileErrorTypeNone) { Assert::Failure(string("Compilation failed ") + compile_result.error_message); return nullptr; @@ -155,7 +140,7 @@ const TSLanguage *load_compile_result(const string &name, return language; } -const TSLanguage *get_test_language(const string &language_name) { +const TSLanguage *load_real_language(const string &language_name) { if (loaded_languages[language_name]) return loaded_languages[language_name]; @@ -182,20 +167,14 @@ const TSLanguage *get_test_language(const string &language_name) { if (parser_mtime < grammar_mtime || parser_mtime < libcompiler_mtime) { printf("\n" "Regenerating the %s parser...\n", language_name.c_str()); - ifstream grammar_file(grammar_filename); - istreambuf_iterator grammar_file_iterator(grammar_file), end_iterator; - string grammar_json(grammar_file_iterator, end_iterator); - grammar_file.close(); - + string grammar_json = read_file(grammar_filename); TSCompileResult result = ts_compile_grammar(grammar_json.c_str()); if (result.error_type != TSCompileErrorTypeNone) { fprintf(stderr, "Failed to compile %s grammar: %s\n", language_name.c_str(), result.error_message); return nullptr; } - ofstream parser_file(parser_filename); - parser_file << result.code; - parser_file.close(); + write_file(parser_filename, result.code); } mkdir("out/tmp", 0777); diff --git a/spec/helpers/load_language.h b/spec/helpers/load_language.h index 41d8b739..c34a33ca 100644 --- a/spec/helpers/load_language.h +++ b/spec/helpers/load_language.h @@ -5,8 +5,10 @@ #include "tree_sitter/runtime.h" #include -const TSLanguage *load_compile_result(const std::string &, const TSCompileResult &, - std::string external_scanner_path = ""); -const TSLanguage *get_test_language(const std::string &language_name); +const TSLanguage *load_real_language(const std::string &name); + +const TSLanguage *load_test_language(const std::string &name, + const TSCompileResult &compile_result, + std::string external_scanner_path = ""); #endif // HELPERS_LOAD_LANGUAGE_H_ diff --git a/spec/helpers/read_test_entries.cc b/spec/helpers/read_test_entries.cc index 970b7c57..e743253f 100644 --- a/spec/helpers/read_test_entries.cc +++ b/spec/helpers/read_test_entries.cc @@ -1,20 +1,18 @@ #include "helpers/read_test_entries.h" +#include #include -#include -#include -#include - #include +#include "helpers/file_helpers.h" + using std::regex; using std::regex_search; using std::regex_replace; -using std::smatch; using std::regex_constants::extended; - +using std::smatch; using std::string; using std::vector; -using std::ifstream; -using std::istreambuf_iterator; + +string fixtures_dir = "spec/fixtures/"; static string trim_output(const string &input) { string result(input); @@ -27,7 +25,7 @@ static string trim_output(const string &input) { static vector parse_test_entries(string content) { regex header_pattern("===+\n" "([^=]+)\n" "===+\n", extended); - regex separator_pattern("---+\n", extended); + regex separator_pattern("---+\r?\n", extended); vector descriptions; vector bodies; @@ -55,51 +53,42 @@ static vector parse_test_entries(string content) { body.substr(0, matches.position() - 1), trim_output(body.substr(matches.position() + matches[0].length())) }); + } else { + puts(("Invalid corpus entry with description: " + descriptions[i]).c_str()); + abort(); } } return result; } -static vector list_directory(string dir_name) { - vector result; - - DIR *dir = opendir(dir_name.c_str()); - if (!dir) { - printf("\nTest error - no such directory '%s'", dir_name.c_str()); - return result; - } - - struct dirent *dir_entry; - while ((dir_entry = readdir(dir))) { - string name(dir_entry->d_name); - if (name != "." && name != "..") - result.push_back(dir_name + "/" + name); - } - - closedir(dir); - return result; -} - -static string read_file(string filename) { - ifstream file(filename); - string result((istreambuf_iterator(file)), istreambuf_iterator()); - return result; -} - -vector read_corpus_entries(string language_name) { +vector read_real_language_corpus(string language_name) { vector result; - string fixtures_dir = "spec/fixtures/"; - string test_directory = fixtures_dir + "grammars/" + language_name + "/grammar_test"; - for (string &test_filename : list_directory(test_directory)) - for (TestEntry &entry : parse_test_entries(read_file(test_filename))) + for (string &test_filename : list_directory(test_directory)) { + for (TestEntry &entry : parse_test_entries(read_file(test_directory + "/" + test_filename))) { result.push_back(entry); + } + } string error_test_filename = fixtures_dir + "/error_corpus/" + language_name + "_errors.txt"; - for (TestEntry &entry : parse_test_entries(read_file(error_test_filename))) + for (TestEntry &entry : parse_test_entries(read_file(error_test_filename))) { result.push_back(entry); + } return result; } + +vector read_test_language_corpus(string language_name) { + vector result; + + string test_directory = fixtures_dir + "test_grammars/" + language_name; + for (string &test_filename : list_directory(test_directory)) { + for (TestEntry &entry : parse_test_entries(read_file(test_directory + "/" + test_filename))) { + result.push_back(entry); + } + } + + return result; +} \ No newline at end of file diff --git a/spec/helpers/read_test_entries.h b/spec/helpers/read_test_entries.h index 69f949fc..3de397f1 100644 --- a/spec/helpers/read_test_entries.h +++ b/spec/helpers/read_test_entries.h @@ -10,6 +10,7 @@ struct TestEntry { std::string tree_string; }; -std::vector read_corpus_entries(std::string directory); +std::vector read_real_language_corpus(std::string name); +std::vector read_test_language_corpus(std::string name); #endif diff --git a/spec/helpers/rule_helpers.cc b/spec/helpers/rule_helpers.cc index 0b010d2e..968d59ba 100644 --- a/spec/helpers/rule_helpers.cc +++ b/spec/helpers/rule_helpers.cc @@ -1,6 +1,8 @@ #include "rule_helpers.h" #include #include "compiler/rules/symbol.h" +#include "compiler/variable.h" +#include "compiler/lexical_grammar.h" namespace tree_sitter { using std::make_shared; @@ -52,4 +54,9 @@ namespace tree_sitter { return left.name == right.name && left.rule->operator==(*right.rule) && left.type == right.type; } + + bool operator==(const LexicalVariable &left, const LexicalVariable &right) { + return left.name == right.name && left.rule->operator==(*right.rule) && + left.type == right.type && left.is_string == right.is_string; + } } diff --git a/spec/helpers/rule_helpers.h b/spec/helpers/rule_helpers.h index a985d294..8ebe87e8 100644 --- a/spec/helpers/rule_helpers.h +++ b/spec/helpers/rule_helpers.h @@ -15,7 +15,11 @@ namespace tree_sitter { rule_ptr i_token(size_t index); rule_ptr active_prec(int precedence, rule_ptr); + struct Variable; + struct LexicalVariable; + bool operator==(const Variable &left, const Variable &right); + bool operator==(const LexicalVariable &left, const LexicalVariable &right); } #endif // HELPERS_RULE_HELPERS_H_ diff --git a/spec/helpers/stream_methods.cc b/spec/helpers/stream_methods.cc index a4b275ea..5ef2898c 100644 --- a/spec/helpers/stream_methods.cc +++ b/spec/helpers/stream_methods.cc @@ -3,6 +3,7 @@ #include "tree_sitter/compiler.h" #include "compiler/parse_table.h" #include "compiler/syntax_grammar.h" +#include "compiler/lexical_grammar.h" #include "compiler/build_tables/parse_item.h" #include "compiler/build_tables/lex_item.h" @@ -41,6 +42,11 @@ ostream &operator<<(ostream &stream, const SyntaxVariable &variable) { return stream << string("{") << variable.name << string(", ") << variable.productions << string(", ") << to_string(variable.type) << string("}"); } +ostream &operator<<(ostream &stream, const LexicalVariable &variable) { + return stream << "{" << variable.name << ", " << variable.rule << ", " << + to_string(variable.type) << ", " << to_string(variable.is_string) << "}"; +} + std::ostream &operator<<(std::ostream &stream, const AdvanceAction &action) { return stream << string("#"; } diff --git a/spec/helpers/stream_methods.h b/spec/helpers/stream_methods.h index 28b201c3..149e43c5 100644 --- a/spec/helpers/stream_methods.h +++ b/spec/helpers/stream_methods.h @@ -93,10 +93,11 @@ using std::string; using std::to_string; struct Variable; struct SyntaxVariable; +struct LexicalVariable; struct AdvanceAction; struct AcceptTokenAction; -class ParseAction; -class ParseState; +struct ParseAction; +struct ParseState; struct ExternalToken; struct ProductionStep; struct PrecedenceRange; @@ -107,6 +108,7 @@ ostream &operator<<(ostream &, const Rule &); ostream &operator<<(ostream &, const rule_ptr &); ostream &operator<<(ostream &, const Variable &); ostream &operator<<(ostream &, const SyntaxVariable &); +ostream &operator<<(ostream &, const LexicalVariable &); ostream &operator<<(ostream &, const AdvanceAction &); ostream &operator<<(ostream &, const AcceptTokenAction &); ostream &operator<<(ostream &, const ParseAction &); @@ -119,8 +121,8 @@ namespace build_tables { class LexItem; class LexItemSet; -class ParseItem; -class ParseItemSet; +struct ParseItem; +struct ParseItemSet; class LookaheadSet; ostream &operator<<(ostream &, const LexItem &); diff --git a/spec/integration/compile_grammar_spec.cc b/spec/integration/compile_grammar_spec.cc deleted file mode 100644 index ed2109c2..00000000 --- a/spec/integration/compile_grammar_spec.cc +++ /dev/null @@ -1,847 +0,0 @@ -#include "spec_helper.h" -#include "runtime/alloc.h" -#include "helpers/load_language.h" -#include "helpers/stderr_logger.h" -#include "helpers/dedent.h" -#include "compiler/util/string_helpers.h" -#include - -static string fill_template(string input, map parameters) { - string result = input; - for (const auto &pair : parameters) { - util::str_replace(&result, "{{" + pair.first + "}}", pair.second); - } - return result; -} - -START_TEST - -describe("compile_grammar", []() { - TSDocument *document; - - before_each([&]() { - document = ts_document_new(); - }); - - after_each([&]() { - ts_document_free(document); - }); - - auto assert_root_node = [&](const string &expected_string) { - TSNode root_node = ts_document_root_node(document); - char *node_string = ts_node_string(root_node, document); - AssertThat(node_string, Equals(expected_string)); - ts_free(node_string); - }; - - describe("conflicts", [&]() { - it("can resolve shift/reduce conflicts using associativities", [&]() { - string grammar_template = R"JSON({ - "name": "associativity_example", - - "rules": { - "expression": { - "type": "CHOICE", - "members": [ - {"type": "SYMBOL", "name": "math_operation"}, - {"type": "SYMBOL", "name": "identifier"} - ] - }, - - "math_operation": { - "type": "{{math_operation_prec_type}}", - "value": 0, - "content": { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "expression"}, - {"type": "STRING", "value": "+"}, - {"type": "SYMBOL", "name": "expression"} - ] - } - }, - - "identifier": { - "type": "PATTERN", - "value": "[a-zA-Z]+" - } - } - })JSON"; - - // Ambiguity, which '+' applies first? - ts_document_set_input_string(document, "x+y+z"); - - TSCompileResult result = ts_compile_grammar(fill_template(grammar_template, { - {"math_operation_prec_type", "PREC"} - }).c_str()); - - AssertThat(result.error_message, Equals(dedent(R"MESSAGE( - Unresolved conflict for symbol sequence: - - expression '+' expression • '+' … - - Possible interpretations: - - 1: (math_operation expression '+' expression) • '+' … - 2: expression '+' (math_operation expression • '+' expression) - - Possible resolutions: - - 1: Specify a left or right associativity in `math_operation` - 2: Add a conflict for these rules: `math_operation` - )MESSAGE"))); - - result = ts_compile_grammar(fill_template(grammar_template, { - {"math_operation_prec_type", "PREC_LEFT"} - }).c_str()); - - ts_document_set_language(document, load_compile_result("associativity_example", result)); - ts_document_parse(document); - assert_root_node("(expression (math_operation " - "(expression (math_operation (expression (identifier)) (expression (identifier)))) " - "(expression (identifier))))"); - - result = ts_compile_grammar(fill_template(grammar_template, { - {"math_operation_prec_type", "PREC_RIGHT"} - }).c_str()); - - ts_document_set_language(document, load_compile_result("associativity_example", result)); - ts_document_parse(document); - assert_root_node("(expression (math_operation " - "(expression (identifier)) " - "(expression (math_operation (expression (identifier)) (expression (identifier))))))"); - }); - - it("can resolve shift/reduce conflicts involving single-child rules using precedence", [&]() { - string grammar_template = R"JSON({ - "name": "associativity_example", - - "extras": [ - {"type": "PATTERN", "value": "\\s"} - ], - - "rules": { - "expression": { - "type": "CHOICE", - "members": [ - {"type": "SYMBOL", "name": "function_call"}, - {"type": "SYMBOL", "name": "identifier"} - ] - }, - - "function_call": { - "type": "PREC_RIGHT", - "value": {{function_call_precedence}}, - "content": { - "type": "CHOICE", - "members": [ - { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "identifier"}, - {"type": "SYMBOL", "name": "expression"} - ] - }, - { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "identifier"}, - {"type": "SYMBOL", "name": "block"} - ] - }, - { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "identifier"}, - {"type": "SYMBOL", "name": "expression"}, - {"type": "SYMBOL", "name": "block"} - ] - } - ] - } - }, - - "block": { - "type": "SEQ", - "members": [ - {"type": "STRING", "value": "{"}, - {"type": "SYMBOL", "name": "expression"}, - {"type": "STRING", "value": "}"} - ] - }, - - "identifier": { - "type": "PATTERN", - "value": "[a-zA-Z]+" - } - } - })JSON"; - - // Ambiguity: is the trailing block associated with `bar` or `foo`? - ts_document_set_input_string(document, "foo bar { baz }"); - - TSCompileResult result = ts_compile_grammar(fill_template(grammar_template, { - {"function_call_precedence", "0"} - }).c_str()); - - AssertThat(result.error_message, Equals(dedent(R"MESSAGE( - Unresolved conflict for symbol sequence: - - identifier • '{' … - - Possible interpretations: - - 1: (expression identifier) • '{' … - 2: (function_call identifier • block) - - Possible resolutions: - - 1: Specify a higher precedence in `function_call` than in the other rules. - 2: Specify a higher precedence in `expression` than in the other rules. - 3: Specify a left or right associativity in `expression` - 4: Add a conflict for these rules: `expression` `function_call` - )MESSAGE"))); - - // Giving function calls lower precedence than expressions causes `bar` - // to be treated as an expression passed to `foo`, not as a function - // that's being called with a block. - result = ts_compile_grammar(fill_template(grammar_template, { - {"function_call_precedence", "-1"} - }).c_str()); - - AssertThat(result.error_message, IsNull()); - ts_document_set_language(document, load_compile_result("associativity_example", result)); - ts_document_parse(document); - assert_root_node("(expression (function_call " - "(identifier) " - "(expression (identifier)) " - "(block (expression (identifier)))))"); - - // Giving function calls higher precedence than expressions causes `bar` - // to be treated as a function that's being called with a block, not as - // an expression passed to `foo`. - result = ts_compile_grammar(fill_template(grammar_template, { - {"function_call_precedence", "1"} - }).c_str()); - - AssertThat(result.error_message, IsNull()); - ts_document_set_language(document, load_compile_result("associativity_example", result)); - ts_document_set_input_string(document, "foo bar { baz }"); - ts_document_parse(document); - assert_root_node("(expression (function_call " - "(identifier) " - "(expression (function_call " - "(identifier) " - "(block (expression (identifier)))))))"); - }); - - it("handles precedence applied to specific rule subsequences (regression)", [&]() { - TSCompileResult result = ts_compile_grammar(R"JSON({ - "name": "precedence_on_subsequence", - - "extras": [ - {"type": "STRING", "value": " "} - ], - - "rules": { - "expression": { - "type": "PREC_LEFT", - "value": 0, - "content": { - "type": "CHOICE", - "members": [ - {"type": "SYMBOL", "name": "function_call"}, - {"type": "SYMBOL", "name": "identifier"}, - {"type": "SYMBOL", "name": "scope_resolution"} - ] - } - }, - - "function_call": { - "type": "CHOICE", - "members": [ - { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "identifier"}, - {"type": "SYMBOL", "name": "expression"} - ] - }, - - { - "type": "PREC", - "value": 1, - "content": { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "identifier"}, - {"type": "SYMBOL", "name": "block"} - ] - } - }, - - { - "type": "PREC", - "value": -1, - "content": { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "identifier"}, - {"type": "SYMBOL", "name": "do_block"} - ] - } - }, - - { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "identifier"}, - { - "type": "PREC", - "value": 1, - "content": { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "expression"}, - {"type": "SYMBOL", "name": "block"} - ] - } - } - ] - }, - - { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "identifier"}, - { - "type": "PREC", - "value": -1, - "content": { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "expression"}, - {"type": "SYMBOL", "name": "do_block"} - ] - } - } - ] - } - ] - }, - - "scope_resolution": { - "type": "PREC_LEFT", - "value": 1, - "content": { - "type": "CHOICE", - "members": [ - { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "expression"}, - {"type": "STRING", "value": "::"}, - {"type": "SYMBOL", "name": "expression"} - ] - }, - { - "type": "SEQ", - "members": [ - {"type": "STRING", "value": "::"}, - {"type": "SYMBOL", "name": "expression"}, - ] - } - ] - } - }, - - "block": { - "type": "STRING", - "value": "{}" - }, - - "do_block": { - "type": "STRING", - "value": "do end" - }, - - "identifier": { - "type": "PATTERN", - "value": "[a-zA-Z]+" - } - } - })JSON"); - - auto language = load_compile_result("precedence_on_subsequence", result); - ts_document_set_language(document, language); - - ts_document_set_input_string(document, "a b {}"); - ts_document_parse(document); - assert_root_node("(expression (function_call " - "(identifier) " - "(expression (function_call (identifier) (block)))))"); - - ts_document_set_input_string(document, "a b do end"); - ts_document_parse(document); - assert_root_node("(expression (function_call " - "(identifier) " - "(expression (identifier)) " - "(do_block)))"); - }); - - it("does not allow conflicting precedences", [&]() { - string grammar_template = R"JSON({ - "name": "conflicting_precedence_example", - - "rules": { - "expression": { - "type": "CHOICE", - "members": [ - {"type": "SYMBOL", "name": "sum"}, - {"type": "SYMBOL", "name": "product"}, - {"type": "SYMBOL", "name": "other_thing"} - ] - }, - - "sum": { - "type": "PREC_LEFT", - "value": 0, - "content": { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "expression"}, - {"type": "STRING", "value": "+"}, - {"type": "SYMBOL", "name": "expression"} - ] - } - }, - - "product": { - "type": "PREC_LEFT", - "value": 1, - "content": { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "expression"}, - {"type": "STRING", "value": "*"}, - {"type": "SYMBOL", "name": "expression"} - ] - } - }, - - "other_thing": { - "type": "PREC_LEFT", - "value": -1, - "content": { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "expression"}, - {"type": "STRING", "value": "*"}, - {"type": "STRING", "value": "*"} - ] - } - }, - - "identifier": { - "type": "PATTERN", - "value": "[a-zA-Z]+" - } - } - })JSON"; - - TSCompileResult result = ts_compile_grammar(fill_template(grammar_template, { - }).c_str()); - - AssertThat(result.error_message, Equals(dedent(R"MESSAGE( - Unresolved conflict for symbol sequence: - - expression '+' expression • '*' … - - Possible interpretations: - - 1: (sum expression '+' expression) • '*' … - 2: expression '+' (product expression • '*' expression) - 3: expression '+' (other_thing expression • '*' '*') - - Possible resolutions: - - 1: Specify a higher precedence in `product` and `other_thing` than in the other rules. - 2: Specify a higher precedence in `sum` than in the other rules. - 3: Add a conflict for these rules: `sum` `product` `other_thing` - )MESSAGE"))); - }); - }); - - describe("when the grammar contains rules that match the empty string", [&]() { - it("reports an error", [&]() { - TSCompileResult result = ts_compile_grammar(R"JSON( - { - "name": "empty_rules", - - "rules": { - "rule_1": {"type": "SYMBOL", "name": "rule_2"}, - - "rule_2": { - "type": "CHOICE", - "members": [ - {"type": "SYMBOL", "name": "rule_1"}, - {"type": "BLANK"} - ] - } - } - } - )JSON"); - - AssertThat(result.error_message, Equals(dedent(R"MESSAGE( - The rule `rule_2` matches the empty string. - Tree-sitter currently does not support syntactic rules that match the empty string. - )MESSAGE"))); - }); - }); - - describe("external scanners", [&]() { - it("can tokenize using arbitrary user-defined scanner functions", [&]() { - string grammar = R"JSON({ - "name": "external_scanner_example", - - "externals": [ - "_percent_string", - "_percent_string_start", - "_percent_string_end" - ], - - "extras": [ - {"type": "PATTERN", "value": "\\s"} - ], - - "rules": { - "expression": { - "type": "CHOICE", - "members": [ - {"type": "SYMBOL", "name": "string"}, - {"type": "SYMBOL", "name": "sum"}, - {"type": "SYMBOL", "name": "identifier"} - ] - }, - - "sum": { - "type": "PREC_LEFT", - "value": 0, - "content": { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "expression"}, - {"type": "STRING", "value": "+"}, - {"type": "SYMBOL", "name": "expression"} - ] - } - }, - - "string": { - "type": "CHOICE", - "members": [ - {"type": "SYMBOL", "name": "_percent_string"}, - { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "_percent_string_start"}, - {"type": "SYMBOL", "name": "expression"}, - {"type": "SYMBOL", "name": "_percent_string_end"} - ] - }, - ] - }, - - "identifier": { - "type": "PATTERN", - "value": "\\a+" - } - } - })JSON"; - - TSCompileResult result = ts_compile_grammar(grammar.c_str()); - AssertThat(result.error_message, IsNull()); - - ts_document_set_language(document, load_compile_result( - "external_scanner_example", - result, - "spec/fixtures/external_scanners/percent_strings.c" - )); - - ts_document_set_input_string(document, "x + %(sup (external) scanner?)"); - ts_document_parse(document); - assert_root_node("(expression (sum (expression (identifier)) (expression (string))))"); - - ts_document_set_input_string(document, "%{sup {} #{x + y} {} scanner?}"); - ts_document_parse(document); - assert_root_node("(expression (string (expression (sum (expression (identifier)) (expression (identifier))))))"); - }); - - it("allows external scanners to refer to tokens that are defined internally", [&]() { - string grammar = R"JSON({ - "name": "shared_external_tokens", - - "externals": [ - "string", - "line_break" - ], - - "extras": [ - {"type": "PATTERN", "value": "\\s"} - ], - - "rules": { - "statement": { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "_expression"}, - {"type": "SYMBOL", "name": "_expression"}, - {"type": "SYMBOL", "name": "line_break"} - ] - }, - - "_expression": { - "type": "CHOICE", - "members": [ - {"type": "SYMBOL", "name": "string"}, - {"type": "SYMBOL", "name": "variable"}, - {"type": "SYMBOL", "name": "number"} - ] - }, - - "variable": {"type": "PATTERN", "value": "\\a+"}, - "number": {"type": "PATTERN", "value": "\\d+"}, - "line_break": {"type": "STRING", "value": "\n"} - } - })JSON"; - - TSCompileResult result = ts_compile_grammar(grammar.c_str()); - AssertThat(result.error_message, IsNull()); - - ts_document_set_language(document, load_compile_result( - "shared_external_tokens", - result, - "spec/fixtures/external_scanners/shared_external_tokens.c" - )); - - ts_document_set_input_string(document, "a b\n"); - ts_document_parse(document); - assert_root_node("(statement (variable) (variable) (line_break))"); - - ts_document_set_input_string(document, "a \nb\n"); - ts_document_parse(document); - assert_root_node("(statement (variable) (variable) (line_break))"); - - ts_document_set_input_string(document, "'hello' 'world'\n"); - ts_document_parse(document); - assert_root_node("(statement (string) (string) (line_break))"); - - ts_document_set_input_string(document, "'hello' \n'world'\n"); - ts_document_parse(document); - assert_root_node("(statement (string) (string) (line_break))"); - }); - - it("allows external tokens to be used as extras", [&]() { - string grammar = R"JSON({ - "name": "extra_external_tokens", - - "externals": [ - "comment" - ], - - "extras": [ - {"type": "PATTERN", "value": "\\s"}, - {"type": "SYMBOL", "name": "comment"} - ], - - "rules": { - "assignment": { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "variable"}, - {"type": "STRING", "value": "="}, - {"type": "SYMBOL", "name": "variable"} - ] - }, - - "variable": {"type": "PATTERN", "value": "\\a+"} - } - })JSON"; - - TSCompileResult result = ts_compile_grammar(grammar.c_str()); - AssertThat(result.error_message, IsNull()); - - ts_document_set_language(document, load_compile_result( - "extra_external_tokens", - result, - "spec/fixtures/external_scanners/extra_external_tokens.c" - )); - - ts_document_set_input_string(document, "x = # a comment\n y"); - ts_document_parse(document); - assert_root_node("(assignment (variable) (comment) (variable))"); - }); - }); - - describe("when the grammar's start symbol is a token", [&]() { - it("parses the token", [&]() { - TSCompileResult result = ts_compile_grammar(R"JSON( - { - "name": "one_token_language", - "rules": { - "first_rule": {"type": "STRING", "value": "the-value"} - } - } - )JSON"); - - ts_document_set_language(document, load_compile_result("one_token_language", result)); - - ts_document_set_input_string(document, "the-value"); - ts_document_parse(document); - assert_root_node("(first_rule)"); - }); - }); - - describe("when the grammar's start symbol is blank", [&]() { - it("parses the empty string", [&]() { - TSCompileResult result = ts_compile_grammar(R"JSON( - { - "name": "blank_language", - "rules": { - "first_rule": {"type": "BLANK"} - } - } - )JSON"); - - ts_document_set_language(document, load_compile_result("blank_language", result)); - - ts_document_set_input_string(document, ""); - ts_document_parse(document); - assert_root_node("(first_rule)"); - }); - }); - - describe("when the grammar contains anonymous tokens with escaped characters", [&]() { - it("escapes the escaped characters properly in the generated parser", [&]() { - TSCompileResult result = ts_compile_grammar(R"JSON( - { - "name": "escaped_char_language", - "rules": { - "first_rule": { - "type": "CHOICE", - "members": [ - {"type": "STRING", "value": "\n"}, - {"type": "STRING", "value": "\r"}, - {"type": "STRING", "value": "'hello'"}, - {"type": "PATTERN", "value": "\\d+"} - ] - } - } - } - )JSON"); - - ts_document_set_language(document, load_compile_result("escaped_char_language", result)); - - ts_document_set_input_string(document, "1234"); - ts_document_parse(document); - assert_root_node("(first_rule)"); - - ts_document_set_input_string(document, "\n"); - ts_document_parse(document); - assert_root_node("(first_rule)"); - - ts_document_set_input_string(document, "'hello'"); - ts_document_parse(document); - assert_root_node("(first_rule)"); - }); - }); - - describe("the grammar in the README", [&]() { - it("parses the input in the README", [&]() { - TSCompileResult result = ts_compile_grammar(R"JSON( - { - "name": "arithmetic", - - // Things that can appear anywhere in the language, like comments - // and whitespace, are expressed as 'extras'. - "extras": [ - {"type": "PATTERN", "value": "\\s"}, - {"type": "SYMBOL", "name": "comment"} - ], - - "rules": { - - // The first rule listed in the grammar becomes the 'start rule'. - "expression": { - "type": "CHOICE", - "members": [ - {"type": "SYMBOL", "name": "sum"}, - {"type": "SYMBOL", "name": "product"}, - {"type": "SYMBOL", "name": "number"}, - {"type": "SYMBOL", "name": "variable"}, - { - "type": "SEQ", - "members": [ - {"type": "STRING", "value": "("}, - {"type": "SYMBOL", "name": "expression"}, - {"type": "STRING", "value": ")"} - ] - } - ] - }, - - // Tokens like '+' and '*' are described directly within the - // grammar's rules, as opposed to in a seperate lexer description. - "sum": { - "type": "PREC_LEFT", - "value": 1, - "content": { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "expression"}, - {"type": "STRING", "value": "+"}, - {"type": "SYMBOL", "name": "expression"} - ] - } - }, - - // Ambiguities can be resolved at compile time by assigning precedence - // values to rule subtrees. - "product": { - "type": "PREC_LEFT", - "value": 2, - "content": { - "type": "SEQ", - "members": [ - {"type": "SYMBOL", "name": "expression"}, - {"type": "STRING", "value": "*"}, - {"type": "SYMBOL", "name": "expression"} - ] - } - }, - - // Tokens can be specified using ECMAScript regexps. - "number": {"type": "PATTERN", "value": "\\d+"}, - "comment": {"type": "PATTERN", "value": "#.*"}, - "variable": {"type": "PATTERN", "value": "[a-zA-Z]\\w*"} - } - } - )JSON"); - - const TSLanguage *language = load_compile_result("arithmetic", result); - - ts_document_set_language(document, language); - ts_document_set_input_string(document, "a + b * c"); - ts_document_parse(document); - - assert_root_node( - "(expression (sum " - "(expression (variable)) " - "(expression (product " - "(expression (variable)) " - "(expression (variable))))))"); - }); - }); -}); - -END_TEST diff --git a/spec/integration/corpus_specs.cc b/spec/integration/corpus_specs.cc deleted file mode 100644 index c399e8f9..00000000 --- a/spec/integration/corpus_specs.cc +++ /dev/null @@ -1,185 +0,0 @@ -#include "spec_helper.h" -#include "runtime/alloc.h" -#include "helpers/load_language.h" -#include "helpers/read_test_entries.h" -#include "helpers/spy_input.h" -#include "helpers/stderr_logger.h" -#include "helpers/point_helpers.h" -#include "helpers/encoding_helpers.h" -#include "helpers/record_alloc.h" -#include "helpers/random_helpers.h" -#include "helpers/scope_sequence.h" -#include - -static void assert_correct_tree_shape(const TSDocument *document, string tree_string) { - TSNode root_node = ts_document_root_node(document); - const char *node_string = ts_node_string(root_node, document); - string result(node_string); - ts_free((void *)node_string); - AssertThat(result, Equals(tree_string)); -} - -static void assert_consistent_sizes(TSNode node) { - size_t child_count = ts_node_child_count(node); - size_t start_byte = ts_node_start_byte(node); - size_t end_byte = ts_node_end_byte(node); - TSPoint start_point = ts_node_start_point(node); - TSPoint end_point = ts_node_end_point(node); - bool some_child_has_changes = false; - - AssertThat(start_byte, !IsGreaterThan(end_byte)); - AssertThat(start_point, !IsGreaterThan(end_point)); - - size_t last_child_end_byte = start_byte; - TSPoint last_child_end_point = start_point; - - for (size_t i = 0; i < child_count; i++) { - TSNode child = ts_node_child(node, i); - size_t child_start_byte = ts_node_start_byte(child); - TSPoint child_start_point = ts_node_start_point(child); - - AssertThat(child_start_byte, !IsLessThan(last_child_end_byte)); - AssertThat(child_start_point, !IsLessThan(last_child_end_point)); - assert_consistent_sizes(child); - if (ts_node_has_changes(child)) - some_child_has_changes = true; - - last_child_end_byte = ts_node_end_byte(child); - last_child_end_point = ts_node_end_point(child); - } - - if (child_count > 0) { - AssertThat(end_byte, !IsLessThan(last_child_end_byte)); - AssertThat(end_point, !IsLessThan(last_child_end_point)); - } - - if (some_child_has_changes) { - AssertThat(ts_node_has_changes(node), IsTrue()); - } -} - -static void assert_correct_tree_size(TSDocument *document, string content) { - TSNode root_node = ts_document_root_node(document); - size_t expected_size = content.size(); - - // In the JSON grammar, the start rule (`_value`) is hidden, so the node - // returned from `ts_document_root_node` (e.g. an `object` node), does not - // actually point to the root of the tree. In this weird case, trailing - // whitespace is not included in the root node's size. - // - // TODO: Fix this inconsistency. Maybe disallow the start rule being hidden? - if (ts_document_language(document) == get_test_language("json") && - string(ts_node_type(root_node, document)) != "ERROR") - expected_size = content.find_last_not_of("\n ") + 1; - - AssertThat(ts_node_end_byte(root_node), Equals(expected_size)); - assert_consistent_sizes(root_node); -} - -START_TEST - -describe("The Corpus", []() { - vector test_languages({ - "javascript", - "json", - "c", - "cpp", - "python", - }); - - for (auto &language_name : test_languages) { - describe(("the " + language_name + " language").c_str(), [&]() { - TSDocument *document; - - before_each([&]() { - record_alloc::start(); - document = ts_document_new(); - ts_document_set_language(document, get_test_language(language_name)); - - // ts_document_set_logger(document, stderr_logger_new(true)); - // ts_document_print_debugging_graphs(document, true); - }); - - after_each([&]() { - ts_document_free(document); - AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty()); - }); - - for (auto &entry : read_corpus_entries(language_name)) { - SpyInput *input; - - auto it_handles_edit_sequence = [&](string name, std::function edit_sequence){ - it(("parses " + entry.description + ": " + name).c_str(), [&]() { - input = new SpyInput(entry.input, 3); - ts_document_set_input(document, input->input()); - edit_sequence(); - assert_correct_tree_shape(document, entry.tree_string); - assert_correct_tree_size(document, input->content); - delete input; - }); - }; - - it_handles_edit_sequence("initial parse", [&]() { - ts_document_parse(document); - }); - - std::set> deletions; - std::set> insertions; - - for (size_t i = 0; i < 60; i++) { - size_t edit_position = random() % utf8_char_count(entry.input); - size_t deletion_size = random() % (utf8_char_count(entry.input) - edit_position); - string inserted_text = random_words(random() % 4 + 1); - - if (insertions.insert({edit_position, inserted_text}).second) { - string description = "\"" + inserted_text + "\" at " + to_string(edit_position); - - it_handles_edit_sequence("repairing an insertion of " + description, [&]() { - ts_document_edit(document, input->replace(edit_position, 0, inserted_text)); - ts_document_parse(document); - assert_correct_tree_size(document, input->content); - - ts_document_edit(document, input->undo()); - assert_correct_tree_size(document, input->content); - - TSRange *ranges; - uint32_t range_count; - ScopeSequence old_scope_sequence = build_scope_sequence(document, input->content); - ts_document_parse_and_get_changed_ranges(document, &ranges, &range_count); - - ScopeSequence new_scope_sequence = build_scope_sequence(document, input->content); - verify_changed_ranges(old_scope_sequence, new_scope_sequence, - input->content, ranges, range_count); - ts_free(ranges); - }); - } - - if (deletions.insert({edit_position, deletion_size}).second) { - string desription = to_string(edit_position) + "-" + to_string(edit_position + deletion_size); - - it_handles_edit_sequence("repairing a deletion of " + desription, [&]() { - ts_document_edit(document, input->replace(edit_position, deletion_size, "")); - ts_document_parse(document); - assert_correct_tree_size(document, input->content); - - ts_document_edit(document, input->undo()); - assert_correct_tree_size(document, input->content); - - TSRange *ranges; - uint32_t range_count; - ScopeSequence old_scope_sequence = build_scope_sequence(document, input->content); - ts_document_parse_and_get_changed_ranges(document, &ranges, &range_count); - - ScopeSequence new_scope_sequence = build_scope_sequence(document, input->content); - verify_changed_ranges(old_scope_sequence, new_scope_sequence, - input->content, ranges, range_count); - ts_free(ranges); - }); - } - } - } - }); - } -}); - -END_TEST diff --git a/spec/integration/real_grammars.cc b/spec/integration/real_grammars.cc new file mode 100644 index 00000000..a7c2137d --- /dev/null +++ b/spec/integration/real_grammars.cc @@ -0,0 +1,181 @@ +#include "spec_helper.h" +#include "runtime/alloc.h" +#include "helpers/load_language.h" +#include "helpers/read_test_entries.h" +#include "helpers/spy_input.h" +#include "helpers/stderr_logger.h" +#include "helpers/point_helpers.h" +#include "helpers/encoding_helpers.h" +#include "helpers/record_alloc.h" +#include "helpers/random_helpers.h" +#include "helpers/scope_sequence.h" +#include + +static void assert_consistent_sizes(TSNode node) { + size_t child_count = ts_node_child_count(node); + size_t start_byte = ts_node_start_byte(node); + size_t end_byte = ts_node_end_byte(node); + TSPoint start_point = ts_node_start_point(node); + TSPoint end_point = ts_node_end_point(node); + bool some_child_has_changes = false; + + AssertThat(start_byte, !IsGreaterThan(end_byte)); + AssertThat(start_point, !IsGreaterThan(end_point)); + + size_t last_child_end_byte = start_byte; + TSPoint last_child_end_point = start_point; + + for (size_t i = 0; i < child_count; i++) { + TSNode child = ts_node_child(node, i); + size_t child_start_byte = ts_node_start_byte(child); + TSPoint child_start_point = ts_node_start_point(child); + + AssertThat(child_start_byte, !IsLessThan(last_child_end_byte)); + AssertThat(child_start_point, !IsLessThan(last_child_end_point)); + assert_consistent_sizes(child); + if (ts_node_has_changes(child)) + some_child_has_changes = true; + + last_child_end_byte = ts_node_end_byte(child); + last_child_end_point = ts_node_end_point(child); + } + + if (child_count > 0) { + AssertThat(end_byte, !IsLessThan(last_child_end_byte)); + AssertThat(end_point, !IsLessThan(last_child_end_point)); + } + + if (some_child_has_changes) { + AssertThat(ts_node_has_changes(node), IsTrue()); + } +} + +static void assert_correct_tree_size(TSDocument *document, string content) { + TSNode root_node = ts_document_root_node(document); + size_t expected_size = content.size(); + + // In the JSON grammar, the start rule (`_value`) is hidden, so the node + // returned from `ts_document_root_node` (e.g. an `object` node), does not + // actually point to the root of the tree. In this weird case, trailing + // whitespace is not included in the root node's size. + // + // TODO: Fix this inconsistency. Maybe disallow the start rule being hidden? + if (ts_document_language(document) == load_real_language("json") && + string(ts_node_type(root_node, document)) != "ERROR") + expected_size = content.find_last_not_of("\n ") + 1; + + AssertThat(ts_node_end_byte(root_node), Equals(expected_size)); + assert_consistent_sizes(root_node); +} + +START_TEST + +vector test_languages({ + "javascript", + "json", + "c", + "cpp", + "python", +}); + +for (auto &language_name : test_languages) { + describe(("the " + language_name + " language").c_str(), [&]() { + TSDocument *document; + + before_each([&]() { + record_alloc::start(); + document = ts_document_new(); + ts_document_set_language(document, load_real_language(language_name)); + + // ts_document_set_logger(document, stderr_logger_new(true)); + // ts_document_print_debugging_graphs(document, true); + }); + + after_each([&]() { + ts_document_free(document); + AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty()); + }); + + for (auto &entry : read_real_language_corpus(language_name)) { + SpyInput *input; + + auto it_handles_edit_sequence = [&](string name, std::function edit_sequence){ + it(("parses " + entry.description + ": " + name).c_str(), [&]() { + input = new SpyInput(entry.input, 3); + ts_document_set_input(document, input->input()); + edit_sequence(); + + TSNode root_node = ts_document_root_node(document); + const char *node_string = ts_node_string(root_node, document); + string result(node_string); + ts_free((void *)node_string); + AssertThat(result, Equals(entry.tree_string)); + + assert_correct_tree_size(document, input->content); + delete input; + }); + }; + + it_handles_edit_sequence("initial parse", [&]() { + ts_document_parse(document); + }); + + std::set> deletions; + std::set> insertions; + + for (size_t i = 0; i < 60; i++) { + size_t edit_position = random() % utf8_char_count(entry.input); + size_t deletion_size = random() % (utf8_char_count(entry.input) - edit_position); + string inserted_text = random_words(random() % 4 + 1); + + if (insertions.insert({edit_position, inserted_text}).second) { + string description = "\"" + inserted_text + "\" at " + to_string(edit_position); + + it_handles_edit_sequence("repairing an insertion of " + description, [&]() { + ts_document_edit(document, input->replace(edit_position, 0, inserted_text)); + ts_document_parse(document); + assert_correct_tree_size(document, input->content); + + ts_document_edit(document, input->undo()); + assert_correct_tree_size(document, input->content); + + TSRange *ranges; + uint32_t range_count; + ScopeSequence old_scope_sequence = build_scope_sequence(document, input->content); + ts_document_parse_and_get_changed_ranges(document, &ranges, &range_count); + + ScopeSequence new_scope_sequence = build_scope_sequence(document, input->content); + verify_changed_ranges(old_scope_sequence, new_scope_sequence, + input->content, ranges, range_count); + ts_free(ranges); + }); + } + + if (deletions.insert({edit_position, deletion_size}).second) { + string desription = to_string(edit_position) + "-" + to_string(edit_position + deletion_size); + + it_handles_edit_sequence("repairing a deletion of " + desription, [&]() { + ts_document_edit(document, input->replace(edit_position, deletion_size, "")); + ts_document_parse(document); + assert_correct_tree_size(document, input->content); + + ts_document_edit(document, input->undo()); + assert_correct_tree_size(document, input->content); + + TSRange *ranges; + uint32_t range_count; + ScopeSequence old_scope_sequence = build_scope_sequence(document, input->content); + ts_document_parse_and_get_changed_ranges(document, &ranges, &range_count); + + ScopeSequence new_scope_sequence = build_scope_sequence(document, input->content); + verify_changed_ranges(old_scope_sequence, new_scope_sequence, + input->content, ranges, range_count); + ts_free(ranges); + }); + } + } + } + }); +} + +END_TEST diff --git a/spec/integration/test_grammars.cc b/spec/integration/test_grammars.cc new file mode 100644 index 00000000..128dd6cc --- /dev/null +++ b/spec/integration/test_grammars.cc @@ -0,0 +1,78 @@ +#include "spec_helper.h" +#include "helpers/read_test_entries.h" +#include "helpers/load_language.h" +#include "helpers/stderr_logger.h" +#include "helpers/file_helpers.h" +#include "runtime/alloc.h" + +START_TEST + +string grammars_dir_path = "spec/fixtures/test_grammars"; +vector test_languages = list_directory(grammars_dir_path); + +for (auto &language_name : test_languages) { + if (language_name == "readme.md") continue; + + describe(("test language: " + language_name).c_str(), [&]() { + string directory_path = grammars_dir_path + "/" + language_name; + string grammar_path = directory_path + "/grammar.json"; + string external_scanner_path = directory_path + "/scanner.c"; + string expected_error_path = directory_path + "/expected_error.txt"; + string corpus_path = directory_path + "/corpus.txt"; + + if (!file_exists(external_scanner_path)) { + external_scanner_path = ""; + } + + string grammar_json = read_file(grammar_path); + TSCompileResult compile_result = ts_compile_grammar(grammar_json.c_str()); + + if (file_exists(expected_error_path)) { + it("fails with the correct error message", [&]() { + string expected_error = read_file(expected_error_path); + AssertThat((void *)compile_result.error_message, !IsNull()); + AssertThat(compile_result.error_message, Equals(expected_error)); + }); + + return; + } else { + TSDocument *document = nullptr; + const TSLanguage *language = nullptr; + + before_each([&]() { + if (!language) { + language = load_test_language( + language_name, + compile_result, + external_scanner_path + ); + } + + document = ts_document_new(); + ts_document_set_language(document, language); + + // ts_document_set_logger(document, stderr_logger_new(true)); + // ts_document_print_debugging_graphs(document, true); + }); + + after_each([&]() { + if (document) ts_document_free(document); + }); + + for (auto &entry : read_test_language_corpus(language_name)) { + it(("parses " + entry.description).c_str(), [&]() { + ts_document_set_input_string_with_length(document, entry.input.c_str(), entry.input.size()); + ts_document_parse(document); + + TSNode root_node = ts_document_root_node(document); + const char *node_string = ts_node_string(root_node, document); + string result(node_string); + ts_free((void *)node_string); + AssertThat(result, Equals(entry.tree_string)); + }); + } + } + }); +} + +END_TEST \ No newline at end of file diff --git a/spec/runtime/document_spec.cc b/spec/runtime/document_spec.cc index f80419dc..1863e210 100644 --- a/spec/runtime/document_spec.cc +++ b/spec/runtime/document_spec.cc @@ -43,7 +43,7 @@ describe("Document", [&]() { before_each([&]() { spy_input = new SpyInput("{\"key\": [null, 2]}", 3); - ts_document_set_language(document, get_test_language("json")); + ts_document_set_language(document, load_real_language("json")); ts_document_set_input_string(document, "{\"key\": [1, 2]}"); ts_document_parse(document); @@ -152,7 +152,7 @@ describe("Document", [&]() { }); it("uses the given language for future parses", [&]() { - ts_document_set_language(document, get_test_language("json")); + ts_document_set_language(document, load_real_language("json")); ts_document_parse(document); root = ts_document_root_node(document); @@ -162,10 +162,10 @@ describe("Document", [&]() { }); it("clears out any previous tree", [&]() { - ts_document_set_language(document, get_test_language("json")); + ts_document_set_language(document, load_real_language("json")); ts_document_parse(document); - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); AssertThat(ts_document_root_node(document).data, Equals(nullptr)); ts_document_parse(document); @@ -177,7 +177,7 @@ describe("Document", [&]() { }); it("does not allow setting a language with a different version number", [&]() { - TSLanguage language = *get_test_language("json"); + TSLanguage language = *load_real_language("json"); AssertThat(ts_language_version(&language), Equals(TREE_SITTER_LANGUAGE_VERSION)); language.version++; @@ -193,7 +193,7 @@ describe("Document", [&]() { before_each([&]() { logger = new SpyLogger(); - ts_document_set_language(document, get_test_language("json")); + ts_document_set_language(document, load_real_language("json")); ts_document_set_input_string(document, "[1, 2]"); }); @@ -235,7 +235,7 @@ describe("Document", [&]() { SpyInput *input; before_each([&]() { - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); input = new SpyInput("{a: null};", 3); ts_document_set_input(document, input->input()); ts_document_parse(document); diff --git a/spec/runtime/node_spec.cc b/spec/runtime/node_spec.cc index 085e4d31..f01a862f 100644 --- a/spec/runtime/node_spec.cc +++ b/spec/runtime/node_spec.cc @@ -40,7 +40,7 @@ describe("Node", []() { record_alloc::start(); document = ts_document_new(); - ts_document_set_language(document, get_test_language("json")); + ts_document_set_language(document, load_real_language("json")); ts_document_set_input_string(document, input_string.c_str()); ts_document_parse(document); diff --git a/spec/runtime/parser_spec.cc b/spec/runtime/parser_spec.cc index 0b4c0a3a..c1c3a547 100644 --- a/spec/runtime/parser_spec.cc +++ b/spec/runtime/parser_spec.cc @@ -83,7 +83,7 @@ describe("Parser", [&]() { describe("handling errors", [&]() { describe("when there is an invalid substring right before a valid token", [&]() { it("computes the error node's size and position correctly", [&]() { - ts_document_set_language(document, get_test_language("json")); + ts_document_set_language(document, load_real_language("json")); set_text(" [123, @@@@@, true]"); assert_root_node( @@ -108,7 +108,7 @@ describe("Parser", [&]() { describe("when there is an unexpected string in the middle of a token", [&]() { it("computes the error node's size and position correctly", [&]() { - ts_document_set_language(document, get_test_language("json")); + ts_document_set_language(document, load_real_language("json")); set_text(" [123, faaaaalse, true]"); assert_root_node( @@ -134,7 +134,7 @@ describe("Parser", [&]() { describe("when there is one unexpected token between two valid tokens", [&]() { it("computes the error node's size and position correctly", [&]() { - ts_document_set_language(document, get_test_language("json")); + ts_document_set_language(document, load_real_language("json")); set_text(" [123, true false, true]"); assert_root_node( @@ -153,7 +153,7 @@ describe("Parser", [&]() { describe("when there is an unexpected string at the end of a token", [&]() { it("computes the error's size and position correctly", [&]() { - ts_document_set_language(document, get_test_language("json")); + ts_document_set_language(document, load_real_language("json")); set_text(" [123, \"hi\n, true]"); assert_root_node( @@ -163,7 +163,7 @@ describe("Parser", [&]() { describe("when there is an unterminated error", [&]() { it("maintains a consistent tree", [&]() { - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); set_text("a; /* b"); assert_root_node( "(ERROR (program (expression_statement (identifier))) (UNEXPECTED EOF))"); @@ -172,7 +172,7 @@ describe("Parser", [&]() { describe("when there are extra tokens at the end of the viable prefix", [&]() { it("does not include them in the error node", [&]() { - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); set_text( "var x;\n" "\n" @@ -192,7 +192,7 @@ describe("Parser", [&]() { describe("handling extra tokens", [&]() { describe("when the token appears as part of a grammar rule", [&]() { it("incorporates it into the tree", [&]() { - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); set_text("fn()\n"); assert_root_node( @@ -202,7 +202,7 @@ describe("Parser", [&]() { describe("when the token appears somewhere else", [&]() { it("incorporates it into the tree", [&]() { - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); set_text( "fn()\n" " .otherFn();"); @@ -218,7 +218,7 @@ describe("Parser", [&]() { describe("when several extra tokens appear in a row", [&]() { it("incorporates them into the tree", [&]() { - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); set_text( "fn()\n\n" "// This is a comment" @@ -239,7 +239,7 @@ describe("Parser", [&]() { describe("editing", [&]() { describe("creating new tokens near the end of the input", [&]() { it("updates the parse tree and re-reads only the changed portion of the text", [&]() { - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); set_text("x * (100 + abc);"); assert_root_node( @@ -262,7 +262,7 @@ describe("Parser", [&]() { it("updates the parse tree and re-reads only the changed portion of the input", [&]() { chunk_size = 2; - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); set_text("123 + 456 * (10 + x);"); assert_root_node( @@ -285,7 +285,7 @@ describe("Parser", [&]() { describe("introducing an error", [&]() { it("gives the error the right size", [&]() { - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); set_text("var x = y;"); assert_root_node( @@ -308,7 +308,7 @@ describe("Parser", [&]() { describe("into the middle of an existing token", [&]() { it("updates the parse tree", [&]() { - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); set_text("abc * 123;"); assert_root_node( @@ -327,7 +327,7 @@ describe("Parser", [&]() { describe("at the end of an existing token", [&]() { it("updates the parse tree", [&]() { - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); set_text("abc * 123;"); assert_root_node( @@ -346,7 +346,7 @@ describe("Parser", [&]() { describe("inserting text into a node containing a extra token", [&]() { it("updates the parse tree", [&]() { - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); set_text("123 *\n" "// a-comment\n" "abc;"); @@ -373,7 +373,7 @@ describe("Parser", [&]() { describe("when a critical token is removed", [&]() { it("updates the parse tree, creating an error", [&]() { - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); set_text("123 * 456; 789 * 123;"); assert_root_node( @@ -392,7 +392,7 @@ describe("Parser", [&]() { describe("with external tokens", [&]() { it("maintains the external scanner's state during incremental parsing", [&]() { - ts_document_set_language(document, get_test_language("python")); + ts_document_set_language(document, load_real_language("python")); string text = dedent(R"PYTHON( if a: print b @@ -420,7 +420,7 @@ describe("Parser", [&]() { }); it("does not try to re-use nodes that are within the edited region", [&]() { - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); set_text("{ x: (b.c) };"); assert_root_node( @@ -435,7 +435,7 @@ describe("Parser", [&]() { }); it("updates the document's parse count", [&]() { - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); AssertThat(ts_document_parse_count(document), Equals(0)); set_text("{ x: (b.c) };"); @@ -449,7 +449,7 @@ describe("Parser", [&]() { describe("lexing", [&]() { describe("handling tokens containing wildcard patterns (e.g. comments)", [&]() { it("terminates them at the end of the document", [&]() { - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); set_text("x; // this is a comment"); assert_root_node( @@ -464,7 +464,7 @@ describe("Parser", [&]() { it("recognizes UTF8 characters as single characters", [&]() { // 'ΩΩΩ — ΔΔ'; - ts_document_set_language(document, get_test_language("javascript")); + ts_document_set_language(document, load_real_language("javascript")); set_text("'\u03A9\u03A9\u03A9 \u2014 \u0394\u0394';"); assert_root_node( diff --git a/src/compiler/build_tables/build_lex_table.cc b/src/compiler/build_tables/build_lex_table.cc deleted file mode 100644 index 29d8f4d0..00000000 --- a/src/compiler/build_tables/build_lex_table.cc +++ /dev/null @@ -1,195 +0,0 @@ -#include "compiler/build_tables/build_lex_table.h" -#include -#include -#include -#include -#include -#include -#include -#include "compiler/build_tables/lex_conflict_manager.h" -#include "compiler/build_tables/remove_duplicate_states.h" -#include "compiler/build_tables/lex_item.h" -#include "compiler/parse_table.h" -#include "compiler/lexical_grammar.h" -#include "compiler/rules/built_in_symbols.h" -#include "compiler/rules/choice.h" -#include "compiler/rules/metadata.h" -#include "compiler/rules/repeat.h" -#include "compiler/rules/seq.h" -#include "compiler/rules/blank.h" - -namespace tree_sitter { -namespace build_tables { - -using std::map; -using std::set; -using std::string; -using std::vector; -using std::make_shared; -using std::unordered_map; -using rules::Blank; -using rules::Choice; -using rules::CharacterSet; -using rules::Repeat; -using rules::Symbol; -using rules::Metadata; -using rules::Seq; - -class LexTableBuilder { - LexTable lex_table; - ParseTable *parse_table; - const LexicalGrammar lex_grammar; - vector separator_rules; - LexConflictManager conflict_manager; - unordered_map lex_state_ids; - - public: - LexTableBuilder(ParseTable *parse_table, const LexicalGrammar &lex_grammar) - : parse_table(parse_table), lex_grammar(lex_grammar) { - for (const rule_ptr &rule : lex_grammar.separators) - separator_rules.push_back(Repeat::build(rule)); - separator_rules.push_back(Blank::build()); - } - - LexTable build() { - for (ParseState &parse_state : parse_table->states) - add_lex_state_for_parse_state(&parse_state); - - mark_fragile_tokens(); - remove_duplicate_lex_states(); - - return lex_table; - } - - private: - void add_lex_state_for_parse_state(ParseState *parse_state) { - parse_state->lex_state_id = - add_lex_state(item_set_for_terminals(parse_state->terminal_entries)); - } - - LexStateId add_lex_state(const LexItemSet &item_set) { - const auto &pair = lex_state_ids.find(item_set); - if (pair == lex_state_ids.end()) { - LexStateId state_id = lex_table.add_state(); - lex_state_ids[item_set] = state_id; - add_accept_token_actions(item_set, state_id); - add_advance_actions(item_set, state_id); - return state_id; - } else { - return pair->second; - } - } - - void add_advance_actions(const LexItemSet &item_set, LexStateId state_id) { - for (const auto &pair : item_set.transitions()) { - const CharacterSet &characters = pair.first; - const LexItemSet::Transition &transition = pair.second; - AdvanceAction action(-1, transition.precedence, transition.in_main_token); - - auto current_action = lex_table.state(state_id).accept_action; - if (conflict_manager.resolve(transition.destination, action, - current_action)) { - action.state_index = add_lex_state(transition.destination); - lex_table.state(state_id).advance_actions[characters] = action; - } - } - } - - void add_accept_token_actions(const LexItemSet &item_set, LexStateId state_id) { - for (const LexItem &item : item_set.entries) { - LexItem::CompletionStatus completion_status = item.completion_status(); - if (completion_status.is_done) { - AcceptTokenAction action(item.lhs, completion_status.precedence.max, - completion_status.is_string); - - auto current_action = lex_table.state(state_id).accept_action; - if (conflict_manager.resolve(action, current_action)) - lex_table.state(state_id).accept_action = action; - } - } - } - - void mark_fragile_tokens() { - for (ParseState &state : parse_table->states) { - for (auto &entry : state.terminal_entries) { - Symbol symbol = entry.first; - if (symbol.is_token()) { - auto homonyms = conflict_manager.possible_homonyms.find(symbol.index); - if (homonyms != conflict_manager.possible_homonyms.end()) - for (Symbol::Index homonym : homonyms->second) - if (state.terminal_entries.count(Symbol(homonym, Symbol::Terminal))) { - entry.second.reusable = false; - break; - } - - if (!entry.second.reusable) - continue; - - auto extensions = conflict_manager.possible_extensions.find(symbol.index); - if (extensions != conflict_manager.possible_extensions.end()) - for (Symbol::Index extension : extensions->second) - if (state.terminal_entries.count(Symbol(extension, Symbol::Terminal))) { - entry.second.depends_on_lookahead = true; - break; - } - } - } - } - } - - void remove_duplicate_lex_states() { - for (LexState &state : lex_table.states) { - state.accept_action.is_string = false; - state.accept_action.precedence = 0; - } - - auto replacements = - remove_duplicate_states(&lex_table); - - for (ParseState &parse_state : parse_table->states) { - auto replacement = replacements.find(parse_state.lex_state_id); - if (replacement != replacements.end()) - parse_state.lex_state_id = replacement->second; - } - } - - LexItemSet item_set_for_terminals(const map &terminals) { - LexItemSet result; - for (const auto &pair : terminals) { - Symbol symbol = pair.first; - if (symbol.is_token()) { - for (const rule_ptr &rule : rules_for_symbol(symbol)) { - for (const rule_ptr &separator_rule : separator_rules) { - result.entries.insert(LexItem( - symbol, - Metadata::separator( - Seq::build({ - separator_rule, - Metadata::main_token(rule) })))); - } - } - } - } - return result; - } - - vector rules_for_symbol(const rules::Symbol &symbol) { - if (symbol == rules::END_OF_INPUT()) - return { CharacterSet().include(0).copy() }; - - rule_ptr rule = lex_grammar.variables[symbol.index].rule; - - auto choice = rule->as(); - if (choice) - return choice->elements; - else - return { rule }; - } -}; - -LexTable build_lex_table(ParseTable *table, const LexicalGrammar &grammar) { - return LexTableBuilder(table, grammar).build(); -} - -} // namespace build_tables -} // namespace tree_sitter diff --git a/src/compiler/build_tables/build_lex_table.h b/src/compiler/build_tables/build_lex_table.h deleted file mode 100644 index 26bfe6c2..00000000 --- a/src/compiler/build_tables/build_lex_table.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef COMPILER_BUILD_TABLES_BUILD_LEX_TABLE_H_ -#define COMPILER_BUILD_TABLES_BUILD_LEX_TABLE_H_ - -#include "compiler/lex_table.h" - -namespace tree_sitter { - -struct LexicalGrammar; -class ParseTable; - -namespace build_tables { - -LexTable build_lex_table(ParseTable *, const LexicalGrammar &); - -} // namespace build_tables -} // namespace tree_sitter - -#endif // COMPILER_BUILD_TABLES_BUILD_LEX_TABLE_H_ diff --git a/src/compiler/build_tables/build_parse_table.cc b/src/compiler/build_tables/build_parse_table.cc index 9fb6859f..1f8a6939 100644 --- a/src/compiler/build_tables/build_parse_table.cc +++ b/src/compiler/build_tables/build_parse_table.cc @@ -6,14 +6,13 @@ #include #include #include "compiler/parse_table.h" -#include "compiler/build_tables/remove_duplicate_states.h" #include "compiler/build_tables/parse_item.h" #include "compiler/build_tables/parse_item_set_builder.h" #include "compiler/lexical_grammar.h" #include "compiler/syntax_grammar.h" #include "compiler/rules/symbol.h" #include "compiler/rules/built_in_symbols.h" -#include "compiler/build_tables/recovery_tokens.h" +#include "compiler/build_tables/lex_table_builder.h" namespace tree_sitter { namespace build_tables { @@ -41,6 +40,7 @@ class ParseTableBuilder { set conflicts; ParseItemSetBuilder item_set_builder; set fragile_productions; + vector> incompatible_token_indices_by_index; bool allow_any_conflict; public: @@ -56,9 +56,9 @@ class ParseTableBuilder { Symbol(0, Symbol::Terminal) : Symbol(0, Symbol::NonTerminal); - Production start_production({ - ProductionStep(start_symbol, 0, rules::AssociativityNone), - }); + Production start_production{ + ProductionStep{start_symbol, 0, rules::AssociativityNone}, + }; // Placeholder for error state add_parse_state(ParseItemSet()); @@ -71,10 +71,11 @@ class ParseTableBuilder { })); CompileError error = process_part_state_queue(); - if (error.type != TSCompileErrorTypeNone) + if (error.type != TSCompileErrorTypeNone) { return { parse_table, error }; + } - parse_table.mergeable_symbols = recovery_tokens(lexical_grammar); + compute_unmergable_token_pairs(); build_error_parse_state(); @@ -110,8 +111,18 @@ class ParseTableBuilder { void build_error_parse_state() { ParseState error_state; - for (const Symbol symbol : parse_table.mergeable_symbols) { - add_out_of_context_parse_state(&error_state, symbol); + for (Symbol::Index i = 0; i < lexical_grammar.variables.size(); i++) { + bool has_non_reciprocal_conflict = false; + for (Symbol::Index incompatible_index : incompatible_token_indices_by_index[i]) { + if (!incompatible_token_indices_by_index[incompatible_index].count(i)) { + has_non_reciprocal_conflict = true; + break; + } + } + + if (!has_non_reciprocal_conflict) { + add_out_of_context_parse_state(&error_state, Symbol(i, Symbol::Terminal)); + } } for (const Symbol &symbol : grammar.extra_tokens) { @@ -148,7 +159,8 @@ class ParseTableBuilder { ParseStateId add_parse_state(const ParseItemSet &item_set) { auto pair = parse_state_ids.find(item_set); if (pair == parse_state_ids.end()) { - ParseStateId state_id = parse_table.add_state(); + ParseStateId state_id = parse_table.states.size(); + parse_table.states.push_back(ParseState()); parse_state_ids[item_set] = state_id; parse_table.states[state_id].shift_actions_signature = item_set.unfinished_item_signature(); item_sets_to_process.push_back({ std::move(item_set), state_id }); @@ -291,6 +303,34 @@ class ParseTableBuilder { } } + void compute_unmergable_token_pairs() { + incompatible_token_indices_by_index.resize(lexical_grammar.variables.size()); + + // First, assume that all tokens are mutually incompatible. + for (Symbol::Index i = 0, n = lexical_grammar.variables.size(); i < n; i++) { + auto &incompatible_indices = incompatible_token_indices_by_index[i]; + for (Symbol::Index j = 0; j < n; j++) { + if (j != i) incompatible_indices.insert(j); + } + } + + // For the remaining possibly-incompatible pairs of tokens, check if they + // are actually incompatible by actually generating lexical states that + // contain them both. + auto lex_table_builder = LexTableBuilder::create(lexical_grammar); + for (Symbol::Index i = 0, n = lexical_grammar.variables.size(); i < n; i++) { + auto &incompatible_indices = incompatible_token_indices_by_index[i]; + auto iter = incompatible_indices.begin(); + while (iter != incompatible_indices.end()) { + if (lex_table_builder->detect_conflict(i, *iter)) { + ++iter; + } else { + iter = incompatible_indices.erase(iter); + } + } + } + } + void remove_duplicate_parse_states() { map> state_indices_by_signature; @@ -302,7 +342,7 @@ class ParseTableBuilder { set deleted_states; while (true) { - std::map state_replacements; + map state_replacements; for (auto &pair : state_indices_by_signature) { auto &state_group = pair.second; @@ -310,7 +350,7 @@ class ParseTableBuilder { for (ParseStateId i : state_group) { for (ParseStateId j : state_group) { if (j == i) break; - if (!state_replacements.count(j) && parse_table.merge_state(j, i)) { + if (!state_replacements.count(j) && merge_parse_state(j, i)) { state_replacements.insert({ i, j }); deleted_states.insert(i); break; @@ -364,6 +404,72 @@ class ParseTableBuilder { } } + static bool has_entry(const ParseState &state, const ParseTableEntry &entry) { + for (const auto &pair : state.terminal_entries) + if (pair.second == entry) + return true; + return false; + } + + bool merge_parse_state(size_t i, size_t j) { + ParseState &state = parse_table.states[i]; + ParseState &other = parse_table.states[j]; + + if (state.nonterminal_entries != other.nonterminal_entries) + return false; + + for (auto &entry : state.terminal_entries) { + Symbol lookahead = entry.first; + const vector &actions = entry.second.actions; + auto &incompatible_token_indices = incompatible_token_indices_by_index[lookahead.index]; + + const auto &other_entry = other.terminal_entries.find(lookahead); + if (other_entry == other.terminal_entries.end()) { + if (lookahead.is_external()) return false; + if (!lookahead.is_built_in()) { + for (Symbol::Index incompatible_index : incompatible_token_indices) { + Symbol incompatible_symbol(incompatible_index, Symbol::Terminal); + if (other.terminal_entries.count(incompatible_symbol)) return false; + } + } + if (actions.back().type != ParseActionTypeReduce) + return false; + if (!has_entry(other, entry.second)) + return false; + } else if (entry.second != other_entry->second) { + return false; + } + } + + set symbols_to_merge; + + for (auto &entry : other.terminal_entries) { + Symbol lookahead = entry.first; + const vector &actions = entry.second.actions; + auto &incompatible_token_indices = incompatible_token_indices_by_index[lookahead.index]; + + if (!state.terminal_entries.count(lookahead)) { + if (lookahead.is_external()) return false; + if (!lookahead.is_built_in()) { + for (Symbol::Index incompatible_index : incompatible_token_indices) { + Symbol incompatible_symbol(incompatible_index, Symbol::Terminal); + if (state.terminal_entries.count(incompatible_symbol)) return false; + } + } + if (actions.back().type != ParseActionTypeReduce) + return false; + if (!has_entry(state, entry.second)) + return false; + symbols_to_merge.insert(lookahead); + } + } + + for (const Symbol &lookahead : symbols_to_merge) + state.terminal_entries[lookahead] = other.terminal_entries.find(lookahead)->second; + + return true; + } + string handle_conflict(const ParseItemSet &item_set, ParseStateId state_id, Symbol lookahead) { ParseTableEntry &entry = parse_table.states[state_id].terminal_entries[lookahead]; @@ -574,7 +680,7 @@ class ParseTableBuilder { switch (symbol.type) { case Symbol::Terminal: { - const Variable &variable = lexical_grammar.variables[symbol.index]; + const LexicalVariable &variable = lexical_grammar.variables[symbol.index]; if (variable.type == VariableTypeNamed) return variable.name; else diff --git a/src/compiler/build_tables/build_tables.cc b/src/compiler/build_tables/build_tables.cc index b226a403..de9fa466 100644 --- a/src/compiler/build_tables/build_tables.cc +++ b/src/compiler/build_tables/build_tables.cc @@ -1,6 +1,6 @@ #include "compiler/build_tables/build_tables.h" #include -#include "compiler/build_tables/build_lex_table.h" +#include "compiler/build_tables/lex_table_builder.h" #include "compiler/build_tables/build_parse_table.h" #include "compiler/syntax_grammar.h" #include "compiler/lexical_grammar.h" @@ -15,11 +15,13 @@ using std::vector; using std::make_tuple; tuple build_tables( - const SyntaxGrammar &grammar, const LexicalGrammar &lex_grammar) { - auto parse_table_result = build_parse_table(grammar, lex_grammar); + const SyntaxGrammar &grammar, + const LexicalGrammar &lexical_grammar +) { + auto parse_table_result = build_parse_table(grammar, lexical_grammar); ParseTable parse_table = parse_table_result.first; const CompileError error = parse_table_result.second; - LexTable lex_table = build_lex_table(&parse_table, lex_grammar); + LexTable lex_table = LexTableBuilder::create(lexical_grammar)->build(&parse_table); return make_tuple(parse_table, lex_table, error); } diff --git a/src/compiler/build_tables/lex_conflict_manager.cc b/src/compiler/build_tables/lex_conflict_manager.cc index 3fc22ed2..0fbdf4d9 100644 --- a/src/compiler/build_tables/lex_conflict_manager.cc +++ b/src/compiler/build_tables/lex_conflict_manager.cc @@ -10,11 +10,10 @@ namespace build_tables { bool LexConflictManager::resolve(const LexItemSet &item_set, const AdvanceAction &new_action, const AcceptTokenAction &old_action) { - if (!old_action.is_present()) - return true; if (new_action.precedence_range.max >= old_action.precedence) { - for (const LexItem &item : item_set.entries) + for (const LexItem &item : item_set.entries) { possible_extensions[old_action.symbol.index].insert(item.lhs.index); + } return true; } else { return false; @@ -23,30 +22,26 @@ bool LexConflictManager::resolve(const LexItemSet &item_set, bool LexConflictManager::resolve(const AcceptTokenAction &new_action, const AcceptTokenAction &old_action) { - if (!old_action.is_present()) - return true; - - int old_precedence = old_action.precedence; - int new_precedence = new_action.precedence; - bool result; - if (new_precedence > old_precedence) + if (new_action.precedence > old_action.precedence) { result = true; - else if (new_precedence < old_precedence) + } else if (new_action.precedence < old_action.precedence) { result = false; - else if (new_action.is_string && !old_action.is_string) + } else if (new_action.is_string && !old_action.is_string) { result = true; - else if (old_action.is_string && !new_action.is_string) + } else if (old_action.is_string && !new_action.is_string) { result = false; - else if (new_action.symbol.index < old_action.symbol.index) + } else if (new_action.symbol.index < old_action.symbol.index) { result = true; - else + } else { result = false; + } - if (result) + if (result) { possible_homonyms[old_action.symbol.index].insert(new_action.symbol.index); - else + } else { possible_homonyms[new_action.symbol.index].insert(old_action.symbol.index); + } return result; } diff --git a/src/compiler/build_tables/lex_item.cc b/src/compiler/build_tables/lex_item.cc index 152b2469..4c9056df 100644 --- a/src/compiler/build_tables/lex_item.cc +++ b/src/compiler/build_tables/lex_item.cc @@ -32,19 +32,15 @@ LexItem::CompletionStatus LexItem::completion_status() const { CompletionStatus apply_to(const rules::Choice *rule) { for (const auto &element : rule->elements) { CompletionStatus status = apply(element); - if (status.is_done) - return status; + if (status.is_done) return status; } - return { false, PrecedenceRange(), false }; + return { false, PrecedenceRange() }; } CompletionStatus apply_to(const rules::Metadata *rule) { CompletionStatus result = apply(rule->rule); - if (result.is_done) { - if (result.precedence.empty && rule->params.has_precedence) - result.precedence.add(rule->params.precedence); - if (rule->params.is_string) - result.is_string = true; + if (result.is_done && result.precedence.empty && rule->params.has_precedence) { + result.precedence.add(rule->params.precedence); } return result; } @@ -54,15 +50,16 @@ LexItem::CompletionStatus LexItem::completion_status() const { } CompletionStatus apply_to(const rules::Blank *rule) { - return { true, PrecedenceRange(), false }; + return { true, PrecedenceRange() }; } CompletionStatus apply_to(const rules::Seq *rule) { CompletionStatus left_status = apply(rule->left); - if (left_status.is_done) + if (left_status.is_done) { return apply(rule->right); - else - return { false, PrecedenceRange(), false }; + } else { + return { false, PrecedenceRange() }; + } } }; @@ -80,8 +77,9 @@ bool LexItemSet::operator==(const LexItemSet &other) const { LexItemSet::TransitionMap LexItemSet::transitions() const { TransitionMap result; - for (const LexItem &item : entries) + for (const LexItem &item : entries) { lex_item_transitions(&result, item); + } return result; } diff --git a/src/compiler/build_tables/lex_item.h b/src/compiler/build_tables/lex_item.h index 4c45f80d..b6b07de7 100644 --- a/src/compiler/build_tables/lex_item.h +++ b/src/compiler/build_tables/lex_item.h @@ -19,7 +19,6 @@ class LexItem { struct CompletionStatus { bool is_done; PrecedenceRange precedence; - bool is_string; }; bool operator==(const LexItem &other) const; diff --git a/src/compiler/build_tables/lex_table_builder.cc b/src/compiler/build_tables/lex_table_builder.cc new file mode 100644 index 00000000..e0a18914 --- /dev/null +++ b/src/compiler/build_tables/lex_table_builder.cc @@ -0,0 +1,324 @@ +#include "compiler/build_tables/lex_table_builder.h" +#include +#include +#include +#include +#include +#include +#include +#include "compiler/build_tables/lex_conflict_manager.h" +#include "compiler/build_tables/lex_item.h" +#include "compiler/parse_table.h" +#include "compiler/lexical_grammar.h" +#include "compiler/rules/built_in_symbols.h" +#include "compiler/rules/choice.h" +#include "compiler/rules/metadata.h" +#include "compiler/rules/repeat.h" +#include "compiler/rules/seq.h" +#include "compiler/rules/blank.h" +#include "compiler/rules/visitor.h" + +namespace tree_sitter { +namespace build_tables { + +using std::map; +using std::pair; +using std::set; +using std::string; +using std::vector; +using std::unordered_map; +using std::unique_ptr; +using rules::Blank; +using rules::Choice; +using rules::CharacterSet; +using rules::Repeat; +using rules::Symbol; +using rules::Metadata; +using rules::Seq; + +class StartingCharacterAggregator : public rules::RuleFn { + void apply_to(const rules::Seq *rule) { + apply(rule->left); + } + + void apply_to(const rules::Choice *rule) { + for (const rule_ptr &element : rule->elements) apply(element); + } + + void apply_to(const rules::Repeat *rule) { + apply(rule->content); + } + + void apply_to(const rules::Metadata *rule) { + apply(rule->rule); + } + + void apply_to(const rules::CharacterSet *rule) { + result.add_set(*rule); + } + + public: + CharacterSet result; +}; + +class LexTableBuilderImpl : public LexTableBuilder { + LexTable lex_table; + const LexicalGrammar grammar; + vector separator_rules; + CharacterSet first_separator_characters; + LexConflictManager conflict_manager; + unordered_map lex_state_ids; + + public: + vector shadowed_token_indices; + + LexTableBuilderImpl(const LexicalGrammar &grammar) : grammar(grammar) { + StartingCharacterAggregator starting_character_aggregator; + for (const rule_ptr &rule : grammar.separators) { + separator_rules.push_back(Repeat::build(rule)); + starting_character_aggregator.apply(rule); + } + separator_rules.push_back(Blank::build()); + first_separator_characters = starting_character_aggregator.result; + shadowed_token_indices.resize(grammar.variables.size()); + } + + LexTable build(ParseTable *parse_table) { + for (ParseState &parse_state : parse_table->states) { + parse_state.lex_state_id = add_lex_state( + item_set_for_terminals(parse_state.terminal_entries) + ); + } + mark_fragile_tokens(parse_table); + remove_duplicate_lex_states(parse_table); + return lex_table; + } + + bool detect_conflict(Symbol::Index left, Symbol::Index right) { + clear(); + + map terminals; + terminals[Symbol(left, Symbol::Terminal)]; + terminals[Symbol(right, Symbol::Terminal)]; + + add_lex_state(item_set_for_terminals(terminals)); + + return shadowed_token_indices[right]; + } + + LexStateId add_lex_state(const LexItemSet &item_set) { + const auto &pair = lex_state_ids.find(item_set); + if (pair == lex_state_ids.end()) { + LexStateId state_id = lex_table.states.size(); + lex_table.states.push_back(LexState()); + lex_state_ids[item_set] = state_id; + add_accept_token_actions(item_set, state_id); + add_advance_actions(item_set, state_id); + return state_id; + } else { + return pair->second; + } + } + + void clear() { + lex_table.states.clear(); + lex_state_ids.clear(); + shadowed_token_indices.assign(grammar.variables.size(), false); + } + + private: + void add_advance_actions(const LexItemSet &item_set, LexStateId state_id) { + for (const auto &pair : item_set.transitions()) { + const CharacterSet &characters = pair.first; + const LexItemSet::Transition &transition = pair.second; + + AdvanceAction action(-1, transition.precedence, transition.in_main_token); + auto current_action = lex_table.states[state_id].accept_action; + if (current_action.is_present()) { + bool prefer_advancing = conflict_manager.resolve(transition.destination, action, current_action); + bool matches_accepted_token = false; + for (const LexItem &item : transition.destination.entries) { + if (item.lhs == current_action.symbol) { + matches_accepted_token = true; + } else if (!transition.in_main_token && !item.lhs.is_built_in() && !prefer_advancing) { + shadowed_token_indices[item.lhs.index] = true; + } + } + + if (!matches_accepted_token && characters.intersects(first_separator_characters)) { + shadowed_token_indices[current_action.symbol.index] = true; + } + + if (!prefer_advancing) { + continue; + } + } + + action.state_index = add_lex_state(transition.destination); + lex_table.states[state_id].advance_actions[characters] = action; + } + } + + void add_accept_token_actions(const LexItemSet &item_set, LexStateId state_id) { + for (const LexItem &item : item_set.entries) { + LexItem::CompletionStatus completion_status = item.completion_status(); + if (completion_status.is_done) { + AcceptTokenAction action(item.lhs, completion_status.precedence.max, + item.lhs.is_built_in() || + grammar.variables[item.lhs.index].is_string); + + auto current_action = lex_table.states[state_id].accept_action; + if (current_action.is_present()) { + if (!conflict_manager.resolve(action, current_action)) { + continue; + } + } + + lex_table.states[state_id].accept_action = action; + } + } + } + + void mark_fragile_tokens(ParseTable *parse_table) { + for (ParseState &state : parse_table->states) { + for (auto &entry : state.terminal_entries) { + Symbol symbol = entry.first; + if (symbol.is_token()) { + auto homonyms = conflict_manager.possible_homonyms.find(symbol.index); + if (homonyms != conflict_manager.possible_homonyms.end()) + for (Symbol::Index homonym : homonyms->second) + if (state.terminal_entries.count(Symbol(homonym, Symbol::Terminal))) { + entry.second.reusable = false; + break; + } + + if (!entry.second.reusable) + continue; + + auto extensions = conflict_manager.possible_extensions.find(symbol.index); + if (extensions != conflict_manager.possible_extensions.end()) + for (Symbol::Index extension : extensions->second) + if (state.terminal_entries.count(Symbol(extension, Symbol::Terminal))) { + entry.second.depends_on_lookahead = true; + break; + } + } + } + } + } + + void remove_duplicate_lex_states(ParseTable *parse_table) { + for (LexState &state : lex_table.states) { + state.accept_action.is_string = false; + state.accept_action.precedence = 0; + } + + map replacements; + + while (true) { + map duplicates; + for (LexStateId i = 0, size = lex_table.states.size(); i < size; i++) { + for (LexStateId j = 0; j < i; j++) { + if (!duplicates.count(j) && lex_table.states[j] == lex_table.states[i]) { + duplicates.insert({ i, j }); + break; + } + } + } + + if (duplicates.empty()) break; + + map new_replacements; + for (LexStateId i = 0, size = lex_table.states.size(); i < size; i++) { + LexStateId new_state_index = i; + auto duplicate = duplicates.find(i); + if (duplicate != duplicates.end()) { + new_state_index = duplicate->second; + } + + size_t prior_removed = 0; + for (const auto &duplicate : duplicates) { + if (duplicate.first >= new_state_index) break; + prior_removed++; + } + + new_state_index -= prior_removed; + new_replacements.insert({ i, new_state_index }); + replacements.insert({ i, new_state_index }); + for (auto &replacement : replacements) { + if (replacement.second == i) { + replacement.second = new_state_index; + } + } + } + + for (auto &state : lex_table.states) { + for (auto &entry : state.advance_actions) { + auto new_replacement = new_replacements.find(entry.second.state_index); + if (new_replacement != new_replacements.end()) { + entry.second.state_index = new_replacement->second; + } + } + } + + for (auto i = duplicates.rbegin(); i != duplicates.rend(); ++i) { + lex_table.states.erase(lex_table.states.begin() + i->first); + } + } + + for (ParseState &parse_state : parse_table->states) { + auto replacement = replacements.find(parse_state.lex_state_id); + if (replacement != replacements.end()) { + parse_state.lex_state_id = replacement->second; + } + } + } + + LexItemSet item_set_for_terminals(const map &terminals) { + LexItemSet result; + for (const auto &pair : terminals) { + Symbol symbol = pair.first; + if (symbol.is_token()) { + for (const rule_ptr &rule : rules_for_symbol(symbol)) { + for (const rule_ptr &separator_rule : separator_rules) { + result.entries.insert(LexItem( + symbol, + Metadata::separator( + Seq::build({ + separator_rule, + Metadata::main_token(rule) })))); + } + } + } + } + return result; + } + + vector rules_for_symbol(const rules::Symbol &symbol) { + if (symbol == rules::END_OF_INPUT()) + return { CharacterSet().include(0).copy() }; + + rule_ptr rule = grammar.variables[symbol.index].rule; + + auto choice = rule->as(); + if (choice) + return choice->elements; + else + return { rule }; + } +}; + +unique_ptr LexTableBuilder::create(const LexicalGrammar &grammar) { + return unique_ptr(new LexTableBuilderImpl(grammar)); +} + +LexTable LexTableBuilder::build(ParseTable *parse_table) { + return static_cast(this)->build(parse_table); +} + +bool LexTableBuilder::detect_conflict(Symbol::Index left, Symbol::Index right) { + return static_cast(this)->detect_conflict(left, right); +} + +} // namespace build_tables +} // namespace tree_sitter diff --git a/src/compiler/build_tables/lex_table_builder.h b/src/compiler/build_tables/lex_table_builder.h new file mode 100644 index 00000000..91f24f70 --- /dev/null +++ b/src/compiler/build_tables/lex_table_builder.h @@ -0,0 +1,26 @@ +#ifndef COMPILER_BUILD_TABLES_LEX_TABLE_BUILDER_H_ +#define COMPILER_BUILD_TABLES_LEX_TABLE_BUILDER_H_ + +#include +#include "compiler/lex_table.h" + +namespace tree_sitter { + +struct ParseTable; +struct LexicalGrammar; + +namespace build_tables { + +class LexTableBuilder { + public: + static std::unique_ptr create(const LexicalGrammar &); + LexTable build(ParseTable *); + bool detect_conflict(rules::Symbol::Index, rules::Symbol::Index); + protected: + LexTableBuilder() = default; +}; + +} // namespace build_tables +} // namespace tree_sitter + +#endif // COMPILER_BUILD_TABLES_LEX_TABLE_BUILDER_H_ diff --git a/src/compiler/build_tables/parse_item.h b/src/compiler/build_tables/parse_item.h index a3785638..fc3f0129 100644 --- a/src/compiler/build_tables/parse_item.h +++ b/src/compiler/build_tables/parse_item.h @@ -12,8 +12,7 @@ namespace tree_sitter { namespace build_tables { -class ParseItem { - public: +struct ParseItem { ParseItem(); ParseItem(const rules::Symbol &, const Production &, unsigned int); @@ -36,8 +35,7 @@ class ParseItem { unsigned int step_index; }; -class ParseItemSet { - public: +struct ParseItemSet { ParseItemSet(); explicit ParseItemSet(const std::map &); diff --git a/src/compiler/build_tables/recovery_tokens.cc b/src/compiler/build_tables/recovery_tokens.cc deleted file mode 100644 index 84b175bc..00000000 --- a/src/compiler/build_tables/recovery_tokens.cc +++ /dev/null @@ -1,89 +0,0 @@ -#include "compiler/build_tables/recovery_tokens.h" -#include "compiler/lexical_grammar.h" -#include "compiler/rules/choice.h" -#include "compiler/rules/character_set.h" -#include "compiler/rules/repeat.h" -#include "compiler/rules/visitor.h" -#include "compiler/rules/seq.h" -#include "compiler/rules/metadata.h" - -namespace tree_sitter { -namespace build_tables { - -using rules::Symbol; -using std::set; - -template -class CharacterAggregator : public rules::RuleFn { - void apply_to(const rules::Seq *rule) { - if (left) - apply(rule->left); - if (right) - apply(rule->right); - } - - void apply_to(const rules::Choice *rule) { - for (const rule_ptr &element : rule->elements) - apply(element); - } - - void apply_to(const rules::Repeat *rule) { - apply(rule->content); - } - - void apply_to(const rules::Metadata *rule) { - apply(rule->rule); - } - - void apply_to(const rules::CharacterSet *rule) { - result.add_set(*rule); - } - - public: - rules::CharacterSet result; -}; - -class FirstCharacters : public CharacterAggregator {}; -class LastCharacters : public CharacterAggregator {}; -class AllCharacters : public CharacterAggregator {}; - -set recovery_tokens(const LexicalGrammar &grammar) { - set result; - - AllCharacters all_separator_characters; - for (const rule_ptr &separator : grammar.separators) - all_separator_characters.apply(separator); - - for (size_t i = 0; i < grammar.variables.size(); i++) { - const Variable &variable = grammar.variables[i]; - rule_ptr rule = variable.rule; - - FirstCharacters first_characters; - first_characters.apply(variable.rule); - - LastCharacters last_characters; - last_characters.apply(variable.rule); - - AllCharacters all_characters; - all_characters.apply(variable.rule); - - bool has_distinct_start = - !first_characters.result.includes_all && - !first_characters.result.intersects(all_separator_characters.result); - - bool has_distinct_end = - !last_characters.result.includes_all && - !last_characters.result.intersects(all_separator_characters.result); - - bool has_no_separators = - !all_characters.result.intersects(all_separator_characters.result); - - if ((has_distinct_start && has_distinct_end) || has_no_separators) - result.insert(Symbol(i, Symbol::Terminal)); - } - - return result; -} - -} // namespace build_tables -} // namespace tree_sitter diff --git a/src/compiler/build_tables/recovery_tokens.h b/src/compiler/build_tables/recovery_tokens.h deleted file mode 100644 index c97a8cfd..00000000 --- a/src/compiler/build_tables/recovery_tokens.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef COMPILER_BUILD_TABLES_DOES_MATCH_ANY_LINE_H_ -#define COMPILER_BUILD_TABLES_DOES_MATCH_ANY_LINE_H_ - -#include "compiler/rule.h" -#include "compiler/rules/symbol.h" -#include - -namespace tree_sitter { - -struct LexicalGrammar; - -namespace build_tables { - -std::set recovery_tokens(const LexicalGrammar &); - -} // namespace build_tables -} // namespace tree_sitter - -#endif // COMPILER_BUILD_TABLES_DOES_MATCH_ANY_LINE_H_ diff --git a/src/compiler/build_tables/remove_duplicate_states.h b/src/compiler/build_tables/remove_duplicate_states.h deleted file mode 100644 index a154c05a..00000000 --- a/src/compiler/build_tables/remove_duplicate_states.h +++ /dev/null @@ -1,65 +0,0 @@ -#ifndef COMPILER_BUILD_TABLES_REMOVE_DUPLICATE_STATES_H_ -#define COMPILER_BUILD_TABLES_REMOVE_DUPLICATE_STATES_H_ - -#include -#include - -namespace tree_sitter { -namespace build_tables { - -template -std::map remove_duplicate_states(TableType *table) { - std::map replacements; - - while (true) { - std::map duplicates; - for (size_t i = 0, size = table->states.size(); i < size; i++) - for (size_t j = 0; j < i; j++) - if (!duplicates.count(j) && table->merge_state(j, i)) { - duplicates.insert({ i, j }); - break; - } - - if (duplicates.empty()) - break; - - std::map new_replacements; - for (size_t i = 0, size = table->states.size(); i < size; i++) { - size_t new_state_index = i; - auto duplicate = duplicates.find(i); - if (duplicate != duplicates.end()) - new_state_index = duplicate->second; - - size_t prior_removed = 0; - for (const auto &duplicate : duplicates) { - if (duplicate.first >= new_state_index) - break; - prior_removed++; - } - - new_state_index -= prior_removed; - new_replacements.insert({ i, new_state_index }); - replacements.insert({ i, new_state_index }); - for (auto &replacement : replacements) - if (replacement.second == i) - replacement.second = new_state_index; - } - - for (auto &state : table->states) - state.each_referenced_state([&new_replacements](int64_t *state_index) { - auto new_replacement = new_replacements.find(*state_index); - if (new_replacement != new_replacements.end()) - *state_index = new_replacement->second; - }); - - for (auto i = duplicates.rbegin(); i != duplicates.rend(); ++i) - table->states.erase(table->states.begin() + i->first); - } - - return replacements; -} - -} // namespace build_tables -} // namespace tree_sitter - -#endif // COMPILER_BUILD_TABLES_REMOVE_DUPLICATE_STATES_H_ diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index bc84e557..0bd01c29 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -26,8 +26,6 @@ using std::vector; using util::escape_char; using rules::Symbol; -static Variable EOF_ENTRY("end", VariableTypeNamed, rule_ptr()); - static const map REPLACEMENTS({ { '~', "TILDE" }, { '`', "BQUOTE" }, @@ -561,7 +559,7 @@ class CCodeGenerator { return { variable.name, variable.type }; } case Symbol::Terminal: { - const Variable &variable = lexical_grammar.variables[symbol.index]; + const LexicalVariable &variable = lexical_grammar.variables[symbol.index]; return { variable.name, variable.type }; } case Symbol::External: diff --git a/src/compiler/generate_code/c_code.h b/src/compiler/generate_code/c_code.h index 4ecf4840..1e77ed0e 100644 --- a/src/compiler/generate_code/c_code.h +++ b/src/compiler/generate_code/c_code.h @@ -7,8 +7,8 @@ namespace tree_sitter { struct LexicalGrammar; struct SyntaxGrammar; -class LexTable; -class ParseTable; +struct LexTable; +struct ParseTable; namespace generate_code { diff --git a/src/compiler/lex_table.cc b/src/compiler/lex_table.cc index 8f8d2ded..74af0900 100644 --- a/src/compiler/lex_table.cc +++ b/src/compiler/lex_table.cc @@ -44,35 +44,10 @@ bool AcceptTokenAction::operator==(const AcceptTokenAction &other) const { LexState::LexState() : is_token_start(false) {} -set LexState::expected_inputs() const { - set result; - for (auto &pair : advance_actions) - result.insert(pair.first); - return result; -} - bool LexState::operator==(const LexState &other) const { return advance_actions == other.advance_actions && accept_action == other.accept_action && is_token_start == other.is_token_start; } -void LexState::each_referenced_state(function fn) { - for (auto &entry : advance_actions) - fn(&entry.second.state_index); -} - -LexStateId LexTable::add_state() { - states.push_back(LexState()); - return states.size() - 1; -} - -LexState &LexTable::state(LexStateId id) { - return states[id]; -} - -bool LexTable::merge_state(size_t i, size_t j) { - return states[i] == states[j]; -} - } // namespace tree_sitter diff --git a/src/compiler/lex_table.h b/src/compiler/lex_table.h index ac7357a1..e669739e 100644 --- a/src/compiler/lex_table.h +++ b/src/compiler/lex_table.h @@ -13,17 +13,9 @@ namespace tree_sitter { typedef int64_t LexStateId; -typedef enum { - LexActionTypeError, - LexActionTypeAccept, - LexActionTypeAcceptFragile, - LexActionTypeAdvance -} LexActionType; - struct AdvanceAction { AdvanceAction(); AdvanceAction(size_t, PrecedenceRange, bool); - bool operator==(const AdvanceAction &other) const; LexStateId state_index; @@ -34,7 +26,6 @@ struct AdvanceAction { struct AcceptTokenAction { AcceptTokenAction(); AcceptTokenAction(rules::Symbol, int, bool); - bool is_present() const; bool operator==(const AcceptTokenAction &action) const; @@ -43,31 +34,17 @@ struct AcceptTokenAction { bool is_string; }; -} // namespace tree_sitter - -namespace std {} // namespace std - -namespace tree_sitter { - -class LexState { - public: +struct LexState { LexState(); - std::set expected_inputs() const; bool operator==(const LexState &) const; - void each_referenced_state(std::function); std::map advance_actions; AcceptTokenAction accept_action; bool is_token_start; }; -class LexTable { - public: - LexStateId add_state(); - LexState &state(LexStateId state_id); +struct LexTable { std::vector states; - - bool merge_state(size_t i, size_t j); }; } // namespace tree_sitter diff --git a/src/compiler/lexical_grammar.h b/src/compiler/lexical_grammar.h index 58aa54f3..456e2089 100644 --- a/src/compiler/lexical_grammar.h +++ b/src/compiler/lexical_grammar.h @@ -9,8 +9,15 @@ namespace tree_sitter { +struct LexicalVariable { + std::string name; + VariableType type; + rule_ptr rule; + bool is_string; +}; + struct LexicalGrammar { - std::vector variables; + std::vector variables; std::vector separators; }; diff --git a/src/compiler/parse_table.cc b/src/compiler/parse_table.cc index a04eec8c..57728e0f 100644 --- a/src/compiler/parse_table.cc +++ b/src/compiler/parse_table.cc @@ -148,13 +148,6 @@ bool ParseState::has_shift_action() const { return (!nonterminal_entries.empty()); } -set ParseState::expected_inputs() const { - set result; - for (auto &entry : terminal_entries) - result.insert(entry.first); - return result; -} - void ParseState::each_referenced_state(function fn) { for (auto &entry : terminal_entries) for (ParseAction &action : entry.second.actions) @@ -169,18 +162,6 @@ bool ParseState::operator==(const ParseState &other) const { nonterminal_entries == other.nonterminal_entries; } -set ParseTable::all_symbols() const { - set result; - for (auto &pair : symbols) - result.insert(pair.first); - return result; -} - -ParseStateId ParseTable::add_state() { - states.push_back(ParseState()); - return states.size() - 1; -} - ParseAction &ParseTable::add_terminal_action(ParseStateId state_id, Symbol lookahead, ParseAction action) { @@ -201,58 +182,4 @@ void ParseTable::set_nonterminal_action(ParseStateId state_id, states[state_id].nonterminal_entries[lookahead] = next_state_id; } -static bool has_entry(const ParseState &state, const ParseTableEntry &entry) { - for (const auto &pair : state.terminal_entries) - if (pair.second == entry) - return true; - return false; -} - -bool ParseTable::merge_state(size_t i, size_t j) { - ParseState &state = states[i]; - ParseState &other = states[j]; - - if (state.nonterminal_entries != other.nonterminal_entries) - return false; - - for (auto &entry : state.terminal_entries) { - Symbol lookahead = entry.first; - const vector &actions = entry.second.actions; - - const auto &other_entry = other.terminal_entries.find(lookahead); - if (other_entry == other.terminal_entries.end()) { - if (mergeable_symbols.count(lookahead) == 0 && !lookahead.is_built_in()) - return false; - if (actions.back().type != ParseActionTypeReduce) - return false; - if (!has_entry(other, entry.second)) - return false; - } else if (entry.second != other_entry->second) { - return false; - } - } - - set symbols_to_merge; - - for (auto &entry : other.terminal_entries) { - Symbol lookahead = entry.first; - const vector &actions = entry.second.actions; - - if (!state.terminal_entries.count(lookahead)) { - if (mergeable_symbols.count(lookahead) == 0 && !lookahead.is_built_in()) - return false; - if (actions.back().type != ParseActionTypeReduce) - return false; - if (!has_entry(state, entry.second)) - return false; - symbols_to_merge.insert(lookahead); - } - } - - for (const Symbol &lookahead : symbols_to_merge) - state.terminal_entries[lookahead] = other.terminal_entries.find(lookahead)->second; - - return true; -} - } // namespace tree_sitter diff --git a/src/compiler/parse_table.h b/src/compiler/parse_table.h index 79eec4fc..02501ebd 100644 --- a/src/compiler/parse_table.h +++ b/src/compiler/parse_table.h @@ -23,13 +23,11 @@ enum ParseActionType { ParseActionTypeRecover, }; -class ParseAction { +struct ParseAction { + ParseAction(); ParseAction(ParseActionType type, ParseStateId state_index, rules::Symbol symbol, size_t consumed_symbol_count, const Production *); - - public: - ParseAction(); static ParseAction Accept(); static ParseAction Error(); static ParseAction Shift(ParseStateId state_index); @@ -39,7 +37,6 @@ class ParseAction { static ParseAction ShiftExtra(); bool operator==(const ParseAction &) const; bool operator<(const ParseAction &) const; - rules::Associativity associativity() const; int precedence() const; @@ -47,30 +44,26 @@ class ParseAction { bool extra; bool fragile; ParseStateId state_index; - rules::Symbol symbol; size_t consumed_symbol_count; const Production *production; }; struct ParseTableEntry { - std::vector actions; - bool reusable; - bool depends_on_lookahead; - ParseTableEntry(); ParseTableEntry(const std::vector &, bool, bool); bool operator==(const ParseTableEntry &other) const; - inline bool operator!=(const ParseTableEntry &other) const { return !operator==(other); } + + std::vector actions; + bool reusable; + bool depends_on_lookahead; }; -class ParseState { - public: +struct ParseState { ParseState(); - std::set expected_inputs() const; bool operator==(const ParseState &) const; bool merge(const ParseState &); void each_referenced_state(std::function); @@ -87,18 +80,12 @@ struct ParseTableSymbolMetadata { bool structural; }; -class ParseTable { - public: - std::set all_symbols() const; - ParseStateId add_state(); +struct ParseTable { ParseAction &add_terminal_action(ParseStateId state_id, rules::Symbol, ParseAction); void set_nonterminal_action(ParseStateId, rules::Symbol::Index, ParseStateId); - bool merge_state(size_t i, size_t j); std::vector states; std::map symbols; - - std::set mergeable_symbols; }; } // namespace tree_sitter diff --git a/src/compiler/prepare_grammar/expand_repeats.cc b/src/compiler/prepare_grammar/expand_repeats.cc index 331c9cea..d01bb7a0 100644 --- a/src/compiler/prepare_grammar/expand_repeats.cc +++ b/src/compiler/prepare_grammar/expand_repeats.cc @@ -41,10 +41,17 @@ class ExpandRepeats : public rules::IdentityRuleFn { string helper_rule_name = rule_name + "_repeat" + to_string(++repeat_count); Symbol repeat_symbol(offset + index, Symbol::NonTerminal); existing_repeats.push_back({ rule->copy(), repeat_symbol }); - aux_rules.push_back( - Variable(helper_rule_name, VariableTypeAuxiliary, - Choice::build({ Seq::build({ repeat_symbol.copy(), inner_rule }), - inner_rule }))); + aux_rules.push_back(Variable{ + helper_rule_name, + VariableTypeAuxiliary, + Choice::build({ + Seq::build({ + repeat_symbol.copy(), + inner_rule, + }), + inner_rule, + }) + }); return repeat_symbol.copy(); } diff --git a/src/compiler/prepare_grammar/expand_tokens.cc b/src/compiler/prepare_grammar/expand_tokens.cc index b024e27c..ff268782 100644 --- a/src/compiler/prepare_grammar/expand_tokens.cc +++ b/src/compiler/prepare_grammar/expand_tokens.cc @@ -67,11 +67,11 @@ pair expand_tokens(const LexicalGrammar &grammar) LexicalGrammar result; ExpandTokens expander; - for (const Variable &variable : grammar.variables) { + for (const LexicalVariable &variable : grammar.variables) { auto rule = expander.apply(variable.rule); if (expander.error.type) return { result, expander.error }; - result.variables.push_back(Variable(variable.name, variable.type, rule)); + result.variables.push_back({variable.name, variable.type, rule, variable.is_string}); } for (auto &sep : grammar.separators) { diff --git a/src/compiler/prepare_grammar/extract_tokens.cc b/src/compiler/prepare_grammar/extract_tokens.cc index 9d161ca8..32b524e3 100644 --- a/src/compiler/prepare_grammar/extract_tokens.cc +++ b/src/compiler/prepare_grammar/extract_tokens.cc @@ -56,7 +56,7 @@ class SymbolReplacer : public rules::IdentityRuleFn { class TokenExtractor : public rules::IdentityRuleFn { using rules::IdentityRuleFn::apply_to; - rule_ptr apply_to_token(const Rule *input, VariableType entry_type) { + rule_ptr apply_to_token(const Rule *input, VariableType entry_type, bool is_string) { for (size_t i = 0; i < tokens.size(); i++) if (tokens[i].rule->operator==(*input)) { token_usage_counts[i]++; @@ -65,29 +65,30 @@ class TokenExtractor : public rules::IdentityRuleFn { rule_ptr rule = input->copy(); size_t index = tokens.size(); - tokens.push_back(Variable(token_description(rule), entry_type, rule)); + tokens.push_back({token_description(rule), entry_type, rule, is_string}); token_usage_counts.push_back(1); return make_shared(index, Symbol::Terminal); } rule_ptr apply_to(const rules::String *rule) { - return apply_to_token(rule, VariableTypeAnonymous); + return apply_to_token(rule, VariableTypeAnonymous, true); } rule_ptr apply_to(const rules::Pattern *rule) { - return apply_to_token(rule, VariableTypeAuxiliary); + return apply_to_token(rule, VariableTypeAuxiliary, false); } rule_ptr apply_to(const rules::Metadata *rule) { - if (rule->params.is_token) - return apply_to_token(rule->rule.get(), VariableTypeAuxiliary); - else + if (rule->params.is_token) { + return apply_to_token(rule->rule.get(), VariableTypeAuxiliary, false); + } else { return rules::IdentityRuleFn::apply_to(rule); + } } public: vector token_usage_counts; - vector tokens; + vector tokens; }; static CompileError extra_token_error(const string &message) { @@ -106,8 +107,11 @@ tuple extract_tokens( */ vector processed_variables; for (const Variable &variable : grammar.variables) - processed_variables.push_back( - Variable(variable.name, variable.type, extractor.apply(variable.rule))); + processed_variables.push_back(Variable{ + variable.name, + variable.type, + extractor.apply(variable.rule) + }); lexical_grammar.variables = extractor.tokens; /* @@ -139,8 +143,9 @@ tuple extract_tokens( for (const ConflictSet &conflict_set : grammar.expected_conflicts) { ConflictSet new_conflict_set; - for (const Symbol &symbol : conflict_set) + for (const Symbol &symbol : conflict_set) { new_conflict_set.insert(symbol_replacer.replace_symbol(symbol)); + } syntax_grammar.expected_conflicts.insert(new_conflict_set); } @@ -154,7 +159,7 @@ tuple extract_tokens( for (const rule_ptr &rule : grammar.extra_tokens) { int i = 0; bool used_elsewhere_in_grammar = false; - for (const Variable &variable : lexical_grammar.variables) { + for (const LexicalVariable &variable : lexical_grammar.variables) { if (variable.rule->operator==(*rule)) { syntax_grammar.extra_tokens.insert(Symbol(i, Symbol::Terminal)); used_elsewhere_in_grammar = true; @@ -171,9 +176,10 @@ tuple extract_tokens( } auto symbol = rule->as(); - if (!symbol) + if (!symbol) { return make_tuple(syntax_grammar, lexical_grammar, extra_token_error(rule->to_string())); + } Symbol new_symbol = symbol_replacer.replace_symbol(*symbol); if (new_symbol.is_non_terminal()) { diff --git a/src/compiler/prepare_grammar/flatten_grammar.cc b/src/compiler/prepare_grammar/flatten_grammar.cc index 8ac0e33c..fe49c7a3 100644 --- a/src/compiler/prepare_grammar/flatten_grammar.cc +++ b/src/compiler/prepare_grammar/flatten_grammar.cc @@ -25,8 +25,11 @@ class FlattenRule : public rules::RuleFn { Production production; void apply_to(const rules::Symbol *sym) { - production.push_back(ProductionStep(*sym, precedence_stack.back(), - associativity_stack.back())); + production.push_back(ProductionStep{ + *sym, + precedence_stack.back(), + associativity_stack.back() + }); } void apply_to(const rules::Metadata *metadata) { @@ -85,7 +88,7 @@ SyntaxVariable flatten_rule(const Variable &variable) { } } - return SyntaxVariable(variable.name, variable.type, productions); + return SyntaxVariable{variable.name, variable.type, productions}; } pair flatten_grammar(const InitialSyntaxGrammar &grammar) { diff --git a/src/compiler/prepare_grammar/normalize_rules.cc b/src/compiler/prepare_grammar/normalize_rules.cc index 0e1da9fd..2e4dd205 100644 --- a/src/compiler/prepare_grammar/normalize_rules.cc +++ b/src/compiler/prepare_grammar/normalize_rules.cc @@ -8,7 +8,7 @@ namespace prepare_grammar { LexicalGrammar normalize_rules(const LexicalGrammar &input_grammar) { LexicalGrammar result(input_grammar); - for (Variable &variable : result.variables) { + for (LexicalVariable &variable : result.variables) { variable.rule = rules::Choice::build(extract_choices(variable.rule)); } diff --git a/src/compiler/syntax_grammar.cc b/src/compiler/syntax_grammar.cc index aa3074e8..254e1a34 100644 --- a/src/compiler/syntax_grammar.cc +++ b/src/compiler/syntax_grammar.cc @@ -7,20 +7,6 @@ namespace tree_sitter { -using std::string; -using std::to_string; -using std::pair; -using std::vector; -using std::set; - -SyntaxVariable::SyntaxVariable(const string &name, VariableType type, - const vector &productions) - : name(name), productions(productions), type(type) {} - -ProductionStep::ProductionStep(const rules::Symbol &symbol, int precedence, - rules::Associativity associativity) - : symbol(symbol), precedence(precedence), associativity(associativity) {} - bool ExternalToken::operator==(const ExternalToken &other) const { return name == other.name && type == other.type && corresponding_internal_token == other.corresponding_internal_token; diff --git a/src/compiler/syntax_grammar.h b/src/compiler/syntax_grammar.h index e3af8f28..9d154884 100644 --- a/src/compiler/syntax_grammar.h +++ b/src/compiler/syntax_grammar.h @@ -11,15 +11,14 @@ namespace tree_sitter { struct ExternalToken { + bool operator==(const ExternalToken &) const; + std::string name; VariableType type; rules::Symbol corresponding_internal_token; - - bool operator==(const ExternalToken &) const; }; struct ProductionStep { - ProductionStep(const rules::Symbol &, int, rules::Associativity); bool operator==(const ProductionStep &) const; rules::Symbol symbol; @@ -30,12 +29,9 @@ struct ProductionStep { typedef std::vector Production; struct SyntaxVariable { - SyntaxVariable(const std::string &, VariableType, - const std::vector &); - std::string name; - std::vector productions; VariableType type; + std::vector productions; }; typedef std::set ConflictSet; diff --git a/src/compiler/variable.cc b/src/compiler/variable.cc deleted file mode 100644 index 313f1d21..00000000 --- a/src/compiler/variable.cc +++ /dev/null @@ -1,11 +0,0 @@ -#include "compiler/variable.h" -#include - -namespace tree_sitter { - -using std::string; - -Variable::Variable(const string &name, VariableType type, const rule_ptr &rule) - : name(name), rule(rule), type(type) {} - -} // namespace tree_sitter diff --git a/src/compiler/variable.h b/src/compiler/variable.h index 707619f0..823852ff 100644 --- a/src/compiler/variable.h +++ b/src/compiler/variable.h @@ -15,11 +15,9 @@ enum VariableType { }; struct Variable { - Variable(const std::string &, VariableType, const rule_ptr &); - std::string name; - rule_ptr rule; VariableType type; + rule_ptr rule; }; } // namespace tree_sitter