diff --git a/project.gyp b/project.gyp index 5751ec8d..74327d98 100644 --- a/project.gyp +++ b/project.gyp @@ -14,7 +14,7 @@ 'src/compiler/build_tables/build_lex_table.cc', 'src/compiler/build_tables/build_parse_table.cc', 'src/compiler/build_tables/build_tables.cc', - 'src/compiler/build_tables/does_match_any_line.cc', + 'src/compiler/build_tables/recovery_tokens.cc', 'src/compiler/build_tables/item_set_closure.cc', 'src/compiler/build_tables/lex_item.cc', 'src/compiler/build_tables/lex_item_transitions.cc', diff --git a/spec/compiler/build_tables/distinctive_tokens_spec.cc b/spec/compiler/build_tables/distinctive_tokens_spec.cc new file mode 100644 index 00000000..1c9d8794 --- /dev/null +++ b/spec/compiler/build_tables/distinctive_tokens_spec.cc @@ -0,0 +1,36 @@ +#include "spec_helper.h" +#include "compiler/rules/character_set.h" +#include "compiler/build_tables/recovery_tokens.h" +#include "compiler/lexical_grammar.h" +#include "helpers/rule_helpers.h" +#include "helpers/stream_methods.h" +#include "compiler/rules.h" + +using namespace rules; +using namespace build_tables; + +START_TEST + +describe("recovery_tokens(rule)", []() { + it("includes rules that can only begin and end with an explicit set of characters", [&]() { + LexicalGrammar grammar; + grammar.separators = { + character({ ' ' }), + }; + + grammar.variables = { + Variable("var0", VariableTypeNamed, character({}, false)), + Variable("var1", VariableTypeNamed, seq({ + character({ 'a', 'b' }), + character({}, false), + character({ 'c', 'd' }), + })), + }; + + AssertThat(recovery_tokens(grammar), Equals>({ + Symbol(1, true), + })); + }); +}); + +END_TEST diff --git a/spec/compiler/build_tables/does_match_any_line_spec.cc b/spec/compiler/build_tables/does_match_any_line_spec.cc deleted file mode 100644 index 42793441..00000000 --- a/spec/compiler/build_tables/does_match_any_line_spec.cc +++ /dev/null @@ -1,43 +0,0 @@ -#include "spec_helper.h" -#include "compiler/rules/character_set.h" -#include "compiler/build_tables/does_match_any_line.h" -#include "helpers/rule_helpers.h" -#include "compiler/rules.h" - -using namespace rules; -using namespace build_tables; - -START_TEST - -describe("does_match_any_line(rule)", []() { - it("returns true for rules that match any sequence of characters on a line", [&]() { - rule_ptr rule = character({}, false); - AssertThat(does_match_any_line(rule), IsFalse()); - - rule = repeat(character({}, false)); - AssertThat(does_match_any_line(rule), IsTrue()); - - rule = repeat(character({}, false)); - AssertThat(does_match_any_line(rule), IsTrue()); - - rule = choice({ repeat(character({}, false)), str("x") }); - AssertThat(does_match_any_line(rule), IsTrue()); - - rule = repeat(choice({ character({}, false), str("x") })); - AssertThat(does_match_any_line(rule), IsTrue()); - - rule = choice({ str("y"), str("x") }); - AssertThat(does_match_any_line(rule), IsFalse()); - - rule = seq({ repeat(character({}, false)), repeat(character({}, false)) }); - AssertThat(does_match_any_line(rule), IsTrue()); - - rule = seq({ repeat(character({}, false)), str("x") }); - AssertThat(does_match_any_line(rule), IsFalse()); - - rule = repeat(character({0, '\n'}, false)); - AssertThat(does_match_any_line(rule), IsTrue()); - }); -}); - -END_TEST diff --git a/src/compiler/build_tables/build_parse_table.cc b/src/compiler/build_tables/build_parse_table.cc index ff3d945c..33ee970b 100644 --- a/src/compiler/build_tables/build_parse_table.cc +++ b/src/compiler/build_tables/build_parse_table.cc @@ -15,7 +15,7 @@ #include "compiler/syntax_grammar.h" #include "compiler/rules/symbol.h" #include "compiler/rules/built_in_symbols.h" -#include "compiler/build_tables/does_match_any_line.h" +#include "compiler/build_tables/recovery_tokens.h" namespace tree_sitter { namespace build_tables { @@ -108,10 +108,8 @@ class ParseTableBuilder { void add_out_of_context_parse_states() { auto symbols_by_first = symbols_by_first_symbol(grammar); - for (size_t i = 0; i < lexical_grammar.variables.size(); i++) { - Symbol symbol(i, true); - if (!does_match_any_line(lexical_grammar.variables[i].rule)) - add_out_of_context_parse_state(symbol, symbols_by_first[symbol]); + for (const Symbol &symbol : recovery_tokens(lexical_grammar)) { + add_out_of_context_parse_state(symbol, symbols_by_first[symbol]); } for (size_t i = 0; i < grammar.variables.size(); i++) { diff --git a/src/compiler/build_tables/does_match_any_line.cc b/src/compiler/build_tables/does_match_any_line.cc deleted file mode 100644 index 53eb6251..00000000 --- a/src/compiler/build_tables/does_match_any_line.cc +++ /dev/null @@ -1,62 +0,0 @@ -#include "compiler/build_tables/does_match_any_line.h" -#include "compiler/rules/choice.h" -#include "compiler/rules/character_set.h" -#include "compiler/rules/repeat.h" -#include "compiler/rules/visitor.h" -#include "compiler/rules/seq.h" -#include "compiler/rules/metadata.h" - -namespace tree_sitter { -namespace build_tables { - -class DoesTokenCatchAnyCharacter : public rules::RuleFn { - bool apply_to(const rules::Choice *rule) { - for (const rule_ptr &element : rule->elements) - if (apply(element)) - return true; - return false; - } - - bool apply_to(const rules::Metadata *rule) { - return apply(rule->rule); - } - - bool apply_to(const rules::CharacterSet *rule) { - if (rule->includes_all) { - for (uint32_t character : rule->excluded_chars) { - if (character != 0 && character != '\n') - return false; - } - return true; - } - return false; - } -}; - -class DoesTokenCatchAll : public rules::RuleFn { - bool apply_to(const rules::Repeat *rule) { - return DoesTokenCatchAnyCharacter().apply(rule->content); - } - - bool apply_to(const rules::Metadata *rule) { - return apply(rule->rule); - } - - bool apply_to(const rules::Choice *rule) { - for (const rule_ptr &element : rule->elements) - if (apply(element)) - return true; - return false; - } - - bool apply_to(const rules::Seq *rule) { - return apply(rule->left) && apply(rule->right); - } -}; - -bool does_match_any_line(const rule_ptr &rule) { - return DoesTokenCatchAll().apply(rule); -} - -} // namespace build_tables -} // namespace tree_sitter diff --git a/src/compiler/build_tables/recovery_tokens.cc b/src/compiler/build_tables/recovery_tokens.cc new file mode 100644 index 00000000..0aacb7c3 --- /dev/null +++ b/src/compiler/build_tables/recovery_tokens.cc @@ -0,0 +1,89 @@ +#include "compiler/build_tables/recovery_tokens.h" +#include "compiler/lexical_grammar.h" +#include "compiler/rules/choice.h" +#include "compiler/rules/character_set.h" +#include "compiler/rules/repeat.h" +#include "compiler/rules/visitor.h" +#include "compiler/rules/seq.h" +#include "compiler/rules/metadata.h" + +namespace tree_sitter { +namespace build_tables { + +using rules::Symbol; +using std::vector; + +template +class CharacterAggregator : public rules::RuleFn { + void apply_to(const rules::Seq *rule) { + if (left) + apply(rule->left); + if (right) + apply(rule->right); + } + + void apply_to(const rules::Choice *rule) { + for (const rule_ptr &element : rule->elements) + apply(element); + } + + void apply_to(const rules::Repeat *rule) { + apply(rule->content); + } + + void apply_to(const rules::Metadata *rule) { + apply(rule->rule); + } + + void apply_to(const rules::CharacterSet *rule) { + result.add_set(*rule); + } + + public: + rules::CharacterSet result; +}; + +class FirstCharacters : public CharacterAggregator {}; +class LastCharacters : public CharacterAggregator {}; +class AllCharacters : public CharacterAggregator {}; + +vector recovery_tokens(const LexicalGrammar &grammar) { + vector result; + + AllCharacters all_separator_characters; + for (const rule_ptr &separator : grammar.separators) + all_separator_characters.apply(separator); + + for (size_t i = 0; i < grammar.variables.size(); i++) { + const Variable &variable = grammar.variables[i]; + rule_ptr rule = variable.rule; + + FirstCharacters first_characters; + first_characters.apply(variable.rule); + + LastCharacters last_characters; + last_characters.apply(variable.rule); + + AllCharacters all_characters; + all_characters.apply(variable.rule); + + bool has_distinct_start = + !first_characters.result.includes_all && + !first_characters.result.intersects(all_separator_characters.result); + + bool has_distinct_end = + !last_characters.result.includes_all && + !last_characters.result.intersects(all_separator_characters.result); + + bool has_no_separators = + !all_characters.result.intersects(all_separator_characters.result); + + if ((has_distinct_start && has_distinct_end) || has_no_separators) + result.push_back(Symbol(i, true)); + } + + return result; +} + +} // namespace build_tables +} // namespace tree_sitter diff --git a/src/compiler/build_tables/does_match_any_line.h b/src/compiler/build_tables/recovery_tokens.h similarity index 67% rename from src/compiler/build_tables/does_match_any_line.h rename to src/compiler/build_tables/recovery_tokens.h index cfc5ed8d..db477d76 100644 --- a/src/compiler/build_tables/does_match_any_line.h +++ b/src/compiler/build_tables/recovery_tokens.h @@ -2,11 +2,16 @@ #define COMPILER_BUILD_TABLES_DOES_MATCH_ANY_LINE_H_ #include "compiler/rule.h" +#include "compiler/rules/symbol.h" +#include namespace tree_sitter { + +struct LexicalGrammar; + namespace build_tables { -bool does_match_any_line(const rule_ptr &); +std::vector recovery_tokens(const LexicalGrammar &); } // namespace build_tables } // namespace tree_sitter diff --git a/src/compiler/rules/character_set.cc b/src/compiler/rules/character_set.cc index f5618a07..9c273575 100644 --- a/src/compiler/rules/character_set.cc +++ b/src/compiler/rules/character_set.cc @@ -153,8 +153,24 @@ bool CharacterSet::is_empty() const { } void CharacterSet::add_set(const CharacterSet &other) { - for (uint32_t c : other.included_chars) - included_chars.insert(c); + if (includes_all) { + if (other.includes_all) { + excluded_chars = remove_chars(&excluded_chars, other.excluded_chars); + } else { + remove_chars(&excluded_chars, other.included_chars); + } + } else { + if (other.includes_all) { + includes_all = true; + for (uint32_t c : other.excluded_chars) + if (!included_chars.count(c)) + excluded_chars.insert(c); + included_chars.clear(); + } else { + for (uint32_t c : other.included_chars) + included_chars.insert(c); + } + } } CharacterSet CharacterSet::remove_set(const CharacterSet &other) { @@ -182,6 +198,11 @@ CharacterSet CharacterSet::remove_set(const CharacterSet &other) { return result; } +bool CharacterSet::intersects(const CharacterSet &other) const { + CharacterSet copy(*this); + return !copy.remove_set(other).is_empty(); +} + vector CharacterSet::included_ranges() const { return consolidate_ranges(included_chars); } diff --git a/src/compiler/rules/character_set.h b/src/compiler/rules/character_set.h index f1469d72..ced343b0 100644 --- a/src/compiler/rules/character_set.h +++ b/src/compiler/rules/character_set.h @@ -31,6 +31,7 @@ class CharacterSet : public Rule { void add_set(const CharacterSet &other); CharacterSet remove_set(const CharacterSet &other); + bool intersects(const CharacterSet &other) const; bool is_empty() const; std::vector included_ranges() const;