Refine logic for which tokens to use in error recovery

2016-04-27 14:09:19 -07:00 · 2016-04-27 14:09:19 -07:00 · 5b74813a5c
commit 5b74813a5c
parent 31f6b2e24a
9 changed files with 159 additions and 114 deletions
--- a/project.gyp
+++ b/project.gyp
@ -14,7 +14,7 @@
        'src/compiler/build_tables/build_lex_table.cc',
        'src/compiler/build_tables/build_parse_table.cc',
        'src/compiler/build_tables/build_tables.cc',
-        'src/compiler/build_tables/does_match_any_line.cc',
+        'src/compiler/build_tables/recovery_tokens.cc',
        'src/compiler/build_tables/item_set_closure.cc',
        'src/compiler/build_tables/lex_item.cc',
        'src/compiler/build_tables/lex_item_transitions.cc',
--- a/spec/compiler/build_tables/distinctive_tokens_spec.cc
+++ b/spec/compiler/build_tables/distinctive_tokens_spec.cc
@ -0,0 +1,36 @@
+#include "spec_helper.h"
+#include "compiler/rules/character_set.h"
+#include "compiler/build_tables/recovery_tokens.h"
+#include "compiler/lexical_grammar.h"
+#include "helpers/rule_helpers.h"
+#include "helpers/stream_methods.h"
+#include "compiler/rules.h"
+
+using namespace rules;
+using namespace build_tables;
+
+START_TEST
+
+describe("recovery_tokens(rule)", []() {
+  it("includes rules that can only begin and end with an explicit set of characters", [&]() {
+    LexicalGrammar grammar;
+    grammar.separators = {
+      character({ ' ' }),
+    };
+
+    grammar.variables = {
+      Variable("var0", VariableTypeNamed, character({}, false)),
+      Variable("var1", VariableTypeNamed, seq({
+        character({ 'a', 'b' }),
+        character({}, false),
+        character({ 'c', 'd' }),
+      })),
+    };
+
+    AssertThat(recovery_tokens(grammar), Equals<vector<Symbol>>({
+      Symbol(1, true),
+    }));
+  });
+});
+
+END_TEST
--- a/spec/compiler/build_tables/does_match_any_line_spec.cc
+++ b/spec/compiler/build_tables/does_match_any_line_spec.cc
@ -1,43 +0,0 @@
-#include "spec_helper.h"
-#include "compiler/rules/character_set.h"
-#include "compiler/build_tables/does_match_any_line.h"
-#include "helpers/rule_helpers.h"
-#include "compiler/rules.h"
-
-using namespace rules;
-using namespace build_tables;
-
-START_TEST
-
-describe("does_match_any_line(rule)", []() {
-  it("returns true for rules that match any sequence of characters on a line", [&]() {
-    rule_ptr rule = character({}, false);
-    AssertThat(does_match_any_line(rule), IsFalse());
-
-    rule = repeat(character({}, false));
-    AssertThat(does_match_any_line(rule), IsTrue());
-
-    rule = repeat(character({}, false));
-    AssertThat(does_match_any_line(rule), IsTrue());
-
-    rule = choice({ repeat(character({}, false)), str("x") });
-    AssertThat(does_match_any_line(rule), IsTrue());
-
-    rule = repeat(choice({ character({}, false), str("x") }));
-    AssertThat(does_match_any_line(rule), IsTrue());
-
-    rule = choice({ str("y"), str("x") });
-    AssertThat(does_match_any_line(rule), IsFalse());
-
-    rule = seq({ repeat(character({}, false)), repeat(character({}, false)) });
-    AssertThat(does_match_any_line(rule), IsTrue());
-
-    rule = seq({ repeat(character({}, false)), str("x") });
-    AssertThat(does_match_any_line(rule), IsFalse());
-
-    rule = repeat(character({0, '\n'}, false));
-    AssertThat(does_match_any_line(rule), IsTrue());
-  });
-});
-
-END_TEST
--- a/src/compiler/build_tables/build_parse_table.cc
+++ b/src/compiler/build_tables/build_parse_table.cc
@ -15,7 +15,7 @@
 #include "compiler/syntax_grammar.h"
 #include "compiler/rules/symbol.h"
 #include "compiler/rules/built_in_symbols.h"
-#include "compiler/build_tables/does_match_any_line.h"
+#include "compiler/build_tables/recovery_tokens.h"

 namespace tree_sitter {
 namespace build_tables {
@ -108,10 +108,8 @@ class ParseTableBuilder {
  void add_out_of_context_parse_states() {
    auto symbols_by_first = symbols_by_first_symbol(grammar);

-    for (size_t i = 0; i < lexical_grammar.variables.size(); i++) {
-      Symbol symbol(i, true);
-      if (!does_match_any_line(lexical_grammar.variables[i].rule))
-        add_out_of_context_parse_state(symbol, symbols_by_first[symbol]);
+    for (const Symbol &symbol : recovery_tokens(lexical_grammar)) {
+      add_out_of_context_parse_state(symbol, symbols_by_first[symbol]);
    }

    for (size_t i = 0; i < grammar.variables.size(); i++) {
--- a/src/compiler/build_tables/does_match_any_line.cc
+++ b/src/compiler/build_tables/does_match_any_line.cc
@ -1,62 +0,0 @@
-#include "compiler/build_tables/does_match_any_line.h"
-#include "compiler/rules/choice.h"
-#include "compiler/rules/character_set.h"
-#include "compiler/rules/repeat.h"
-#include "compiler/rules/visitor.h"
-#include "compiler/rules/seq.h"
-#include "compiler/rules/metadata.h"
-
-namespace tree_sitter {
-namespace build_tables {
-
-class DoesTokenCatchAnyCharacter : public rules::RuleFn<bool> {
-  bool apply_to(const rules::Choice *rule) {
-    for (const rule_ptr &element : rule->elements)
-      if (apply(element))
-        return true;
-    return false;
-  }
-
-  bool apply_to(const rules::Metadata *rule) {
-    return apply(rule->rule);
-  }
-
-  bool apply_to(const rules::CharacterSet *rule) {
-    if (rule->includes_all) {
-      for (uint32_t character : rule->excluded_chars) {
-        if (character != 0 && character != '\n')
-          return false;
-      }
-      return true;
-    }
-    return false;
-  }
-};
-
-class DoesTokenCatchAll : public rules::RuleFn<bool> {
-  bool apply_to(const rules::Repeat *rule) {
-    return DoesTokenCatchAnyCharacter().apply(rule->content);
-  }
-
-  bool apply_to(const rules::Metadata *rule) {
-    return apply(rule->rule);
-  }
-
-  bool apply_to(const rules::Choice *rule) {
-    for (const rule_ptr &element : rule->elements)
-      if (apply(element))
-        return true;
-    return false;
-  }
-
-  bool apply_to(const rules::Seq *rule) {
-    return apply(rule->left) && apply(rule->right);
-  }
-};
-
-bool does_match_any_line(const rule_ptr &rule) {
-  return DoesTokenCatchAll().apply(rule);
-}
-
-}  // namespace build_tables
-}  // namespace tree_sitter
--- a/src/compiler/build_tables/recovery_tokens.cc
+++ b/src/compiler/build_tables/recovery_tokens.cc
@ -0,0 +1,89 @@
+#include "compiler/build_tables/recovery_tokens.h"
+#include "compiler/lexical_grammar.h"
+#include "compiler/rules/choice.h"
+#include "compiler/rules/character_set.h"
+#include "compiler/rules/repeat.h"
+#include "compiler/rules/visitor.h"
+#include "compiler/rules/seq.h"
+#include "compiler/rules/metadata.h"
+
+namespace tree_sitter {
+namespace build_tables {
+
+using rules::Symbol;
+using std::vector;
+
+template <bool left, bool right>
+class CharacterAggregator : public rules::RuleFn<void> {
+  void apply_to(const rules::Seq *rule) {
+    if (left)
+      apply(rule->left);
+    if (right)
+      apply(rule->right);
+  }
+
+  void apply_to(const rules::Choice *rule) {
+    for (const rule_ptr &element : rule->elements)
+      apply(element);
+  }
+
+  void apply_to(const rules::Repeat *rule) {
+    apply(rule->content);
+  }
+
+  void apply_to(const rules::Metadata *rule) {
+    apply(rule->rule);
+  }
+
+  void apply_to(const rules::CharacterSet *rule) {
+    result.add_set(*rule);
+  }
+
+ public:
+  rules::CharacterSet result;
+};
+
+class FirstCharacters : public CharacterAggregator<true, false> {};
+class LastCharacters : public CharacterAggregator<false, true> {};
+class AllCharacters : public CharacterAggregator<true, true> {};
+
+vector<Symbol> recovery_tokens(const LexicalGrammar &grammar) {
+  vector<Symbol> result;
+
+  AllCharacters all_separator_characters;
+  for (const rule_ptr &separator : grammar.separators)
+    all_separator_characters.apply(separator);
+
+  for (size_t i = 0; i < grammar.variables.size(); i++) {
+    const Variable &variable = grammar.variables[i];
+    rule_ptr rule = variable.rule;
+
+    FirstCharacters first_characters;
+    first_characters.apply(variable.rule);
+
+    LastCharacters last_characters;
+    last_characters.apply(variable.rule);
+
+    AllCharacters all_characters;
+    all_characters.apply(variable.rule);
+
+    bool has_distinct_start =
+      !first_characters.result.includes_all &&
+      !first_characters.result.intersects(all_separator_characters.result);
+
+    bool has_distinct_end =
+      !last_characters.result.includes_all &&
+      !last_characters.result.intersects(all_separator_characters.result);
+
+    bool has_no_separators =
+      !all_characters.result.intersects(all_separator_characters.result);
+
+    if ((has_distinct_start && has_distinct_end) || has_no_separators)
+      result.push_back(Symbol(i, true));
+  }
+
+  return result;
+}
+
+}  // namespace build_tables
+}  // namespace tree_sitter
--- a/src/compiler/build_tables/does_match_any_line.h
+++ b/src/compiler/build_tables/does_match_any_line.h
@ -2,11 +2,16 @@
 #define COMPILER_BUILD_TABLES_DOES_MATCH_ANY_LINE_H_

 #include "compiler/rule.h"
+#include "compiler/rules/symbol.h"
+#include <vector>

 namespace tree_sitter {
+
+struct LexicalGrammar;
+
 namespace build_tables {

-bool does_match_any_line(const rule_ptr &);
+std::vector<rules::Symbol> recovery_tokens(const LexicalGrammar &);

 }  // namespace build_tables
 }  // namespace tree_sitter
--- a/src/compiler/rules/character_set.cc
+++ b/src/compiler/rules/character_set.cc
@ -153,8 +153,24 @@ bool CharacterSet::is_empty() const {
 }

 void CharacterSet::add_set(const CharacterSet &other) {
-  for (uint32_t c : other.included_chars)
-    included_chars.insert(c);
+  if (includes_all) {
+    if (other.includes_all) {
+      excluded_chars = remove_chars(&excluded_chars, other.excluded_chars);
+    } else {
+      remove_chars(&excluded_chars, other.included_chars);
+    }
+  } else {
+    if (other.includes_all) {
+      includes_all = true;
+      for (uint32_t c : other.excluded_chars)
+        if (!included_chars.count(c))
+          excluded_chars.insert(c);
+      included_chars.clear();
+    } else {
+      for (uint32_t c : other.included_chars)
+        included_chars.insert(c);
+    }
+  }
 }

 CharacterSet CharacterSet::remove_set(const CharacterSet &other) {
@ -182,6 +198,11 @@ CharacterSet CharacterSet::remove_set(const CharacterSet &other) {
  return result;
 }

+bool CharacterSet::intersects(const CharacterSet &other) const {
+  CharacterSet copy(*this);
+  return !copy.remove_set(other).is_empty();
+}
+
 vector<CharacterRange> CharacterSet::included_ranges() const {
  return consolidate_ranges(included_chars);
 }
--- a/src/compiler/rules/character_set.h
+++ b/src/compiler/rules/character_set.h
@ -31,6 +31,7 @@ class CharacterSet : public Rule {

  void add_set(const CharacterSet &other);
  CharacterSet remove_set(const CharacterSet &other);
+  bool intersects(const CharacterSet &other) const;
  bool is_empty() const;

  std::vector<CharacterRange> included_ranges() const;