Make ts_compile_grammar take an optional log file, start logging to it

2018-05-24 16:01:14 -07:00 · 2018-05-24 16:01:14 -07:00 · 6fca8f2f4d
commit 6fca8f2f4d
parent 69d8c6f5e6
13 changed files with 164 additions and 25 deletions
--- a/include/tree_sitter/compiler.h
+++ b/include/tree_sitter/compiler.h
@ -5,6 +5,8 @@
 extern "C" {
 #endif

+#include <stdio.h>
+
 typedef enum {
  TSCompileErrorTypeNone,
  TSCompileErrorTypeInvalidGrammar,
@ -25,7 +27,7 @@ typedef struct {
  TSCompileErrorType error_type;
 } TSCompileResult;

-TSCompileResult ts_compile_grammar(const char *input);
+TSCompileResult ts_compile_grammar(const char *input, FILE *log_file);

 #ifdef __cplusplus
 }
--- a/project.gyp
+++ b/project.gyp
@ -22,6 +22,7 @@
        'src/compiler/compile.cc',
        'src/compiler/generate_code/c_code.cc',
        'src/compiler/lex_table.cc',
+        'src/compiler/log.cc',
        'src/compiler/parse_grammar.cc',
        'src/compiler/parse_table.cc',
        'src/compiler/precedence_range.cc',
--- a/src/compiler/build_tables/lex_table_builder.cc
+++ b/src/compiler/build_tables/lex_table_builder.cc
@ -9,9 +9,11 @@
 #include <vector>
 #include "compiler/build_tables/lex_item.h"
 #include "compiler/build_tables/lookahead_set.h"
-#include "compiler/parse_table.h"
 #include "compiler/lexical_grammar.h"
+#include "compiler/log.h"
+#include "compiler/parse_table.h"
 #include "compiler/rule.h"
+#include "utf8proc.h"

 namespace tree_sitter {
 namespace build_tables {
@ -78,6 +80,7 @@ class LexTableBuilderImpl : public LexTableBuilder {
  bool conflict_detection_mode;
  LookaheadSet keyword_symbols;
  Symbol keyword_capture_token;
+  char encoding_buffer[8];

 public:
  LexTableBuilderImpl(const SyntaxGrammar &syntax_grammar,
@ -151,6 +154,7 @@ class LexTableBuilderImpl : public LexTableBuilder {

    // For each pair of tokens, generate a lex table for just those two tokens and record what
    // conflicts arise.
+    LOG_START("detecting conflicts between tokens");
    conflict_detection_mode = true;
    for (Symbol::Index i = 0, n = grammar.variables.size(); i < n; i++) {
      for (Symbol::Index j = 0; j < i; j++) {
@ -165,6 +169,7 @@ class LexTableBuilderImpl : public LexTableBuilder {
        }
      }
    }
+    LOG_END();

    // Find a 'keyword capture token' that matches all of the indentified keywords.
    for (Symbol::Index i = 0, n = grammar.variables.size(); i < n; i++) {
@ -304,9 +309,33 @@ class LexTableBuilderImpl : public LexTableBuilder {

          if (prefer_advancing && !next_item_set_can_yield_this_token) {
            auto advance_symbol = transition.destination.entries.begin()->lhs;
-            if (characters.intersects(following_characters_by_token[accept_action.symbol.index]) ||
-                characters.intersects(separator_start_characters)) {
-              record_conflict(accept_action.symbol, advance_symbol, MatchesLongerStringWithValidNextChar);
+            auto &following_chars = following_characters_by_token[accept_action.symbol.index];
+            CharacterSet conflicting_following_chars = characters.intersection(following_chars);
+            CharacterSet conflicting_sep_chars = characters.intersection(separator_start_characters);
+            if (!conflicting_following_chars.is_empty()) {
+              LOG(
+                "%s shadows %s followed by '%s'",
+                token_name(advance_symbol).c_str(),
+                token_name(accept_action.symbol).c_str(),
+                log_char(*conflicting_following_chars.included_chars.begin())
+              );
+              record_conflict(
+                accept_action.symbol,
+                advance_symbol,
+                MatchesLongerStringWithValidNextChar
+              );
+            } else if (!conflicting_sep_chars.is_empty()) {
+              LOG(
+                "%s shadows %s followed by '%s'",
+                token_name(advance_symbol).c_str(),
+                token_name(accept_action.symbol).c_str(),
+                log_char(*conflicting_sep_chars.included_chars.begin())
+              );
+              record_conflict(
+                accept_action.symbol,
+                advance_symbol,
+                MatchesLongerStringWithValidNextChar
+              );
            } else {
              record_conflict(accept_action.symbol, advance_symbol, MatchesLongerString);
            }
@ -508,8 +537,22 @@ class LexTableBuilderImpl : public LexTableBuilder {
    main_lex_state_ids.clear();
  }

-  const string &token_name(rules::Symbol &symbol) {
-    return grammar.variables[symbol.index].name;
+  string token_name(rules::Symbol &symbol) {
+    const LexicalVariable &variable = grammar.variables[symbol.index];
+    if (variable.type == VariableTypeNamed) {
+      return variable.name;
+    } else {
+      return "'" + variable.name + "'";
+    }
+  }
+
+  const char *log_char(int32_t character) {
+    uint32_t count = utf8proc_encode_char(
+      character,
+      reinterpret_cast<utf8proc_uint8_t *>(encoding_buffer)
+    );
+    encoding_buffer[count] = 0;
+    return encoding_buffer;
  }
 };

--- a/src/compiler/build_tables/parse_table_builder.cc
+++ b/src/compiler/build_tables/parse_table_builder.cc
@ -6,6 +6,7 @@
 #include <string>
 #include <unordered_map>
 #include <utility>
+#include "compiler/log.h"
 #include "compiler/parse_table.h"
 #include "compiler/build_tables/parse_item.h"
 #include "compiler/build_tables/parse_item_set_builder.h"
@ -152,8 +153,8 @@ class ParseTableBuilderImpl : public ParseTableBuilder {

    parse_table.states[state_id].terminal_entries.clear();

-    // Add all the tokens that have no conflict with other tokens.
-    LookaheadSet non_conflicting_tokens;
+    // First, identify the conflict-free tokens.
+    LookaheadSet conflict_free_tokens;
    for (unsigned i = 0; i < lexical_grammar.variables.size(); i++) {
      Symbol token = Symbol::terminal(i);
      bool conflicts_with_other_tokens = false;
@ -166,27 +167,41 @@ class ParseTableBuilderImpl : public ParseTableBuilder {
          break;
        }
      }
-      if (!conflicts_with_other_tokens) non_conflicting_tokens.insert(token);
+      if (!conflicts_with_other_tokens) conflict_free_tokens.insert(token);
    }

+    // Include in the error recover state all of the tokens that are either
+    // conflict-free themselves, or have no conflicts with any conflict-free
+    // tokens.
+    LOG_START("finding non-conflicting tokens for error recovery");
    LookaheadSet tokens;
    for (unsigned i = 0; i < lexical_grammar.variables.size(); i++) {
      Symbol token = Symbol::terminal(i);
-      bool conflicts_with_other_tokens = false;
-      if (!non_conflicting_tokens.contains(token)) {
-        non_conflicting_tokens.for_each([&](Symbol other_token) {
+      if (conflict_free_tokens.contains(token)) {
+        LOG("include %s", symbol_name(token).c_str());
+        parse_table.add_terminal_action(state_id, token, ParseAction::Recover());
+      } else {
+        bool conflicts_with_other_tokens = false;
+        conflict_free_tokens.for_each([&](Symbol other_token) {
          if (!coincident_tokens_by_token[token.index].contains(other_token) &&
              (lex_table_builder->get_conflict_status(other_token, token) & CannotMerge)) {
+            LOG(
+              "exclude %s: conflicts with %s",
+              symbol_name(token).c_str(),
+              symbol_name(other_token).c_str()
+            );
            conflicts_with_other_tokens = true;
            return false;
          }
          return true;
        });
-      }
-      if (!conflicts_with_other_tokens) {
-        parse_table.add_terminal_action(state_id, token, ParseAction::Recover());
+        if (!conflicts_with_other_tokens) {
+          LOG("include %s", symbol_name(token).c_str());
+          parse_table.add_terminal_action(state_id, token, ParseAction::Recover());
+        }
      }
    }
+    LOG_END();

    for (size_t i = 0; i < grammar.external_tokens.size(); i++) {
      if (grammar.external_tokens[i].corresponding_internal_token == rules::NONE()) {
--- a/src/compiler/compile.cc
+++ b/src/compiler/compile.cc
@ -3,6 +3,7 @@
 #include "compiler/build_tables/parse_table_builder.h"
 #include "compiler/generate_code/c_code.h"
 #include "compiler/syntax_grammar.h"
+#include "compiler/log.h"
 #include "compiler/lexical_grammar.h"
 #include "compiler/parse_grammar.h"
 #include "json.h"
@ -16,7 +17,9 @@ using std::vector;
 using std::get;
 using std::make_tuple;

-extern "C" TSCompileResult ts_compile_grammar(const char *input) {
+extern "C" TSCompileResult ts_compile_grammar(const char *input, FILE *log_file) {
+  set_log_file(log_file);
+
  ParseGrammarResult parse_result = parse_grammar(string(input));
  if (!parse_result.error_message.empty()) {
    return { nullptr, strdup(parse_result.error_message.c_str()),
@ -48,8 +51,8 @@ extern "C" TSCompileResult ts_compile_grammar(const char *input) {
    move(lexical_grammar)
  );

-  return {
-    strdup(code.c_str()), nullptr, TSCompileErrorTypeNone };
+  set_log_file(nullptr);
+  return { strdup(code.c_str()), nullptr, TSCompileErrorTypeNone };
 }

 }  // namespace tree_sitter
--- a/src/compiler/log.cc
+++ b/src/compiler/log.cc
@ -0,0 +1,31 @@
+#include "compiler/log.h"
+
+static const char *SPACES = "                                                           ";
+
+namespace tree_sitter {
+
+thread_local unsigned _indent_level = 0;
+thread_local FILE *_log_file = nullptr;
+
+void set_log_file(FILE *file) {
+  _log_file = file;
+  _indent_level = 0;
+}
+
+FILE *get_log_file() {
+  return _log_file;
+}
+
+void _indent_logs() {
+  _indent_level++;
+}
+
+void _outdent_logs() {
+  _indent_level--;
+}
+
+void _print_indent() {
+  fwrite(SPACES, 1, _indent_level * 2, _log_file);
+}
+
+}
--- a/src/compiler/log.h
+++ b/src/compiler/log.h
@ -0,0 +1,38 @@
+#ifndef COMPILER_LOG_H_
+#define COMPILER_LOG_H_
+
+#include <stdio.h>
+
+namespace tree_sitter {
+
+void set_log_file(FILE *);
+FILE *get_log_file();
+void _indent_logs();
+void _outdent_logs();
+void _print_indent();
+
+#define LOG_START(...) \
+  do {                 \
+    LOG(__VA_ARGS__);  \
+    _indent_logs();    \
+  } while (0)
+
+#define LOG_END(...) \
+  do {               \
+    _outdent_logs(); \
+    LOG("");         \
+  } while (0)
+
+#define LOG(...)               \
+  do {                         \
+    FILE *f = get_log_file();  \
+    if (f) {                   \
+      _print_indent();         \
+      fprintf(f, __VA_ARGS__); \
+      fputs("\n", f);          \
+    }                          \
+  } while (0)
+
+}  // namespace tree_sitter
+
+#endif  // COMPILER_LOG_H_
--- a/src/compiler/rules/character_set.cc
+++ b/src/compiler/rules/character_set.cc
@ -159,6 +159,11 @@ bool CharacterSet::intersects(const CharacterSet &other) const {
  return !copy.remove_set(other).is_empty();
 }

+CharacterSet CharacterSet::intersection(const CharacterSet &other) const {
+  CharacterSet copy(*this);
+  return copy.remove_set(other);
+}
+
 vector<CharacterRange> CharacterSet::included_ranges() const {
  return consolidate_ranges(included_chars);
 }
--- a/src/compiler/rules/character_set.h
+++ b/src/compiler/rules/character_set.h
@ -35,6 +35,7 @@ struct CharacterSet {

  void add_set(const CharacterSet &other);
  CharacterSet remove_set(const CharacterSet &other);
+  CharacterSet intersection(const CharacterSet &other) const;
  bool intersects(const CharacterSet &other) const;
  bool is_empty() const;

@ -49,4 +50,4 @@ struct CharacterSet {
 }  // namespace rules
 }  // namespace tree_sitter

-#endif  // COMPILER_RULES_CHARACTER_SET_H_
+#endif  // COMPILER_RULES_CHARACTER_SET_H_
--- a/test/helpers/load_language.cc
+++ b/test/helpers/load_language.cc
@ -223,7 +223,7 @@ const TSLanguage *load_real_language(const string &language_name) {
    printf("\n" "Regenerating the %s parser...\n", language_name.c_str());

    string grammar_json = read_file(grammar_filename);
-    TSCompileResult result = ts_compile_grammar(grammar_json.c_str());
+    TSCompileResult result = ts_compile_grammar(grammar_json.c_str(), nullptr);
    if (result.error_type != TSCompileErrorTypeNone) {
      fprintf(stderr, "Failed to compile %s grammar: %s\n", language_name.c_str(), result.error_message);
      return nullptr;
--- a/test/integration/test_grammars.cc
+++ b/test/integration/test_grammars.cc
@ -27,7 +27,7 @@ for (auto &language_name : test_languages) {

    if (file_exists(expected_error_path)) {
      it("fails with the correct error message", [&]() {
-        TSCompileResult compile_result = ts_compile_grammar(grammar_json.c_str());
+        TSCompileResult compile_result = ts_compile_grammar(grammar_json.c_str(), nullptr);
        string expected_error = read_file(expected_error_path);
        AssertThat((void *)compile_result.error_message, !Equals<void *>(nullptr));
        AssertThat(compile_result.error_message, Equals(expected_error));
@ -43,7 +43,7 @@ for (auto &language_name : test_languages) {
          string external_scanner_path = join_path({directory_path, "scanner.c"});
          if (!file_exists(external_scanner_path)) external_scanner_path = "";

-          TSCompileResult compile_result = ts_compile_grammar(grammar_json.c_str());
+          TSCompileResult compile_result = ts_compile_grammar(grammar_json.c_str(), nullptr);

          language = load_test_language(
            language_name,
--- a/test/runtime/language_test.cc
+++ b/test/runtime/language_test.cc
@ -26,7 +26,7 @@ describe("Language", []() {
            "value": "b"
          }
        }
-      })JSON");
+      })JSON", nullptr);

      TSParser *parser = ts_parser_new();
      const TSLanguage *language =  load_test_language("aliased_rules", compile_result);
--- a/test/runtime/node_test.cc
+++ b/test/runtime/node_test.cc
@ -71,7 +71,7 @@ string grammar_with_aliases_and_extras = R"JSON({

 const TSLanguage *language_with_aliases_and_extras = load_test_language(
  "aliases_and_extras",
-  ts_compile_grammar(grammar_with_aliases_and_extras.c_str())
+  ts_compile_grammar(grammar_with_aliases_and_extras.c_str(), nullptr)
 );

 describe("Node", [&]() {