diff --git a/include/tree_sitter/compiler.h b/include/tree_sitter/compiler.h index 30f4e99c..ca2a28f7 100644 --- a/include/tree_sitter/compiler.h +++ b/include/tree_sitter/compiler.h @@ -5,6 +5,8 @@ extern "C" { #endif +#include + typedef enum { TSCompileErrorTypeNone, TSCompileErrorTypeInvalidGrammar, @@ -25,7 +27,7 @@ typedef struct { TSCompileErrorType error_type; } TSCompileResult; -TSCompileResult ts_compile_grammar(const char *input); +TSCompileResult ts_compile_grammar(const char *input, FILE *log_file); #ifdef __cplusplus } diff --git a/project.gyp b/project.gyp index b88a6c65..0006a18a 100644 --- a/project.gyp +++ b/project.gyp @@ -22,6 +22,7 @@ 'src/compiler/compile.cc', 'src/compiler/generate_code/c_code.cc', 'src/compiler/lex_table.cc', + 'src/compiler/log.cc', 'src/compiler/parse_grammar.cc', 'src/compiler/parse_table.cc', 'src/compiler/precedence_range.cc', diff --git a/src/compiler/build_tables/lex_table_builder.cc b/src/compiler/build_tables/lex_table_builder.cc index e0cab5e2..54957efe 100644 --- a/src/compiler/build_tables/lex_table_builder.cc +++ b/src/compiler/build_tables/lex_table_builder.cc @@ -9,9 +9,11 @@ #include #include "compiler/build_tables/lex_item.h" #include "compiler/build_tables/lookahead_set.h" -#include "compiler/parse_table.h" #include "compiler/lexical_grammar.h" +#include "compiler/log.h" +#include "compiler/parse_table.h" #include "compiler/rule.h" +#include "utf8proc.h" namespace tree_sitter { namespace build_tables { @@ -78,6 +80,7 @@ class LexTableBuilderImpl : public LexTableBuilder { bool conflict_detection_mode; LookaheadSet keyword_symbols; Symbol keyword_capture_token; + char encoding_buffer[8]; public: LexTableBuilderImpl(const SyntaxGrammar &syntax_grammar, @@ -151,6 +154,7 @@ class LexTableBuilderImpl : public LexTableBuilder { // For each pair of tokens, generate a lex table for just those two tokens and record what // conflicts arise. + LOG_START("detecting conflicts between tokens"); conflict_detection_mode = true; for (Symbol::Index i = 0, n = grammar.variables.size(); i < n; i++) { for (Symbol::Index j = 0; j < i; j++) { @@ -165,6 +169,7 @@ class LexTableBuilderImpl : public LexTableBuilder { } } } + LOG_END(); // Find a 'keyword capture token' that matches all of the indentified keywords. for (Symbol::Index i = 0, n = grammar.variables.size(); i < n; i++) { @@ -304,9 +309,33 @@ class LexTableBuilderImpl : public LexTableBuilder { if (prefer_advancing && !next_item_set_can_yield_this_token) { auto advance_symbol = transition.destination.entries.begin()->lhs; - if (characters.intersects(following_characters_by_token[accept_action.symbol.index]) || - characters.intersects(separator_start_characters)) { - record_conflict(accept_action.symbol, advance_symbol, MatchesLongerStringWithValidNextChar); + auto &following_chars = following_characters_by_token[accept_action.symbol.index]; + CharacterSet conflicting_following_chars = characters.intersection(following_chars); + CharacterSet conflicting_sep_chars = characters.intersection(separator_start_characters); + if (!conflicting_following_chars.is_empty()) { + LOG( + "%s shadows %s followed by '%s'", + token_name(advance_symbol).c_str(), + token_name(accept_action.symbol).c_str(), + log_char(*conflicting_following_chars.included_chars.begin()) + ); + record_conflict( + accept_action.symbol, + advance_symbol, + MatchesLongerStringWithValidNextChar + ); + } else if (!conflicting_sep_chars.is_empty()) { + LOG( + "%s shadows %s followed by '%s'", + token_name(advance_symbol).c_str(), + token_name(accept_action.symbol).c_str(), + log_char(*conflicting_sep_chars.included_chars.begin()) + ); + record_conflict( + accept_action.symbol, + advance_symbol, + MatchesLongerStringWithValidNextChar + ); } else { record_conflict(accept_action.symbol, advance_symbol, MatchesLongerString); } @@ -508,8 +537,22 @@ class LexTableBuilderImpl : public LexTableBuilder { main_lex_state_ids.clear(); } - const string &token_name(rules::Symbol &symbol) { - return grammar.variables[symbol.index].name; + string token_name(rules::Symbol &symbol) { + const LexicalVariable &variable = grammar.variables[symbol.index]; + if (variable.type == VariableTypeNamed) { + return variable.name; + } else { + return "'" + variable.name + "'"; + } + } + + const char *log_char(int32_t character) { + uint32_t count = utf8proc_encode_char( + character, + reinterpret_cast(encoding_buffer) + ); + encoding_buffer[count] = 0; + return encoding_buffer; } }; diff --git a/src/compiler/build_tables/parse_table_builder.cc b/src/compiler/build_tables/parse_table_builder.cc index 6218fec6..c19d5a2b 100644 --- a/src/compiler/build_tables/parse_table_builder.cc +++ b/src/compiler/build_tables/parse_table_builder.cc @@ -6,6 +6,7 @@ #include #include #include +#include "compiler/log.h" #include "compiler/parse_table.h" #include "compiler/build_tables/parse_item.h" #include "compiler/build_tables/parse_item_set_builder.h" @@ -152,8 +153,8 @@ class ParseTableBuilderImpl : public ParseTableBuilder { parse_table.states[state_id].terminal_entries.clear(); - // Add all the tokens that have no conflict with other tokens. - LookaheadSet non_conflicting_tokens; + // First, identify the conflict-free tokens. + LookaheadSet conflict_free_tokens; for (unsigned i = 0; i < lexical_grammar.variables.size(); i++) { Symbol token = Symbol::terminal(i); bool conflicts_with_other_tokens = false; @@ -166,27 +167,41 @@ class ParseTableBuilderImpl : public ParseTableBuilder { break; } } - if (!conflicts_with_other_tokens) non_conflicting_tokens.insert(token); + if (!conflicts_with_other_tokens) conflict_free_tokens.insert(token); } + // Include in the error recover state all of the tokens that are either + // conflict-free themselves, or have no conflicts with any conflict-free + // tokens. + LOG_START("finding non-conflicting tokens for error recovery"); LookaheadSet tokens; for (unsigned i = 0; i < lexical_grammar.variables.size(); i++) { Symbol token = Symbol::terminal(i); - bool conflicts_with_other_tokens = false; - if (!non_conflicting_tokens.contains(token)) { - non_conflicting_tokens.for_each([&](Symbol other_token) { + if (conflict_free_tokens.contains(token)) { + LOG("include %s", symbol_name(token).c_str()); + parse_table.add_terminal_action(state_id, token, ParseAction::Recover()); + } else { + bool conflicts_with_other_tokens = false; + conflict_free_tokens.for_each([&](Symbol other_token) { if (!coincident_tokens_by_token[token.index].contains(other_token) && (lex_table_builder->get_conflict_status(other_token, token) & CannotMerge)) { + LOG( + "exclude %s: conflicts with %s", + symbol_name(token).c_str(), + symbol_name(other_token).c_str() + ); conflicts_with_other_tokens = true; return false; } return true; }); - } - if (!conflicts_with_other_tokens) { - parse_table.add_terminal_action(state_id, token, ParseAction::Recover()); + if (!conflicts_with_other_tokens) { + LOG("include %s", symbol_name(token).c_str()); + parse_table.add_terminal_action(state_id, token, ParseAction::Recover()); + } } } + LOG_END(); for (size_t i = 0; i < grammar.external_tokens.size(); i++) { if (grammar.external_tokens[i].corresponding_internal_token == rules::NONE()) { diff --git a/src/compiler/compile.cc b/src/compiler/compile.cc index 56af3aed..49a3468b 100644 --- a/src/compiler/compile.cc +++ b/src/compiler/compile.cc @@ -3,6 +3,7 @@ #include "compiler/build_tables/parse_table_builder.h" #include "compiler/generate_code/c_code.h" #include "compiler/syntax_grammar.h" +#include "compiler/log.h" #include "compiler/lexical_grammar.h" #include "compiler/parse_grammar.h" #include "json.h" @@ -16,7 +17,9 @@ using std::vector; using std::get; using std::make_tuple; -extern "C" TSCompileResult ts_compile_grammar(const char *input) { +extern "C" TSCompileResult ts_compile_grammar(const char *input, FILE *log_file) { + set_log_file(log_file); + ParseGrammarResult parse_result = parse_grammar(string(input)); if (!parse_result.error_message.empty()) { return { nullptr, strdup(parse_result.error_message.c_str()), @@ -48,8 +51,8 @@ extern "C" TSCompileResult ts_compile_grammar(const char *input) { move(lexical_grammar) ); - return { - strdup(code.c_str()), nullptr, TSCompileErrorTypeNone }; + set_log_file(nullptr); + return { strdup(code.c_str()), nullptr, TSCompileErrorTypeNone }; } } // namespace tree_sitter diff --git a/src/compiler/log.cc b/src/compiler/log.cc new file mode 100644 index 00000000..5502c7d6 --- /dev/null +++ b/src/compiler/log.cc @@ -0,0 +1,31 @@ +#include "compiler/log.h" + +static const char *SPACES = " "; + +namespace tree_sitter { + +thread_local unsigned _indent_level = 0; +thread_local FILE *_log_file = nullptr; + +void set_log_file(FILE *file) { + _log_file = file; + _indent_level = 0; +} + +FILE *get_log_file() { + return _log_file; +} + +void _indent_logs() { + _indent_level++; +} + +void _outdent_logs() { + _indent_level--; +} + +void _print_indent() { + fwrite(SPACES, 1, _indent_level * 2, _log_file); +} + +} diff --git a/src/compiler/log.h b/src/compiler/log.h new file mode 100644 index 00000000..c848f970 --- /dev/null +++ b/src/compiler/log.h @@ -0,0 +1,38 @@ +#ifndef COMPILER_LOG_H_ +#define COMPILER_LOG_H_ + +#include + +namespace tree_sitter { + +void set_log_file(FILE *); +FILE *get_log_file(); +void _indent_logs(); +void _outdent_logs(); +void _print_indent(); + +#define LOG_START(...) \ + do { \ + LOG(__VA_ARGS__); \ + _indent_logs(); \ + } while (0) + +#define LOG_END(...) \ + do { \ + _outdent_logs(); \ + LOG(""); \ + } while (0) + +#define LOG(...) \ + do { \ + FILE *f = get_log_file(); \ + if (f) { \ + _print_indent(); \ + fprintf(f, __VA_ARGS__); \ + fputs("\n", f); \ + } \ + } while (0) + +} // namespace tree_sitter + +#endif // COMPILER_LOG_H_ diff --git a/src/compiler/rules/character_set.cc b/src/compiler/rules/character_set.cc index c199368d..b0064cbb 100644 --- a/src/compiler/rules/character_set.cc +++ b/src/compiler/rules/character_set.cc @@ -159,6 +159,11 @@ bool CharacterSet::intersects(const CharacterSet &other) const { return !copy.remove_set(other).is_empty(); } +CharacterSet CharacterSet::intersection(const CharacterSet &other) const { + CharacterSet copy(*this); + return copy.remove_set(other); +} + vector CharacterSet::included_ranges() const { return consolidate_ranges(included_chars); } diff --git a/src/compiler/rules/character_set.h b/src/compiler/rules/character_set.h index 0c991c43..c49b0d1d 100644 --- a/src/compiler/rules/character_set.h +++ b/src/compiler/rules/character_set.h @@ -35,6 +35,7 @@ struct CharacterSet { void add_set(const CharacterSet &other); CharacterSet remove_set(const CharacterSet &other); + CharacterSet intersection(const CharacterSet &other) const; bool intersects(const CharacterSet &other) const; bool is_empty() const; @@ -49,4 +50,4 @@ struct CharacterSet { } // namespace rules } // namespace tree_sitter -#endif // COMPILER_RULES_CHARACTER_SET_H_ \ No newline at end of file +#endif // COMPILER_RULES_CHARACTER_SET_H_ diff --git a/test/helpers/load_language.cc b/test/helpers/load_language.cc index f01a2184..efec4371 100644 --- a/test/helpers/load_language.cc +++ b/test/helpers/load_language.cc @@ -223,7 +223,7 @@ const TSLanguage *load_real_language(const string &language_name) { printf("\n" "Regenerating the %s parser...\n", language_name.c_str()); string grammar_json = read_file(grammar_filename); - TSCompileResult result = ts_compile_grammar(grammar_json.c_str()); + TSCompileResult result = ts_compile_grammar(grammar_json.c_str(), nullptr); if (result.error_type != TSCompileErrorTypeNone) { fprintf(stderr, "Failed to compile %s grammar: %s\n", language_name.c_str(), result.error_message); return nullptr; diff --git a/test/integration/test_grammars.cc b/test/integration/test_grammars.cc index 62174855..3741a3c9 100644 --- a/test/integration/test_grammars.cc +++ b/test/integration/test_grammars.cc @@ -27,7 +27,7 @@ for (auto &language_name : test_languages) { if (file_exists(expected_error_path)) { it("fails with the correct error message", [&]() { - TSCompileResult compile_result = ts_compile_grammar(grammar_json.c_str()); + TSCompileResult compile_result = ts_compile_grammar(grammar_json.c_str(), nullptr); string expected_error = read_file(expected_error_path); AssertThat((void *)compile_result.error_message, !Equals(nullptr)); AssertThat(compile_result.error_message, Equals(expected_error)); @@ -43,7 +43,7 @@ for (auto &language_name : test_languages) { string external_scanner_path = join_path({directory_path, "scanner.c"}); if (!file_exists(external_scanner_path)) external_scanner_path = ""; - TSCompileResult compile_result = ts_compile_grammar(grammar_json.c_str()); + TSCompileResult compile_result = ts_compile_grammar(grammar_json.c_str(), nullptr); language = load_test_language( language_name, diff --git a/test/runtime/language_test.cc b/test/runtime/language_test.cc index 747327c0..1c460347 100644 --- a/test/runtime/language_test.cc +++ b/test/runtime/language_test.cc @@ -26,7 +26,7 @@ describe("Language", []() { "value": "b" } } - })JSON"); + })JSON", nullptr); TSParser *parser = ts_parser_new(); const TSLanguage *language = load_test_language("aliased_rules", compile_result); diff --git a/test/runtime/node_test.cc b/test/runtime/node_test.cc index 313bec40..8683a9ef 100644 --- a/test/runtime/node_test.cc +++ b/test/runtime/node_test.cc @@ -71,7 +71,7 @@ string grammar_with_aliases_and_extras = R"JSON({ const TSLanguage *language_with_aliases_and_extras = load_test_language( "aliases_and_extras", - ts_compile_grammar(grammar_with_aliases_and_extras.c_str()) + ts_compile_grammar(grammar_with_aliases_and_extras.c_str(), nullptr) ); describe("Node", [&]() {