From f8649824fa2890849b71c3b78bd813dc1b893a91 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 31 Aug 2017 15:30:28 -0700 Subject: [PATCH 01/12] Remove unused function --- src/compiler/compile.cc | 21 --------------------- src/compiler/compile.h | 16 ---------------- 2 files changed, 37 deletions(-) delete mode 100644 src/compiler/compile.h diff --git a/src/compiler/compile.cc b/src/compiler/compile.cc index 3edcf141..a887c4fc 100644 --- a/src/compiler/compile.cc +++ b/src/compiler/compile.cc @@ -45,25 +45,4 @@ extern "C" TSCompileResult ts_compile_grammar(const char *input) { return { strdup(code.c_str()), nullptr, TSCompileErrorTypeNone }; } -pair compile(const InputGrammar &grammar, - std::string name) { - auto prepare_grammar_result = prepare_grammar::prepare_grammar(grammar); - const SyntaxGrammar &syntax_grammar = get<0>(prepare_grammar_result); - const LexicalGrammar &lexical_grammar = get<1>(prepare_grammar_result); - CompileError error = get<2>(prepare_grammar_result); - if (error.type) return { "", error }; - - auto table_build_result = - build_tables::build_tables(syntax_grammar, lexical_grammar); - const ParseTable &parse_table = get<0>(table_build_result); - const LexTable &lex_table = get<1>(table_build_result); - error = get<2>(table_build_result); - if (error.type) return { "", error }; - - string code = generate_code::c_code(name, parse_table, lex_table, - syntax_grammar, lexical_grammar); - - return { code, CompileError::none() }; -} - } // namespace tree_sitter diff --git a/src/compiler/compile.h b/src/compiler/compile.h deleted file mode 100644 index 5f182bc0..00000000 --- a/src/compiler/compile.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef COMPILER_COMPILE_H_ -#define COMPILER_COMPILE_H_ - -#include -#include -#include "compiler/compile_error.h" - -namespace tree_sitter { - -struct InputGrammar; - -std::pair compile(const InputGrammar &, std::string); - -} // namespace tree_sitter - -#endif // COMPILER_COMPILE_H_ From 9d668c5004cc6569b26ace4e4ac1b7df307f9fc8 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 31 Aug 2017 15:40:43 -0700 Subject: [PATCH 02/12] Move incompatible token map into LexTableBuilder --- project.gyp | 1 - src/compiler/build_tables/build_tables.cc | 35 ----- src/compiler/build_tables/build_tables.h | 24 ---- .../build_tables/lex_table_builder.cc | 51 ++++++-- src/compiler/build_tables/lex_table_builder.h | 12 +- .../build_tables/parse_table_builder.cc | 68 ++++------ .../build_tables/parse_table_builder.h | 12 +- src/compiler/compile.cc | 6 +- .../build_tables/lex_table_builder_test.cc | 122 ------------------ tests.gyp | 1 - 10 files changed, 78 insertions(+), 254 deletions(-) delete mode 100644 src/compiler/build_tables/build_tables.cc delete mode 100644 src/compiler/build_tables/build_tables.h delete mode 100644 test/compiler/build_tables/lex_table_builder_test.cc diff --git a/project.gyp b/project.gyp index bbb88438..56f742b3 100644 --- a/project.gyp +++ b/project.gyp @@ -11,7 +11,6 @@ 'externals/json-parser', ], 'sources': [ - 'src/compiler/build_tables/build_tables.cc', 'src/compiler/build_tables/lex_item.cc', 'src/compiler/build_tables/lex_item_transitions.cc', 'src/compiler/build_tables/lex_conflict_manager.cc', diff --git a/src/compiler/build_tables/build_tables.cc b/src/compiler/build_tables/build_tables.cc deleted file mode 100644 index a15aede3..00000000 --- a/src/compiler/build_tables/build_tables.cc +++ /dev/null @@ -1,35 +0,0 @@ -#include "compiler/build_tables/build_tables.h" -#include -#include "compiler/build_tables/lex_table_builder.h" -#include "compiler/build_tables/parse_table_builder.h" -#include "compiler/syntax_grammar.h" -#include "compiler/lexical_grammar.h" -#include "compiler/compile_error.h" - -namespace tree_sitter { -namespace build_tables { - -using std::tuple; -using std::make_tuple; - -tuple build_tables( - const SyntaxGrammar &syntax_grammar, - const LexicalGrammar &lexical_grammar -) { - auto lex_table_builder = LexTableBuilder::create(lexical_grammar); - auto parse_table_builder = ParseTableBuilder::create( - syntax_grammar, - lexical_grammar, - lex_table_builder.get() - ); - - auto parse_table_result = parse_table_builder->build(); - ParseTable parse_table = parse_table_result.first; - const CompileError error = parse_table_result.second; - - LexTable lex_table = lex_table_builder->build(&parse_table); - return make_tuple(parse_table, lex_table, error); -} - -} // namespace build_tables -} // namespace tree_sitter diff --git a/src/compiler/build_tables/build_tables.h b/src/compiler/build_tables/build_tables.h deleted file mode 100644 index ed1f4770..00000000 --- a/src/compiler/build_tables/build_tables.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef COMPILER_BUILD_TABLES_BUILD_TABLES_H_ -#define COMPILER_BUILD_TABLES_BUILD_TABLES_H_ - -#include -#include "compiler/parse_table.h" -#include "compiler/lex_table.h" -#include "compiler/compile_error.h" - -namespace tree_sitter { - -struct SyntaxGrammar; -struct LexicalGrammar; - -namespace build_tables { - -std::tuple build_tables( - const SyntaxGrammar &, - const LexicalGrammar & -); - -} // namespace build_tables -} // namespace tree_sitter - -#endif // COMPILER_BUILD_TABLES_BUILD_TABLES_H_ diff --git a/src/compiler/build_tables/lex_table_builder.cc b/src/compiler/build_tables/lex_table_builder.cc index b50182f3..8e8cff8a 100644 --- a/src/compiler/build_tables/lex_table_builder.cc +++ b/src/compiler/build_tables/lex_table_builder.cc @@ -9,6 +9,7 @@ #include #include "compiler/build_tables/lex_conflict_manager.h" #include "compiler/build_tables/lex_item.h" +#include "compiler/build_tables/lookahead_set.h" #include "compiler/parse_table.h" #include "compiler/lexical_grammar.h" #include "compiler/rule.h" @@ -76,13 +77,18 @@ class LexTableBuilderImpl : public LexTableBuilder { unordered_map lex_state_ids; map following_characters_by_token_index; + vector> incompatible_tokens_by_token_index; CharacterSet separator_start_characters; CharacterSet current_conflict_detection_following_characters; Symbol::Index current_conflict_detection_token_index; bool current_conflict_value; public: - LexTableBuilderImpl(const LexicalGrammar &grammar) : grammar(grammar) { + LexTableBuilderImpl(const SyntaxGrammar &syntax_grammar, + const LexicalGrammar &lexical_grammar, + const vector> &following_tokens_by_token_index) : + grammar(lexical_grammar), + incompatible_tokens_by_token_index(lexical_grammar.variables.size()) { StartingCharacterAggregator separator_character_aggregator; for (const auto &rule : grammar.separators) { separator_rules.push_back(Repeat{rule}); @@ -91,6 +97,26 @@ class LexTableBuilderImpl : public LexTableBuilder { separator_rules.push_back(Blank{}); separator_start_characters = separator_character_aggregator.result; clear(); + + for (unsigned i = 0, n = grammar.variables.size(); i < n; i++) { + Symbol token = Symbol::terminal(i); + auto &incompatible_indices = incompatible_tokens_by_token_index[i]; + + for (unsigned j = 0; j < n; j++) { + if (i == j) continue; + if (detect_conflict(i, j, following_tokens_by_token_index)) { + incompatible_indices.insert(Symbol::terminal(j)); + } + } + + for (const ExternalToken &external_token : syntax_grammar.external_tokens) { + if (external_token.corresponding_internal_token == token) { + for (unsigned j = 0; j < syntax_grammar.external_tokens.size(); j++) { + incompatible_indices.insert(Symbol::external(j)); + } + } + } + } } LexTable build(ParseTable *parse_table) { @@ -104,8 +130,12 @@ class LexTableBuilderImpl : public LexTableBuilder { return lex_table; } + const set &get_incompatible_tokens(Symbol::Index index) const { + return incompatible_tokens_by_token_index[index]; + } + bool detect_conflict(Symbol::Index left, Symbol::Index right, - const vector> &following_terminals_by_terminal_index) { + const vector> &following_tokens_by_token_index) { StartingCharacterAggregator left_starting_characters; StartingCharacterAggregator right_starting_characters; left_starting_characters.apply(grammar.variables[left].rule); @@ -119,7 +149,7 @@ class LexTableBuilderImpl : public LexTableBuilder { auto following_characters_entry = following_characters_by_token_index.find(right); if (following_characters_entry == following_characters_by_token_index.end()) { StartingCharacterAggregator aggregator; - for (auto following_token_index : following_terminals_by_terminal_index[right]) { + for (auto following_token_index : following_tokens_by_token_index[right]) { aggregator.apply(grammar.variables[following_token_index].rule); } following_characters_entry = @@ -369,17 +399,22 @@ class LexTableBuilderImpl : public LexTableBuilder { } }; -unique_ptr LexTableBuilder::create(const LexicalGrammar &grammar) { - return unique_ptr(new LexTableBuilderImpl(grammar)); +unique_ptr LexTableBuilder::create(const SyntaxGrammar &syntax_grammar, + const LexicalGrammar &lexical_grammar, + const vector> &following_tokens) { + return unique_ptr(new LexTableBuilderImpl( + syntax_grammar, + lexical_grammar, + following_tokens + )); } LexTable LexTableBuilder::build(ParseTable *parse_table) { return static_cast(this)->build(parse_table); } -bool LexTableBuilder::detect_conflict(Symbol::Index left, Symbol::Index right, - const vector> &following_terminals) { - return static_cast(this)->detect_conflict(left, right, following_terminals); +const set &LexTableBuilder::get_incompatible_tokens(Symbol::Index token) const { + return static_cast(this)->get_incompatible_tokens(token); } } // namespace build_tables diff --git a/src/compiler/build_tables/lex_table_builder.h b/src/compiler/build_tables/lex_table_builder.h index 3b896bb7..2bb7a56a 100644 --- a/src/compiler/build_tables/lex_table_builder.h +++ b/src/compiler/build_tables/lex_table_builder.h @@ -9,19 +9,19 @@ namespace tree_sitter { struct ParseTable; +struct SyntaxGrammar; struct LexicalGrammar; namespace build_tables { class LexTableBuilder { public: - static std::unique_ptr create(const LexicalGrammar &); + static std::unique_ptr create(const SyntaxGrammar &, + const LexicalGrammar &, + const std::vector> &); LexTable build(ParseTable *); - bool detect_conflict( - rules::Symbol::Index, - rules::Symbol::Index, - const std::vector> &following_terminals_by_terminal_index - ); + const std::set &get_incompatible_tokens(rules::Symbol::Index) const; + protected: LexTableBuilder() = default; }; diff --git a/src/compiler/build_tables/parse_table_builder.cc b/src/compiler/build_tables/parse_table_builder.cc index 7e67b650..3b59c8ae 100644 --- a/src/compiler/build_tables/parse_table_builder.cc +++ b/src/compiler/build_tables/parse_table_builder.cc @@ -19,9 +19,10 @@ namespace build_tables { using std::deque; using std::find; -using std::pair; using std::vector; using std::set; +using std::tuple; +using std::make_tuple; using std::map; using std::move; using std::string; @@ -49,26 +50,20 @@ class ParseTableBuilderImpl : public ParseTableBuilder { deque parse_state_queue; ParseTable parse_table; ParseItemSetBuilder item_set_builder; - LexTableBuilder *lex_table_builder; + unique_ptr lex_table_builder; set fragile_reductions; - vector> incompatible_tokens_by_token_index; vector> following_tokens_by_token_index; bool processing_recovery_states; public: - ParseTableBuilderImpl( - const SyntaxGrammar &syntax_grammar, - const LexicalGrammar &lexical_grammar, - LexTableBuilder *lex_table_builder - ) : grammar(syntax_grammar), + ParseTableBuilderImpl(const SyntaxGrammar &syntax_grammar, const LexicalGrammar &lexical_grammar) + : grammar(syntax_grammar), lexical_grammar(lexical_grammar), item_set_builder(syntax_grammar, lexical_grammar), - lex_table_builder(lex_table_builder), - incompatible_tokens_by_token_index(lexical_grammar.variables.size()), following_tokens_by_token_index(lexical_grammar.variables.size()), processing_recovery_states(false) {} - pair build() { + tuple build() { // Ensure that the empty rename sequence has index 0. parse_table.alias_sequences.push_back({}); @@ -90,9 +85,13 @@ class ParseTableBuilderImpl : public ParseTableBuilder { }}); CompileError error = process_part_state_queue(); - if (error) return {parse_table, error}; + if (error) return make_tuple(parse_table, LexTable(), error); - compute_unmergable_token_pairs(); + lex_table_builder = LexTableBuilder::create( + grammar, + lexical_grammar, + following_tokens_by_token_index + ); processing_recovery_states = true; build_error_parse_state(error_state_id); @@ -100,7 +99,9 @@ class ParseTableBuilderImpl : public ParseTableBuilder { mark_fragile_actions(); remove_duplicate_parse_states(); - return {parse_table, CompileError::none()}; + + auto lex_table = lex_table_builder->build(&parse_table); + return make_tuple(parse_table, lex_table, CompileError::none()); } private: @@ -131,9 +132,9 @@ class ParseTableBuilderImpl : public ParseTableBuilder { Symbol token = Symbol::terminal(i); bool has_non_reciprocal_conflict = false; - for (Symbol incompatible_token : incompatible_tokens_by_token_index[i]) { + for (Symbol incompatible_token : lex_table_builder->get_incompatible_tokens(i)) { if (incompatible_token.is_terminal() && - !incompatible_tokens_by_token_index[incompatible_token.index].count(token)) { + !lex_table_builder->get_incompatible_tokens(incompatible_token.index).count(token)) { has_non_reciprocal_conflict = true; break; } @@ -355,28 +356,6 @@ class ParseTableBuilderImpl : public ParseTableBuilder { return false; } - void compute_unmergable_token_pairs() { - for (unsigned i = 0, n = lexical_grammar.variables.size(); i < n; i++) { - Symbol token = Symbol::terminal(i); - auto &incompatible_indices = incompatible_tokens_by_token_index[i]; - - for (unsigned j = 0; j < n; j++) { - if (i == j) continue; - if (lex_table_builder->detect_conflict(i, j, following_tokens_by_token_index)) { - incompatible_indices.insert(Symbol::terminal(j)); - } - } - - for (const ExternalToken &external_token : grammar.external_tokens) { - if (external_token.corresponding_internal_token == token) { - for (unsigned j = 0; j < grammar.external_tokens.size(); j++) { - incompatible_indices.insert(Symbol::external(j)); - } - } - } - } - } - void remove_duplicate_parse_states() { unordered_map> state_indices_by_signature; @@ -474,7 +453,7 @@ class ParseTableBuilderImpl : public ParseTableBuilder { if (left_entry.second.actions.back().type != ParseActionTypeReduce) return false; if (!has_actions(right_state, left_entry.second)) return false; if (!lookahead.is_built_in()) { - for (const Symbol &incompatible_token : incompatible_tokens_by_token_index[lookahead.index]) { + for (const Symbol &incompatible_token : lex_table_builder->get_incompatible_tokens(lookahead.index)) { if (right_state.terminal_entries.count(incompatible_token)) return false; } } @@ -492,7 +471,7 @@ class ParseTableBuilderImpl : public ParseTableBuilder { if (right_entry.second.actions.back().type != ParseActionTypeReduce) return false; if (!has_actions(left_state, right_entry.second)) return false; if (!lookahead.is_built_in()) { - for (const Symbol &incompatible_token : incompatible_tokens_by_token_index[lookahead.index]) { + for (const Symbol &incompatible_token : lex_table_builder->get_incompatible_tokens(lookahead.index)) { if (left_state.terminal_entries.count(incompatible_token)) return false; } } @@ -805,15 +784,12 @@ class ParseTableBuilderImpl : public ParseTableBuilder { unique_ptr ParseTableBuilder::create( const SyntaxGrammar &syntax_grammar, - const LexicalGrammar &lexical_grammar, - LexTableBuilder *lex_table_builder + const LexicalGrammar &lexical_grammar ) { - return unique_ptr( - new ParseTableBuilderImpl(syntax_grammar, lexical_grammar, lex_table_builder) - ); + return unique_ptr(new ParseTableBuilderImpl(syntax_grammar, lexical_grammar)); } -pair ParseTableBuilder::build() { +tuple ParseTableBuilder::build() { return static_cast(this)->build(); } diff --git a/src/compiler/build_tables/parse_table_builder.h b/src/compiler/build_tables/parse_table_builder.h index bab96243..1cbecb49 100644 --- a/src/compiler/build_tables/parse_table_builder.h +++ b/src/compiler/build_tables/parse_table_builder.h @@ -8,21 +8,17 @@ namespace tree_sitter { struct ParseTable; +struct LexTable; struct SyntaxGrammar; struct LexicalGrammar; namespace build_tables { -class LexTableBuilder; - class ParseTableBuilder { public: - static std::unique_ptr create( - const SyntaxGrammar &, - const LexicalGrammar &, - LexTableBuilder * - ); - std::pair build(); + static std::unique_ptr create(const SyntaxGrammar &, const LexicalGrammar &); + std::tuple build(); + protected: ParseTableBuilder() = default; }; diff --git a/src/compiler/compile.cc b/src/compiler/compile.cc index a887c4fc..ad3a64cb 100644 --- a/src/compiler/compile.cc +++ b/src/compiler/compile.cc @@ -1,6 +1,6 @@ #include "tree_sitter/compiler.h" #include "compiler/prepare_grammar/prepare_grammar.h" -#include "compiler/build_tables/build_tables.h" +#include "compiler/build_tables/parse_table_builder.h" #include "compiler/generate_code/c_code.h" #include "compiler/syntax_grammar.h" #include "compiler/lexical_grammar.h" @@ -30,8 +30,8 @@ extern "C" TSCompileResult ts_compile_grammar(const char *input) { return { nullptr, strdup(error.message.c_str()), error.type }; } - auto table_build_result = - build_tables::build_tables(syntax_grammar, lexical_grammar); + auto builder = build_tables::ParseTableBuilder::create(syntax_grammar, lexical_grammar); + auto table_build_result = builder->build(); const ParseTable &parse_table = get<0>(table_build_result); const LexTable &lex_table = get<1>(table_build_result); error = get<2>(table_build_result); diff --git a/test/compiler/build_tables/lex_table_builder_test.cc b/test/compiler/build_tables/lex_table_builder_test.cc deleted file mode 100644 index e9f70aee..00000000 --- a/test/compiler/build_tables/lex_table_builder_test.cc +++ /dev/null @@ -1,122 +0,0 @@ -#include "test_helper.h" -#include "compiler/lexical_grammar.h" -#include "compiler/build_tables/lex_table_builder.h" - -using namespace build_tables; -using namespace rules; - -START_TEST - -describe("LexTableBuilder::detect_conflict", []() { - vector separators({ - CharacterSet({ ' ', '\t' }), - }); - - it("returns false for tokens that don't match the same string", [&]() { - auto builder = LexTableBuilder::create(LexicalGrammar{ - { - LexicalVariable{ - "token_0", - VariableTypeNamed, - Rule::seq({ - CharacterSet({ 'a' }), - CharacterSet({ 'b' }), - CharacterSet({ 'c' }), - }), - false - }, - LexicalVariable{ - "token_1", - VariableTypeNamed, - Rule::seq({ - CharacterSet({ 'b' }), - CharacterSet({ 'c' }), - CharacterSet({ 'd' }), - }), - false - }, - }, - separators - }); - - AssertThat(builder->detect_conflict(0, 1, {{}, {}}), IsFalse()); - AssertThat(builder->detect_conflict(1, 0, {{}, {}}), IsFalse()); - }); - - it("returns true when the left token can match a string that the right token matches, " - "plus a separator character", [&]() { - LexicalGrammar grammar{ - { - LexicalVariable{ - "token_0", - VariableTypeNamed, - Rule::repeat(CharacterSet().include_all().exclude('\n')), // regex: /.+/ - false - }, - LexicalVariable{ - "token_1", - VariableTypeNamed, - Rule::seq({ CharacterSet({ 'a' }), CharacterSet({ 'b' }), CharacterSet({ 'c' }) }), // string: 'abc' - true - }, - }, - separators - }; - - auto builder = LexTableBuilder::create(grammar); - AssertThat(builder->detect_conflict(0, 1, {{}, {}}), IsTrue()); - AssertThat(builder->detect_conflict(1, 0, {{}, {}}), IsFalse()); - - grammar.variables[1].is_string = false; - AssertThat(builder->detect_conflict(0, 1, {{}, {}}), IsTrue()); - AssertThat(builder->detect_conflict(1, 0, {{}, {}}), IsFalse()); - }); - - it("returns true when the left token matches a string that the right token matches, " - "plus the first character of some token that can follow the right token", [&]() { - LexicalGrammar grammar{ - { - LexicalVariable{ - "token_0", - VariableTypeNamed, - Rule::seq({ - CharacterSet({ '>' }), - CharacterSet({ '=' }), - }), - true - }, - LexicalVariable{ - "token_1", - VariableTypeNamed, - Rule::seq({ - CharacterSet({ '>' }), - }), - true - }, - LexicalVariable{ - "token_2", - VariableTypeNamed, - Rule::seq({ - CharacterSet({ '=' }), - }), - true - }, - }, - separators - }; - - // If no tokens can follow token_1, then there's no conflict - auto builder = LexTableBuilder::create(grammar); - vector> following_tokens_by_token_index(3); - AssertThat(builder->detect_conflict(0, 1, following_tokens_by_token_index), IsFalse()); - AssertThat(builder->detect_conflict(1, 0, following_tokens_by_token_index), IsFalse()); - - // If token_2 can follow token_1, then token_0 conflicts with token_1 - builder = LexTableBuilder::create(grammar); - following_tokens_by_token_index[1].insert(2); - AssertThat(builder->detect_conflict(0, 1, following_tokens_by_token_index), IsTrue()); - AssertThat(builder->detect_conflict(1, 0, following_tokens_by_token_index), IsFalse()); - }); -}); - -END_TEST diff --git a/tests.gyp b/tests.gyp index af11b878..02012a0a 100644 --- a/tests.gyp +++ b/tests.gyp @@ -39,7 +39,6 @@ 'sources': [ 'test/compiler/build_tables/lex_conflict_manager_test.cc', 'test/compiler/build_tables/lex_item_test.cc', - 'test/compiler/build_tables/lex_table_builder_test.cc', 'test/compiler/build_tables/parse_item_set_builder_test.cc', 'test/compiler/build_tables/rule_can_be_blank_test.cc', 'test/compiler/prepare_grammar/expand_repeats_test.cc', From 4c9c05806a224fcdcb6af9d78e5ebfe66dd44026 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 1 Sep 2017 14:22:50 -0700 Subject: [PATCH 03/12] Merge compatible starting token states before constructing lex table --- .../build_tables/lex_table_builder.cc | 294 ++++++++++-------- src/compiler/build_tables/lex_table_builder.h | 5 +- .../build_tables/parse_table_builder.cc | 52 +++- src/compiler/lex_table.h | 4 +- src/runtime/parser.c | 5 +- 5 files changed, 219 insertions(+), 141 deletions(-) diff --git a/src/compiler/build_tables/lex_table_builder.cc b/src/compiler/build_tables/lex_table_builder.cc index 8e8cff8a..662f156c 100644 --- a/src/compiler/build_tables/lex_table_builder.cc +++ b/src/compiler/build_tables/lex_table_builder.cc @@ -34,12 +34,13 @@ using rules::Symbol; using rules::Metadata; using rules::Seq; -class StartingCharacterAggregator { +template +class StartOrEndCharacterAggregator { public: void apply(const Rule &rule) { rule.match( [this](const Seq &sequence) { - apply(*sequence.left); + apply(is_start ? *sequence.left : *sequence.right); }, [this](const rules::Choice &rule) { @@ -48,20 +49,9 @@ class StartingCharacterAggregator { } }, - [this](const rules::Repeat &rule) { - apply(*rule.rule); - }, - - [this](const rules::Metadata &rule) { - apply(*rule.rule); - }, - - [this](const rules::CharacterSet &rule) { - result.add_set(rule); - }, - - [this](const rules::Blank) {}, - + [this](const rules::Repeat &rule) { apply(*rule.rule); }, + [this](const rules::Metadata &rule) { apply(*rule.rule); }, + [this](const rules::CharacterSet &rule) { result.add_set(rule); }, [](auto) {} ); } @@ -69,26 +59,37 @@ class StartingCharacterAggregator { CharacterSet result; }; +using StartingCharacterAggregator = StartOrEndCharacterAggregator; +using EndingCharacterAggregator = StartOrEndCharacterAggregator; + class LexTableBuilderImpl : public LexTableBuilder { LexTable lex_table; const LexicalGrammar grammar; vector separator_rules; LexConflictManager conflict_manager; unordered_map lex_state_ids; - - map following_characters_by_token_index; - vector> incompatible_tokens_by_token_index; CharacterSet separator_start_characters; - CharacterSet current_conflict_detection_following_characters; - Symbol::Index current_conflict_detection_token_index; - bool current_conflict_value; + vector starting_characters_by_token; + vector following_characters_by_token; + vector> shadowed_tokens_by_token; + const vector &coincident_tokens_by_token; + vector conflict_status_by_token; + bool conflict_detection_mode; public: LexTableBuilderImpl(const SyntaxGrammar &syntax_grammar, const LexicalGrammar &lexical_grammar, - const vector> &following_tokens_by_token_index) : - grammar(lexical_grammar), - incompatible_tokens_by_token_index(lexical_grammar.variables.size()) { + const vector &following_tokens_by_token, + const vector &coincident_tokens) + : grammar(lexical_grammar), + starting_characters_by_token(lexical_grammar.variables.size()), + following_characters_by_token(lexical_grammar.variables.size()), + shadowed_tokens_by_token(lexical_grammar.variables.size()), + coincident_tokens_by_token(coincident_tokens), + conflict_detection_mode(false) { + + // Compute the possible separator rules and the set of separator characters that can occur + // immediately after any token. StartingCharacterAggregator separator_character_aggregator; for (const auto &rule : grammar.separators) { separator_rules.push_back(Repeat{rule}); @@ -96,34 +97,84 @@ class LexTableBuilderImpl : public LexTableBuilder { } separator_rules.push_back(Blank{}); separator_start_characters = separator_character_aggregator.result; - clear(); + // Compute the set of characters that each token can start with and the set of non-separator + // characters that can follow each token. for (unsigned i = 0, n = grammar.variables.size(); i < n; i++) { - Symbol token = Symbol::terminal(i); - auto &incompatible_indices = incompatible_tokens_by_token_index[i]; + StartingCharacterAggregator starting_character_aggregator; + starting_character_aggregator.apply(grammar.variables[i].rule); + starting_characters_by_token[i] = starting_character_aggregator.result; - for (unsigned j = 0; j < n; j++) { - if (i == j) continue; - if (detect_conflict(i, j, following_tokens_by_token_index)) { - incompatible_indices.insert(Symbol::terminal(j)); - } + StartingCharacterAggregator following_character_aggregator; + following_tokens_by_token[i].for_each([&](Symbol following_token) { + following_character_aggregator.apply(grammar.variables[following_token.index].rule); + }); + + // TODO - Refactor this. In general, a keyword token cannot be followed immediately by + // another alphanumeric character. But this requirement is currently not expressed anywhere in + // the grammar. So without this hack, we would be overly conservative about merging parse + // states because we would often consider `identifier` tokens to *conflict* with keyword + // tokens. + if (is_keyword(grammar.variables[i])) { + following_character_aggregator.result + .exclude('a', 'z') + .exclude('A', 'Z') + .exclude('0', '9') + .exclude('_') + .exclude('$'); } - for (const ExternalToken &external_token : syntax_grammar.external_tokens) { - if (external_token.corresponding_internal_token == token) { - for (unsigned j = 0; j < syntax_grammar.external_tokens.size(); j++) { - incompatible_indices.insert(Symbol::external(j)); - } + following_characters_by_token[i] = following_character_aggregator.result; + } + + // For each pair of tokens, generate a lex table for just those two tokens and record what + // conflicts arise. + conflict_detection_mode = true; + for (Symbol::Index i = 0, n = grammar.variables.size(); i < n; i++) { + for (Symbol::Index j = 0; j < i; j++) { + if (starting_characters_by_token[i].intersects(starting_characters_by_token[j]) || + starting_characters_by_token[i].intersects(separator_start_characters) || + starting_characters_by_token[j].intersects(separator_start_characters)) { + clear(); + add_lex_state(item_set_for_terminals(LookaheadSet({ + Symbol::terminal(i), + Symbol::terminal(j) + }))); + if (conflict_status_by_token[i]) shadowed_tokens_by_token[j].insert(Symbol::terminal(i)); + if (conflict_status_by_token[j]) shadowed_tokens_by_token[i].insert(Symbol::terminal(j)); } } } } LexTable build(ParseTable *parse_table) { + clear(); + conflict_detection_mode = false; + vector>> starting_token_sets; + for (ParseState &parse_state : parse_table->states) { - parse_state.lex_state_id = add_lex_state( - item_set_for_terminals(parse_state.terminal_entries) - ); + LookaheadSet token_set; + for (auto &entry : parse_state.terminal_entries) { + token_set.insert(entry.first); + } + + bool did_merge = false; + for (auto &pair : starting_token_sets) { + if (merge_token_set(&pair.first, token_set)) { + did_merge = true; + pair.second.push_back(&parse_state); + break; + } + } + + if (!did_merge) starting_token_sets.push_back({token_set, {&parse_state}}); + } + + for (auto &pair : starting_token_sets) { + LexStateId state_id = add_lex_state(item_set_for_terminals(pair.first)); + for (ParseState *parse_state : pair.second) { + parse_state->lex_state_id = state_id; + } } mark_fragile_tokens(parse_table); remove_duplicate_lex_states(parse_table); @@ -131,64 +182,17 @@ class LexTableBuilderImpl : public LexTableBuilder { } const set &get_incompatible_tokens(Symbol::Index index) const { - return incompatible_tokens_by_token_index[index]; - } - - bool detect_conflict(Symbol::Index left, Symbol::Index right, - const vector> &following_tokens_by_token_index) { - StartingCharacterAggregator left_starting_characters; - StartingCharacterAggregator right_starting_characters; - left_starting_characters.apply(grammar.variables[left].rule); - right_starting_characters.apply(grammar.variables[right].rule); - if (!left_starting_characters.result.intersects(right_starting_characters.result) && - !left_starting_characters.result.intersects(separator_start_characters) && - !right_starting_characters.result.intersects(separator_start_characters)) { - return false; - } - - auto following_characters_entry = following_characters_by_token_index.find(right); - if (following_characters_entry == following_characters_by_token_index.end()) { - StartingCharacterAggregator aggregator; - for (auto following_token_index : following_tokens_by_token_index[right]) { - aggregator.apply(grammar.variables[following_token_index].rule); - } - following_characters_entry = - following_characters_by_token_index.insert({right, aggregator.result}).first; - - // TODO - Refactor this. In general, a keyword token cannot be followed immediately by - // another alphanumeric character. But this requirement is currently not expressed anywhere in - // the grammar. So without this hack, we would be overly conservative about merging parse - // states because we would often consider `identifier` tokens to *conflict* with keyword - // tokens. - if (is_keyword(grammar.variables[right])) { - following_characters_entry->second - .exclude('a', 'z') - .exclude('A', 'Z') - .exclude('0', '9') - .exclude('_') - .exclude('$'); - } - } - - current_conflict_detection_token_index = right; - current_conflict_detection_following_characters = following_characters_entry->second; - add_lex_state(item_set_for_terminals({{Symbol::terminal(left), {}}, {Symbol::terminal(right), {}}})); - bool result = current_conflict_value; - clear(); - return result; + return shadowed_tokens_by_token[index]; } + private: bool is_keyword(const LexicalVariable &variable) { - return variable.is_string && iswalpha(get_last_character(variable.rule)); - } - - static uint32_t get_last_character(const Rule &rule) { - return rule.match( - [](const Seq &sequence) { return get_last_character(*sequence.right); }, - [](const rules::CharacterSet &rule) { return *rule.included_chars.begin(); }, - [](const rules::Metadata &rule) { return get_last_character(*rule.rule); }, - [](auto) { return 0; } - ); + EndingCharacterAggregator aggregator; + aggregator.apply(variable.rule); + return + !aggregator.result.includes_all && + aggregator.result.included_chars.size() == 1 && + iswalpha(*aggregator.result.included_chars.begin()); } LexStateId add_lex_state(const LexItemSet &item_set) { @@ -208,11 +212,9 @@ class LexTableBuilderImpl : public LexTableBuilder { void clear() { lex_table.states.clear(); lex_state_ids.clear(); - current_conflict_detection_following_characters = CharacterSet(); - current_conflict_value = false; + conflict_status_by_token = vector(grammar.variables.size(), false); } - private: void add_advance_actions(const LexItemSet &item_set, LexStateId state_id) { for (const auto &pair : item_set.transitions()) { const CharacterSet &characters = pair.first; @@ -221,23 +223,28 @@ class LexTableBuilderImpl : public LexTableBuilder { AdvanceAction action(-1, transition.precedence, transition.in_main_token); AcceptTokenAction &accept_action = lex_table.states[state_id].accept_action; if (accept_action.is_present()) { - bool prefer_advancing = conflict_manager.resolve(transition.destination, action, accept_action); - bool can_advance_for_accepted_token = false; - for (const LexItem &item : transition.destination.entries) { - if (item.lhs == accept_action.symbol) { - can_advance_for_accepted_token = true; - } else if (item.lhs.index == current_conflict_detection_token_index && - !prefer_advancing && !transition.in_main_token) { - current_conflict_value = true; - } - } + bool prefer_advancing = conflict_manager.resolve( + transition.destination, + action, + accept_action + ); - if (accept_action.symbol.index == current_conflict_detection_token_index && - !can_advance_for_accepted_token && - (characters.intersects(separator_start_characters) || - (characters.intersects(current_conflict_detection_following_characters) && - grammar.variables[accept_action.symbol.index].is_string))) { - current_conflict_value = true; + if (conflict_detection_mode) { + bool next_item_set_can_yield_this_token = false; + for (const LexItem &item : transition.destination.entries) { + if (item.lhs == accept_action.symbol) { + next_item_set_can_yield_this_token = true; + } else if (!prefer_advancing && !transition.in_main_token) { + conflict_status_by_token[item.lhs.index] = true; + } + } + + if (prefer_advancing && + !next_item_set_can_yield_this_token && + (characters.intersects(following_characters_by_token[accept_action.symbol.index]) || + characters.intersects(separator_start_characters))) { + conflict_status_by_token[accept_action.symbol.index] = true; + } } if (!prefer_advancing) continue; @@ -256,10 +263,15 @@ class LexTableBuilderImpl : public LexTableBuilder { item.lhs.is_built_in() || grammar.variables[item.lhs.index].is_string); AcceptTokenAction &existing_action = lex_table.states[state_id].accept_action; - if (!existing_action.is_present() || - conflict_manager.resolve(action, existing_action)) { - lex_table.states[state_id].accept_action = action; + if (existing_action.is_present()) { + if (conflict_manager.resolve(action, existing_action)) { + conflict_status_by_token[existing_action.symbol.index] = true; + } else { + conflict_status_by_token[action.symbol.index] = true; + continue; + } } + lex_table.states[state_id].accept_action = action; } } } @@ -292,6 +304,39 @@ class LexTableBuilderImpl : public LexTableBuilder { } } + bool merge_token_set(LookaheadSet *left, const LookaheadSet &right) const { + bool is_compatible = true; + + left->for_each([&](Symbol left_symbol) { + if (left_symbol.is_terminal() && !left_symbol.is_built_in() && !right.contains(left_symbol)) { + right.for_each([&](Symbol right_symbol) { + if (shadowed_tokens_by_token[left_symbol.index].count(right_symbol) || + !coincident_tokens_by_token[left_symbol.index].contains(right_symbol)) { + is_compatible = false; + return; + } + }); + } + if (!is_compatible) return; + }); + + right.for_each([&](Symbol right_symbol) { + if (right_symbol.is_terminal() && !right_symbol.is_built_in() && !left->contains(right_symbol)) { + left->for_each([&](Symbol left_symbol) { + if (shadowed_tokens_by_token[right_symbol.index].count(left_symbol) || + !coincident_tokens_by_token[right_symbol.index].contains(left_symbol)) { + is_compatible = false; + return; + } + }); + } + if (!is_compatible) return; + }); + + if (is_compatible) left->insert_all(right); + return is_compatible; + } + void remove_duplicate_lex_states(ParseTable *parse_table) { for (LexState &state : lex_table.states) { state.accept_action.is_string = false; @@ -359,10 +404,9 @@ class LexTableBuilderImpl : public LexTableBuilder { } } - LexItemSet item_set_for_terminals(const map &terminals) { + LexItemSet item_set_for_terminals(const LookaheadSet &terminals) { LexItemSet result; - for (const auto &pair : terminals) { - Symbol symbol = pair.first; + terminals.for_each([&](Symbol symbol) { if (symbol.is_terminal()) { for (const auto &rule : rules_for_symbol(symbol)) { for (const auto &separator_rule : separator_rules) { @@ -378,7 +422,7 @@ class LexTableBuilderImpl : public LexTableBuilder { } } } - } + }); return result; } @@ -401,11 +445,13 @@ class LexTableBuilderImpl : public LexTableBuilder { unique_ptr LexTableBuilder::create(const SyntaxGrammar &syntax_grammar, const LexicalGrammar &lexical_grammar, - const vector> &following_tokens) { + const vector &following_tokens, + const vector &coincident_tokens) { return unique_ptr(new LexTableBuilderImpl( syntax_grammar, lexical_grammar, - following_tokens + following_tokens, + coincident_tokens )); } diff --git a/src/compiler/build_tables/lex_table_builder.h b/src/compiler/build_tables/lex_table_builder.h index 2bb7a56a..af36c1a2 100644 --- a/src/compiler/build_tables/lex_table_builder.h +++ b/src/compiler/build_tables/lex_table_builder.h @@ -14,11 +14,14 @@ struct LexicalGrammar; namespace build_tables { +class LookaheadSet; + class LexTableBuilder { public: static std::unique_ptr create(const SyntaxGrammar &, const LexicalGrammar &, - const std::vector> &); + const std::vector &, + const std::vector &); LexTable build(ParseTable *); const std::set &get_incompatible_tokens(rules::Symbol::Index) const; diff --git a/src/compiler/build_tables/parse_table_builder.cc b/src/compiler/build_tables/parse_table_builder.cc index 3b59c8ae..4b983fa5 100644 --- a/src/compiler/build_tables/parse_table_builder.cc +++ b/src/compiler/build_tables/parse_table_builder.cc @@ -52,7 +52,8 @@ class ParseTableBuilderImpl : public ParseTableBuilder { ParseItemSetBuilder item_set_builder; unique_ptr lex_table_builder; set fragile_reductions; - vector> following_tokens_by_token_index; + vector following_tokens_by_token; + vector coincident_tokens_by_token; bool processing_recovery_states; public: @@ -60,8 +61,22 @@ class ParseTableBuilderImpl : public ParseTableBuilder { : grammar(syntax_grammar), lexical_grammar(lexical_grammar), item_set_builder(syntax_grammar, lexical_grammar), - following_tokens_by_token_index(lexical_grammar.variables.size()), - processing_recovery_states(false) {} + following_tokens_by_token(lexical_grammar.variables.size()), + coincident_tokens_by_token(lexical_grammar.variables.size()), + processing_recovery_states(false) { + + for (unsigned i = 0, n = lexical_grammar.variables.size(); i < n; i++) { + coincident_tokens_by_token[i].insert(rules::END_OF_INPUT()); + if (lexical_grammar.variables[i].is_string) { + for (unsigned j = 0; j < i; j++) { + if (lexical_grammar.variables[j].is_string) { + coincident_tokens_by_token[i].insert(Symbol::terminal(j)); + coincident_tokens_by_token[j].insert(Symbol::terminal(i)); + } + } + } + } + } tuple build() { // Ensure that the empty rename sequence has index 0. @@ -90,7 +105,8 @@ class ParseTableBuilderImpl : public ParseTableBuilder { lex_table_builder = LexTableBuilder::create( grammar, lexical_grammar, - following_tokens_by_token_index + following_tokens_by_token, + coincident_tokens_by_token ); processing_recovery_states = true; @@ -130,17 +146,18 @@ class ParseTableBuilderImpl : public ParseTableBuilder { for (unsigned i = 0; i < lexical_grammar.variables.size(); i++) { Symbol token = Symbol::terminal(i); - bool has_non_reciprocal_conflict = false; + const LexicalVariable &variable = lexical_grammar.variables[i]; + bool exclude_from_recovery_state = false; for (Symbol incompatible_token : lex_table_builder->get_incompatible_tokens(i)) { - if (incompatible_token.is_terminal() && - !lex_table_builder->get_incompatible_tokens(incompatible_token.index).count(token)) { - has_non_reciprocal_conflict = true; + if (!coincident_tokens_by_token[i].contains(incompatible_token) && + ((lexical_grammar.variables[incompatible_token.index].is_string && !variable.is_string) || + !lex_table_builder->get_incompatible_tokens(incompatible_token.index).count(token))) { + exclude_from_recovery_state = true; break; } } - - if (!has_non_reciprocal_conflict) { + if (!exclude_from_recovery_state) { add_out_of_context_parse_state(&error_state, Symbol::terminal(i)); } } @@ -163,8 +180,7 @@ class ParseTableBuilderImpl : public ParseTableBuilder { parse_table.states[state_id] = error_state; } - void add_out_of_context_parse_state(ParseState *error_state, - const rules::Symbol &symbol) { + void add_out_of_context_parse_state(ParseState *error_state, const rules::Symbol &symbol) { const ParseItemSet &item_set = recovery_item_sets_by_lookahead[symbol]; if (!item_set.entries.empty()) { ParseStateId state = add_parse_state({}, item_set); @@ -300,6 +316,16 @@ class ParseTableBuilderImpl : public ParseTableBuilder { } } + auto &terminals = state.terminal_entries; + for (auto iter = terminals.begin(), end = terminals.end(); iter != end; ++iter) { + if (iter->first.is_built_in() || iter->first.is_external()) continue; + for (auto other_iter = terminals.begin(); other_iter != iter; ++other_iter) { + if (other_iter->first.is_built_in() || other_iter->first.is_external()) continue; + coincident_tokens_by_token[iter->first.index].insert(other_iter->first); + coincident_tokens_by_token[other_iter->first.index].insert(iter->first); + } + } + return ""; } @@ -767,7 +793,7 @@ class ParseTableBuilderImpl : public ParseTableBuilder { if (left_symbol.is_terminal() && !left_symbol.is_built_in()) { right_tokens.for_each([&](Symbol right_symbol) { if (right_symbol.is_terminal() && !right_symbol.is_built_in()) { - following_tokens_by_token_index[left_symbol.index].insert(right_symbol.index); + following_tokens_by_token[left_symbol.index].insert(right_symbol); } }); } diff --git a/src/compiler/lex_table.h b/src/compiler/lex_table.h index 9317c818..6de0792d 100644 --- a/src/compiler/lex_table.h +++ b/src/compiler/lex_table.h @@ -16,6 +16,7 @@ struct AdvanceAction { AdvanceAction(); AdvanceAction(size_t, PrecedenceRange, bool); bool operator==(const AdvanceAction &other) const; + inline bool operator!=(const AdvanceAction &other) const { return !operator==(other); } LexStateId state_index; PrecedenceRange precedence_range; @@ -26,7 +27,8 @@ struct AcceptTokenAction { AcceptTokenAction(); AcceptTokenAction(rules::Symbol, int, bool); bool is_present() const; - bool operator==(const AcceptTokenAction &action) const; + bool operator==(const AcceptTokenAction &other) const; + inline bool operator!=(const AcceptTokenAction &other) const { return !operator==(other); } rules::Symbol symbol; int precedence; diff --git a/src/runtime/parser.c b/src/runtime/parser.c index a989989d..0429da3b 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -234,6 +234,7 @@ static Tree *parser__lex(Parser *self, StackVersion version) { bool found_external_token = false; bool skipped_error = false; + bool error_mode = parse_state == ERROR_STATE; int32_t first_error_character = 0; Length error_start_position, error_end_position; uint32_t last_byte_scanned = start_position.bytes; @@ -260,8 +261,7 @@ static Tree *parser__lex(Parser *self, StackVersion version) { self->lexer.token_end_position = self->lexer.current_position; } - if (lex_mode.lex_state == ERROR_STATE && - self->lexer.token_end_position.bytes <= current_position.bytes) { + if (error_mode && self->lexer.token_end_position.bytes <= current_position.bytes) { LOG("disregard_empty_token"); } else { found_external_token = true; @@ -291,6 +291,7 @@ static Tree *parser__lex(Parser *self, StackVersion version) { if (lex_mode.lex_state != self->language->lex_modes[ERROR_STATE].lex_state) { LOG("retry_in_error_mode"); + error_mode = true; lex_mode = self->language->lex_modes[ERROR_STATE]; valid_external_tokens = ts_language_enabled_external_tokens( self->language, From 8b3941764fb422c9fdcd16922075de51d7cdbf03 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 7 Sep 2017 17:48:44 -0700 Subject: [PATCH 04/12] Make outstanding_allocation_indices return a vector, not a set --- test/helpers/record_alloc.cc | 14 ++++++-------- test/helpers/record_alloc.h | 4 ++-- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/test/helpers/record_alloc.cc b/test/helpers/record_alloc.cc index a3f0b49f..43e11abe 100644 --- a/test/helpers/record_alloc.cc +++ b/test/helpers/record_alloc.cc @@ -1,9 +1,9 @@ #include #include -#include +#include using std::map; -using std::set; +using std::vector; static bool _enabled = false; static size_t _allocation_count = 0; @@ -21,10 +21,10 @@ void stop() { _enabled = false; } -set outstanding_allocation_indices() { - set result; +vector outstanding_allocation_indices() { + vector result; for (const auto &entry : _outstanding_allocations) { - result.insert(entry.second); + result.push_back(entry.second); } return result; } @@ -38,9 +38,7 @@ size_t allocation_count() { extern "C" { static void *record_allocation(void *result) { - if (!_enabled) - return result; - + if (!_enabled) return result; _outstanding_allocations[result] = _allocation_count; _allocation_count++; return result; diff --git a/test/helpers/record_alloc.h b/test/helpers/record_alloc.h index 50cd62ad..1f5968ac 100644 --- a/test/helpers/record_alloc.h +++ b/test/helpers/record_alloc.h @@ -1,14 +1,14 @@ #ifndef HELPERS_RECORD_ALLOC_H_ #define HELPERS_RECORD_ALLOC_H_ -#include +#include namespace record_alloc { void start(); void stop(); void fail_at_allocation_index(size_t failure_index); -std::set outstanding_allocation_indices(); +std::vector outstanding_allocation_indices(); size_t allocation_count(); } // namespace record_alloc From 99d048e016d52de89f6fdaa1f98bd22fdf6e5601 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 11 Sep 2017 15:22:52 -0700 Subject: [PATCH 05/12] Simplify error recovery; eliminate recovery states The previous approach to error recovery relied on special error-recovery states in the parse table. For each token T, there was an error recovery state in which the parser looked for *any* token that could follow T. Unfortunately, sometimes the set of tokens that could follow T contained conflicts. For example, in JS, the token '}' can be followed by the open-ended 'template_chars' token, but also by ordinary tokens like 'identifier'. So with the old algorithm, when recovering from an unexpected '}' token, the lexer had no way to distinguish identifiers from template_chars. This commit drops the error recovery states. Instead, when we encounter an unexpected token T, we recover from the error by finding a previous state S in the stack in which T would be valid, popping all of the nodes after S, and wrapping them in an error. This way, the lexer is always invoked in a normal parse state, in which it is looking for a non-conflicting set of tokens. Eliminating the error recovery states also shrinks the lex state machine significantly. Signed-off-by: Rick Winfrey --- include/tree_sitter/parser.h | 9 +- .../build_tables/parse_table_builder.cc | 52 +- src/compiler/generate_code/c_code.cc | 2 +- src/compiler/parse_table.cc | 5 +- src/compiler/parse_table.h | 2 +- src/runtime/error_costs.c | 22 +- src/runtime/error_costs.h | 2 +- src/runtime/language.c | 31 +- src/runtime/parser.c | 464 ++++-------------- src/runtime/stack.c | 239 +++++---- src/runtime/stack.h | 16 +- test/fixtures/error_corpus/c_errors.txt | 39 +- .../error_corpus/javascript_errors.txt | 34 +- test/runtime/parser_test.cc | 4 +- test/runtime/stack_test.cc | 45 -- 15 files changed, 327 insertions(+), 639 deletions(-) diff --git a/include/tree_sitter/parser.h b/include/tree_sitter/parser.h index e4b27d12..d9a8e197 100644 --- a/include/tree_sitter/parser.h +++ b/include/tree_sitter/parser.h @@ -139,12 +139,9 @@ typedef struct TSLanguage { } \ } -#define RECOVER(state_value) \ - { \ - { \ - .type = TSParseActionTypeRecover, \ - .params = {.state = state_value} \ - } \ +#define RECOVER() \ + { \ + { .type = TSParseActionTypeRecover } \ } #define SHIFT_EXTRA() \ diff --git a/src/compiler/build_tables/parse_table_builder.cc b/src/compiler/build_tables/parse_table_builder.cc index 4b983fa5..8666ce1d 100644 --- a/src/compiler/build_tables/parse_table_builder.cc +++ b/src/compiler/build_tables/parse_table_builder.cc @@ -44,7 +44,6 @@ struct ParseStateQueueEntry { class ParseTableBuilderImpl : public ParseTableBuilder { const SyntaxGrammar grammar; const LexicalGrammar lexical_grammar; - unordered_map recovery_item_sets_by_lookahead; unordered_map state_ids_by_item_set; vector item_sets_by_state_id; deque parse_state_queue; @@ -54,7 +53,6 @@ class ParseTableBuilderImpl : public ParseTableBuilder { set fragile_reductions; vector following_tokens_by_token; vector coincident_tokens_by_token; - bool processing_recovery_states; public: ParseTableBuilderImpl(const SyntaxGrammar &syntax_grammar, const LexicalGrammar &lexical_grammar) @@ -62,8 +60,7 @@ class ParseTableBuilderImpl : public ParseTableBuilder { lexical_grammar(lexical_grammar), item_set_builder(syntax_grammar, lexical_grammar), following_tokens_by_token(lexical_grammar.variables.size()), - coincident_tokens_by_token(lexical_grammar.variables.size()), - processing_recovery_states(false) { + coincident_tokens_by_token(lexical_grammar.variables.size()) { for (unsigned i = 0, n = lexical_grammar.variables.size(); i < n; i++) { coincident_tokens_by_token[i].insert(rules::END_OF_INPUT()); @@ -109,10 +106,7 @@ class ParseTableBuilderImpl : public ParseTableBuilder { coincident_tokens_by_token ); - processing_recovery_states = true; build_error_parse_state(error_state_id); - process_part_state_queue(); - mark_fragile_actions(); remove_duplicate_parse_states(); @@ -142,8 +136,6 @@ class ParseTableBuilderImpl : public ParseTableBuilder { } void build_error_parse_state(ParseStateId state_id) { - ParseState error_state; - for (unsigned i = 0; i < lexical_grammar.variables.size(); i++) { Symbol token = Symbol::terminal(i); const LexicalVariable &variable = lexical_grammar.variables[i]; @@ -158,38 +150,21 @@ class ParseTableBuilderImpl : public ParseTableBuilder { } } if (!exclude_from_recovery_state) { - add_out_of_context_parse_state(&error_state, Symbol::terminal(i)); + parse_table.add_terminal_action(state_id, Symbol::terminal(i), ParseAction::Recover()); } } for (const Symbol &symbol : grammar.extra_tokens) { - if (!error_state.terminal_entries.count(symbol)) { - error_state.terminal_entries[symbol].actions.push_back(ParseAction::ShiftExtra()); + if (!parse_table.states[state_id].terminal_entries.count(symbol)) { + parse_table.add_terminal_action(state_id, symbol, ParseAction::ShiftExtra()); } } for (size_t i = 0; i < grammar.external_tokens.size(); i++) { - add_out_of_context_parse_state(&error_state, Symbol::external(i)); + parse_table.states[state_id].terminal_entries[Symbol::external(i)].actions.push_back(ParseAction::Recover()); } - for (size_t i = 0; i < grammar.variables.size(); i++) { - add_out_of_context_parse_state(&error_state, Symbol::non_terminal(i)); - } - - error_state.terminal_entries[END_OF_INPUT()].actions.push_back(ParseAction::Recover(0)); - parse_table.states[state_id] = error_state; - } - - void add_out_of_context_parse_state(ParseState *error_state, const rules::Symbol &symbol) { - const ParseItemSet &item_set = recovery_item_sets_by_lookahead[symbol]; - if (!item_set.entries.empty()) { - ParseStateId state = add_parse_state({}, item_set); - if (symbol.is_non_terminal()) { - error_state->nonterminal_entries[symbol.index] = state; - } else { - error_state->terminal_entries[symbol].actions.assign({ ParseAction::Recover(state) }); - } - } + parse_table.add_terminal_action(state_id, END_OF_INPUT(), ParseAction::Recover()); } ParseStateId add_parse_state(SymbolSequence &&preceding_symbols, const ParseItemSet &item_set) { @@ -241,7 +216,7 @@ class ParseTableBuilderImpl : public ParseTableBuilder { parse_table.add_terminal_action(state_id, lookahead, action); } else { ParseAction &existing_action = entry.actions[0]; - if (existing_action.type == ParseActionTypeAccept || processing_recovery_states) { + if (existing_action.type == ParseActionTypeAccept) { entry.actions.push_back(action); } else { if (action.precedence > existing_action.precedence) { @@ -281,11 +256,8 @@ class ParseTableBuilderImpl : public ParseTableBuilder { ParseItemSet &next_item_set = pair.second; ParseStateId next_state_id = add_parse_state(append_symbol(sequence, lookahead), next_item_set); - if (!processing_recovery_states) { - recovery_item_sets_by_lookahead[lookahead].add(next_item_set); - if (!parse_table.states[state_id].terminal_entries[lookahead].actions.empty()) { - lookaheads_with_conflicts.insert(lookahead); - } + if (!parse_table.states[state_id].terminal_entries[lookahead].actions.empty()) { + lookaheads_with_conflicts.insert(lookahead); } parse_table.add_terminal_action(state_id, lookahead, ParseAction::Shift(next_state_id)); @@ -297,9 +269,6 @@ class ParseTableBuilderImpl : public ParseTableBuilder { ParseItemSet &next_item_set = pair.second; ParseStateId next_state_id = add_parse_state(append_symbol(sequence, lookahead), next_item_set); parse_table.set_nonterminal_action(state_id, lookahead.index, next_state_id); - if (!processing_recovery_states) { - recovery_item_sets_by_lookahead[lookahead].add(next_item_set); - } } for (Symbol lookahead : lookaheads_with_conflicts) { @@ -310,8 +279,7 @@ class ParseTableBuilderImpl : public ParseTableBuilder { ParseAction shift_extra = ParseAction::ShiftExtra(); ParseState &state = parse_table.states[state_id]; for (const Symbol &extra_symbol : grammar.extra_tokens) { - if (!state.terminal_entries.count(extra_symbol) || - state.has_shift_action() || processing_recovery_states) { + if (!state.terminal_entries.count(extra_symbol) || state.has_shift_action()) { parse_table.add_terminal_action(state_id, extra_symbol, shift_extra); } } diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index cbc656a8..d11b3084 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -656,7 +656,7 @@ class CCodeGenerator { add(")"); break; case ParseActionTypeRecover: - add("RECOVER(" + to_string(action.state_index) + ")"); + add("RECOVER()"); break; default: {} } diff --git a/src/compiler/parse_table.cc b/src/compiler/parse_table.cc index e79e02dd..4d10907c 100644 --- a/src/compiler/parse_table.cc +++ b/src/compiler/parse_table.cc @@ -40,10 +40,9 @@ ParseAction ParseAction::Shift(ParseStateId state_index) { return result; } -ParseAction ParseAction::Recover(ParseStateId state_index) { +ParseAction ParseAction::Recover() { ParseAction result; result.type = ParseActionTypeRecover; - result.state_index = state_index; return result; } @@ -133,7 +132,7 @@ bool ParseState::has_shift_action() const { void ParseState::each_referenced_state(function fn) { for (auto &entry : terminal_entries) for (ParseAction &action : entry.second.actions) - if ((action.type == ParseActionTypeShift && !action.extra) || action.type == ParseActionTypeRecover) + if (action.type == ParseActionTypeShift && !action.extra) fn(&action.state_index); for (auto &entry : nonterminal_entries) fn(&entry.second); diff --git a/src/compiler/parse_table.h b/src/compiler/parse_table.h index 40a44dfe..39e0080b 100644 --- a/src/compiler/parse_table.h +++ b/src/compiler/parse_table.h @@ -27,7 +27,7 @@ struct ParseAction { static ParseAction Accept(); static ParseAction Error(); static ParseAction Shift(ParseStateId state_index); - static ParseAction Recover(ParseStateId state_index); + static ParseAction Recover(); static ParseAction Reduce(rules::Symbol symbol, size_t child_count, int precedence, int dynamic_precedence, rules::Associativity, unsigned alias_sequence_id); diff --git a/src/runtime/error_costs.c b/src/runtime/error_costs.c index ac055f45..d165572e 100644 --- a/src/runtime/error_costs.c +++ b/src/runtime/error_costs.c @@ -4,25 +4,21 @@ static const unsigned MAX_COST_DIFFERENCE = 16 * ERROR_COST_PER_SKIPPED_TREE; static const unsigned MAX_PUSH_COUNT_WITH_COUNT_DIFFERENCE = 24; ErrorComparison error_status_compare(ErrorStatus a, ErrorStatus b, bool are_mergeable) { - if (a.count < b.count) { - if (are_mergeable || - a.cost <= b.cost || - a.count + 1 < b.count || - b.push_count > MAX_PUSH_COUNT_WITH_COUNT_DIFFERENCE) { + ErrorComparison result = ErrorComparisonNone; + + if (!a.recovering && b.recovering) { + if (a.push_count > MAX_PUSH_COUNT_WITH_COUNT_DIFFERENCE) { return ErrorComparisonTakeLeft; } else { - return ErrorComparisonPreferLeft; + result = ErrorComparisonPreferLeft; } } - if (b.count < a.count) { - if (are_mergeable || - b.cost <= a.cost || - b.count + 1 < a.count || - a.push_count > MAX_PUSH_COUNT_WITH_COUNT_DIFFERENCE) { + if (!b.recovering && a.recovering) { + if (b.push_count > MAX_PUSH_COUNT_WITH_COUNT_DIFFERENCE) { return ErrorComparisonTakeRight; } else { - return ErrorComparisonPreferRight; + result = ErrorComparisonPreferRight; } } @@ -42,5 +38,5 @@ ErrorComparison error_status_compare(ErrorStatus a, ErrorStatus b, bool are_merg } } - return ErrorComparisonNone; + return result; } diff --git a/src/runtime/error_costs.h b/src/runtime/error_costs.h index f65b9c93..380b96bd 100644 --- a/src/runtime/error_costs.h +++ b/src/runtime/error_costs.h @@ -13,10 +13,10 @@ extern "C" { #define ERROR_COST_PER_SKIPPED_CHAR 1 typedef struct { - unsigned count; unsigned cost; unsigned push_count; unsigned depth; + bool recovering; } ErrorStatus; typedef enum { diff --git a/src/runtime/language.c b/src/runtime/language.c index 8f6c37ac..cb4e7383 100644 --- a/src/runtime/language.c +++ b/src/runtime/language.c @@ -2,33 +2,22 @@ #include "runtime/tree.h" #include "runtime/error_costs.h" -static const TSParseAction SHIFT_ERROR = { - .type = TSParseActionTypeShift, - .params.state = ERROR_STATE, -}; - void ts_language_table_entry(const TSLanguage *self, TSStateId state, TSSymbol symbol, TableEntry *result) { - uint32_t action_index; if (symbol == ts_builtin_sym_error) { - if (state == ERROR_STATE) { - result->action_count = 1; - result->is_reusable = false; - result->depends_on_lookahead = false; - result->actions = &SHIFT_ERROR; - return; - } - action_index = 0; + result->action_count = 0; + result->is_reusable = false; + result->actions = NULL; + return; } else { assert(symbol < self->token_count); - action_index = self->parse_table[state * self->symbol_count + symbol]; + uint32_t action_index = self->parse_table[state * self->symbol_count + symbol]; + const TSParseActionEntry *entry = &self->parse_actions[action_index]; + result->action_count = entry->count; + result->is_reusable = entry->reusable; + result->depends_on_lookahead = entry->depends_on_lookahead; + result->actions = (const TSParseAction *)(entry + 1); } - - const TSParseActionEntry *entry = &self->parse_actions[action_index]; - result->action_count = entry->count; - result->is_reusable = entry->reusable; - result->depends_on_lookahead = entry->depends_on_lookahead; - result->actions = (const TSParseAction *)(entry + 1); } uint32_t ts_language_symbol_count(const TSLanguage *language) { diff --git a/src/runtime/parser.c b/src/runtime/parser.c index 0429da3b..60aa7da4 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -35,23 +35,6 @@ #define SYM_NAME(symbol) ts_language_symbol_name(self->language, symbol) static const uint32_t MAX_VERSION_COUNT = 10; -static const uint32_t MAX_PRECEDING_TREES_TO_SKIP = 32; - -typedef struct { - Parser *parser; - TSSymbol lookahead_symbol; - TreeArray *trees_above_error; - uint32_t tree_count_above_error; - bool found_repair; - ReduceAction best_repair; - TSStateId best_repair_next_state; - uint32_t best_repair_skip_count; -} ErrorRepairSession; - -typedef struct { - Parser *parser; - TSSymbol lookahead_symbol; -} SkipPrecedingTreesSession; static void parser__log(Parser *self) { if (self->lexer.logger.log) { @@ -110,8 +93,8 @@ static bool parser__breakdown_top_of_stack(Parser *self, StackVersion version) { LOG("breakdown_top_of_stack tree:%s", SYM_NAME(parent->symbol)); LOG_STACK(); - ts_stack_decrease_push_count(self->stack, slice.version, - parent->child_count + 1); + ts_stack_decrease_push_count(self->stack, slice.version, parent->child_count + 1); + ts_tree_release(parent); array_delete(&slice.trees); } @@ -148,7 +131,7 @@ static bool parser__condense_stack(Parser *self) { } ErrorStatus right_error_status = ts_stack_error_status(self->stack, i); - if (right_error_status.count == 0) all_versions_have_error = false; + if (!right_error_status.recovering) all_versions_have_error = false; for (StackVersion j = 0; j < i; j++) { bool can_merge = ts_stack_can_merge(self->stack, i, j); @@ -210,7 +193,6 @@ static bool parser__condense_stack(Parser *self) { } static void parser__restore_external_scanner(Parser *self, Tree *external_token) { - LOG("restore_external_scanner"); if (external_token) { self->language->external_scanner.deserialize( self->external_scanner_payload, @@ -222,8 +204,7 @@ static void parser__restore_external_scanner(Parser *self, Tree *external_token) } } -static Tree *parser__lex(Parser *self, StackVersion version) { - TSStateId parse_state = ts_stack_top_state(self->stack, version); +static Tree *parser__lex(Parser *self, StackVersion version, TSStateId parse_state) { Length start_position = ts_stack_top_position(self->stack, version); Tree *external_token = ts_stack_last_external_token(self->stack, version); TSLexMode lex_mode = self->language->lex_modes[parse_state]; @@ -289,7 +270,7 @@ static Tree *parser__lex(Parser *self, StackVersion version) { break; } - if (lex_mode.lex_state != self->language->lex_modes[ERROR_STATE].lex_state) { + if (!error_mode) { LOG("retry_in_error_mode"); error_mode = true; lex_mode = self->language->lex_modes[ERROR_STATE]; @@ -463,7 +444,7 @@ static Tree *parser__get_lookahead(Parser *self, StackVersion version, TSStateId } } - result = parser__lex(self, version); + result = parser__lex(self, version, *state); parser__set_cached_token(self, position.bytes, last_external_token, result); ts_language_table_entry(self->language, *state, result->symbol, table_entry); return result; @@ -580,14 +561,12 @@ static bool parser__replace_children(Parser *self, Tree *tree, Tree **children, } } -static StackPopResult parser__reduce(Parser *self, StackVersion version, - TSSymbol symbol, uint32_t count, - int dynamic_precedence, uint16_t alias_sequence_id, - bool fragile, bool allow_skipping) { +static StackPopResult parser__reduce(Parser *self, StackVersion version, TSSymbol symbol, + uint32_t count, int dynamic_precedence, + uint16_t alias_sequence_id, bool fragile) { uint32_t initial_version_count = ts_stack_version_count(self->stack); StackPopResult pop = ts_stack_pop_count(self->stack, version, count); - if (pop.stopped_at_error) return pop; for (uint32_t i = 0; i < pop.slices.size; i++) { StackSlice slice = pop.slices.contents[i]; @@ -639,24 +618,6 @@ static StackPopResult parser__reduce(Parser *self, StackVersion version, parent->parse_state = state; } - // If this pop operation terminated at the end of an error region, then - // create two stack versions: one in which the parent node is interpreted - // normally, and one in which the parent node is skipped. - if (state == ERROR_STATE && allow_skipping && child_count > 1) { - StackVersion other_version = ts_stack_copy_version(self->stack, slice.version); - - ts_stack_push(self->stack, other_version, parent, false, ERROR_STATE); - for (uint32_t j = parent->child_count; j < slice.trees.size; j++) { - Tree *tree = slice.trees.contents[j]; - ts_stack_push(self->stack, other_version, tree, false, ERROR_STATE); - } - - ErrorStatus error_status = ts_stack_error_status(self->stack, other_version); - if (parser__better_version_exists(self, version, error_status)) { - ts_stack_remove_version(self->stack, other_version); - } - } - // Push the parent node onto the stack, along with any extra tokens that // were previously on top of the stack. ts_stack_push(self->stack, slice.version, parent, false, next_state); @@ -680,211 +641,6 @@ static StackPopResult parser__reduce(Parser *self, StackVersion version, return pop; } -static const TSParseAction *parser__reductions_after_sequence(Parser *self, - TSStateId start_state, - const TreeArray *trees_below, - uint32_t tree_count_below, - const TreeArray *trees_above, - TSSymbol lookahead_symbol, - uint32_t *count) { - TSStateId state = start_state; - uint32_t child_count = 0; - *count = 0; - - for (uint32_t i = 0; i < trees_below->size; i++) { - if (child_count == tree_count_below) - break; - Tree *tree = trees_below->contents[trees_below->size - 1 - i]; - if (tree->extra) continue; - TSStateId next_state = ts_language_next_state(self->language, state, tree->symbol); - if (next_state == ERROR_STATE) - return NULL; - if (next_state != state) { - child_count++; - state = next_state; - } - } - - for (uint32_t i = 0; i < trees_above->size; i++) { - Tree *tree = trees_above->contents[i]; - if (tree->extra) continue; - TSStateId next_state = ts_language_next_state(self->language, state, tree->symbol); - if (next_state == ERROR_STATE) - return NULL; - if (next_state != state) { - child_count++; - state = next_state; - } - } - - const TSParseAction *actions = - ts_language_actions(self->language, state, lookahead_symbol, count); - - if (*count > 0 && actions[*count - 1].type != TSParseActionTypeReduce) { - (*count)--; - } - - while (*count > 0 && actions[0].params.child_count < child_count) { - actions++; - (*count)--; - } - - while (*count > 0 && actions[*count - 1].params.child_count > child_count) { - (*count)--; - } - - return actions; -} - -static StackIterateAction parser__repair_error_callback(void *payload, TSStateId state, - const TreeArray *trees, - uint32_t tree_count) { - ErrorRepairSession *session = payload; - Parser *self = session->parser; - TSSymbol lookahead_symbol = session->lookahead_symbol; - ReduceActionSet *repairs = &self->reduce_actions; - TreeArray *trees_above_error = session->trees_above_error; - uint32_t tree_count_above_error = session->tree_count_above_error; - - StackIterateAction result = StackIterateNone; - - uint32_t last_repair_count = -1; - uint32_t repair_reduction_count = 0; - const TSParseAction *repair_reductions = NULL; - - for (uint32_t i = 0; i < repairs->size; i++) { - ReduceAction *repair = &repairs->contents[i]; - uint32_t count_needed_below_error = repair->count - tree_count_above_error; - if (count_needed_below_error > tree_count) - break; - - uint32_t skip_count = tree_count - count_needed_below_error; - if (session->found_repair && skip_count >= session->best_repair_skip_count) { - array_erase(repairs, i--); - continue; - } - - TSStateId state_after_repair = ts_language_next_state(self->language, state, repair->symbol); - if (state == ERROR_STATE || state_after_repair == ERROR_STATE) - continue; - - uint32_t action_count; - ts_language_actions(self->language, state_after_repair, lookahead_symbol, &action_count); - if (action_count == 0) - continue; - - if (count_needed_below_error != last_repair_count) { - last_repair_count = count_needed_below_error; - repair_reductions = parser__reductions_after_sequence( - self, state, trees, count_needed_below_error, trees_above_error, - lookahead_symbol, &repair_reduction_count); - } - - for (uint32_t j = 0; j < repair_reduction_count; j++) { - if (repair_reductions[j].params.symbol == repair->symbol) { - result |= StackIteratePop; - session->found_repair = true; - session->best_repair = *repair; - session->best_repair_skip_count = skip_count; - session->best_repair_next_state = state_after_repair; - array_erase(repairs, i--); - break; - } - } - } - - if (repairs->size == 0) - result |= StackIterateStop; - - return result; -} - -static bool parser__repair_error(Parser *self, StackSlice slice, - TSSymbol lookahead_symbol, TableEntry entry) { - LOG("repair_error"); - ErrorRepairSession session = { - .parser = self, - .lookahead_symbol = lookahead_symbol, - .found_repair = false, - .trees_above_error = &slice.trees, - .tree_count_above_error = ts_tree_array_essential_count(&slice.trees), - }; - - array_clear(&self->reduce_actions); - for (uint32_t i = 0; i < entry.action_count; i++) { - TSParseAction action = entry.actions[i]; - if (action.type == TSParseActionTypeReduce) { - TSSymbol symbol = action.params.symbol; - uint32_t child_count = action.params.child_count; - if ((child_count > session.tree_count_above_error) || - (child_count == session.tree_count_above_error && - !ts_language_symbol_metadata(self->language, symbol).visible)) - array_push(&self->reduce_actions, ((ReduceAction){ - .symbol = symbol, - .count = child_count, - .alias_sequence_id = action.params.alias_sequence_id, - })); - } - } - - StackPopResult pop = ts_stack_iterate( - self->stack, slice.version, parser__repair_error_callback, &session); - - if (!session.found_repair) { - LOG("no_repair_found"); - ts_stack_remove_version(self->stack, slice.version); - ts_tree_array_delete(&slice.trees); - return false; - } - - ReduceAction repair = session.best_repair; - TSStateId next_state = session.best_repair_next_state; - uint32_t skip_count = session.best_repair_skip_count; - - StackSlice new_slice = array_pop(&pop.slices); - TreeArray children = new_slice.trees; - ts_stack_renumber_version(self->stack, new_slice.version, slice.version); - - for (uint32_t i = pop.slices.size - 1; i + 1 > 0; i--) { - StackSlice other_slice = pop.slices.contents[i]; - ts_tree_array_delete(&other_slice.trees); - if (other_slice.version != pop.slices.contents[i + 1].version) - ts_stack_remove_version(self->stack, other_slice.version); - } - - TreeArray skipped_children = ts_tree_array_remove_last_n(&children, skip_count); - TreeArray trailing_extras = ts_tree_array_remove_trailing_extras(&skipped_children); - Tree *error = ts_tree_make_error_node(&skipped_children, self->language); - error->extra = true; - array_push(&children, error); - array_push_all(&children, &trailing_extras); - trailing_extras.size = 0; - array_delete(&trailing_extras); - - for (uint32_t i = 0; i < slice.trees.size; i++) - array_push(&children, slice.trees.contents[i]); - array_delete(&slice.trees); - - Tree *parent = ts_tree_make_node( - repair.symbol, children.size, children.contents, - repair.alias_sequence_id, self->language - ); - ts_stack_push(self->stack, slice.version, parent, false, next_state); - ts_tree_release(parent); - ts_stack_decrease_push_count(self->stack, slice.version, error->child_count); - - ErrorStatus error_status = ts_stack_error_status(self->stack, slice.version); - if (parser__better_version_exists(self, slice.version, error_status)) { - LOG("no_better_repair_found"); - ts_stack_halt(self->stack, slice.version); - return false; - } else { - LOG("repair_found sym:%s, child_count:%u, cost:%u", SYM_NAME(repair.symbol), - repair.count, parent->error_cost); - return true; - } -} - static void parser__start(Parser *self, TSInput input, Tree *previous_tree) { if (previous_tree) { LOG("parse_after_edit"); @@ -986,18 +742,12 @@ static bool parser__do_potential_reductions(Parser *self, StackVersion version) bool did_reduce = false; for (uint32_t i = 0; i < self->reduce_actions.size; i++) { ReduceAction action = self->reduce_actions.contents[i]; - StackPopResult reduction = parser__reduce( + parser__reduce( self, version, action.symbol, action.count, action.dynamic_precedence, action.alias_sequence_id, - true, false + true ); - if (reduction.stopped_at_error) { - ts_tree_array_delete(&reduction.slices.contents[0].trees); - ts_stack_remove_version(self->stack, reduction.slices.contents[0].version); - continue; - } else { - did_reduce = true; - } + did_reduce = true; } if (did_reduce) { @@ -1012,59 +762,12 @@ static bool parser__do_potential_reductions(Parser *self, StackVersion version) } } -static StackIterateAction parser__skip_preceding_trees_callback( - void *payload, TSStateId state, const TreeArray *trees, uint32_t tree_count) { - if (trees->size > MAX_PRECEDING_TREES_TO_SKIP) return StackIterateStop; - if (tree_count > 0 && state != ERROR_STATE) { - uint32_t bytes_skipped = 0; - for (uint32_t i = 0; i < trees->size; i++) { - bytes_skipped += ts_tree_total_bytes(trees->contents[i]); - } - if (bytes_skipped == 0) return StackIterateNone; - SkipPrecedingTreesSession *session = payload; - Parser *self = session->parser; - TSSymbol lookahead_symbol = session->lookahead_symbol; - uint32_t action_count; - const TSParseAction *actions = - ts_language_actions(self->language, state, lookahead_symbol, &action_count); - if (action_count > 0 && actions[0].type == TSParseActionTypeReduce) { - return StackIteratePop | StackIterateStop; - } - } - return StackIterateNone; -} - -static bool parser__skip_preceding_trees(Parser *self, StackVersion version, - TSSymbol lookahead_symbol) { - SkipPrecedingTreesSession session = { self, lookahead_symbol }; - StackPopResult pop = ts_stack_iterate( - self->stack, version, parser__skip_preceding_trees_callback, &session); - - StackVersion previous_version = STACK_VERSION_NONE; - for (uint32_t i = 0; i < pop.slices.size; i++) { - StackSlice slice = pop.slices.contents[i]; - if (slice.version == previous_version) { - ts_tree_array_delete(&slice.trees); - continue; - } - - previous_version = slice.version; - Tree *error = ts_tree_make_error_node(&slice.trees, self->language); - error->extra = true; - TSStateId state = ts_stack_top_state(self->stack, slice.version); - ts_stack_push(self->stack, slice.version, error, false, state); - ts_tree_release(error); - } - - return pop.slices.size > 0; -} - -static void parser__handle_error(Parser *self, StackVersion version, - TSSymbol lookahead_symbol) { +static void parser__handle_error(Parser *self, StackVersion version, TSSymbol lookahead_symbol) { // If there are other stack versions that are clearly better than this one, // just halt this version. ErrorStatus error_status = ts_stack_error_status(self->stack, version); - error_status.count++; + error_status.recovering = true; + error_status.cost += ERROR_COST_PER_SKIPPED_TREE; if (parser__better_version_exists(self, version, error_status)) { ts_stack_halt(self->stack, version); LOG("bail_on_error"); @@ -1073,16 +776,6 @@ static void parser__handle_error(Parser *self, StackVersion version, LOG("handle_error"); - // If the current lookahead symbol would have been valid in some previous - // state on the stack, create one stack version that repairs the error - // immediately by simply skipping all of the trees that came after that state. - if (ts_stack_version_count(self->stack) < MAX_VERSION_COUNT) { - if (parser__skip_preceding_trees(self, version, lookahead_symbol)) { - LOG("skip_preceding_trees"); - LOG_STACK(); - } - } - // Perform any reductions that could have happened in this state, regardless // of the lookahead. uint32_t previous_version_count = ts_stack_version_count(self->stack); @@ -1103,6 +796,9 @@ static void parser__handle_error(Parser *self, StackVersion version, ts_stack_push(self->stack, previous_version_count, NULL, false, ERROR_STATE); ts_stack_force_merge(self->stack, version, previous_version_count); } + + ts_stack_record_summary(self->stack, version); + LOG_STACK(); } static void parser__halt_parse(Parser *self) { @@ -1130,8 +826,84 @@ static void parser__halt_parse(Parser *self) { ts_tree_release(eof); } -static void parser__recover(Parser *self, StackVersion version, TSStateId state, - Tree *lookahead) { +static void parser__recover(Parser *self, StackVersion version, Tree *lookahead) { + unsigned previous_version_count = ts_stack_version_count(self->stack); + StackSummary *summary = ts_stack_get_summary(self->stack, version); + for (unsigned i = 0; i < summary->size; i++) { + StackSummaryEntry entry = summary->contents[i]; + if (entry.state == ERROR_STATE) continue; + unsigned depth = entry.depth + ts_stack_depth_since_error(self->stack, version); + + unsigned count = 0; + if (ts_language_actions(self->language, entry.state, lookahead->symbol, &count) && count > 0) { + LOG("recover state:%u, depth:%u", entry.state, depth); + StackPopResult pop = ts_stack_pop_count(self->stack, version, depth); + StackVersion previous_version = STACK_VERSION_NONE; + for (unsigned j = 0; j < pop.slices.size; j++) { + StackSlice slice = pop.slices.contents[j]; + if (slice.version == previous_version) { + ts_tree_array_delete(&slice.trees); + continue; + } + + if (ts_stack_top_state(self->stack, slice.version) != entry.state) { + ts_tree_array_delete(&slice.trees); + ts_stack_halt(self->stack, slice.version); + continue; + } + + StackPopResult error_pop = ts_stack_pop_error(self->stack, slice.version); + if (error_pop.slices.size > 0) { + StackSlice error_slice = error_pop.slices.contents[0]; + array_push_all(&error_slice.trees, &slice.trees); + array_delete(&slice.trees); + slice.trees = error_slice.trees; + ts_stack_renumber_version(self->stack, error_slice.version, slice.version); + } + + TreeArray trailing_extras = ts_tree_array_remove_trailing_extras(&slice.trees); + if (slice.trees.size > 0) { + Tree *error = ts_tree_make_error_node(&slice.trees, self->language); + error->extra = true; + ts_stack_push(self->stack, slice.version, error, false, entry.state); + ts_tree_release(error); + } else { + array_delete(&slice.trees); + } + previous_version = slice.version; + + for (unsigned k = 0; k < trailing_extras.size; k++) { + Tree *tree = trailing_extras.contents[k]; + ts_stack_push(self->stack, slice.version, tree, false, entry.state); + ts_tree_release(tree); + } + + array_delete(&trailing_extras); + } + break; + } + } + + for (unsigned i = previous_version_count; i < ts_stack_version_count(self->stack); i++) { + if (ts_stack_is_halted(self->stack, i)) { + ts_stack_remove_version(self->stack, i); + i--; + } else { + for (unsigned j = 0; j < i; j++) { + if (ts_stack_can_merge(self->stack, j, i)) { + ts_stack_remove_version(self->stack, i); + i--; + break; + } + } + } + } + + if (ts_stack_version_count(self->stack) > MAX_VERSION_COUNT) { + ts_stack_halt(self->stack, version); + return; + } + if (lookahead->symbol == ts_builtin_sym_end) { LOG("recover_eof"); TreeArray children = array_new(); @@ -1142,20 +914,14 @@ static void parser__recover(Parser *self, StackVersion version, TSStateId state, return; } - LOG("recover state:%u", state); + LOG("skip_token symbol:%s", SYM_NAME(lookahead->symbol)); + bool can_be_extra = ts_language_symbol_metadata(self->language, lookahead->symbol).extra; + parser__shift(self, version, ERROR_STATE, lookahead, can_be_extra); - if (ts_stack_version_count(self->stack) < MAX_VERSION_COUNT) { - StackVersion new_version = ts_stack_copy_version(self->stack, version); - bool can_be_extra = ts_language_symbol_metadata(self->language, lookahead->symbol).extra; - parser__shift(self, new_version, ERROR_STATE, lookahead, can_be_extra); - - ErrorStatus error_status = ts_stack_error_status(self->stack, new_version); - if (parser__better_version_exists(self, version, error_status)) { - ts_stack_remove_version(self->stack, new_version); - } + ErrorStatus error_status = ts_stack_error_status(self->stack, version); + if (parser__better_version_exists(self, version, error_status)) { + ts_stack_halt(self->stack, version); } - - parser__shift(self, version, state, lookahead, false); } static void parser__advance(Parser *self, StackVersion version, ReusableNode *reusable_node) { @@ -1164,7 +930,6 @@ static void parser__advance(Parser *self, StackVersion version, ReusableNode *re Tree *lookahead = parser__get_lookahead(self, version, &state, reusable_node, &table_entry); for (;;) { - bool reduction_stopped_at_error = false; StackVersion last_reduction_version = STACK_VERSION_NONE; for (uint32_t i = 0; i < table_entry.action_count; i++) { @@ -1193,26 +958,18 @@ static void parser__advance(Parser *self, StackVersion version, ReusableNode *re } case TSParseActionTypeReduce: { - if (reduction_stopped_at_error) continue; LOG("reduce sym:%s, child_count:%u", SYM_NAME(action.params.symbol), action.params.child_count); StackPopResult reduction = parser__reduce( self, version, action.params.symbol, action.params.child_count, action.params.dynamic_precedence, action.params.alias_sequence_id, - action.params.fragile, true + action.params.fragile ); StackSlice slice = *array_front(&reduction.slices); - if (reduction.stopped_at_error) { - reduction_stopped_at_error = true; - if (!parser__repair_error(self, slice, lookahead->first_leaf.symbol, table_entry)) { - break; - } - } last_reduction_version = slice.version; break; } case TSParseActionTypeAccept: { - if (ts_stack_error_status(self->stack, version).count > 0) continue; LOG("accept"); parser__accept(self, version, lookahead); ts_tree_release(lookahead); @@ -1221,13 +978,9 @@ static void parser__advance(Parser *self, StackVersion version, ReusableNode *re case TSParseActionTypeRecover: { while (lookahead->child_count > 0) { - reusable_node_breakdown(reusable_node); - ts_tree_release(lookahead); - lookahead = reusable_node->tree; - ts_tree_retain(lookahead); + parser__breakdown_lookahead(self, &lookahead, state, reusable_node); } - - parser__recover(self, version, action.params.state, lookahead); + parser__recover(self, version, lookahead); if (lookahead == reusable_node->tree) reusable_node_pop(reusable_node); ts_tree_release(lookahead); return; @@ -1307,12 +1060,13 @@ Tree *parser_parse(Parser *self, TSInput input, Tree *old_tree, bool halt_on_err do { for (version = 0; version < ts_stack_version_count(self->stack); version++) { reusable_node = self->reusable_node; - last_position = position; while (!ts_stack_is_halted(self->stack, version)) { - position = ts_stack_top_position(self->stack, version).chars; - if (position > last_position || (version > 0 && position == last_position)) + position = ts_stack_top_position(self->stack, version).bytes; + if (position > last_position || (version > 0 && position == last_position)) { + last_position = position; break; + } LOG("process version:%d, version_count:%u, state:%d, row:%u, col:%u", version, ts_stack_version_count(self->stack), diff --git a/src/runtime/stack.c b/src/runtime/stack.c index ce5f9a6d..f3907ffd 100644 --- a/src/runtime/stack.c +++ b/src/runtime/stack.c @@ -21,8 +21,6 @@ typedef struct StackNode StackNode; typedef struct { StackNode *node; Tree *tree; - uint32_t push_count; - uint32_t depth; bool is_pending; } StackLink; @@ -33,24 +31,16 @@ struct StackNode { short unsigned int link_count; uint32_t ref_count; unsigned error_cost; - unsigned error_count; + unsigned depth; }; typedef struct { StackNode *node; TreeArray trees; uint32_t tree_count; - uint32_t push_count; - uint32_t depth; bool is_pending; } Iterator; -typedef struct { - uint32_t goal_tree_count; - bool found_error; - bool found_valid_path; -} StackPopSession; - typedef struct { void *payload; StackIterateCallback callback; @@ -62,8 +52,8 @@ typedef struct { StackNode *node; Tree *last_external_token; uint32_t push_count; - uint32_t depth; bool is_halted; + StackSummary *summary; } StackHead; struct Stack { @@ -117,7 +107,7 @@ static StackNode *stack_node_new(StackNode *previous_node, Tree *tree, bool is_p StackNode *node = pool->size > 0 ? array_pop(pool) : ts_malloc(sizeof(StackNode)); - *node = (StackNode){.ref_count = 1, .link_count = 0, .state = state}; + *node = (StackNode){.ref_count = 1, .link_count = 0, .state = state, .depth = 0}; if (previous_node) { stack_node_retain(previous_node); @@ -127,30 +117,31 @@ static StackNode *stack_node_new(StackNode *previous_node, Tree *tree, bool is_p .node = previous_node, .tree = tree, .is_pending = is_pending, - .push_count = 0, - .depth = 0, }; node->position = previous_node->position; - node->error_count = previous_node->error_count; node->error_cost = previous_node->error_cost; if (tree) { + node->depth = previous_node->depth; + if (!tree->extra) node->depth++; ts_tree_retain(tree); node->error_cost += tree->error_cost; node->position = length_add(node->position, ts_tree_total_size(tree)); if (state == ERROR_STATE && !tree->extra) { node->error_cost += - ERROR_COST_PER_SKIPPED_TREE * (tree->visible ? 1 : tree->visible_child_count) + - ERROR_COST_PER_SKIPPED_CHAR * (tree->padding.chars + tree->size.chars) + - ERROR_COST_PER_SKIPPED_LINE * (tree->padding.extent.row + tree->size.extent.row); + ERROR_COST_PER_SKIPPED_TREE * ((tree->visible || tree->child_count == 0) ? 1 : tree->visible_child_count) + + ERROR_COST_PER_SKIPPED_CHAR * tree->size.chars + + ERROR_COST_PER_SKIPPED_LINE * tree->size.extent.row; + if (previous_node->links[0].tree) { + node->error_cost += + ERROR_COST_PER_SKIPPED_CHAR * tree->padding.chars + + ERROR_COST_PER_SKIPPED_LINE * tree->padding.extent.row; + } } - } else { - node->error_count++; } } else { node->position = length_zero(); - node->error_count = 0; node->error_cost = 0; } @@ -195,17 +186,19 @@ static void stack_head_delete(StackHead *self, StackNodeArray *pool) { if (self->last_external_token) { ts_tree_release(self->last_external_token); } + if (self->summary) { + array_delete(self->summary); + ts_free(self->summary); + } stack_node_release(self->node, pool); } } -static StackVersion ts_stack__add_version(Stack *self, StackNode *node, - uint32_t push_count, uint32_t depth, - Tree *last_external_token) { +static StackVersion ts_stack__add_version(Stack *self, StackVersion original_version, + StackNode *node, Tree *last_external_token) { StackHead head = { .node = node, - .depth = depth, - .push_count = push_count, + .push_count = self->heads.contents[original_version].push_count, .last_external_token = last_external_token, .is_halted = false, }; @@ -215,38 +208,35 @@ static StackVersion ts_stack__add_version(Stack *self, StackNode *node, return (StackVersion)(self->heads.size - 1); } -static void ts_stack__add_slice(Stack *self, StackNode *node, TreeArray *trees, - uint32_t push_count, uint32_t depth, - Tree *last_external_token) { +static void ts_stack__add_slice(Stack *self, StackVersion original_version, StackNode *node, + TreeArray *trees, Tree *last_external_token) { for (uint32_t i = self->slices.size - 1; i + 1 > 0; i--) { StackVersion version = self->slices.contents[i].version; if (self->heads.contents[version].node == node) { - StackSlice slice = { *trees, version }; + StackSlice slice = {*trees, version}; array_insert(&self->slices, i + 1, slice); return; } } - StackVersion version = ts_stack__add_version(self, node, push_count, depth, last_external_token); + StackVersion version = ts_stack__add_version(self, original_version, node, last_external_token); StackSlice slice = { *trees, version }; array_push(&self->slices, slice); } inline StackPopResult stack__iter(Stack *self, StackVersion version, - StackIterateInternalCallback callback, void *payload) { + StackIterateInternalCallback callback, void *payload, + bool include_trees) { array_clear(&self->slices); array_clear(&self->iterators); StackHead *head = array_get(&self->heads, version); - uint32_t starting_push_count = head->push_count; Tree *last_external_token = head->last_external_token; Iterator iterator = { .node = head->node, .trees = array_new(), .tree_count = 0, .is_pending = true, - .push_count = 0, - .depth = head->depth, }; array_push(&self->iterators, iterator); @@ -266,10 +256,9 @@ inline StackPopResult stack__iter(Stack *self, StackVersion version, ts_tree_array_reverse(&trees); ts_stack__add_slice( self, + version, node, &trees, - starting_push_count + iterator->push_count, - iterator->depth, last_external_token ); } @@ -298,28 +287,27 @@ inline StackPopResult stack__iter(Stack *self, StackVersion version, } next_iterator->node = link.node; - next_iterator->push_count += link.push_count; - if (link.depth > 0) { - next_iterator->depth = link.depth; - } if (link.tree) { + if (include_trees) { + array_push(&next_iterator->trees, link.tree); + ts_tree_retain(link.tree); + } + if (!link.tree->extra) { next_iterator->tree_count++; - next_iterator->depth--; if (!link.is_pending) { next_iterator->is_pending = false; } } - array_push(&next_iterator->trees, link.tree); - ts_tree_retain(link.tree); } else { + next_iterator->tree_count++; next_iterator->is_pending = false; } } } } - return (StackPopResult){ false, self->slices }; + return (StackPopResult){self->slices}; } Stack *ts_stack_new() { @@ -375,8 +363,7 @@ unsigned ts_stack_push_count(const Stack *self, StackVersion version) { return array_get(&self->heads, version)->push_count; } -void ts_stack_decrease_push_count(Stack *self, StackVersion version, - unsigned decrement) { +void ts_stack_decrease_push_count(Stack *self, StackVersion version, unsigned decrement) { array_get(&self->heads, version)->push_count -= decrement; } @@ -395,29 +382,18 @@ ErrorStatus ts_stack_error_status(const Stack *self, StackVersion version) { StackHead *head = array_get(&self->heads, version); return (ErrorStatus){ .cost = head->node->error_cost, - .count = head->node->error_count, + .recovering = head->node->state == ERROR_STATE, .push_count = head->push_count, - .depth = head->depth, }; } -unsigned ts_stack_error_count(const Stack *self, StackVersion version) { - StackNode *node = array_get(&self->heads, version)->node; - return node->error_count; -} - -void ts_stack_push(Stack *self, StackVersion version, Tree *tree, - bool is_pending, TSStateId state) { +void ts_stack_push(Stack *self, StackVersion version, Tree *tree, bool pending, TSStateId state) { StackHead *head = array_get(&self->heads, version); - StackNode *new_node = stack_node_new(head->node, tree, is_pending, state, &self->node_pool); + StackNode *new_node = stack_node_new(head->node, tree, pending, state, &self->node_pool); if (state == ERROR_STATE) { - new_node->links[0].push_count = head->push_count; - new_node->links[0].depth = head->depth; head->push_count = 0; - head->depth = 0; - } else { + } else if (!tree->extra) { head->push_count++; - if (!tree->extra) head->depth++; } stack_node_release(head->node, &self->node_pool); head->node = new_node; @@ -431,55 +407,20 @@ inline StackIterateAction iterate_callback(void *payload, const Iterator *iterat StackPopResult ts_stack_iterate(Stack *self, StackVersion version, StackIterateCallback callback, void *payload) { StackIterateSession session = {payload, callback}; - return stack__iter(self, version, iterate_callback, &session); + return stack__iter(self, version, iterate_callback, &session, true); } inline StackIterateAction pop_count_callback(void *payload, const Iterator *iterator) { - StackPopSession *pop_session = (StackPopSession *)payload; - - if (iterator->tree_count == pop_session->goal_tree_count) { - pop_session->found_valid_path = true; + unsigned *goal_tree_count = payload; + if (iterator->tree_count == *goal_tree_count) { return StackIteratePop | StackIterateStop; + } else { + return StackIterateNone; } - - if (iterator->node->state == ERROR_STATE) { - if (pop_session->found_valid_path || pop_session->found_error) { - return StackIterateStop; - } else { - pop_session->found_error = true; - return StackIteratePop | StackIterateStop; - } - } - return StackIterateNone; } -StackPopResult ts_stack_pop_count(Stack *self, StackVersion version, - uint32_t count) { - StackPopSession session = { - .goal_tree_count = count, - .found_error = false, - .found_valid_path = false, - }; - - StackPopResult pop = stack__iter(self, version, pop_count_callback, &session); - - if (session.found_error) { - if (session.found_valid_path) { - StackSlice error_slice = pop.slices.contents[0]; - ts_tree_array_delete(&error_slice.trees); - array_erase(&pop.slices, 0); - if (array_front(&pop.slices)->version != error_slice.version) { - ts_stack_remove_version(self, error_slice.version); - for (StackVersion i = 0; i < pop.slices.size; i++) { - pop.slices.contents[i].version--; - } - } - } else { - pop.stopped_at_error = true; - } - } - - return pop; +StackPopResult ts_stack_pop_count(Stack *self, StackVersion version, uint32_t count) { + return stack__iter(self, version, pop_count_callback, &count, true); } inline StackIterateAction pop_pending_callback(void *payload, const Iterator *iterator) { @@ -495,7 +436,7 @@ inline StackIterateAction pop_pending_callback(void *payload, const Iterator *it } StackPopResult ts_stack_pop_pending(Stack *self, StackVersion version) { - StackPopResult pop = stack__iter(self, version, pop_pending_callback, NULL); + StackPopResult pop = stack__iter(self, version, pop_pending_callback, NULL, true); if (pop.slices.size > 0) { ts_stack_renumber_version(self, pop.slices.contents[0].version, version); pop.slices.contents[0].version = version; @@ -503,12 +444,71 @@ StackPopResult ts_stack_pop_pending(Stack *self, StackVersion version) { return pop; } +inline StackIterateAction pop_error_callback(void *payload, const Iterator *iterator) { + if (iterator->trees.size > 0) { + bool *found_error = payload; + if (!*found_error && iterator->trees.contents[0]->symbol == ts_builtin_sym_error) { + *found_error = true; + return StackIteratePop | StackIterateStop; + } else { + return StackIterateStop; + } + } else { + return StackIterateNone; + } +} + +StackPopResult ts_stack_pop_error(Stack *self, StackVersion version) { + StackNode *node = array_get(&self->heads, version)->node; + for (unsigned i = 0; i < node->link_count; i++) { + if (node->links[i].tree && node->links[i].tree->symbol == ts_builtin_sym_error) { + bool found_error = false; + return stack__iter(self, version, pop_error_callback, &found_error, true); + } + } + return (StackPopResult){.slices = array_new()}; +} + inline StackIterateAction pop_all_callback(void *payload, const Iterator *iterator) { return iterator->node->link_count == 0 ? StackIteratePop : StackIterateNone; } StackPopResult ts_stack_pop_all(Stack *self, StackVersion version) { - return stack__iter(self, version, pop_all_callback, NULL); + return stack__iter(self, version, pop_all_callback, NULL, true); +} + +inline StackIterateAction summarize_stack_callback(void *payload, const Iterator *iterator) { + StackSummary *summary = payload; + TSStateId state = iterator->node->state; + unsigned depth = iterator->tree_count; + for (unsigned i = summary->size - 1; i + 1 > 0; i--) { + StackSummaryEntry entry = summary->contents[i]; + if (entry.depth < depth) break; + if (entry.depth == depth && entry.state == state) return StackIterateNone; + } + array_push(summary, ((StackSummaryEntry){.depth = depth, .state = state})); + return StackIterateNone; +} + +void ts_stack_record_summary(Stack *self, StackVersion version) { + StackSummary *result = ts_malloc(sizeof(StackSummary)); + array_init(result); + stack__iter(self, version, summarize_stack_callback, result, false); + self->heads.contents[version].summary = result; +} + +StackSummary *ts_stack_get_summary(Stack *self, StackVersion version) { + return array_get(&self->heads, version)->summary; +} + +unsigned ts_stack_depth_since_error(Stack *self, StackVersion version) { + unsigned result = 0; + StackNode *node = array_get(&self->heads, version)->node; + while (node->state == 0) { + result++; + node = node->links[0].node; + } + return result - 1; } void ts_stack_remove_version(Stack *self, StackVersion version) { @@ -536,6 +536,7 @@ StackVersion ts_stack_copy_version(Stack *self, StackVersion version) { StackHead *head = array_back(&self->heads); stack_node_retain(head->node); if (head->last_external_token) ts_tree_retain(head->last_external_token); + head->summary = NULL; return self->heads.size - 1; } @@ -554,9 +555,8 @@ bool ts_stack_can_merge(Stack *self, StackVersion version1, StackVersion version return head1->node->state == head2->node->state && head1->node->position.chars == head2->node->position.chars && - ts_tree_external_token_state_eq(head1->last_external_token, head2->last_external_token) && - ((head1->node->error_count == 0 && head2->node->error_count == 0) || - (head1->depth == head2->depth)); + head1->node->depth == head2->node->depth && + ts_tree_external_token_state_eq(head1->last_external_token, head2->last_external_token); } void ts_stack_force_merge(Stack *self, StackVersion version1, StackVersion version2) { @@ -565,8 +565,6 @@ void ts_stack_force_merge(Stack *self, StackVersion version1, StackVersion versi for (uint32_t i = 0; i < head2->node->link_count; i++) { stack_node_add_link(head1->node, head2->node->links[i]); } - if (head2->push_count > head1->push_count) head1->push_count = head2->push_count; - if (head2->depth > head1->depth) head1->depth = head2->depth; ts_stack_remove_version(self, version2); } @@ -587,8 +585,6 @@ void ts_stack_clear(Stack *self) { array_push(&self->heads, ((StackHead){ .node = self->base_node, .last_external_token = NULL, - .depth = 0, - .push_count = 0, .is_halted = false, })); } @@ -612,8 +608,8 @@ bool ts_stack_print_dot_graph(Stack *self, const char **symbol_names, FILE *f) { fprintf( f, "node_head_%u -> node_%p [label=%u, fontcolor=blue, weight=10000, " - "labeltooltip=\"push_count: %u\ndepth: %u", - i, head->node, i, head->push_count, head->depth); + "labeltooltip=\"push_count: %u\ndepth: %u", i, head->node, i, head->push_count, head->node->depth + ); if (head->last_external_token) { TSExternalTokenState *state = &head->last_external_token->external_token_state; @@ -654,10 +650,11 @@ bool ts_stack_print_dot_graph(Stack *self, const char **symbol_names, FILE *f) { else fprintf(f, "label=\"%d\"", node->state); - fprintf(f, - " tooltip=\"position: %u,%u\nerror_count: %u\nerror_cost: %u\"];\n", - node->position.extent.row, node->position.extent.column, node->error_count, - node->error_cost); + fprintf( + f, + " tooltip=\"position: %u,%u\nerror_cost: %u\"];\n", + node->position.extent.row, node->position.extent.column, node->error_cost + ); for (int j = 0; j < node->link_count; j++) { StackLink link = node->links[j]; @@ -668,7 +665,7 @@ bool ts_stack_print_dot_graph(Stack *self, const char **symbol_names, FILE *f) { fprintf(f, "fontcolor=gray "); if (!link.tree) { - fprintf(f, "color=red, tooltip=\"push_count: %u, depth: %u\"", link.push_count, link.depth); + fprintf(f, "color=red"); } else if (link.tree->symbol == ts_builtin_sym_error) { fprintf(f, "label=\"ERROR\""); } else { diff --git a/src/runtime/stack.h b/src/runtime/stack.h index 1eeb5cd0..36e41c50 100644 --- a/src/runtime/stack.h +++ b/src/runtime/stack.h @@ -23,7 +23,6 @@ typedef struct { typedef Array(StackSlice) StackSliceArray; typedef struct { - bool stopped_at_error; StackSliceArray slices; } StackPopResult; @@ -34,6 +33,13 @@ enum { StackIteratePop = 2, }; +typedef struct { + unsigned depth; + TSStateId state; +} StackSummaryEntry; + +typedef Array(StackSummaryEntry) StackSummary; + typedef StackIterateAction (*StackIterateCallback)(void *, TSStateId state, const TreeArray *trees, uint32_t tree_count); @@ -89,10 +95,18 @@ StackPopResult ts_stack_pop_count(Stack *, StackVersion, uint32_t count); StackPopResult ts_stack_iterate(Stack *, StackVersion, StackIterateCallback, void *); +StackPopResult ts_stack_pop_error(Stack *, StackVersion); + StackPopResult ts_stack_pop_pending(Stack *, StackVersion); StackPopResult ts_stack_pop_all(Stack *, StackVersion); +unsigned ts_stack_depth_since_error(Stack *, StackVersion); + +void ts_stack_record_summary(Stack *, StackVersion); + +StackSummary *ts_stack_get_summary(Stack *, StackVersion); + ErrorStatus ts_stack_error_status(const Stack *, StackVersion); bool ts_stack_merge(Stack *, StackVersion, StackVersion); diff --git a/test/fixtures/error_corpus/c_errors.txt b/test/fixtures/error_corpus/c_errors.txt index f9280ce3..194fa795 100644 --- a/test/fixtures/error_corpus/c_errors.txt +++ b/test/fixtures/error_corpus/c_errors.txt @@ -9,9 +9,11 @@ int x // no semicolon int a; #ifdef __cplusplus -extern "C" +extern "C" { #endif +int c() { return 5; } + int b; #ifdef __cplusplus @@ -23,20 +25,23 @@ int c; --- (translation_unit - (preproc_ifdef (identifier) + (preproc_ifdef + (identifier) (ERROR (type_identifier) (identifier)) (comment)) - (declaration (type_identifier) (identifier)) - - (preproc_ifdef (identifier) - (ERROR (string_literal))) - - (declaration (type_identifier) (identifier)) - - (preproc_ifdef (identifier) - (ERROR)) - + (preproc_ifdef + (identifier) + (linkage_specification + (string_literal) + (declaration_list + (ERROR) + (function_definition + (type_identifier) + (function_declarator (identifier) (parameter_list)) + (compound_statement (return_statement (number_literal)))) + (declaration (type_identifier) (identifier)) + (ERROR (identifier))))) (declaration (type_identifier) (identifier))) ======================================== @@ -76,8 +81,8 @@ int main() { (declaration (type_identifier) (init_declarator (identifier) (parenthesized_expression - (ERROR (number_literal)) - (number_literal))))))) + (number_literal) + (ERROR (number_literal)))))))) ======================================== Errors in declarations @@ -124,13 +129,15 @@ int b() { (compound_statement (declaration (type_identifier) + (ERROR (identifier)) (init_declarator (identifier) - (ERROR (identifier) (identifier)) + (ERROR (identifier)) (number_literal))) (declaration (type_identifier) + (ERROR (identifier)) (init_declarator (identifier) - (ERROR (identifier) (identifier)) + (ERROR (identifier)) (number_literal)))))) diff --git a/test/fixtures/error_corpus/javascript_errors.txt b/test/fixtures/error_corpus/javascript_errors.txt index 45eb1c33..ef67f85c 100644 --- a/test/fixtures/error_corpus/javascript_errors.txt +++ b/test/fixtures/error_corpus/javascript_errors.txt @@ -12,12 +12,13 @@ e f; (program (if_statement (parenthesized_expression - (ERROR (identifier)) - (identifier)) + (identifier) + (ERROR (identifier))) (statement_block (ERROR (identifier)) (expression_statement (identifier)))) - (expression_statement (ERROR (identifier)) (identifier))) + (ERROR (identifier)) + (expression_statement (identifier))) ======================================================= multiple invalid tokens right after the viable prefix @@ -33,16 +34,13 @@ h i j k; (program (if_statement (parenthesized_expression - (ERROR (identifier)) (identifier) - (ERROR (identifier))) + (ERROR (identifier) (identifier))) (statement_block - (expression_statement - (identifier) - (ERROR (jsx_attribute (property_identifier)) (jsx_attribute (property_identifier)) (identifier))))) - (expression_statement - (identifier) - (ERROR (jsx_attribute (property_identifier)) (jsx_attribute (property_identifier)) (identifier)))) + (ERROR (identifier) (identifier) (identifier)) + (expression_statement (identifier)))) + (ERROR (identifier) (identifier) (identifier)) + (expression_statement (identifier))) =================================================== one invalid subtree right after the viable prefix @@ -136,3 +134,17 @@ var x = !!! (function (identifier) (formal_parameters) (statement_block)) (function (identifier) (formal_parameters) (statement_block)) (ERROR (identifier))) + +========================================================= +Errors inside of a template string substitution +========================================================= + +const a = `b c ${d +} f g` + +--- + +(program + (lexical_declaration + (variable_declarator + (identifier) + (template_string (template_substitution (identifier) (ERROR)))))) diff --git a/test/runtime/parser_test.cc b/test/runtime/parser_test.cc index 2f60c0f0..c5ddce41 100644 --- a/test/runtime/parser_test.cc +++ b/test/runtime/parser_test.cc @@ -166,7 +166,7 @@ describe("Parser", [&]() { ts_document_set_language(document, load_real_language("javascript")); set_text("a; ' this string never ends"); assert_root_node( - "(ERROR (program (expression_statement (identifier))) (UNEXPECTED EOF))"); + "(program (expression_statement (identifier)) (ERROR (UNEXPECTED EOF)))"); }); }); @@ -198,7 +198,7 @@ describe("Parser", [&]() { free(string); - assert_root_node("(ERROR (UNEXPECTED INVALID))"); + assert_root_node("(program (ERROR (UNEXPECTED INVALID)))"); }); }); diff --git a/test/runtime/stack_test.cc b/test/runtime/stack_test.cc index e8a258ed..cb5abf2d 100644 --- a/test/runtime/stack_test.cc +++ b/test/runtime/stack_test.cc @@ -124,12 +124,6 @@ describe("Stack", [&]() { {1, 3}, }))); }); - - it("increments the version's push count", [&]() { - AssertThat(ts_stack_push_count(stack, 0), Equals(0)); - ts_stack_push(stack, 0, trees[0], false, stateA); - AssertThat(ts_stack_push_count(stack, 0), Equals(1)); - }); }); describe("merge()", [&]() { @@ -221,7 +215,6 @@ describe("Stack", [&]() { // ↑ // └─* StackPopResult pop = ts_stack_pop_count(stack, 0, 2); - AssertThat(pop.stopped_at_error, Equals(false)); AssertThat(pop.slices.size, Equals(1)); AssertThat(ts_stack_version_count(stack), Equals(2)); @@ -240,7 +233,6 @@ describe("Stack", [&]() { // ↑ // └─* StackPopResult pop = ts_stack_pop_count(stack, 0, 2); - AssertThat(pop.stopped_at_error, Equals(false)); AssertThat(pop.slices.size, Equals(1)); StackSlice slice = pop.slices.contents[0]; @@ -250,40 +242,6 @@ describe("Stack", [&]() { free_slice_array(&pop.slices); }); - it("stops popping entries early if it reaches an error tree", [&]() { - // . <──0── A <──1── B <──2── C <──3── ERROR <──4── D* - ts_stack_push(stack, 0, trees[3], false, ERROR_STATE); - ts_stack_push(stack, 0, trees[4], false, stateD); - - // . <──0── A <──1── B <──2── C <──3── ERROR <──4── D* - // ↑ - // └─* - StackPopResult pop = ts_stack_pop_count(stack, 0, 3); - AssertThat(pop.stopped_at_error, Equals(true)); - - AssertThat(ts_stack_version_count(stack), Equals(2)); - AssertThat(ts_stack_top_state(stack, 1), Equals(ERROR_STATE)); - - AssertThat(pop.slices.size, Equals(1)); - StackSlice slice = pop.slices.contents[0]; - AssertThat(slice.version, Equals(1)); - AssertThat(slice.trees, Equals(vector({ trees[4] }))); - - free_slice_array(&pop.slices); - }); - - it("preserves the push count of the popped version", [&]() { - // . <──0── A <──1── B <──2── C* - // ↑ - // └─* - StackPopResult pop = ts_stack_pop_count(stack, 0, 2); - - AssertThat(ts_stack_push_count(stack, 0), Equals(3)); - AssertThat(ts_stack_push_count(stack, 1), Equals(3)); - - free_slice_array(&pop.slices); - }); - describe("when the version has been merged", [&]() { before_each([&]() { // . <──0── A <──1── B <──2── C <──3── D <──10── I* @@ -475,7 +433,6 @@ describe("Stack", [&]() { ts_stack_push(stack, 0, trees[1], true, stateB); StackPopResult pop = ts_stack_pop_pending(stack, 0); - AssertThat(pop.stopped_at_error, Equals(false)); AssertThat(pop.slices.size, Equals(1)); AssertThat(get_stack_entries(stack, 0), Equals(vector({ @@ -496,7 +453,6 @@ describe("Stack", [&]() { ts_stack_push(stack, 0, trees[3], false, stateB); StackPopResult pop = ts_stack_pop_pending(stack, 0); - AssertThat(pop.stopped_at_error, Equals(false)); AssertThat(pop.slices.size, Equals(1)); AssertThat(pop.slices.contents[0].trees, Equals(vector({ trees[1], trees[2], trees[3] }))); @@ -513,7 +469,6 @@ describe("Stack", [&]() { ts_stack_push(stack, 0, trees[1], false, stateB); StackPopResult pop = ts_stack_pop_pending(stack, 0); - AssertThat(pop.stopped_at_error, Equals(false)); AssertThat(pop.slices.size, Equals(0)); AssertThat(get_stack_entries(stack, 0), Equals(vector({ From 819235bac3990795f25e292910a3d76c0cda4e00 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 12 Sep 2017 12:00:00 -0700 Subject: [PATCH 06/12] Limit the number of stack nodes that are included in a summary --- src/runtime/parser.c | 5 +++-- src/runtime/stack.c | 35 +++++++++++++++++++---------------- src/runtime/stack.h | 2 +- 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/src/runtime/parser.c b/src/runtime/parser.c index 60aa7da4..548becc3 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -34,7 +34,8 @@ #define SYM_NAME(symbol) ts_language_symbol_name(self->language, symbol) -static const uint32_t MAX_VERSION_COUNT = 10; +static const unsigned MAX_VERSION_COUNT = 10; +static const unsigned MAX_SUMMARY_DEPTH = 16; static void parser__log(Parser *self) { if (self->lexer.logger.log) { @@ -797,7 +798,7 @@ static void parser__handle_error(Parser *self, StackVersion version, TSSymbol lo ts_stack_force_merge(self->stack, version, previous_version_count); } - ts_stack_record_summary(self->stack, version); + ts_stack_record_summary(self->stack, version, MAX_SUMMARY_DEPTH); LOG_STACK(); } diff --git a/src/runtime/stack.c b/src/runtime/stack.c index f3907ffd..b4d3feb6 100644 --- a/src/runtime/stack.c +++ b/src/runtime/stack.c @@ -477,24 +477,33 @@ StackPopResult ts_stack_pop_all(Stack *self, StackVersion version) { return stack__iter(self, version, pop_all_callback, NULL, true); } +typedef struct { + StackSummary *summary; + unsigned max_depth; +} SummarizeStackSession; + inline StackIterateAction summarize_stack_callback(void *payload, const Iterator *iterator) { - StackSummary *summary = payload; + SummarizeStackSession *session = payload; TSStateId state = iterator->node->state; unsigned depth = iterator->tree_count; - for (unsigned i = summary->size - 1; i + 1 > 0; i--) { - StackSummaryEntry entry = summary->contents[i]; + if (depth > session->max_depth) return StackIterateStop; + for (unsigned i = session->summary->size - 1; i + 1 > 0; i--) { + StackSummaryEntry entry = session->summary->contents[i]; if (entry.depth < depth) break; if (entry.depth == depth && entry.state == state) return StackIterateNone; } - array_push(summary, ((StackSummaryEntry){.depth = depth, .state = state})); + array_push(session->summary, ((StackSummaryEntry){.depth = depth, .state = state})); return StackIterateNone; } -void ts_stack_record_summary(Stack *self, StackVersion version) { - StackSummary *result = ts_malloc(sizeof(StackSummary)); - array_init(result); - stack__iter(self, version, summarize_stack_callback, result, false); - self->heads.contents[version].summary = result; +void ts_stack_record_summary(Stack *self, StackVersion version, unsigned max_depth) { + SummarizeStackSession session = { + .summary = ts_malloc(sizeof(StackSummary)), + .max_depth = max_depth + }; + array_init(session.summary); + stack__iter(self, version, summarize_stack_callback, &session, false); + self->heads.contents[version].summary = session.summary; } StackSummary *ts_stack_get_summary(Stack *self, StackVersion version) { @@ -502,13 +511,7 @@ StackSummary *ts_stack_get_summary(Stack *self, StackVersion version) { } unsigned ts_stack_depth_since_error(Stack *self, StackVersion version) { - unsigned result = 0; - StackNode *node = array_get(&self->heads, version)->node; - while (node->state == 0) { - result++; - node = node->links[0].node; - } - return result - 1; + return array_get(&self->heads, version)->node->depth; } void ts_stack_remove_version(Stack *self, StackVersion version) { diff --git a/src/runtime/stack.h b/src/runtime/stack.h index 36e41c50..ce0a30ed 100644 --- a/src/runtime/stack.h +++ b/src/runtime/stack.h @@ -103,7 +103,7 @@ StackPopResult ts_stack_pop_all(Stack *, StackVersion); unsigned ts_stack_depth_since_error(Stack *, StackVersion); -void ts_stack_record_summary(Stack *, StackVersion); +void ts_stack_record_summary(Stack *, StackVersion, unsigned max_depth); StackSummary *ts_stack_get_summary(Stack *, StackVersion); From ee2906ac2e7a4578eb018a09c6ebe304ca8bce4b Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 12 Sep 2017 16:19:28 -0700 Subject: [PATCH 07/12] Don't merge stack versions that are halted --- src/runtime/stack.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/runtime/stack.c b/src/runtime/stack.c index b4d3feb6..ae1e175a 100644 --- a/src/runtime/stack.c +++ b/src/runtime/stack.c @@ -556,6 +556,7 @@ bool ts_stack_can_merge(Stack *self, StackVersion version1, StackVersion version StackHead *head1 = &self->heads.contents[version1]; StackHead *head2 = &self->heads.contents[version2]; return + !head1->is_halted && !head2->is_halted && head1->node->state == head2->node->state && head1->node->position.chars == head2->node->position.chars && head1->node->depth == head2->node->depth && From 47669e6015f80ced81281f0040a36c62b6c31989 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 12 Sep 2017 16:20:06 -0700 Subject: [PATCH 08/12] Avoid halting the only non-halted entry in recover --- src/runtime/parser.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/runtime/parser.c b/src/runtime/parser.c index 548becc3..452320b4 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -34,7 +34,7 @@ #define SYM_NAME(symbol) ts_language_symbol_name(self->language, symbol) -static const unsigned MAX_VERSION_COUNT = 10; +static const unsigned MAX_VERSION_COUNT = 6; static const unsigned MAX_SUMMARY_DEPTH = 16; static void parser__log(Parser *self) { @@ -828,6 +828,7 @@ static void parser__halt_parse(Parser *self) { } static void parser__recover(Parser *self, StackVersion version, Tree *lookahead) { + bool did_recover = false; unsigned previous_version_count = ts_stack_version_count(self->stack); StackSummary *summary = ts_stack_get_summary(self->stack, version); for (unsigned i = 0; i < summary->size; i++) { @@ -880,6 +881,7 @@ static void parser__recover(Parser *self, StackVersion version, Tree *lookahead) } array_delete(&trailing_extras); + did_recover = true; } break; } @@ -900,7 +902,7 @@ static void parser__recover(Parser *self, StackVersion version, Tree *lookahead) } } - if (ts_stack_version_count(self->stack) > MAX_VERSION_COUNT) { + if (did_recover && ts_stack_version_count(self->stack) > MAX_VERSION_COUNT) { ts_stack_halt(self->stack, version); return; } From 65ed4281d41f5b02a9a2f89bf540f34c6be678e0 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 12 Sep 2017 16:30:38 -0700 Subject: [PATCH 09/12] Exclude zeros from speeds reported in benchmarks --- test/benchmarks.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/benchmarks.cc b/test/benchmarks.cc index b070ec5e..616d2de6 100644 --- a/test/benchmarks.cc +++ b/test/benchmarks.cc @@ -83,7 +83,7 @@ int main(int argc, char *arg[]) { assert(!ts_node_has_error(ts_document_root_node(document))); size_t speed = static_cast(example.input.size()) / duration; printf(" %-30s\t%u ms\t\t%lu bytes/ms\n", example.file_name.c_str(), duration, speed); - non_error_speeds.push_back(speed); + if (speed != 0) non_error_speeds.push_back(speed); } for (auto &other_language_name : language_names) { @@ -102,7 +102,7 @@ int main(int argc, char *arg[]) { unsigned duration = (end_time - start_time) * 1000 / CLOCKS_PER_SEC; size_t speed = static_cast(example.input.size()) / duration; printf(" %-30s\t%u ms\t\t%lu bytes/ms\n", example.file_name.c_str(), duration, speed); - error_speeds.push_back(speed); + if (speed != 0) error_speeds.push_back(speed); } } From 07fb3ab0e669d04adae489784833eebd22239d16 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 13 Sep 2017 09:56:51 -0700 Subject: [PATCH 10/12] Abort recoveries before popping if better versions already exist --- src/runtime/parser.c | 11 +++++++++++ src/runtime/stack.c | 6 +++++- src/runtime/stack.h | 1 + 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/src/runtime/parser.c b/src/runtime/parser.c index 452320b4..a36cfdee 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -830,12 +830,23 @@ static void parser__halt_parse(Parser *self) { static void parser__recover(Parser *self, StackVersion version, Tree *lookahead) { bool did_recover = false; unsigned previous_version_count = ts_stack_version_count(self->stack); + Length position = ts_stack_top_position(self->stack, version); StackSummary *summary = ts_stack_get_summary(self->stack, version); for (unsigned i = 0; i < summary->size; i++) { StackSummaryEntry entry = summary->contents[i]; if (entry.state == ERROR_STATE) continue; unsigned depth = entry.depth + ts_stack_depth_since_error(self->stack, version); + ErrorStatus status = { + .recovering = false, + .push_count = 0, + .cost = + depth * ERROR_COST_PER_SKIPPED_TREE + + (position.chars - entry.position.chars) * ERROR_COST_PER_SKIPPED_CHAR + + (position.extent.row - entry.position.extent.row) * ERROR_COST_PER_SKIPPED_LINE + }; + if (parser__better_version_exists(self, version, status)) break; + unsigned count = 0; if (ts_language_actions(self->language, entry.state, lookahead->symbol, &count) && count > 0) { LOG("recover state:%u, depth:%u", entry.state, depth); diff --git a/src/runtime/stack.c b/src/runtime/stack.c index ae1e175a..a4204dce 100644 --- a/src/runtime/stack.c +++ b/src/runtime/stack.c @@ -492,7 +492,11 @@ inline StackIterateAction summarize_stack_callback(void *payload, const Iterator if (entry.depth < depth) break; if (entry.depth == depth && entry.state == state) return StackIterateNone; } - array_push(session->summary, ((StackSummaryEntry){.depth = depth, .state = state})); + array_push(session->summary, ((StackSummaryEntry){ + .position = iterator->node->position, + .depth = depth, + .state = state, + })); return StackIterateNone; } diff --git a/src/runtime/stack.h b/src/runtime/stack.h index ce0a30ed..9fd925e5 100644 --- a/src/runtime/stack.h +++ b/src/runtime/stack.h @@ -34,6 +34,7 @@ enum { }; typedef struct { + Length position; unsigned depth; TSStateId state; } StackSummaryEntry; From 71595ffde6b96b3503f04043489ff8acaf2c6ec4 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 13 Sep 2017 10:05:31 -0700 Subject: [PATCH 11/12] Only allow one stack link with a given type containing errors --- src/runtime/stack.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/runtime/stack.c b/src/runtime/stack.c index a4204dce..48922257 100644 --- a/src/runtime/stack.c +++ b/src/runtime/stack.c @@ -149,15 +149,17 @@ static StackNode *stack_node_new(StackNode *previous_node, Tree *tree, bool is_p } static bool stack__tree_is_equivalent(const Tree *left, const Tree *right) { - return left == right || ( - left && - right && - left->child_count == 0 && right->child_count == 0 && - left->symbol == right->symbol && - left->padding.bytes == right->padding.bytes && - left->size.bytes == right->size.bytes && - left->extra == right->extra && - ts_tree_external_token_state_eq(left, right)); + return + left == right || + (left && + right && + left->symbol == right->symbol && + ((left->error_cost > 0 && right->error_cost > 0) || + (left->child_count == 0 && right->child_count == 0 && + left->padding.bytes == right->padding.bytes && + left->size.bytes == right->size.bytes && + left->extra == right->extra && + ts_tree_external_token_state_eq(left, right)))); } static void stack_node_add_link(StackNode *self, StackLink link) { From d291af9a313ea139c06cba8cecee6b33ac1e1ac0 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 13 Sep 2017 16:38:15 -0700 Subject: [PATCH 12/12] Refactor error comparisons * Deal with mergeability outside of error comparison function * Make `better_version_exists` function pure (don't halt other versions as a side effect). * Tweak error comparison logic Signed-off-by: Rick Winfrey --- project.gyp | 1 - src/runtime/error_costs.c | 42 ---- src/runtime/error_costs.h | 27 --- src/runtime/parser.c | 199 ++++++++++++------ src/runtime/stack.c | 11 +- src/runtime/stack.h | 2 +- test/fixtures/error_corpus/c_errors.txt | 6 +- .../error_corpus/javascript_errors.txt | 8 +- test/runtime/parser_test.cc | 21 +- 9 files changed, 153 insertions(+), 164 deletions(-) delete mode 100644 src/runtime/error_costs.c diff --git a/project.gyp b/project.gyp index 56f742b3..7ae20e54 100644 --- a/project.gyp +++ b/project.gyp @@ -89,7 +89,6 @@ ], 'sources': [ 'src/runtime/document.c', - 'src/runtime/error_costs.c', 'src/runtime/get_changed_ranges.c', 'src/runtime/language.c', 'src/runtime/lexer.c', diff --git a/src/runtime/error_costs.c b/src/runtime/error_costs.c deleted file mode 100644 index d165572e..00000000 --- a/src/runtime/error_costs.c +++ /dev/null @@ -1,42 +0,0 @@ -#include "runtime/error_costs.h" - -static const unsigned MAX_COST_DIFFERENCE = 16 * ERROR_COST_PER_SKIPPED_TREE; -static const unsigned MAX_PUSH_COUNT_WITH_COUNT_DIFFERENCE = 24; - -ErrorComparison error_status_compare(ErrorStatus a, ErrorStatus b, bool are_mergeable) { - ErrorComparison result = ErrorComparisonNone; - - if (!a.recovering && b.recovering) { - if (a.push_count > MAX_PUSH_COUNT_WITH_COUNT_DIFFERENCE) { - return ErrorComparisonTakeLeft; - } else { - result = ErrorComparisonPreferLeft; - } - } - - if (!b.recovering && a.recovering) { - if (b.push_count > MAX_PUSH_COUNT_WITH_COUNT_DIFFERENCE) { - return ErrorComparisonTakeRight; - } else { - result = ErrorComparisonPreferRight; - } - } - - if (a.cost < b.cost) { - if (are_mergeable || (b.cost - a.cost) * (1 + a.push_count) > MAX_COST_DIFFERENCE) { - return ErrorComparisonTakeLeft; - } else { - return ErrorComparisonPreferLeft; - } - } - - if (b.cost < a.cost) { - if (are_mergeable || (a.cost - b.cost) * (1 + b.push_count) > MAX_COST_DIFFERENCE) { - return ErrorComparisonTakeRight; - } else { - return ErrorComparisonPreferRight; - } - } - - return result; -} diff --git a/src/runtime/error_costs.h b/src/runtime/error_costs.h index 380b96bd..60119aa1 100644 --- a/src/runtime/error_costs.h +++ b/src/runtime/error_costs.h @@ -1,36 +1,9 @@ #ifndef RUNTIME_ERROR_COSTS_H_ #define RUNTIME_ERROR_COSTS_H_ -#include - -#ifdef __cplusplus -extern "C" { -#endif - #define ERROR_STATE 0 #define ERROR_COST_PER_SKIPPED_TREE 100 #define ERROR_COST_PER_SKIPPED_LINE 30 #define ERROR_COST_PER_SKIPPED_CHAR 1 -typedef struct { - unsigned cost; - unsigned push_count; - unsigned depth; - bool recovering; -} ErrorStatus; - -typedef enum { - ErrorComparisonTakeLeft, - ErrorComparisonPreferLeft, - ErrorComparisonNone, - ErrorComparisonPreferRight, - ErrorComparisonTakeRight, -} ErrorComparison; - -ErrorComparison error_status_compare(ErrorStatus a, ErrorStatus b, bool can_merge); - -#ifdef __cplusplus -} -#endif - #endif diff --git a/src/runtime/parser.c b/src/runtime/parser.c index a36cfdee..eea89b5e 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -36,6 +36,21 @@ static const unsigned MAX_VERSION_COUNT = 6; static const unsigned MAX_SUMMARY_DEPTH = 16; +static const int MAX_COST_DIFFERENCE = 16 * ERROR_COST_PER_SKIPPED_TREE; + +typedef struct { + unsigned cost; + unsigned push_count; + bool is_in_error; +} ErrorStatus; + +typedef enum { + ErrorComparisonTakeLeft, + ErrorComparisonPreferLeft, + ErrorComparisonNone, + ErrorComparisonPreferRight, + ErrorComparisonTakeRight, +} ErrorComparison; static void parser__log(Parser *self) { if (self->lexer.logger.log) { @@ -120,10 +135,72 @@ static void parser__breakdown_lookahead(Parser *self, Tree **lookahead, } } -static bool parser__condense_stack(Parser *self) { - bool all_versions_have_error = true; - unsigned old_version_count = ts_stack_version_count(self->stack); +static ErrorComparison parser__compare_versions(Parser *self, ErrorStatus a, ErrorStatus b) { + if (!a.is_in_error && b.is_in_error) { + if (a.cost < b.cost) { + return ErrorComparisonTakeLeft; + } else { + return ErrorComparisonPreferLeft; + } + } + if (a.is_in_error && !b.is_in_error) { + if (b.cost < a.cost) { + return ErrorComparisonTakeRight; + } else { + return ErrorComparisonPreferRight; + } + } + + if (a.cost < b.cost) { + if ((b.cost - a.cost) * (1 + a.push_count) > MAX_COST_DIFFERENCE) { + return ErrorComparisonTakeLeft; + } else { + return ErrorComparisonPreferLeft; + } + } + + if (b.cost < a.cost) { + if ((a.cost - b.cost) * (1 + b.push_count) > MAX_COST_DIFFERENCE) { + return ErrorComparisonTakeRight; + } else { + return ErrorComparisonPreferRight; + } + } + + return ErrorComparisonNone; +} + +static bool parser__better_version_exists(Parser *self, StackVersion version, + bool is_in_error, unsigned cost) { + if (self->finished_tree && self->finished_tree->error_cost <= cost) return true; + + ErrorStatus status = {.cost = cost, .is_in_error = is_in_error, .push_count = 0}; + + for (StackVersion i = 0, n = ts_stack_version_count(self->stack); i < n; i++) { + if (i == version || ts_stack_is_halted(self->stack, i)) continue; + ErrorStatus status_i = { + .cost = ts_stack_error_cost(self->stack, i), + .is_in_error = ts_stack_top_state(self->stack, i) == ERROR_STATE, + .push_count = ts_stack_push_count(self->stack, i) + }; + switch (parser__compare_versions(self, status, status_i)) { + case ErrorComparisonTakeRight: + return true; + case ErrorComparisonPreferRight: + if (ts_stack_can_merge(self->stack, i, version)) return true; + default: + break; + } + } + + return false; +} + +static bool parser__condense_stack(Parser *self) { + bool made_changes = false; + unsigned min_error_cost = UINT_MAX; + bool all_versions_have_error = true; for (StackVersion i = 0; i < ts_stack_version_count(self->stack); i++) { if (ts_stack_is_halted(self->stack, i)) { ts_stack_remove_version(self->stack, i); @@ -131,35 +208,47 @@ static bool parser__condense_stack(Parser *self) { continue; } - ErrorStatus right_error_status = ts_stack_error_status(self->stack, i); - if (!right_error_status.recovering) all_versions_have_error = false; + ErrorStatus status_i = { + .cost = ts_stack_error_cost(self->stack, i), + .push_count = ts_stack_push_count(self->stack, i), + .is_in_error = ts_stack_top_state(self->stack, i) == ERROR_STATE, + }; + if (!status_i.is_in_error) all_versions_have_error = false; + if (status_i.cost < min_error_cost) min_error_cost = status_i.cost; for (StackVersion j = 0; j < i; j++) { - bool can_merge = ts_stack_can_merge(self->stack, i, j); - ErrorStatus left_error_status = ts_stack_error_status(self->stack, j); + ErrorStatus status_j = { + .cost = ts_stack_error_cost(self->stack, j), + .push_count = ts_stack_push_count(self->stack, j), + .is_in_error = ts_stack_top_state(self->stack, j) == ERROR_STATE, + }; - switch (error_status_compare(left_error_status, right_error_status, can_merge)) { + bool can_merge = ts_stack_can_merge(self->stack, j, i); + switch (parser__compare_versions(self, status_j, status_i)) { case ErrorComparisonTakeLeft: + made_changes = true; ts_stack_remove_version(self->stack, i); i--; j = i; break; - - case ErrorComparisonTakeRight: - ts_stack_remove_version(self->stack, j); - i--; - j--; - break; - case ErrorComparisonPreferLeft: if (can_merge) { + made_changes = true; + ts_stack_remove_version(self->stack, i); + i--; + j = i; + } + break; + case ErrorComparisonNone: + if (can_merge) { + made_changes = true; ts_stack_force_merge(self->stack, j, i); i--; j = i; } break; - case ErrorComparisonPreferRight: + made_changes = true; if (can_merge) { ts_stack_remove_version(self->stack, j); i--; @@ -169,12 +258,11 @@ static bool parser__condense_stack(Parser *self) { j = i; } break; - - case ErrorComparisonNone: - if (can_merge) { - ts_stack_force_merge(self->stack, j, i); - i--; - } + case ErrorComparisonTakeRight: + made_changes = true; + ts_stack_remove_version(self->stack, j); + i--; + j--; break; } } @@ -182,15 +270,17 @@ static bool parser__condense_stack(Parser *self) { while (ts_stack_version_count(self->stack) > MAX_VERSION_COUNT) { ts_stack_remove_version(self->stack, MAX_VERSION_COUNT); + made_changes = true; } - unsigned new_version_count = ts_stack_version_count(self->stack); - if (new_version_count != old_version_count) { + if (made_changes) { LOG("condense"); LOG_STACK(); } - return all_versions_have_error && new_version_count > 0; + return + (all_versions_have_error && ts_stack_version_count(self->stack) > 0) || + (self->finished_tree && self->finished_tree->error_cost < min_error_cost); } static void parser__restore_external_scanner(Parser *self, Tree *external_token) { @@ -501,30 +591,6 @@ static bool parser__select_tree(Parser *self, Tree *left, Tree *right) { } } -static bool parser__better_version_exists(Parser *self, StackVersion version, - ErrorStatus my_error_status) { - if (self->finished_tree && self->finished_tree->error_cost <= my_error_status.cost) return true; - - for (StackVersion i = 0, n = ts_stack_version_count(self->stack); i < n; i++) { - if (i == version || ts_stack_is_halted(self->stack, i)) continue; - - switch (error_status_compare(my_error_status, - ts_stack_error_status(self->stack, i), - ts_stack_can_merge(self->stack, i, version))) { - case ErrorComparisonTakeLeft: - LOG("halt_other version:%u", i); - ts_stack_halt(self->stack, i); - break; - case ErrorComparisonTakeRight: - if (i < version) return true; - default: - break; - } - } - - return false; -} - static void parser__shift(Parser *self, StackVersion version, TSStateId state, Tree *lookahead, bool extra) { if (extra != lookahead->extra) { @@ -766,10 +832,8 @@ static bool parser__do_potential_reductions(Parser *self, StackVersion version) static void parser__handle_error(Parser *self, StackVersion version, TSSymbol lookahead_symbol) { // If there are other stack versions that are clearly better than this one, // just halt this version. - ErrorStatus error_status = ts_stack_error_status(self->stack, version); - error_status.recovering = true; - error_status.cost += ERROR_COST_PER_SKIPPED_TREE; - if (parser__better_version_exists(self, version, error_status)) { + unsigned new_cost = ts_stack_error_cost(self->stack, version) + ERROR_COST_PER_SKIPPED_TREE; + if (parser__better_version_exists(self, version, true, new_cost)) { ts_stack_halt(self->stack, version); LOG("bail_on_error"); return; @@ -837,15 +901,11 @@ static void parser__recover(Parser *self, StackVersion version, Tree *lookahead) if (entry.state == ERROR_STATE) continue; unsigned depth = entry.depth + ts_stack_depth_since_error(self->stack, version); - ErrorStatus status = { - .recovering = false, - .push_count = 0, - .cost = - depth * ERROR_COST_PER_SKIPPED_TREE + - (position.chars - entry.position.chars) * ERROR_COST_PER_SKIPPED_CHAR + - (position.extent.row - entry.position.extent.row) * ERROR_COST_PER_SKIPPED_LINE - }; - if (parser__better_version_exists(self, version, status)) break; + unsigned new_cost = + depth * ERROR_COST_PER_SKIPPED_TREE + + (position.chars - entry.position.chars) * ERROR_COST_PER_SKIPPED_CHAR + + (position.extent.row - entry.position.extent.row) * ERROR_COST_PER_SKIPPED_LINE; + if (parser__better_version_exists(self, version, false, new_cost)) break; unsigned count = 0; if (ts_language_actions(self->language, entry.state, lookahead->symbol, &count) && count > 0) { @@ -932,8 +992,7 @@ static void parser__recover(Parser *self, StackVersion version, Tree *lookahead) bool can_be_extra = ts_language_symbol_metadata(self->language, lookahead->symbol).extra; parser__shift(self, version, ERROR_STATE, lookahead, can_be_extra); - ErrorStatus error_status = ts_stack_error_status(self->stack, version); - if (parser__better_version_exists(self, version, error_status)) { + if (parser__better_version_exists(self, version, true, ts_stack_error_cost(self->stack, version))) { ts_stack_halt(self->stack, version); } } @@ -1095,10 +1154,14 @@ Tree *parser_parse(Parser *self, TSInput input, Tree *old_tree, bool halt_on_err self->reusable_node = reusable_node; - bool all_versions_have_error = parser__condense_stack(self); - if (halt_on_error && all_versions_have_error) { - parser__halt_parse(self); - break; + bool should_halt = parser__condense_stack(self); + if (should_halt) { + if (self->finished_tree) { + break; + } else if (halt_on_error) { + parser__halt_parse(self); + break; + } } self->is_split = (version > 1); diff --git a/src/runtime/stack.c b/src/runtime/stack.c index 48922257..986d8cb5 100644 --- a/src/runtime/stack.c +++ b/src/runtime/stack.c @@ -167,7 +167,8 @@ static void stack_node_add_link(StackNode *self, StackLink link) { StackLink existing_link = self->links[i]; if (stack__tree_is_equivalent(existing_link.tree, link.tree)) { if (existing_link.node == link.node) return; - if (existing_link.node->state == link.node->state) { + if (existing_link.node->state == link.node->state && + existing_link.node->position.bytes == link.node->position.bytes) { for (int j = 0; j < link.node->link_count; j++) { stack_node_add_link(existing_link.node, link.node->links[j]); } @@ -380,13 +381,9 @@ void ts_stack_set_last_external_token(Stack *self, StackVersion version, Tree *t head->last_external_token = token; } -ErrorStatus ts_stack_error_status(const Stack *self, StackVersion version) { +unsigned ts_stack_error_cost(const Stack *self, StackVersion version) { StackHead *head = array_get(&self->heads, version); - return (ErrorStatus){ - .cost = head->node->error_cost, - .recovering = head->node->state == ERROR_STATE, - .push_count = head->push_count, - }; + return head->node->error_cost; } void ts_stack_push(Stack *self, StackVersion version, Tree *tree, bool pending, TSStateId state) { diff --git a/src/runtime/stack.h b/src/runtime/stack.h index 9fd925e5..6bb0b40b 100644 --- a/src/runtime/stack.h +++ b/src/runtime/stack.h @@ -108,7 +108,7 @@ void ts_stack_record_summary(Stack *, StackVersion, unsigned max_depth); StackSummary *ts_stack_get_summary(Stack *, StackVersion); -ErrorStatus ts_stack_error_status(const Stack *, StackVersion); +unsigned ts_stack_error_cost(const Stack *, StackVersion version); bool ts_stack_merge(Stack *, StackVersion, StackVersion); diff --git a/test/fixtures/error_corpus/c_errors.txt b/test/fixtures/error_corpus/c_errors.txt index 194fa795..7839337e 100644 --- a/test/fixtures/error_corpus/c_errors.txt +++ b/test/fixtures/error_corpus/c_errors.txt @@ -129,15 +129,13 @@ int b() { (compound_statement (declaration (type_identifier) - (ERROR (identifier)) + (ERROR (identifier) (identifier)) (init_declarator (identifier) - (ERROR (identifier)) (number_literal))) (declaration (type_identifier) - (ERROR (identifier)) + (ERROR (identifier) (identifier)) (init_declarator (identifier) - (ERROR (identifier)) (number_literal)))))) diff --git a/test/fixtures/error_corpus/javascript_errors.txt b/test/fixtures/error_corpus/javascript_errors.txt index ef67f85c..7f756028 100644 --- a/test/fixtures/error_corpus/javascript_errors.txt +++ b/test/fixtures/error_corpus/javascript_errors.txt @@ -37,10 +37,10 @@ h i j k; (identifier) (ERROR (identifier) (identifier))) (statement_block - (ERROR (identifier) (identifier) (identifier)) - (expression_statement (identifier)))) - (ERROR (identifier) (identifier) (identifier)) - (expression_statement (identifier))) + (ERROR (identifier)) + (expression_statement (identifier) (ERROR (identifier) (identifier))))) + (ERROR (identifier)) + (expression_statement (identifier) (ERROR (identifier) (identifier)))) =================================================== one invalid subtree right after the viable prefix diff --git a/test/runtime/parser_test.cc b/test/runtime/parser_test.cc index c5ddce41..420cf092 100644 --- a/test/runtime/parser_test.cc +++ b/test/runtime/parser_test.cc @@ -91,15 +91,15 @@ describe("Parser", [&]() { TSNode error = ts_node_named_child(ts_node_child(root, 0), 1); AssertThat(ts_node_type(error, document), Equals("ERROR")); - AssertThat(get_node_text(error), Equals(", @@@@@")); + AssertThat(get_node_text(error), Equals("@@@@@,")); AssertThat(ts_node_child_count(error), Equals(2)); - TSNode comma = ts_node_child(error, 0); - AssertThat(get_node_text(comma), Equals(",")); - - TSNode garbage = ts_node_child(error, 1); + TSNode garbage = ts_node_child(error, 0); AssertThat(get_node_text(garbage), Equals("@@@@@")); + TSNode comma = ts_node_child(error, 1); + AssertThat(get_node_text(comma), Equals(",")); + TSNode node_after_error = ts_node_next_named_sibling(error); AssertThat(ts_node_type(node_after_error, document), Equals("true")); AssertThat(get_node_text(node_after_error), Equals("true")); @@ -116,16 +116,17 @@ describe("Parser", [&]() { TSNode error = ts_node_named_child(ts_node_child(root, 0), 1); AssertThat(ts_node_type(error, document), Equals("ERROR")); + AssertThat(get_node_text(error), Equals("faaaaalse,")); AssertThat(ts_node_child_count(error), Equals(2)); - TSNode comma = ts_node_child(error, 0); - AssertThat(ts_node_type(comma, document), Equals(",")); - AssertThat(get_node_text(comma), Equals(",")); - - TSNode garbage = ts_node_child(error, 1); + TSNode garbage = ts_node_child(error, 0); AssertThat(ts_node_type(garbage, document), Equals("ERROR")); AssertThat(get_node_text(garbage), Equals("faaaaalse")); + TSNode comma = ts_node_child(error, 1); + AssertThat(ts_node_type(comma, document), Equals(",")); + AssertThat(get_node_text(comma), Equals(",")); + TSNode last = ts_node_next_named_sibling(error); AssertThat(ts_node_type(last, document), Equals("true")); AssertThat(ts_node_start_byte(last), Equals(strlen(" [123, faaaaalse, ")));