From e17cd42e47bc3a1543dd2eef5692361534ec29f2 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 13 Jun 2018 16:54:11 -0700 Subject: [PATCH] Perform keyword optimization using explicitly selected word token rather than trying to infer the word token automatically. Co-Authored-By: Ashi Krishnan --- docs/section-3-creating-parsers.md | 24 +++ include/tree_sitter/compiler.h | 1 + .../build_tables/lex_table_builder.cc | 180 ++++++++---------- src/compiler/grammar.h | 1 + src/compiler/parse_grammar.cc | 14 +- .../prepare_grammar/expand_repeats.cc | 1 + .../prepare_grammar/extract_tokens.cc | 12 ++ .../prepare_grammar/flatten_grammar.cc | 2 + .../prepare_grammar/initial_syntax_grammar.h | 1 + .../prepare_grammar/intern_symbols.cc | 2 + .../prepare_grammar/interned_grammar.h | 2 +- src/compiler/syntax_grammar.h | 1 + 12 files changed, 142 insertions(+), 99 deletions(-) diff --git a/docs/section-3-creating-parsers.md b/docs/section-3-creating-parsers.md index 268b034a..8be78926 100644 --- a/docs/section-3-creating-parsers.md +++ b/docs/section-3-creating-parsers.md @@ -217,6 +217,7 @@ In addition to the `name` and `rules` fields, grammars have a few other public f * `inline` - an array of rule names that should be automatically *removed* from the grammar by replacing all of their usages with a copy of their definition. This is useful for rules that are used in multiple places but for which you *don't* want to create syntax tree nodes at runtime. * `conflicts` - an array of arrays of rule names. Each inner array represents a set of rules that's involved in an *LR(1) conflict* that is *intended to exist* in the grammar. When these conflicts occur at runtime, Tree-sitter will use the GLR algorithm to explore all of the possible interpretations. If *multiple* parses end up succeeding, Tree-sitter will pick the subtree rule with the highest *dynamic precedence*. * `externals` - an array of toen names which can be returned by an *external scanner*. External scanners allow you to write custom C code which runs during the lexing process in order to handle lexical rules (e.g. Python's indentation tokens) that cannot be described by regular expressions. +* `word` - the name of a token that will match keywords for the purpose of [keyword-optimization](#keyword-optimization). ## Adjusting existing grammars @@ -359,6 +360,29 @@ You may have noticed in the above examples that some of the grammar rule name li TODO +## Keyword Optimization + +Many languages have a set of keywords. Typically, these aren't identifiers, but +look like them. For example, in Algol-like languages, `if` is a keyword. It could +be a variable name, and in some contexts (e.g. javascript object literals like +`{if: 'something'}`) it might be interpreted as a variable, but in many contexts, +it has special meaning. + +You'll know if you have them, because keywords end up in the grammar as strings +or regexes that match a small finite set of strings. + +The naïve parser generated from such a grammar can be huge and take forever to +compile. Keyword optimization is the fix. Instead of building a parser which +looks for `choice('break', 'continue', 'async', ...etc)` wherever they +might occur, `word: $ => $.identifier` will instruct Tree-sitter to instead try +and parse an `identifier` where it was going to try and parse one of those keywords, +and then check to see if the parsed `identifier` actually does match a keyword. + +You don't have to specify what words actually are keywords. Tree-sitter will +identify these automatically, as the set of terminals that your word could +match. + + [cst]: https://en.wikipedia.org/wiki/Parse_tree [non-terminal]: https://en.wikipedia.org/wiki/Terminal_and_nonterminal_symbols [language-spec]: https://en.wikipedia.org/wiki/Programming_language_specification diff --git a/include/tree_sitter/compiler.h b/include/tree_sitter/compiler.h index ca2a28f7..3db2f7ca 100644 --- a/include/tree_sitter/compiler.h +++ b/include/tree_sitter/compiler.h @@ -19,6 +19,7 @@ typedef enum { TSCompileErrorTypeEpsilonRule, TSCompileErrorTypeInvalidTokenContents, TSCompileErrorTypeInvalidRuleName, + TSCompileErrorTypeInvalidWordRule, } TSCompileErrorType; typedef struct { diff --git a/src/compiler/build_tables/lex_table_builder.cc b/src/compiler/build_tables/lex_table_builder.cc index 178cfb75..5e4bfa85 100644 --- a/src/compiler/build_tables/lex_table_builder.cc +++ b/src/compiler/build_tables/lex_table_builder.cc @@ -109,7 +109,7 @@ class LexTableBuilderImpl : public LexTableBuilder { vector conflict_matrix; bool conflict_detection_mode; LookaheadSet keyword_symbols; - Symbol keyword_capture_token; + Symbol word_rule; char encoding_buffer[8]; public: @@ -125,7 +125,7 @@ class LexTableBuilderImpl : public LexTableBuilder { parse_table(parse_table), conflict_matrix(lexical_grammar.variables.size() * lexical_grammar.variables.size(), DoesNotMatch), conflict_detection_mode(false), - keyword_capture_token(rules::NONE()) { + word_rule(syntax_grammar.word_rule) { // Compute the possible separator rules and the set of separator characters that can occur // immediately after any token. @@ -182,7 +182,6 @@ class LexTableBuilderImpl : public LexTableBuilder { potential_keyword_symbols.insert(token); } } - } LOG_END(); @@ -205,100 +204,87 @@ class LexTableBuilderImpl : public LexTableBuilder { } LOG_END(); - LOG_START("finding keyword capture token"); - for (Symbol::Index i = 0, n = grammar.variables.size(); i < n; i++) { - Symbol candidate = Symbol::terminal(i); - - LookaheadSet homonyms; - potential_keyword_symbols.for_each([&](Symbol other_token) { - if (get_conflict_status(other_token, candidate) & MatchesShorterStringWithinSeparators) { - homonyms.clear(); - return false; - } - if (get_conflict_status(candidate, other_token) == MatchesSameString) { - homonyms.insert(other_token); - } - return true; - }); - if (homonyms.empty()) continue; - - LOG_START( - "keyword capture token candidate: %s, homonym count: %lu", - token_name(candidate).c_str(), - homonyms.size() - ); - - homonyms.for_each([&](Symbol homonym1) { - homonyms.for_each([&](Symbol homonym2) { - if (get_conflict_status(homonym1, homonym2) & MatchesSameString) { - LOG( - "conflict between homonyms %s %s", - token_name(homonym1).c_str(), - token_name(homonym2).c_str() - ); - homonyms.remove(homonym1); - } - return false; - }); - return true; - }); - - for (Symbol::Index j = 0; j < n; j++) { - Symbol other_token = Symbol::terminal(j); - if (other_token == candidate || homonyms.contains(other_token)) continue; - bool candidate_shadows_other = get_conflict_status(other_token, candidate); - bool other_shadows_candidate = get_conflict_status(candidate, other_token); - - if (candidate_shadows_other || other_shadows_candidate) { - homonyms.for_each([&](Symbol homonym) { - bool other_shadows_homonym = get_conflict_status(homonym, other_token); - - bool candidate_was_already_present = true; - for (ParseStateId state_id : coincident_token_index.states_with(homonym, other_token)) { - if (!parse_table->states[state_id].has_terminal_entry(candidate)) { - candidate_was_already_present = false; - break; - } - } - if (candidate_was_already_present) return true; - - if (candidate_shadows_other) { - homonyms.remove(homonym); - LOG( - "remove %s because candidate would shadow %s", - token_name(homonym).c_str(), - token_name(other_token).c_str() - ); - } else if (other_shadows_candidate && !other_shadows_homonym) { - homonyms.remove(homonym); - LOG( - "remove %s because %s would shadow candidate", - token_name(homonym).c_str(), - token_name(other_token).c_str() - ); - } - return true; - }); - } - } - - if (homonyms.size() > keyword_symbols.size()) { - LOG_START("found capture token. homonyms:"); - homonyms.for_each([&](Symbol homonym) { - LOG("%s", token_name(homonym).c_str()); - return true; - }); - LOG_END(); - keyword_symbols = homonyms; - keyword_capture_token = candidate; - } - - LOG_END(); + if (word_rule != rules::NONE()) { + identify_keywords(potential_keyword_symbols); } LOG_END(); } + void identify_keywords(const LookaheadSet &potential_keyword_symbols) { + LookaheadSet homonyms; + potential_keyword_symbols.for_each([&](Symbol other_token) { + if (get_conflict_status(word_rule, other_token) == MatchesSameString) { + homonyms.insert(other_token); + } + return true; + }); + + homonyms.for_each([&](Symbol homonym1) { + homonyms.for_each([&](Symbol homonym2) { + if (get_conflict_status(homonym1, homonym2) & MatchesSameString) { + LOG( + "conflict between homonyms %s %s", + token_name(homonym1).c_str(), + token_name(homonym2).c_str() + ); + homonyms.remove(homonym1); + } + return false; + }); + return true; + }); + + for (Symbol::Index j = 0, n = grammar.variables.size(); j < n; j++) { + Symbol other_token = Symbol::terminal(j); + if (other_token == word_rule || homonyms.contains(other_token)) continue; + bool word_rule_shadows_other = get_conflict_status(other_token, word_rule); + bool other_shadows_word_rule = get_conflict_status(word_rule, other_token); + + if (word_rule_shadows_other || other_shadows_word_rule) { + homonyms.for_each([&](Symbol homonym) { + bool other_shadows_homonym = get_conflict_status(homonym, other_token); + + bool word_rule_was_already_present = true; + for (ParseStateId state_id : coincident_token_index.states_with(homonym, other_token)) { + if (!parse_table->states[state_id].has_terminal_entry(word_rule)) { + word_rule_was_already_present = false; + break; + } + } + if (word_rule_was_already_present) return true; + + if (word_rule_shadows_other) { + homonyms.remove(homonym); + LOG( + "remove %s because word_rule would shadow %s", + token_name(homonym).c_str(), + token_name(other_token).c_str() + ); + } else if (other_shadows_word_rule && !other_shadows_homonym) { + homonyms.remove(homonym); + LOG( + "remove %s because %s would shadow word_rule", + token_name(homonym).c_str(), + token_name(other_token).c_str() + ); + } + return true; + }); + } + } + + if (!homonyms.empty()) { + LOG_START("found keywords:"); + homonyms.for_each([&](Symbol homonym) { + LOG("%s", token_name(homonym).c_str()); + return true; + }); + LOG_END(); + keyword_symbols = homonyms; + } + } + BuildResult build() { clear(); conflict_detection_mode = false; @@ -307,8 +293,8 @@ class LexTableBuilderImpl : public LexTableBuilder { for (ParseState &parse_state : parse_table->states) { LookaheadSet token_set; for (auto &entry : parse_state.terminal_entries) { - if (keyword_capture_token.is_terminal() && keyword_symbols.contains(entry.first)) { - token_set.insert(keyword_capture_token); + if (word_rule.is_terminal() && keyword_symbols.contains(entry.first)) { + token_set.insert(word_rule); } else { token_set.insert(entry.first); } @@ -337,7 +323,7 @@ class LexTableBuilderImpl : public LexTableBuilder { mark_fragile_tokens(); remove_duplicate_lex_states(main_lex_table); - return {main_lex_table, keyword_lex_table, keyword_capture_token}; + return {main_lex_table, keyword_lex_table, word_rule}; } ConflictStatus get_conflict_status(Symbol shadowed_token, Symbol other_token) const { @@ -411,10 +397,10 @@ class LexTableBuilderImpl : public LexTableBuilder { MatchesLongerStringWithValidNextChar )) { LOG( - "%s shadows %s followed by '%s'", + "%s shadows %s followed by %d", token_name(advance_symbol).c_str(), token_name(accept_action.symbol).c_str(), - log_char(*conflicting_following_chars.included_chars.begin()) + *conflicting_following_chars.included_chars.begin() ); } } diff --git a/src/compiler/grammar.h b/src/compiler/grammar.h index 6c63340c..5e2212fb 100644 --- a/src/compiler/grammar.h +++ b/src/compiler/grammar.h @@ -32,6 +32,7 @@ struct InputGrammar { std::vector> expected_conflicts; std::vector external_tokens; std::unordered_set variables_to_inline; + rules::NamedSymbol word_rule; }; } // namespace tree_sitter diff --git a/src/compiler/parse_grammar.cc b/src/compiler/parse_grammar.cc index e233cfe0..f589d15a 100644 --- a/src/compiler/parse_grammar.cc +++ b/src/compiler/parse_grammar.cc @@ -229,7 +229,9 @@ ParseGrammarResult parse_grammar(const string &input) { string error_message; string name; InputGrammar grammar; - json_value name_json, rules_json, extras_json, conflicts_json, external_tokens_json, inline_rules_json; + json_value + name_json, rules_json, extras_json, conflicts_json, external_tokens_json, + inline_rules_json, word_rule_json; json_settings settings = { 0, json_enable_comments, 0, 0, 0, 0 }; char parse_error[json_error_max]; @@ -359,6 +361,16 @@ ParseGrammarResult parse_grammar(const string &input) { } } + word_rule_json = grammar_json->operator[]("word"); + if (word_rule_json.type != json_none) { + if (word_rule_json.type != json_string) { + error_message = "Invalid word property"; + goto error; + } + + grammar.word_rule = NamedSymbol { word_rule_json.u.string.ptr }; + } + json_value_free(grammar_json); return { name, grammar, "" }; diff --git a/src/compiler/prepare_grammar/expand_repeats.cc b/src/compiler/prepare_grammar/expand_repeats.cc index 46230867..42878376 100644 --- a/src/compiler/prepare_grammar/expand_repeats.cc +++ b/src/compiler/prepare_grammar/expand_repeats.cc @@ -106,6 +106,7 @@ InitialSyntaxGrammar expand_repeats(const InitialSyntaxGrammar &grammar) { expander.aux_rules.end() ); + result.word_rule = grammar.word_rule; return result; } diff --git a/src/compiler/prepare_grammar/extract_tokens.cc b/src/compiler/prepare_grammar/extract_tokens.cc index 93b06be2..c82b3505 100644 --- a/src/compiler/prepare_grammar/extract_tokens.cc +++ b/src/compiler/prepare_grammar/extract_tokens.cc @@ -329,6 +329,18 @@ tuple extract_tokens( } } + syntax_grammar.word_rule = symbol_replacer.replace_symbol(grammar.word_rule); + if (syntax_grammar.word_rule.is_non_terminal()) { + return make_tuple( + syntax_grammar, + lexical_grammar, + CompileError( + TSCompileErrorTypeInvalidWordRule, + "Word rules must be tokens" + ) + ); + } + return make_tuple(syntax_grammar, lexical_grammar, CompileError::none()); } diff --git a/src/compiler/prepare_grammar/flatten_grammar.cc b/src/compiler/prepare_grammar/flatten_grammar.cc index e135ee67..ebfc3ae4 100644 --- a/src/compiler/prepare_grammar/flatten_grammar.cc +++ b/src/compiler/prepare_grammar/flatten_grammar.cc @@ -161,6 +161,8 @@ pair flatten_grammar(const InitialSyntaxGrammar &gr i++; } + result.word_rule = grammar.word_rule; + return {result, CompileError::none()}; } diff --git a/src/compiler/prepare_grammar/initial_syntax_grammar.h b/src/compiler/prepare_grammar/initial_syntax_grammar.h index 881c6396..79cc951e 100644 --- a/src/compiler/prepare_grammar/initial_syntax_grammar.h +++ b/src/compiler/prepare_grammar/initial_syntax_grammar.h @@ -17,6 +17,7 @@ struct InitialSyntaxGrammar { std::set> expected_conflicts; std::vector external_tokens; std::set variables_to_inline; + rules::Symbol word_rule = rules::NONE(); }; } // namespace prepare_grammar diff --git a/src/compiler/prepare_grammar/intern_symbols.cc b/src/compiler/prepare_grammar/intern_symbols.cc index 4e610960..dc128779 100644 --- a/src/compiler/prepare_grammar/intern_symbols.cc +++ b/src/compiler/prepare_grammar/intern_symbols.cc @@ -166,6 +166,8 @@ pair intern_symbols(const InputGrammar &grammar) } } + result.word_rule = interner.intern_symbol(grammar.word_rule); + return {result, CompileError::none()}; } diff --git a/src/compiler/prepare_grammar/interned_grammar.h b/src/compiler/prepare_grammar/interned_grammar.h index 83117ced..fc322522 100644 --- a/src/compiler/prepare_grammar/interned_grammar.h +++ b/src/compiler/prepare_grammar/interned_grammar.h @@ -15,8 +15,8 @@ struct InternedGrammar { std::vector extra_tokens; std::set> expected_conflicts; std::vector external_tokens; - std::set blank_external_tokens; std::set variables_to_inline; + rules::Symbol word_rule; }; } // namespace prepare_grammar diff --git a/src/compiler/syntax_grammar.h b/src/compiler/syntax_grammar.h index 2d55686b..ff056e3f 100644 --- a/src/compiler/syntax_grammar.h +++ b/src/compiler/syntax_grammar.h @@ -60,6 +60,7 @@ struct SyntaxGrammar { std::set> expected_conflicts; std::vector external_tokens; std::set variables_to_inline; + rules::Symbol word_rule = rules::NONE(); }; } // namespace tree_sitter