From 356d5e02218db3b3951f6f979edf47c263c0911b Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 25 May 2018 15:29:15 -0700 Subject: [PATCH 1/3] Generalize logic for finding a keyword capture token --- .../build_tables/lex_table_builder.cc | 268 +++++++++++------- src/compiler/build_tables/lex_table_builder.h | 30 +- src/compiler/build_tables/lookahead_set.cc | 26 ++ src/compiler/build_tables/lookahead_set.h | 2 + .../build_tables/parse_table_builder.cc | 34 +-- src/compiler/parse_table.cc | 4 + src/compiler/parse_table.h | 1 + 7 files changed, 244 insertions(+), 121 deletions(-) diff --git a/src/compiler/build_tables/lex_table_builder.cc b/src/compiler/build_tables/lex_table_builder.cc index 001dbc81..4a507b81 100644 --- a/src/compiler/build_tables/lex_table_builder.cc +++ b/src/compiler/build_tables/lex_table_builder.cc @@ -15,6 +15,19 @@ #include "compiler/rule.h" #include "utf8proc.h" +namespace std { + +using tree_sitter::rules::Symbol; + +size_t hash>::operator()( + const pair &p +) const { + hash hasher; + return hasher(p.first) ^ hasher(p.second); +} + +} // namespace std + namespace tree_sitter { namespace build_tables { @@ -36,8 +49,24 @@ using rules::Symbol; using rules::Metadata; using rules::Seq; +static const std::unordered_set EMPTY; + +bool CoincidentTokenIndex::contains(Symbol a, Symbol b) const { + return a == b || !states_with(a, b).empty(); +} + +const std::unordered_set &CoincidentTokenIndex::states_with(Symbol a, Symbol b) const { + if (a.index > b.index) std::swap(a, b); + auto iter = entries.find({a.index, b.index}); + if (iter == entries.end()) { + return EMPTY; + } else { + return iter->second; + } +} + template -class StartOrEndCharacterAggregator { +class CharacterAggregator { public: void apply(const Rule &rule) { rule.match( @@ -62,8 +91,8 @@ class StartOrEndCharacterAggregator { CharacterSet result; }; -using StartingCharacterAggregator = StartOrEndCharacterAggregator; -using AllCharacterAggregator = StartOrEndCharacterAggregator; +using StartingCharacterAggregator = CharacterAggregator; +using AllCharacterAggregator = CharacterAggregator; class LexTableBuilderImpl : public LexTableBuilder { LexTable main_lex_table; @@ -75,7 +104,8 @@ class LexTableBuilderImpl : public LexTableBuilder { CharacterSet separator_start_characters; vector starting_characters_by_token; vector following_characters_by_token; - const vector &coincident_tokens_by_token; + const CoincidentTokenIndex &coincident_token_index; + ParseTable *parse_table; vector conflict_matrix; bool conflict_detection_mode; LookaheadSet keyword_symbols; @@ -86,11 +116,13 @@ class LexTableBuilderImpl : public LexTableBuilder { LexTableBuilderImpl(const SyntaxGrammar &syntax_grammar, const LexicalGrammar &lexical_grammar, const unordered_map &following_tokens_by_token, - const vector &coincident_tokens) + const CoincidentTokenIndex &coincident_token_index, + ParseTable *parse_table) : grammar(lexical_grammar), starting_characters_by_token(lexical_grammar.variables.size()), following_characters_by_token(lexical_grammar.variables.size()), - coincident_tokens_by_token(coincident_tokens), + coincident_token_index(coincident_token_index), + parse_table(parse_table), conflict_matrix(lexical_grammar.variables.size() * lexical_grammar.variables.size(), DoesNotMatch), conflict_detection_mode(false), keyword_capture_token(rules::NONE()) { @@ -106,51 +138,41 @@ class LexTableBuilderImpl : public LexTableBuilder { separator_start_characters = separator_character_aggregator.result; // Compute the set of characters that each token can start with and the set of non-separator - // characters that can follow each token. + // characters that can follow each token. Also identify all of the tokens that consist + // entirely of letters, and can be considered 'keywords'. + LOG_START("characterizing tokens"); + LookaheadSet potential_keyword_symbols; for (unsigned i = 0, n = grammar.variables.size(); i < n; i++) { + Symbol token = Symbol::terminal(i); + StartingCharacterAggregator starting_character_aggregator; starting_character_aggregator.apply(grammar.variables[i].rule); starting_characters_by_token[i] = starting_character_aggregator.result; StartingCharacterAggregator following_character_aggregator; - const auto &following_tokens = following_tokens_by_token.find(Symbol::terminal(i)); + const auto &following_tokens = following_tokens_by_token.find(token); if (following_tokens != following_tokens_by_token.end()) { following_tokens->second.for_each([&](Symbol following_token) { following_character_aggregator.apply(grammar.variables[following_token.index].rule); return true; }); } + following_characters_by_token[i] = following_character_aggregator.result; - if (grammar.variables[i].is_string) { - AllCharacterAggregator aggregator; - aggregator.apply(grammar.variables[i].rule); - bool all_alpha = true, all_lower = true; - for (auto character : aggregator.result.included_chars) { - if (!iswalpha(character) && character != '_') all_alpha = false; - if (!iswlower(character)) all_lower = false; - } - - if (all_lower) { - keyword_symbols.insert(Symbol::terminal(i)); - } - - // TODO - Refactor this. In general, a keyword token cannot be followed immediately - // by another alphanumeric character. But this requirement is currently not expressed - // anywhere in the grammar. So without this hack, we would be overly conservative about - // merging parse states because we would often consider `identifier` tokens to *conflict* - // with keyword tokens. - if (all_alpha) { - following_character_aggregator.result - .exclude('a', 'z') - .exclude('A', 'Z') - .exclude('0', '9') - .exclude('_') - .exclude('$'); + AllCharacterAggregator aggregator; + aggregator.apply(grammar.variables[i].rule); + bool all_alpha = true; + for (auto character : aggregator.result.included_chars) { + if (!iswalpha(character) && character != '_') { + all_alpha = false; } } - - following_characters_by_token[i] = following_character_aggregator.result; + if (all_alpha) { + LOG("potential keyword: %s", token_name(token).c_str()); + potential_keyword_symbols.insert(token); + } } + LOG_END(); // For each pair of tokens, generate a lex table for just those two tokens and record what // conflicts arise. @@ -171,50 +193,102 @@ class LexTableBuilderImpl : public LexTableBuilder { } LOG_END(); - // Find a 'keyword capture token' that matches all of the indentified keywords. + LOG_START("finding keyword capture token"); for (Symbol::Index i = 0, n = grammar.variables.size(); i < n; i++) { - Symbol symbol = Symbol::terminal(i); - bool matches_all_keywords = true; - keyword_symbols.for_each([&](Symbol keyword_symbol) { - if (!(get_conflict_status(symbol, keyword_symbol) & MatchesSameString)) { - matches_all_keywords = false; + Symbol candidate = Symbol::terminal(i); + + LookaheadSet homonyms; + potential_keyword_symbols.for_each([&](Symbol other_token) { + if (get_conflict_status(other_token, candidate) & MatchesShorterStringWithinSeparators) { + homonyms.clear(); return false; } + if (get_conflict_status(candidate, other_token) == MatchesSameString) { + homonyms.insert(other_token); + } return true; }); - if (!matches_all_keywords) continue; + if (homonyms.empty()) continue; - // Don't use a token to capture keywords if it overlaps with separator characters. - AllCharacterAggregator capture_aggregator; - capture_aggregator.apply(grammar.variables[i].rule); - if (capture_aggregator.result.intersects(separator_start_characters)) continue; + LOG_START( + "keyword capture token candidate: %s, homonym count: %lu", + token_name(candidate).c_str(), + homonyms.size() + ); + + homonyms.for_each([&](Symbol homonym1) { + homonyms.for_each([&](Symbol homonym2) { + if (get_conflict_status(homonym1, homonym2) & MatchesSameString) { + LOG( + "conflict between homonyms %s %s", + token_name(homonym1).c_str(), + token_name(homonym2).c_str() + ); + homonyms.remove(homonym1); + } + return false; + }); + return true; + }); - // Don't use a token to capture keywords if it conflicts with other tokens - // that occur in the same state as a keyword. - bool shadows_other_tokens = false; for (Symbol::Index j = 0; j < n; j++) { - Symbol other_symbol = Symbol::terminal(j); - if ((get_conflict_status(other_symbol, symbol) & (MatchesShorterStringWithinSeparators|MatchesLongerStringWithValidNextChar)) && - !keyword_symbols.contains(other_symbol) && - keyword_symbols.intersects(coincident_tokens_by_token[j])) { - shadows_other_tokens = true; - break; + Symbol other_token = Symbol::terminal(j); + if (other_token == candidate || homonyms.contains(other_token)) continue; + bool candidate_shadows_other = get_conflict_status(other_token, candidate); + bool other_shadows_candidate = get_conflict_status(candidate, other_token); + + if (candidate_shadows_other || other_shadows_candidate) { + homonyms.for_each([&](Symbol homonym) { + bool other_shadows_homonym = get_conflict_status(homonym, other_token); + + bool candidate_was_already_present = true; + for (ParseStateId state_id : coincident_token_index.states_with(homonym, other_token)) { + if (!parse_table->states[state_id].has_terminal_entry(candidate)) { + candidate_was_already_present = false; + break; + } + } + + if (!candidate_was_already_present) { + if (candidate_shadows_other) { + homonyms.remove(homonym); + LOG( + "remove %s because candidate would shadow %s", + token_name(homonym).c_str(), + token_name(other_token).c_str() + ); + } else if (other_shadows_candidate != other_shadows_homonym) { + homonyms.remove(homonym); + LOG( + "remove %s because %s would shadow candidate", + token_name(homonym).c_str(), + token_name(other_token).c_str() + ); + } + } + return true; + }); } } - if (shadows_other_tokens) continue; - // If multiple keyword capture tokens are found, don't bother extracting - // the keywords into their own function. - if (keyword_capture_token == rules::NONE()) { - keyword_capture_token = symbol; - } else { - keyword_capture_token = rules::NONE(); - break; + if (homonyms.size() > keyword_symbols.size()) { + LOG_START("found capture token. homonyms:"); + homonyms.for_each([&](Symbol homonym) { + LOG("%s", token_name(homonym).c_str()); + return true; + }); + LOG_END(); + keyword_symbols = homonyms; + keyword_capture_token = candidate; } + + LOG_END(); } + + LOG_END(); } - BuildResult build(ParseTable *parse_table) { + BuildResult build() { clear(); conflict_detection_mode = false; vector>> starting_token_sets; @@ -250,8 +324,8 @@ class LexTableBuilderImpl : public LexTableBuilder { add_lex_state(keyword_lex_table, item_set_for_terminals(keyword_symbols, false)); - mark_fragile_tokens(parse_table); - remove_duplicate_lex_states(main_lex_table, parse_table); + mark_fragile_tokens(); + remove_duplicate_lex_states(main_lex_table); return {main_lex_table, keyword_lex_table, keyword_capture_token}; } @@ -266,10 +340,11 @@ class LexTableBuilderImpl : public LexTableBuilder { private: bool record_conflict(Symbol shadowed_token, Symbol other_token, ConflictStatus status) { + if (!conflict_detection_mode) return false; unsigned index = shadowed_token.index * grammar.variables.size() + other_token.index; - bool old_value = conflict_matrix[index] & status; + bool was_set = conflict_matrix[index] & status; conflict_matrix[index] = static_cast(conflict_matrix[index] | status); - return old_value; + return !was_set; } LexStateId add_lex_state(LexTable &lex_table, const LexItemSet &item_set) { @@ -313,8 +388,12 @@ class LexTableBuilderImpl : public LexTableBuilder { auto advance_symbol = transition.destination.entries.begin()->lhs; auto &following_chars = following_characters_by_token[accept_action.symbol.index]; CharacterSet conflicting_following_chars = characters.intersection(following_chars); - CharacterSet conflicting_sep_chars = characters.intersection(separator_start_characters); - if (!conflicting_following_chars.is_empty()) { + if (conflicting_following_chars.is_empty()) { + conflicting_following_chars = characters.intersection(separator_start_characters); + } + if (conflicting_following_chars.is_empty()) { + record_conflict(accept_action.symbol, advance_symbol, MatchesLongerString); + } else { if (record_conflict( accept_action.symbol, advance_symbol, @@ -327,21 +406,6 @@ class LexTableBuilderImpl : public LexTableBuilder { log_char(*conflicting_following_chars.included_chars.begin()) ); } - } else if (!conflicting_sep_chars.is_empty()) { - if (record_conflict( - accept_action.symbol, - advance_symbol, - MatchesLongerStringWithValidNextChar - )) { - LOG( - "%s shadows %s followed by '%s'", - token_name(advance_symbol).c_str(), - token_name(accept_action.symbol).c_str(), - log_char(*conflicting_sep_chars.included_chars.begin()) - ); - } - } else { - record_conflict(accept_action.symbol, advance_symbol, MatchesLongerString); } } } @@ -364,9 +428,21 @@ class LexTableBuilderImpl : public LexTableBuilder { AcceptTokenAction &existing_action = lex_table.states[state_id].accept_action; if (existing_action.is_present()) { if (should_replace_accept_action(existing_action, action)) { - record_conflict(existing_action.symbol, action.symbol, MatchesSameString); + if (record_conflict(existing_action.symbol, action.symbol, MatchesSameString)) { + LOG( + "%s shadows %s - same length", + token_name(action.symbol).c_str(), + token_name(existing_action.symbol).c_str() + ); + } } else { - record_conflict(action.symbol, existing_action.symbol, MatchesSameString); + if (record_conflict(action.symbol, existing_action.symbol, MatchesSameString)) { + LOG( + "%s shadows %s - same length", + token_name(existing_action.symbol).c_str(), + token_name(action.symbol).c_str() + ); + } continue; } } @@ -375,7 +451,7 @@ class LexTableBuilderImpl : public LexTableBuilder { } } - void mark_fragile_tokens(ParseTable *parse_table) { + void mark_fragile_tokens() { for (ParseState &state : parse_table->states) { for (auto &entry : state.terminal_entries) { Symbol token = entry.first; @@ -401,7 +477,7 @@ class LexTableBuilderImpl : public LexTableBuilder { const LookaheadSet &existing_set = in_left ? right : *left; existing_set.for_each([&](Symbol existing_symbol) { if ((get_conflict_status(existing_symbol, different_symbol) & CannotDistinguish) || - !coincident_tokens_by_token[different_symbol.index].contains(existing_symbol)) { + !coincident_token_index.contains(different_symbol, existing_symbol)) { is_compatible = false; return false; } @@ -417,7 +493,7 @@ class LexTableBuilderImpl : public LexTableBuilder { return is_compatible; } - void remove_duplicate_lex_states(LexTable &lex_table, ParseTable *parse_table) { + void remove_duplicate_lex_states(LexTable &lex_table) { for (LexState &state : lex_table.states) { state.accept_action.is_string = false; state.accept_action.precedence = 0; @@ -541,7 +617,7 @@ class LexTableBuilderImpl : public LexTableBuilder { main_lex_state_ids.clear(); } - string token_name(rules::Symbol &symbol) { + string token_name(const rules::Symbol &symbol) { const LexicalVariable &variable = grammar.variables[symbol.index]; if (variable.type == VariableTypeNamed) { return variable.name; @@ -563,17 +639,19 @@ class LexTableBuilderImpl : public LexTableBuilder { unique_ptr LexTableBuilder::create(const SyntaxGrammar &syntax_grammar, const LexicalGrammar &lexical_grammar, const unordered_map &following_tokens, - const vector &coincident_tokens) { + const CoincidentTokenIndex &coincident_tokens, + ParseTable *parse_table) { return unique_ptr(new LexTableBuilderImpl( syntax_grammar, lexical_grammar, following_tokens, - coincident_tokens + coincident_tokens, + parse_table )); } -LexTableBuilder::BuildResult LexTableBuilder::build(ParseTable *parse_table) { - return static_cast(this)->build(parse_table); +LexTableBuilder::BuildResult LexTableBuilder::build() { + return static_cast(this)->build(); } ConflictStatus LexTableBuilder::get_conflict_status(Symbol a, Symbol b) const { diff --git a/src/compiler/build_tables/lex_table_builder.h b/src/compiler/build_tables/lex_table_builder.h index 2a1051aa..4ec4f22b 100644 --- a/src/compiler/build_tables/lex_table_builder.h +++ b/src/compiler/build_tables/lex_table_builder.h @@ -4,9 +4,22 @@ #include #include #include -#include +#include +#include +#include "compiler/parse_table.h" #include "compiler/lex_table.h" +namespace std { + +using tree_sitter::rules::Symbol; + +template <> +struct hash> { + size_t operator()(const pair &) const; +}; + +} // namespace std + namespace tree_sitter { struct ParseTable; @@ -30,12 +43,23 @@ enum ConflictStatus { ), }; +struct CoincidentTokenIndex { + std::unordered_map< + std::pair, + std::unordered_set + > entries; + + bool contains(rules::Symbol, rules::Symbol) const; + const std::unordered_set &states_with(rules::Symbol, rules::Symbol) const; +}; + class LexTableBuilder { public: static std::unique_ptr create(const SyntaxGrammar &, const LexicalGrammar &, const std::unordered_map &, - const std::vector &); + const CoincidentTokenIndex &, + ParseTable *); struct BuildResult { LexTable main_table; @@ -43,7 +67,7 @@ class LexTableBuilder { rules::Symbol keyword_capture_token; }; - BuildResult build(ParseTable *); + BuildResult build(); ConflictStatus get_conflict_status(rules::Symbol, rules::Symbol) const; diff --git a/src/compiler/build_tables/lookahead_set.cc b/src/compiler/build_tables/lookahead_set.cc index 80ec58e1..6e5f73b5 100644 --- a/src/compiler/build_tables/lookahead_set.cc +++ b/src/compiler/build_tables/lookahead_set.cc @@ -117,5 +117,31 @@ bool LookaheadSet::insert(const Symbol &symbol) { return false; } +bool LookaheadSet::remove(const Symbol &symbol) { + if (symbol == rules::END_OF_INPUT()) { + if (eof) { + eof = false; + return true; + } + return false; + } + + auto &bits = symbol.is_external() ? external_bits : terminal_bits; + if (bits.size() > static_cast(symbol.index)) { + if (bits[symbol.index]) { + bits[symbol.index] = false; + return true; + } + } + + return false; +} + +void LookaheadSet::clear() { + eof = false; + terminal_bits.clear(); + external_bits.clear(); +} + } // namespace build_tables } // namespace tree_sitter diff --git a/src/compiler/build_tables/lookahead_set.h b/src/compiler/build_tables/lookahead_set.h index bb9eeff9..6445969d 100644 --- a/src/compiler/build_tables/lookahead_set.h +++ b/src/compiler/build_tables/lookahead_set.h @@ -22,6 +22,8 @@ class LookaheadSet { bool contains(const rules::Symbol &) const; bool insert_all(const LookaheadSet &); bool insert(const rules::Symbol &); + bool remove(const rules::Symbol &); + void clear(); bool intersects(const LookaheadSet &) const; template diff --git a/src/compiler/build_tables/parse_table_builder.cc b/src/compiler/build_tables/parse_table_builder.cc index 127b866c..0e6b4247 100644 --- a/src/compiler/build_tables/parse_table_builder.cc +++ b/src/compiler/build_tables/parse_table_builder.cc @@ -52,28 +52,14 @@ class ParseTableBuilderImpl : public ParseTableBuilder { ParseItemSetBuilder item_set_builder; unique_ptr lex_table_builder; unordered_map following_tokens_by_token; - vector coincident_tokens_by_token; + CoincidentTokenIndex coincident_token_index; set> logged_conflict_tokens; public: ParseTableBuilderImpl(const SyntaxGrammar &syntax_grammar, const LexicalGrammar &lexical_grammar) : grammar(syntax_grammar), lexical_grammar(lexical_grammar), - item_set_builder(syntax_grammar, lexical_grammar), - coincident_tokens_by_token(lexical_grammar.variables.size()) { - - for (unsigned i = 0, n = lexical_grammar.variables.size(); i < n; i++) { - coincident_tokens_by_token[i].insert(rules::END_OF_INPUT()); - if (lexical_grammar.variables[i].is_string) { - for (unsigned j = 0; j < i; j++) { - if (lexical_grammar.variables[j].is_string) { - coincident_tokens_by_token[i].insert(Symbol::terminal(j)); - coincident_tokens_by_token[j].insert(Symbol::terminal(i)); - } - } - } - } - } + item_set_builder(syntax_grammar, lexical_grammar) {} BuildResult build() { // Ensure that the empty rename sequence has index 0. @@ -106,7 +92,8 @@ class ParseTableBuilderImpl : public ParseTableBuilder { grammar, lexical_grammar, following_tokens_by_token, - coincident_tokens_by_token + coincident_token_index, + &parse_table ); build_error_parse_state(error_state_id); @@ -115,7 +102,7 @@ class ParseTableBuilderImpl : public ParseTableBuilder { eliminate_unit_reductions(); populate_used_terminals(); - auto lex_table_result = lex_table_builder->build(&parse_table); + auto lex_table_result = lex_table_builder->build(); return { parse_table, lex_table_result.main_table, @@ -161,8 +148,7 @@ class ParseTableBuilderImpl : public ParseTableBuilder { bool conflicts_with_other_tokens = false; for (unsigned j = 0; j < lexical_grammar.variables.size(); j++) { Symbol other_token = Symbol::terminal(j); - if (j != i && - !coincident_tokens_by_token[token.index].contains(other_token) && + if (!coincident_token_index.contains(token, other_token) && (lex_table_builder->get_conflict_status(other_token, token) & CannotMerge)) { conflicts_with_other_tokens = true; break; @@ -184,7 +170,7 @@ class ParseTableBuilderImpl : public ParseTableBuilder { } else { bool conflicts_with_other_tokens = false; conflict_free_tokens.for_each([&](Symbol other_token) { - if (!coincident_tokens_by_token[token.index].contains(other_token) && + if (!coincident_token_index.contains(token, other_token) && (lex_table_builder->get_conflict_status(other_token, token) & CannotMerge)) { LOG( "exclude %s: conflicts with %s", @@ -332,8 +318,10 @@ class ParseTableBuilderImpl : public ParseTableBuilder { if (iter->first.is_built_in() || iter->first.is_external()) continue; for (auto other_iter = terminals.begin(); other_iter != iter; ++other_iter) { if (other_iter->first.is_built_in() || other_iter->first.is_external()) continue; - coincident_tokens_by_token[iter->first.index].insert(other_iter->first); - coincident_tokens_by_token[other_iter->first.index].insert(iter->first); + coincident_token_index.entries[{ + other_iter->first.index, + iter->first.index + }].insert(state_id); } } diff --git a/src/compiler/parse_table.cc b/src/compiler/parse_table.cc index ed41d473..252185f4 100644 --- a/src/compiler/parse_table.cc +++ b/src/compiler/parse_table.cc @@ -123,6 +123,10 @@ bool ParseState::has_shift_action() const { return (!nonterminal_entries.empty()); } +bool ParseState::has_terminal_entry(rules::Symbol symbol) const { + return terminal_entries.find(symbol) != terminal_entries.end(); +} + void ParseState::each_referenced_state(function fn) { for (auto &entry : terminal_entries) for (ParseAction &action : entry.second.actions) diff --git a/src/compiler/parse_table.h b/src/compiler/parse_table.h index 770deafb..bf85c4b7 100644 --- a/src/compiler/parse_table.h +++ b/src/compiler/parse_table.h @@ -65,6 +65,7 @@ struct ParseState { bool merge(const ParseState &); void each_referenced_state(std::function); bool has_shift_action() const; + bool has_terminal_entry(rules::Symbol) const; std::map terminal_entries; std::map nonterminal_entries; From 45c52f94596ac5b590c3299c86e26378edb3157f Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 25 May 2018 21:24:53 -0700 Subject: [PATCH 2/3] Allow keywords to contain numbers, as long as they start w/ a letter --- .../build_tables/lex_table_builder.cc | 67 +++++++++++-------- 1 file changed, 39 insertions(+), 28 deletions(-) diff --git a/src/compiler/build_tables/lex_table_builder.cc b/src/compiler/build_tables/lex_table_builder.cc index 4a507b81..178cfb75 100644 --- a/src/compiler/build_tables/lex_table_builder.cc +++ b/src/compiler/build_tables/lex_table_builder.cc @@ -138,8 +138,8 @@ class LexTableBuilderImpl : public LexTableBuilder { separator_start_characters = separator_character_aggregator.result; // Compute the set of characters that each token can start with and the set of non-separator - // characters that can follow each token. Also identify all of the tokens that consist - // entirely of letters, and can be considered 'keywords'. + // characters that can follow each token. Also identify all of the tokens that can be + // considered 'keywords'. LOG_START("characterizing tokens"); LookaheadSet potential_keyword_symbols; for (unsigned i = 0, n = grammar.variables.size(); i < n; i++) { @@ -159,18 +159,30 @@ class LexTableBuilderImpl : public LexTableBuilder { } following_characters_by_token[i] = following_character_aggregator.result; - AllCharacterAggregator aggregator; - aggregator.apply(grammar.variables[i].rule); - bool all_alpha = true; - for (auto character : aggregator.result.included_chars) { - if (!iswalpha(character) && character != '_') { - all_alpha = false; + AllCharacterAggregator all_character_aggregator; + all_character_aggregator.apply(grammar.variables[i].rule); + + if ( + !starting_character_aggregator.result.includes_all && + !all_character_aggregator.result.includes_all + ) { + bool starts_alpha = true, all_alnum = true; + for (auto character : starting_character_aggregator.result.included_chars) { + if (!iswalpha(character) && character != '_') { + starts_alpha = false; + } + } + for (auto character : all_character_aggregator.result.included_chars) { + if (!iswalnum(character) && character != '_') { + all_alnum = false; + } + } + if (starts_alpha && all_alnum) { + LOG("potential keyword: %s", token_name(token).c_str()); + potential_keyword_symbols.insert(token); } } - if (all_alpha) { - LOG("potential keyword: %s", token_name(token).c_str()); - potential_keyword_symbols.insert(token); - } + } LOG_END(); @@ -248,23 +260,22 @@ class LexTableBuilderImpl : public LexTableBuilder { break; } } + if (candidate_was_already_present) return true; - if (!candidate_was_already_present) { - if (candidate_shadows_other) { - homonyms.remove(homonym); - LOG( - "remove %s because candidate would shadow %s", - token_name(homonym).c_str(), - token_name(other_token).c_str() - ); - } else if (other_shadows_candidate != other_shadows_homonym) { - homonyms.remove(homonym); - LOG( - "remove %s because %s would shadow candidate", - token_name(homonym).c_str(), - token_name(other_token).c_str() - ); - } + if (candidate_shadows_other) { + homonyms.remove(homonym); + LOG( + "remove %s because candidate would shadow %s", + token_name(homonym).c_str(), + token_name(other_token).c_str() + ); + } else if (other_shadows_candidate && !other_shadows_homonym) { + homonyms.remove(homonym); + LOG( + "remove %s because %s would shadow candidate", + token_name(homonym).c_str(), + token_name(other_token).c_str() + ); } return true; }); From 8120e61d8d9035cf4d9c0a53c8127f50bf82c085 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 25 May 2018 21:37:25 -0700 Subject: [PATCH 3/3] Remove blank lines from log messages --- src/compiler/log.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/compiler/log.h b/src/compiler/log.h index d781cb41..2f7ad3e2 100644 --- a/src/compiler/log.h +++ b/src/compiler/log.h @@ -20,7 +20,6 @@ void _print_indent(); #define LOG_END(...) \ do { \ _outdent_logs(); \ - LOG(""); \ } while (0) #define LOG(...) \