Rename word_rule -> word_token
This commit is contained in:
parent
2a2e5032d4
commit
c39f0e9ef9
11 changed files with 99 additions and 96 deletions
|
|
@ -5,7 +5,6 @@
|
|||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <cwctype>
|
||||
#include <vector>
|
||||
#include "compiler/build_tables/lex_item.h"
|
||||
#include "compiler/build_tables/lookahead_set.h"
|
||||
|
|
@ -15,23 +14,9 @@
|
|||
#include "compiler/rule.h"
|
||||
#include "utf8proc.h"
|
||||
|
||||
namespace std {
|
||||
|
||||
using tree_sitter::rules::Symbol;
|
||||
|
||||
size_t hash<pair<Symbol::Index, Symbol::Index>>::operator()(
|
||||
const pair<Symbol::Index, Symbol::Index> &p
|
||||
) const {
|
||||
hash<Symbol::Index> hasher;
|
||||
return hasher(p.first) ^ hasher(p.second);
|
||||
}
|
||||
|
||||
} // namespace std
|
||||
|
||||
namespace tree_sitter {
|
||||
namespace build_tables {
|
||||
|
||||
using std::iswalpha;
|
||||
using std::map;
|
||||
using std::move;
|
||||
using std::pair;
|
||||
|
|
@ -39,6 +24,7 @@ using std::set;
|
|||
using std::string;
|
||||
using std::vector;
|
||||
using std::unordered_map;
|
||||
using std::unordered_set;
|
||||
using std::unique_ptr;
|
||||
using rules::Rule;
|
||||
using rules::Blank;
|
||||
|
|
@ -49,60 +35,30 @@ using rules::Symbol;
|
|||
using rules::Metadata;
|
||||
using rules::Seq;
|
||||
|
||||
enum ConflictStatus {
|
||||
DoesNotMatch = 0,
|
||||
MatchesShorterStringWithinSeparators = 1 << 0,
|
||||
MatchesSameString = 1 << 1,
|
||||
MatchesLongerString = 1 << 2,
|
||||
MatchesLongerStringWithValidNextChar = 1 << 3,
|
||||
CannotDistinguish = (
|
||||
MatchesShorterStringWithinSeparators |
|
||||
MatchesSameString |
|
||||
MatchesLongerStringWithValidNextChar
|
||||
),
|
||||
};
|
||||
|
||||
static const std::unordered_set<ParseStateId> EMPTY;
|
||||
|
||||
bool CoincidentTokenIndex::contains(Symbol a, Symbol b) const {
|
||||
return a == b || !states_with(a, b).empty();
|
||||
}
|
||||
|
||||
const std::unordered_set<ParseStateId> &CoincidentTokenIndex::states_with(Symbol a, Symbol b) const {
|
||||
const unordered_set<ParseStateId> &CoincidentTokenIndex::states_with(Symbol a, Symbol b) const {
|
||||
static const unordered_set<ParseStateId> NO_STATES;
|
||||
if (a.index > b.index) std::swap(a, b);
|
||||
auto iter = entries.find({a.index, b.index});
|
||||
if (iter == entries.end()) {
|
||||
return EMPTY;
|
||||
return NO_STATES;
|
||||
} else {
|
||||
return iter->second;
|
||||
}
|
||||
}
|
||||
|
||||
class StartingCharacterAggregator {
|
||||
public:
|
||||
void apply(const Rule &rule) {
|
||||
rule.match(
|
||||
[this](const Seq &sequence) {
|
||||
apply(*sequence.left);
|
||||
},
|
||||
|
||||
[this](const rules::Choice &rule) {
|
||||
for (const auto &element : rule.elements) {
|
||||
apply(element);
|
||||
}
|
||||
},
|
||||
|
||||
[this](const rules::Repeat &rule) { apply(*rule.rule); },
|
||||
[this](const rules::Metadata &rule) { apply(*rule.rule); },
|
||||
[this](const rules::CharacterSet &rule) { result.add_set(rule); },
|
||||
[](auto) {}
|
||||
);
|
||||
}
|
||||
|
||||
CharacterSet result;
|
||||
};
|
||||
|
||||
class LexTableBuilderImpl : public LexTableBuilder {
|
||||
enum ConflictStatus {
|
||||
DoesNotMatch = 0,
|
||||
MatchesShorterStringWithinSeparators = 1 << 0,
|
||||
MatchesSameString = 1 << 1,
|
||||
MatchesLongerString = 1 << 2,
|
||||
MatchesLongerStringWithValidNextChar = 1 << 3,
|
||||
};
|
||||
|
||||
LexTable main_lex_table;
|
||||
LexTable keyword_lex_table;
|
||||
const LexicalGrammar grammar;
|
||||
|
|
@ -117,7 +73,7 @@ class LexTableBuilderImpl : public LexTableBuilder {
|
|||
vector<ConflictStatus> conflict_matrix;
|
||||
bool conflict_detection_mode;
|
||||
LookaheadSet keyword_symbols;
|
||||
Symbol word_rule;
|
||||
Symbol word_token;
|
||||
char encoding_buffer[8];
|
||||
|
||||
public:
|
||||
|
|
@ -133,17 +89,15 @@ class LexTableBuilderImpl : public LexTableBuilder {
|
|||
parse_table(parse_table),
|
||||
conflict_matrix(lexical_grammar.variables.size() * lexical_grammar.variables.size(), DoesNotMatch),
|
||||
conflict_detection_mode(false),
|
||||
word_rule(syntax_grammar.word_rule) {
|
||||
word_token(syntax_grammar.word_token) {
|
||||
|
||||
// Compute the possible separator rules and the set of separator characters that can occur
|
||||
// immediately after any token.
|
||||
StartingCharacterAggregator separator_character_aggregator;
|
||||
for (const auto &rule : grammar.separators) {
|
||||
separator_rules.push_back(Repeat{rule});
|
||||
separator_character_aggregator.apply(rule);
|
||||
add_starting_characters(&separator_start_characters, rule);
|
||||
}
|
||||
separator_rules.push_back(Blank{});
|
||||
separator_start_characters = separator_character_aggregator.result;
|
||||
|
||||
// Compute the set of characters that each token can start with and the set of non-separator
|
||||
// characters that can follow each token. Also identify all of the tokens that can be
|
||||
|
|
@ -152,19 +106,18 @@ class LexTableBuilderImpl : public LexTableBuilder {
|
|||
for (unsigned i = 0, n = grammar.variables.size(); i < n; i++) {
|
||||
Symbol token = Symbol::terminal(i);
|
||||
|
||||
StartingCharacterAggregator starting_character_aggregator;
|
||||
starting_character_aggregator.apply(grammar.variables[i].rule);
|
||||
starting_characters_by_token[i] = starting_character_aggregator.result;
|
||||
add_starting_characters(&starting_characters_by_token[i], grammar.variables[i].rule);
|
||||
|
||||
StartingCharacterAggregator following_character_aggregator;
|
||||
const auto &following_tokens = following_tokens_by_token.find(token);
|
||||
if (following_tokens != following_tokens_by_token.end()) {
|
||||
following_tokens->second.for_each([&](Symbol following_token) {
|
||||
following_character_aggregator.apply(grammar.variables[following_token.index].rule);
|
||||
add_starting_characters(
|
||||
&following_characters_by_token[i],
|
||||
grammar.variables[following_token.index].rule
|
||||
);
|
||||
return true;
|
||||
});
|
||||
}
|
||||
following_characters_by_token[i] = following_character_aggregator.result;
|
||||
}
|
||||
LOG_END();
|
||||
|
||||
|
|
@ -187,7 +140,7 @@ class LexTableBuilderImpl : public LexTableBuilder {
|
|||
}
|
||||
LOG_END();
|
||||
|
||||
if (word_rule != rules::NONE()) {
|
||||
if (word_token != rules::NONE()) {
|
||||
identify_keywords();
|
||||
}
|
||||
}
|
||||
|
|
@ -196,7 +149,7 @@ class LexTableBuilderImpl : public LexTableBuilder {
|
|||
LookaheadSet homonyms;
|
||||
for (Symbol::Index j = 0, n = grammar.variables.size(); j < n; j++) {
|
||||
Symbol other_token = Symbol::terminal(j);
|
||||
if (get_conflict_status(word_rule, other_token) == MatchesSameString) {
|
||||
if (get_conflict_status(word_token, other_token) == MatchesSameString) {
|
||||
homonyms.insert(other_token);
|
||||
}
|
||||
}
|
||||
|
|
@ -218,9 +171,9 @@ class LexTableBuilderImpl : public LexTableBuilder {
|
|||
|
||||
for (Symbol::Index j = 0, n = grammar.variables.size(); j < n; j++) {
|
||||
Symbol other_token = Symbol::terminal(j);
|
||||
if (other_token == word_rule || homonyms.contains(other_token)) continue;
|
||||
bool word_rule_shadows_other = get_conflict_status(other_token, word_rule);
|
||||
bool other_shadows_word_rule = get_conflict_status(word_rule, other_token);
|
||||
if (other_token == word_token || homonyms.contains(other_token)) continue;
|
||||
bool word_rule_shadows_other = get_conflict_status(other_token, word_token);
|
||||
bool other_shadows_word_rule = get_conflict_status(word_token, other_token);
|
||||
|
||||
if (word_rule_shadows_other || other_shadows_word_rule) {
|
||||
homonyms.for_each([&](Symbol homonym) {
|
||||
|
|
@ -228,7 +181,7 @@ class LexTableBuilderImpl : public LexTableBuilder {
|
|||
|
||||
bool word_rule_was_already_present = true;
|
||||
for (ParseStateId state_id : coincident_token_index.states_with(homonym, other_token)) {
|
||||
if (!parse_table->states[state_id].has_terminal_entry(word_rule)) {
|
||||
if (!parse_table->states[state_id].has_terminal_entry(word_token)) {
|
||||
word_rule_was_already_present = false;
|
||||
break;
|
||||
}
|
||||
|
|
@ -238,14 +191,14 @@ class LexTableBuilderImpl : public LexTableBuilder {
|
|||
if (word_rule_shadows_other) {
|
||||
homonyms.remove(homonym);
|
||||
LOG(
|
||||
"remove %s because word_rule would shadow %s",
|
||||
"remove %s because word_token would shadow %s",
|
||||
token_name(homonym).c_str(),
|
||||
token_name(other_token).c_str()
|
||||
);
|
||||
} else if (other_shadows_word_rule && !other_shadows_homonym) {
|
||||
homonyms.remove(homonym);
|
||||
LOG(
|
||||
"remove %s because %s would shadow word_rule",
|
||||
"remove %s because %s would shadow word_token",
|
||||
token_name(homonym).c_str(),
|
||||
token_name(other_token).c_str()
|
||||
);
|
||||
|
|
@ -274,8 +227,8 @@ class LexTableBuilderImpl : public LexTableBuilder {
|
|||
for (ParseState &parse_state : parse_table->states) {
|
||||
LookaheadSet token_set;
|
||||
for (auto &entry : parse_state.terminal_entries) {
|
||||
if (word_rule.is_terminal() && keyword_symbols.contains(entry.first)) {
|
||||
token_set.insert(word_rule);
|
||||
if (word_token.is_terminal() && keyword_symbols.contains(entry.first)) {
|
||||
token_set.insert(word_token);
|
||||
} else {
|
||||
token_set.insert(entry.first);
|
||||
}
|
||||
|
|
@ -304,11 +257,12 @@ class LexTableBuilderImpl : public LexTableBuilder {
|
|||
|
||||
mark_fragile_tokens();
|
||||
remove_duplicate_lex_states(main_lex_table);
|
||||
return {main_lex_table, keyword_lex_table, word_rule};
|
||||
return {main_lex_table, keyword_lex_table, word_token};
|
||||
}
|
||||
|
||||
bool does_token_shadow_other(Symbol token, Symbol shadowed_token) const {
|
||||
if (token == word_rule && keyword_symbols.contains(shadowed_token)) return false;
|
||||
if (keyword_symbols.contains(shadowed_token) &&
|
||||
(keyword_symbols.contains(token) || token == word_token)) return false;
|
||||
return get_conflict_status(shadowed_token, token) & (
|
||||
MatchesShorterStringWithinSeparators |
|
||||
MatchesLongerStringWithValidNextChar
|
||||
|
|
@ -316,9 +270,11 @@ class LexTableBuilderImpl : public LexTableBuilder {
|
|||
}
|
||||
|
||||
bool does_token_match_same_string_as_other(Symbol token, Symbol shadowed_token) const {
|
||||
if (shadowed_token == word_token && keyword_symbols.contains(token)) return false;
|
||||
return get_conflict_status(shadowed_token, token) & MatchesSameString;
|
||||
}
|
||||
|
||||
private:
|
||||
ConflictStatus get_conflict_status(Symbol shadowed_token, Symbol other_token) const {
|
||||
if (shadowed_token.is_built_in() ||
|
||||
other_token.is_built_in() ||
|
||||
|
|
@ -328,7 +284,6 @@ class LexTableBuilderImpl : public LexTableBuilder {
|
|||
return conflict_matrix[index];
|
||||
}
|
||||
|
||||
private:
|
||||
bool record_conflict(Symbol shadowed_token, Symbol other_token, ConflictStatus status) {
|
||||
if (!conflict_detection_mode) return false;
|
||||
unsigned index = shadowed_token.index * grammar.variables.size() + other_token.index;
|
||||
|
|
@ -462,6 +417,12 @@ class LexTableBuilderImpl : public LexTableBuilder {
|
|||
}
|
||||
|
||||
bool merge_token_set(LookaheadSet *left, const LookaheadSet &right) const {
|
||||
auto CannotDistinguish = (
|
||||
MatchesShorterStringWithinSeparators |
|
||||
MatchesSameString |
|
||||
MatchesLongerStringWithValidNextChar
|
||||
);
|
||||
|
||||
bool is_compatible = true;
|
||||
|
||||
left->for_each_difference(right, [&](bool in_left, Symbol different_symbol) {
|
||||
|
|
@ -579,6 +540,34 @@ class LexTableBuilderImpl : public LexTableBuilder {
|
|||
return result;
|
||||
}
|
||||
|
||||
static void add_starting_characters(CharacterSet *characters, const Rule &rule) {
|
||||
rule.match(
|
||||
[characters](const Seq &sequence) {
|
||||
add_starting_characters(characters, *sequence.left);
|
||||
},
|
||||
|
||||
[characters](const rules::Choice &rule) {
|
||||
for (const auto &element : rule.elements) {
|
||||
add_starting_characters(characters, element);
|
||||
}
|
||||
},
|
||||
|
||||
[characters](const rules::Repeat &rule) {
|
||||
add_starting_characters(characters, *rule.rule);
|
||||
},
|
||||
|
||||
[characters](const rules::Metadata &rule) {
|
||||
add_starting_characters(characters, *rule.rule);
|
||||
},
|
||||
|
||||
[characters](const rules::CharacterSet &rule) {
|
||||
characters->add_set(rule);
|
||||
},
|
||||
|
||||
[](auto) {}
|
||||
);
|
||||
}
|
||||
|
||||
vector<Rule> rules_for_symbol(const rules::Symbol &symbol) {
|
||||
if (symbol == rules::END_OF_INPUT()) {
|
||||
return { CharacterSet().include(0) };
|
||||
|
|
@ -656,3 +645,16 @@ bool LexTableBuilder::does_token_match_same_string_as_other(Symbol a, Symbol b)
|
|||
|
||||
} // namespace build_tables
|
||||
} // namespace tree_sitter
|
||||
|
||||
namespace std {
|
||||
|
||||
using tree_sitter::rules::Symbol;
|
||||
|
||||
size_t hash<pair<Symbol::Index, Symbol::Index>>::operator()(
|
||||
const pair<Symbol::Index, Symbol::Index> &p
|
||||
) const {
|
||||
hash<Symbol::Index> hasher;
|
||||
return hasher(p.first) ^ hasher(p.second);
|
||||
}
|
||||
|
||||
} // namespace std
|
||||
|
|
|
|||
|
|
@ -42,11 +42,13 @@ struct CoincidentTokenIndex {
|
|||
|
||||
class LexTableBuilder {
|
||||
public:
|
||||
static std::unique_ptr<LexTableBuilder> create(const SyntaxGrammar &,
|
||||
const LexicalGrammar &,
|
||||
const std::unordered_map<rules::Symbol, LookaheadSet> &,
|
||||
const CoincidentTokenIndex &,
|
||||
ParseTable *);
|
||||
static std::unique_ptr<LexTableBuilder> create(
|
||||
const SyntaxGrammar &,
|
||||
const LexicalGrammar &,
|
||||
const std::unordered_map<rules::Symbol, LookaheadSet> &,
|
||||
const CoincidentTokenIndex &,
|
||||
ParseTable *
|
||||
);
|
||||
|
||||
struct BuildResult {
|
||||
LexTable main_table;
|
||||
|
|
@ -55,7 +57,6 @@ class LexTableBuilder {
|
|||
};
|
||||
|
||||
BuildResult build();
|
||||
|
||||
bool does_token_shadow_other(rules::Symbol, rules::Symbol) const;
|
||||
bool does_token_match_same_string_as_other(rules::Symbol, rules::Symbol) const;
|
||||
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@ struct InputGrammar {
|
|||
std::vector<std::unordered_set<rules::NamedSymbol>> expected_conflicts;
|
||||
std::vector<rules::Rule> external_tokens;
|
||||
std::unordered_set<rules::NamedSymbol> variables_to_inline;
|
||||
rules::NamedSymbol word_rule;
|
||||
rules::NamedSymbol word_token;
|
||||
};
|
||||
|
||||
} // namespace tree_sitter
|
||||
|
|
|
|||
|
|
@ -368,7 +368,7 @@ ParseGrammarResult parse_grammar(const string &input) {
|
|||
goto error;
|
||||
}
|
||||
|
||||
grammar.word_rule = NamedSymbol { word_rule_json.u.string.ptr };
|
||||
grammar.word_token = NamedSymbol { word_rule_json.u.string.ptr };
|
||||
}
|
||||
|
||||
json_value_free(grammar_json);
|
||||
|
|
|
|||
|
|
@ -106,7 +106,7 @@ InitialSyntaxGrammar expand_repeats(const InitialSyntaxGrammar &grammar) {
|
|||
expander.aux_rules.end()
|
||||
);
|
||||
|
||||
result.word_rule = grammar.word_rule;
|
||||
result.word_token = grammar.word_token;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -329,8 +329,8 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
|
|||
}
|
||||
}
|
||||
|
||||
syntax_grammar.word_rule = symbol_replacer.replace_symbol(grammar.word_rule);
|
||||
if (syntax_grammar.word_rule.is_non_terminal()) {
|
||||
syntax_grammar.word_token = symbol_replacer.replace_symbol(grammar.word_token);
|
||||
if (syntax_grammar.word_token.is_non_terminal()) {
|
||||
return make_tuple(
|
||||
syntax_grammar,
|
||||
lexical_grammar,
|
||||
|
|
|
|||
|
|
@ -161,7 +161,7 @@ pair<SyntaxGrammar, CompileError> flatten_grammar(const InitialSyntaxGrammar &gr
|
|||
i++;
|
||||
}
|
||||
|
||||
result.word_rule = grammar.word_rule;
|
||||
result.word_token = grammar.word_token;
|
||||
|
||||
return {result, CompileError::none()};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ struct InitialSyntaxGrammar {
|
|||
std::set<std::set<rules::Symbol>> expected_conflicts;
|
||||
std::vector<ExternalToken> external_tokens;
|
||||
std::set<rules::Symbol> variables_to_inline;
|
||||
rules::Symbol word_rule;
|
||||
rules::Symbol word_token;
|
||||
};
|
||||
|
||||
} // namespace prepare_grammar
|
||||
|
|
|
|||
|
|
@ -166,7 +166,7 @@ pair<InternedGrammar, CompileError> intern_symbols(const InputGrammar &grammar)
|
|||
}
|
||||
}
|
||||
|
||||
result.word_rule = interner.intern_symbol(grammar.word_rule);
|
||||
result.word_token = interner.intern_symbol(grammar.word_token);
|
||||
|
||||
return {result, CompileError::none()};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ struct InternedGrammar {
|
|||
std::set<std::set<rules::Symbol>> expected_conflicts;
|
||||
std::vector<Variable> external_tokens;
|
||||
std::set<rules::Symbol> variables_to_inline;
|
||||
rules::Symbol word_rule;
|
||||
rules::Symbol word_token;
|
||||
};
|
||||
|
||||
} // namespace prepare_grammar
|
||||
|
|
|
|||
|
|
@ -60,7 +60,7 @@ struct SyntaxGrammar {
|
|||
std::set<std::set<rules::Symbol>> expected_conflicts;
|
||||
std::vector<ExternalToken> external_tokens;
|
||||
std::set<rules::Symbol> variables_to_inline;
|
||||
rules::Symbol word_rule;
|
||||
rules::Symbol word_token;
|
||||
};
|
||||
|
||||
} // namespace tree_sitter
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue