In lexer, prefer tokens to skipped separator characters
This was causing newlines in go and javascript to be parsed as meaningless separator characters instead of statement terminators
This commit is contained in:
parent
220e081c49
commit
e93e254518
26 changed files with 5559 additions and 6650 deletions
|
|
@ -70,7 +70,9 @@ namespace tree_sitter {
|
|||
CharacterSet rule = transition.first;
|
||||
LexItemSet new_item_set = transition.second;
|
||||
LexStateId new_state_id = add_lex_state(new_item_set);
|
||||
lex_table.state(state_id).actions[rule] = LexAction::Advance(new_state_id);
|
||||
auto action = LexAction::Advance(new_state_id, precedence_values_for_item_set(new_item_set));
|
||||
if (conflict_manager.resolve_lex_action(lex_table.state(state_id).default_action, action))
|
||||
lex_table.state(state_id).actions[rule] = action;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -93,14 +95,24 @@ namespace tree_sitter {
|
|||
|
||||
rules::rule_ptr after_separators(rules::rule_ptr rule) {
|
||||
return rules::Seq::Build({
|
||||
make_shared<rules::Repeat>(CharacterSet({ ' ', '\t', '\n', '\r' }).copy()),
|
||||
make_shared<rules::Metadata>(make_shared<rules::Blank>(), map<rules::MetadataKey, int>({
|
||||
make_shared<rules::Metadata>(rules::Seq::Build({
|
||||
make_shared<rules::Repeat>(CharacterSet({ ' ', '\t', '\n', '\r' }).copy()),
|
||||
make_shared<rules::Blank>(),
|
||||
}), map<rules::MetadataKey, int>({
|
||||
{rules::START_TOKEN, 1},
|
||||
{rules::PRECEDENCE, -1},
|
||||
})),
|
||||
rule
|
||||
rule,
|
||||
});
|
||||
}
|
||||
|
||||
set<int> precedence_values_for_item_set(const LexItemSet &item_set) const {
|
||||
set<int> result;
|
||||
for (const auto &item : item_set)
|
||||
result.insert(item.precedence());
|
||||
return result;
|
||||
}
|
||||
|
||||
public:
|
||||
LexTableBuilder(ParseTable *parse_table, const PreparedGrammar &lex_grammar) :
|
||||
lex_grammar(lex_grammar),
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@ namespace tree_sitter {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void add_ubiquitous_token_actions(const ParseItemSet &item_set, ParseStateId state_id) {
|
||||
for (const Symbol &symbol : grammar.options.ubiquitous_tokens) {
|
||||
auto &actions = parse_table.states[state_id].actions;
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
#include "compiler/build_tables/get_metadata.h"
|
||||
#include "compiler/rules/visitor.h"
|
||||
#include "compiler/rules/seq.h"
|
||||
|
||||
namespace tree_sitter {
|
||||
namespace build_tables {
|
||||
|
|
@ -11,6 +12,13 @@ namespace tree_sitter {
|
|||
return rule->value_for(metadata_key);
|
||||
}
|
||||
|
||||
// TODO -
|
||||
// Remove this. It is currently needed to make the rule generated
|
||||
// by `LexTableBuilder::after_separators` have the right precedence.
|
||||
int apply_to(const rules::Seq *rule) {
|
||||
return apply(rule->left);
|
||||
}
|
||||
|
||||
public:
|
||||
explicit GetMetadata(rules::MetadataKey key) : metadata_key(key) {}
|
||||
};
|
||||
|
|
|
|||
|
|
@ -18,17 +18,45 @@ namespace tree_sitter {
|
|||
|
||||
bool LexConflictManager::resolve_lex_action(const LexAction &old_action,
|
||||
const LexAction &new_action) {
|
||||
if (new_action.type < old_action.type)
|
||||
return !resolve_lex_action(new_action, old_action);
|
||||
|
||||
switch (old_action.type) {
|
||||
case LexActionTypeError:
|
||||
return true;
|
||||
case LexActionTypeAccept:
|
||||
if (new_action.precedence > old_action.precedence) {
|
||||
return true;
|
||||
} else if (new_action.precedence < old_action.precedence) {
|
||||
return false;
|
||||
} else {
|
||||
return new_action.symbol.index < old_action.symbol.index;
|
||||
case LexActionTypeAccept: {
|
||||
int old_precedence = *old_action.precedence_values.begin();
|
||||
switch (new_action.type) {
|
||||
case LexActionTypeAccept: {
|
||||
int new_precedence = *new_action.precedence_values.begin();
|
||||
if (new_precedence > old_precedence) {
|
||||
return true;
|
||||
} else if (new_precedence < old_precedence) {
|
||||
return false;
|
||||
} else {
|
||||
return new_action.symbol.index < old_action.symbol.index;
|
||||
}
|
||||
}
|
||||
case LexActionTypeAdvance: {
|
||||
// int min_precedence = *new_action.precedence_values.begin();
|
||||
int max_precedence = *new_action.precedence_values.rbegin();
|
||||
if (max_precedence > old_precedence) {
|
||||
// if (min_precedence < old_precedence)
|
||||
return true;
|
||||
} else if (max_precedence < old_precedence) {
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -88,7 +88,7 @@ namespace tree_sitter {
|
|||
const PreparedGrammar & grammar_for_symbol(const rules::Symbol &symbol) {
|
||||
return symbol.is_token() ? lexical_grammar : syntax_grammar;
|
||||
}
|
||||
|
||||
|
||||
string sanitize_name(string name) {
|
||||
auto existing = sanitized_names.find(name);
|
||||
if (existing != sanitized_names.end())
|
||||
|
|
@ -115,7 +115,7 @@ namespace tree_sitter {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
bool has_sanitized_name(string name) {
|
||||
for (auto &pair : sanitized_names)
|
||||
if (pair.second == name)
|
||||
|
|
|
|||
|
|
@ -48,7 +48,7 @@ namespace tree_sitter {
|
|||
GrammarError::GrammarError(GrammarErrorType type, std::string message) :
|
||||
type(type),
|
||||
message(message) {}
|
||||
|
||||
|
||||
bool GrammarError::operator==(const GrammarError &other) const {
|
||||
return type == other.type && message == other.message;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -12,24 +12,25 @@ namespace tree_sitter {
|
|||
LexAction::LexAction() :
|
||||
type(LexActionTypeError),
|
||||
symbol(Symbol(-1)),
|
||||
state_index(-1) {}
|
||||
state_index(-1),
|
||||
precedence_values({0}) {}
|
||||
|
||||
LexAction::LexAction(LexActionType type, size_t state_index, Symbol symbol, int precedence) :
|
||||
LexAction::LexAction(LexActionType type, size_t state_index, Symbol symbol, set<int> precedence_values) :
|
||||
type(type),
|
||||
symbol(symbol),
|
||||
state_index(state_index),
|
||||
precedence(precedence) {}
|
||||
precedence_values(precedence_values) {}
|
||||
|
||||
LexAction LexAction::Error() {
|
||||
return LexAction(LexActionTypeError, -1, Symbol(-1), 0);
|
||||
return LexAction(LexActionTypeError, -1, Symbol(-1), {0});
|
||||
}
|
||||
|
||||
LexAction LexAction::Advance(size_t state_index) {
|
||||
return LexAction(LexActionTypeAdvance, state_index, Symbol(-1), 0);
|
||||
LexAction LexAction::Advance(size_t state_index, set<int> precedence_values) {
|
||||
return LexAction(LexActionTypeAdvance, state_index, Symbol(-1), precedence_values);
|
||||
}
|
||||
|
||||
LexAction LexAction::Accept(Symbol symbol, int precedence) {
|
||||
return LexAction(LexActionTypeAccept, -1, symbol, precedence);
|
||||
return LexAction(LexActionTypeAccept, -1, symbol, { precedence });
|
||||
}
|
||||
|
||||
bool LexAction::operator==(const LexAction &other) const {
|
||||
|
|
|
|||
|
|
@ -16,18 +16,18 @@ namespace tree_sitter {
|
|||
} LexActionType;
|
||||
|
||||
class LexAction {
|
||||
LexAction(LexActionType type, size_t state_index, rules::Symbol symbol, int precedence);
|
||||
LexAction(LexActionType type, size_t state_index, rules::Symbol symbol, std::set<int> precedence_values);
|
||||
public:
|
||||
LexAction();
|
||||
static LexAction Accept(rules::Symbol symbol, int precedence);
|
||||
static LexAction Error();
|
||||
static LexAction Advance(size_t state_index);
|
||||
static LexAction Advance(size_t state_index, std::set<int> precedence_values);
|
||||
bool operator==(const LexAction &action) const;
|
||||
|
||||
LexActionType type;
|
||||
rules::Symbol symbol;
|
||||
size_t state_index;
|
||||
int precedence;
|
||||
std::set<int> precedence_values;
|
||||
};
|
||||
|
||||
std::ostream& operator<<(std::ostream &stream, const LexAction &item);
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ namespace tree_sitter {
|
|||
using rules::rule_ptr;
|
||||
using rules::String;
|
||||
using rules::Pattern;
|
||||
|
||||
|
||||
namespace prepare_grammar {
|
||||
class ExpandTokens : public rules::IdentityRuleFn {
|
||||
using rules::IdentityRuleFn::apply_to;
|
||||
|
|
@ -30,38 +30,38 @@ namespace tree_sitter {
|
|||
elements.push_back(rules::CharacterSet({ val }).copy());
|
||||
return rules::Seq::Build(elements);
|
||||
}
|
||||
|
||||
|
||||
rule_ptr apply_to(const Pattern *rule) {
|
||||
auto pair = parse_regex(rule->value);
|
||||
if (!error)
|
||||
error = pair.second;
|
||||
return pair.first;
|
||||
}
|
||||
|
||||
|
||||
public:
|
||||
const GrammarError *error;
|
||||
ExpandTokens() : error(nullptr) {}
|
||||
};
|
||||
|
||||
|
||||
pair<PreparedGrammar, const GrammarError *>
|
||||
expand_tokens(const PreparedGrammar &grammar) {
|
||||
vector<pair<string, rule_ptr>> rules, aux_rules;
|
||||
ExpandTokens expander;
|
||||
|
||||
|
||||
for (auto &pair : grammar.rules) {
|
||||
auto rule = expander.apply(pair.second);
|
||||
if (expander.error)
|
||||
return { PreparedGrammar(), expander.error };
|
||||
rules.push_back({ pair.first, rule });
|
||||
}
|
||||
|
||||
|
||||
for (auto &pair : grammar.aux_rules) {
|
||||
auto rule = expander.apply(pair.second);
|
||||
if (expander.error)
|
||||
return { PreparedGrammar(), expander.error };
|
||||
aux_rules.push_back({ pair.first, rule });
|
||||
}
|
||||
|
||||
|
||||
return { PreparedGrammar(rules, aux_rules, grammar.options), nullptr };
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@
|
|||
|
||||
namespace tree_sitter {
|
||||
class PreparedGrammar;
|
||||
|
||||
|
||||
namespace prepare_grammar {
|
||||
std::pair<PreparedGrammar, const GrammarError *>
|
||||
expand_tokens(const PreparedGrammar &);
|
||||
|
|
|
|||
|
|
@ -57,7 +57,7 @@ namespace tree_sitter {
|
|||
|
||||
SymbolInliner(const map<Symbol, Symbol> &replacements) : replacements(replacements) {}
|
||||
};
|
||||
|
||||
|
||||
const rules::SymbolOption SymbolOptionAuxToken = rules::SymbolOption(rules::SymbolOptionToken|rules::SymbolOptionAuxiliary);
|
||||
|
||||
class TokenExtractor : public rules::IdentityRuleFn {
|
||||
|
|
@ -71,7 +71,7 @@ namespace tree_sitter {
|
|||
return make_shared<Symbol>(index, SymbolOptionAuxToken);
|
||||
|
||||
}
|
||||
|
||||
|
||||
rule_ptr default_apply(const rules::Rule *rule) {
|
||||
auto result = rule->copy();
|
||||
if (IsToken().apply(rule->copy())) {
|
||||
|
|
@ -80,7 +80,7 @@ namespace tree_sitter {
|
|||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
rule_ptr apply_to(const rules::Metadata *rule) {
|
||||
auto result = rule->copy();
|
||||
if (IsToken().apply(rule->copy())) {
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ namespace tree_sitter {
|
|||
using rules::Repeat;
|
||||
using rules::CharacterRange;
|
||||
using rules::blank;
|
||||
|
||||
|
||||
namespace prepare_grammar {
|
||||
class PatternParser {
|
||||
public:
|
||||
|
|
@ -29,7 +29,7 @@ namespace tree_sitter {
|
|||
input(input),
|
||||
length(input.length()),
|
||||
position(0) {}
|
||||
|
||||
|
||||
pair<rule_ptr, const GrammarError *> rule(bool nested) {
|
||||
vector<rule_ptr> choices = {};
|
||||
do {
|
||||
|
|
@ -47,7 +47,7 @@ namespace tree_sitter {
|
|||
auto rule = (choices.size() > 1) ? make_shared<Choice>(choices) : choices.front();
|
||||
return { rule, nullptr };
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
pair<rule_ptr, const GrammarError *> term(bool nested) {
|
||||
rule_ptr result = blank();
|
||||
|
|
@ -63,7 +63,7 @@ namespace tree_sitter {
|
|||
} while (has_more_input());
|
||||
return { result, nullptr };
|
||||
}
|
||||
|
||||
|
||||
pair<rule_ptr, const GrammarError *> factor() {
|
||||
auto pair = atom();
|
||||
if (pair.second)
|
||||
|
|
@ -87,7 +87,7 @@ namespace tree_sitter {
|
|||
}
|
||||
return { result, nullptr };
|
||||
}
|
||||
|
||||
|
||||
pair<rule_ptr, const GrammarError *> atom() {
|
||||
switch (peek()) {
|
||||
case '(': {
|
||||
|
|
@ -103,7 +103,7 @@ namespace tree_sitter {
|
|||
case '[': {
|
||||
next();
|
||||
auto pair = char_set();
|
||||
if (pair.second)
|
||||
if (pair.second)
|
||||
return { blank(), pair.second };
|
||||
if (peek() != ']')
|
||||
return error("unmatched open square bracket");
|
||||
|
|
@ -122,13 +122,13 @@ namespace tree_sitter {
|
|||
}
|
||||
default: {
|
||||
auto pair = single_char();
|
||||
if (pair.second)
|
||||
if (pair.second)
|
||||
return { blank(), pair.second };
|
||||
return { pair.first.copy(), nullptr };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pair<CharacterSet, const GrammarError *> char_set() {
|
||||
bool is_affirmative = true;
|
||||
if (peek() == '^') {
|
||||
|
|
@ -146,7 +146,7 @@ namespace tree_sitter {
|
|||
result = result.complement();
|
||||
return { result, nullptr };
|
||||
}
|
||||
|
||||
|
||||
pair<CharacterSet, const GrammarError *> single_char() {
|
||||
CharacterSet value;
|
||||
switch (peek()) {
|
||||
|
|
@ -168,7 +168,7 @@ namespace tree_sitter {
|
|||
}
|
||||
return { value, nullptr };
|
||||
}
|
||||
|
||||
|
||||
CharacterSet escaped_char(char value) {
|
||||
switch (value) {
|
||||
case 'a':
|
||||
|
|
@ -181,23 +181,23 @@ namespace tree_sitter {
|
|||
return CharacterSet({ value });
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void next() {
|
||||
position++;
|
||||
}
|
||||
|
||||
|
||||
char peek() {
|
||||
return input[position];
|
||||
}
|
||||
|
||||
|
||||
bool has_more_input() {
|
||||
return position < length;
|
||||
}
|
||||
|
||||
|
||||
pair<rule_ptr, const GrammarError *> error(string msg) {
|
||||
return { blank(), new GrammarError(GrammarErrorTypeRegex, msg) };
|
||||
}
|
||||
|
||||
|
||||
const string input;
|
||||
const size_t length;
|
||||
size_t position;
|
||||
|
|
|
|||
|
|
@ -24,10 +24,10 @@ namespace tree_sitter {
|
|||
auto expand_tokens_result = expand_tokens(grammars.second);
|
||||
const PreparedGrammar &lex_grammar = expand_tokens_result.first;
|
||||
error = expand_tokens_result.second;
|
||||
|
||||
|
||||
if (error)
|
||||
return make_tuple(PreparedGrammar(), PreparedGrammar(), error);
|
||||
|
||||
|
||||
return make_tuple(rule_grammar, lex_grammar, nullptr);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -25,11 +25,11 @@ namespace tree_sitter {
|
|||
string apply_to(const rules::Metadata *rule) {
|
||||
return apply(rule->rule);
|
||||
}
|
||||
|
||||
|
||||
string apply_to(const rules::Seq *rule) {
|
||||
return "(seq " + apply(rule->left) + " " + apply(rule->right) + ")";
|
||||
}
|
||||
|
||||
|
||||
string apply_to(const rules::Choice *rule) {
|
||||
string result = "(choice";
|
||||
for (auto &element : rule->elements)
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ namespace tree_sitter {
|
|||
using std::ostream;
|
||||
using rules::rule_ptr;
|
||||
using rules::Symbol;
|
||||
|
||||
|
||||
PreparedGrammar::PreparedGrammar() : Grammar({}), aux_rules({}), options({}) {}
|
||||
|
||||
PreparedGrammar::PreparedGrammar(const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue