In lexer, prefer tokens to skipped separator characters

This was causing newlines in go and javascript to be parsed as
meaningless separator characters instead of statement terminators
This commit is contained in:
Max Brunsfeld 2014-05-30 13:29:54 -07:00
parent 220e081c49
commit e93e254518
26 changed files with 5559 additions and 6650 deletions

View file

@ -70,7 +70,9 @@ namespace tree_sitter {
CharacterSet rule = transition.first;
LexItemSet new_item_set = transition.second;
LexStateId new_state_id = add_lex_state(new_item_set);
lex_table.state(state_id).actions[rule] = LexAction::Advance(new_state_id);
auto action = LexAction::Advance(new_state_id, precedence_values_for_item_set(new_item_set));
if (conflict_manager.resolve_lex_action(lex_table.state(state_id).default_action, action))
lex_table.state(state_id).actions[rule] = action;
}
}
@ -93,14 +95,24 @@ namespace tree_sitter {
rules::rule_ptr after_separators(rules::rule_ptr rule) {
return rules::Seq::Build({
make_shared<rules::Repeat>(CharacterSet({ ' ', '\t', '\n', '\r' }).copy()),
make_shared<rules::Metadata>(make_shared<rules::Blank>(), map<rules::MetadataKey, int>({
make_shared<rules::Metadata>(rules::Seq::Build({
make_shared<rules::Repeat>(CharacterSet({ ' ', '\t', '\n', '\r' }).copy()),
make_shared<rules::Blank>(),
}), map<rules::MetadataKey, int>({
{rules::START_TOKEN, 1},
{rules::PRECEDENCE, -1},
})),
rule
rule,
});
}
set<int> precedence_values_for_item_set(const LexItemSet &item_set) const {
set<int> result;
for (const auto &item : item_set)
result.insert(item.precedence());
return result;
}
public:
LexTableBuilder(ParseTable *parse_table, const PreparedGrammar &lex_grammar) :
lex_grammar(lex_grammar),

View file

@ -58,7 +58,7 @@ namespace tree_sitter {
}
}
}
void add_ubiquitous_token_actions(const ParseItemSet &item_set, ParseStateId state_id) {
for (const Symbol &symbol : grammar.options.ubiquitous_tokens) {
auto &actions = parse_table.states[state_id].actions;

View file

@ -1,5 +1,6 @@
#include "compiler/build_tables/get_metadata.h"
#include "compiler/rules/visitor.h"
#include "compiler/rules/seq.h"
namespace tree_sitter {
namespace build_tables {
@ -11,6 +12,13 @@ namespace tree_sitter {
return rule->value_for(metadata_key);
}
// TODO -
// Remove this. It is currently needed to make the rule generated
// by `LexTableBuilder::after_separators` have the right precedence.
int apply_to(const rules::Seq *rule) {
return apply(rule->left);
}
public:
explicit GetMetadata(rules::MetadataKey key) : metadata_key(key) {}
};

View file

@ -18,17 +18,45 @@ namespace tree_sitter {
bool LexConflictManager::resolve_lex_action(const LexAction &old_action,
const LexAction &new_action) {
if (new_action.type < old_action.type)
return !resolve_lex_action(new_action, old_action);
switch (old_action.type) {
case LexActionTypeError:
return true;
case LexActionTypeAccept:
if (new_action.precedence > old_action.precedence) {
return true;
} else if (new_action.precedence < old_action.precedence) {
return false;
} else {
return new_action.symbol.index < old_action.symbol.index;
case LexActionTypeAccept: {
int old_precedence = *old_action.precedence_values.begin();
switch (new_action.type) {
case LexActionTypeAccept: {
int new_precedence = *new_action.precedence_values.begin();
if (new_precedence > old_precedence) {
return true;
} else if (new_precedence < old_precedence) {
return false;
} else {
return new_action.symbol.index < old_action.symbol.index;
}
}
case LexActionTypeAdvance: {
// int min_precedence = *new_action.precedence_values.begin();
int max_precedence = *new_action.precedence_values.rbegin();
if (max_precedence > old_precedence) {
// if (min_precedence < old_precedence)
return true;
} else if (max_precedence < old_precedence) {
return false;
} else {
return true;
}
return false;
}
default:
return false;
}
return true;
}
default:
return false;
}

View file

@ -88,7 +88,7 @@ namespace tree_sitter {
const PreparedGrammar & grammar_for_symbol(const rules::Symbol &symbol) {
return symbol.is_token() ? lexical_grammar : syntax_grammar;
}
string sanitize_name(string name) {
auto existing = sanitized_names.find(name);
if (existing != sanitized_names.end())
@ -115,7 +115,7 @@ namespace tree_sitter {
}
}
}
bool has_sanitized_name(string name) {
for (auto &pair : sanitized_names)
if (pair.second == name)

View file

@ -48,7 +48,7 @@ namespace tree_sitter {
GrammarError::GrammarError(GrammarErrorType type, std::string message) :
type(type),
message(message) {}
bool GrammarError::operator==(const GrammarError &other) const {
return type == other.type && message == other.message;
}

View file

@ -12,24 +12,25 @@ namespace tree_sitter {
LexAction::LexAction() :
type(LexActionTypeError),
symbol(Symbol(-1)),
state_index(-1) {}
state_index(-1),
precedence_values({0}) {}
LexAction::LexAction(LexActionType type, size_t state_index, Symbol symbol, int precedence) :
LexAction::LexAction(LexActionType type, size_t state_index, Symbol symbol, set<int> precedence_values) :
type(type),
symbol(symbol),
state_index(state_index),
precedence(precedence) {}
precedence_values(precedence_values) {}
LexAction LexAction::Error() {
return LexAction(LexActionTypeError, -1, Symbol(-1), 0);
return LexAction(LexActionTypeError, -1, Symbol(-1), {0});
}
LexAction LexAction::Advance(size_t state_index) {
return LexAction(LexActionTypeAdvance, state_index, Symbol(-1), 0);
LexAction LexAction::Advance(size_t state_index, set<int> precedence_values) {
return LexAction(LexActionTypeAdvance, state_index, Symbol(-1), precedence_values);
}
LexAction LexAction::Accept(Symbol symbol, int precedence) {
return LexAction(LexActionTypeAccept, -1, symbol, precedence);
return LexAction(LexActionTypeAccept, -1, symbol, { precedence });
}
bool LexAction::operator==(const LexAction &other) const {

View file

@ -16,18 +16,18 @@ namespace tree_sitter {
} LexActionType;
class LexAction {
LexAction(LexActionType type, size_t state_index, rules::Symbol symbol, int precedence);
LexAction(LexActionType type, size_t state_index, rules::Symbol symbol, std::set<int> precedence_values);
public:
LexAction();
static LexAction Accept(rules::Symbol symbol, int precedence);
static LexAction Error();
static LexAction Advance(size_t state_index);
static LexAction Advance(size_t state_index, std::set<int> precedence_values);
bool operator==(const LexAction &action) const;
LexActionType type;
rules::Symbol symbol;
size_t state_index;
int precedence;
std::set<int> precedence_values;
};
std::ostream& operator<<(std::ostream &stream, const LexAction &item);

View file

@ -19,7 +19,7 @@ namespace tree_sitter {
using rules::rule_ptr;
using rules::String;
using rules::Pattern;
namespace prepare_grammar {
class ExpandTokens : public rules::IdentityRuleFn {
using rules::IdentityRuleFn::apply_to;
@ -30,38 +30,38 @@ namespace tree_sitter {
elements.push_back(rules::CharacterSet({ val }).copy());
return rules::Seq::Build(elements);
}
rule_ptr apply_to(const Pattern *rule) {
auto pair = parse_regex(rule->value);
if (!error)
error = pair.second;
return pair.first;
}
public:
const GrammarError *error;
ExpandTokens() : error(nullptr) {}
};
pair<PreparedGrammar, const GrammarError *>
expand_tokens(const PreparedGrammar &grammar) {
vector<pair<string, rule_ptr>> rules, aux_rules;
ExpandTokens expander;
for (auto &pair : grammar.rules) {
auto rule = expander.apply(pair.second);
if (expander.error)
return { PreparedGrammar(), expander.error };
rules.push_back({ pair.first, rule });
}
for (auto &pair : grammar.aux_rules) {
auto rule = expander.apply(pair.second);
if (expander.error)
return { PreparedGrammar(), expander.error };
aux_rules.push_back({ pair.first, rule });
}
return { PreparedGrammar(rules, aux_rules, grammar.options), nullptr };
}
}

View file

@ -5,7 +5,7 @@
namespace tree_sitter {
class PreparedGrammar;
namespace prepare_grammar {
std::pair<PreparedGrammar, const GrammarError *>
expand_tokens(const PreparedGrammar &);

View file

@ -57,7 +57,7 @@ namespace tree_sitter {
SymbolInliner(const map<Symbol, Symbol> &replacements) : replacements(replacements) {}
};
const rules::SymbolOption SymbolOptionAuxToken = rules::SymbolOption(rules::SymbolOptionToken|rules::SymbolOptionAuxiliary);
class TokenExtractor : public rules::IdentityRuleFn {
@ -71,7 +71,7 @@ namespace tree_sitter {
return make_shared<Symbol>(index, SymbolOptionAuxToken);
}
rule_ptr default_apply(const rules::Rule *rule) {
auto result = rule->copy();
if (IsToken().apply(rule->copy())) {
@ -80,7 +80,7 @@ namespace tree_sitter {
return result;
}
}
rule_ptr apply_to(const rules::Metadata *rule) {
auto result = rule->copy();
if (IsToken().apply(rule->copy())) {

View file

@ -21,7 +21,7 @@ namespace tree_sitter {
using rules::Repeat;
using rules::CharacterRange;
using rules::blank;
namespace prepare_grammar {
class PatternParser {
public:
@ -29,7 +29,7 @@ namespace tree_sitter {
input(input),
length(input.length()),
position(0) {}
pair<rule_ptr, const GrammarError *> rule(bool nested) {
vector<rule_ptr> choices = {};
do {
@ -47,7 +47,7 @@ namespace tree_sitter {
auto rule = (choices.size() > 1) ? make_shared<Choice>(choices) : choices.front();
return { rule, nullptr };
}
private:
pair<rule_ptr, const GrammarError *> term(bool nested) {
rule_ptr result = blank();
@ -63,7 +63,7 @@ namespace tree_sitter {
} while (has_more_input());
return { result, nullptr };
}
pair<rule_ptr, const GrammarError *> factor() {
auto pair = atom();
if (pair.second)
@ -87,7 +87,7 @@ namespace tree_sitter {
}
return { result, nullptr };
}
pair<rule_ptr, const GrammarError *> atom() {
switch (peek()) {
case '(': {
@ -103,7 +103,7 @@ namespace tree_sitter {
case '[': {
next();
auto pair = char_set();
if (pair.second)
if (pair.second)
return { blank(), pair.second };
if (peek() != ']')
return error("unmatched open square bracket");
@ -122,13 +122,13 @@ namespace tree_sitter {
}
default: {
auto pair = single_char();
if (pair.second)
if (pair.second)
return { blank(), pair.second };
return { pair.first.copy(), nullptr };
}
}
}
pair<CharacterSet, const GrammarError *> char_set() {
bool is_affirmative = true;
if (peek() == '^') {
@ -146,7 +146,7 @@ namespace tree_sitter {
result = result.complement();
return { result, nullptr };
}
pair<CharacterSet, const GrammarError *> single_char() {
CharacterSet value;
switch (peek()) {
@ -168,7 +168,7 @@ namespace tree_sitter {
}
return { value, nullptr };
}
CharacterSet escaped_char(char value) {
switch (value) {
case 'a':
@ -181,23 +181,23 @@ namespace tree_sitter {
return CharacterSet({ value });
}
}
void next() {
position++;
}
char peek() {
return input[position];
}
bool has_more_input() {
return position < length;
}
pair<rule_ptr, const GrammarError *> error(string msg) {
return { blank(), new GrammarError(GrammarErrorTypeRegex, msg) };
}
const string input;
const size_t length;
size_t position;

View file

@ -24,10 +24,10 @@ namespace tree_sitter {
auto expand_tokens_result = expand_tokens(grammars.second);
const PreparedGrammar &lex_grammar = expand_tokens_result.first;
error = expand_tokens_result.second;
if (error)
return make_tuple(PreparedGrammar(), PreparedGrammar(), error);
return make_tuple(rule_grammar, lex_grammar, nullptr);
}
}

View file

@ -25,11 +25,11 @@ namespace tree_sitter {
string apply_to(const rules::Metadata *rule) {
return apply(rule->rule);
}
string apply_to(const rules::Seq *rule) {
return "(seq " + apply(rule->left) + " " + apply(rule->right) + ")";
}
string apply_to(const rules::Choice *rule) {
string result = "(choice";
for (auto &element : rule->elements)

View file

@ -10,7 +10,7 @@ namespace tree_sitter {
using std::ostream;
using rules::rule_ptr;
using rules::Symbol;
PreparedGrammar::PreparedGrammar() : Grammar({}), aux_rules({}), options({}) {}
PreparedGrammar::PreparedGrammar(const std::vector<std::pair<std::string, rules::rule_ptr>> &rules,