Merge pull request #190 from tree-sitter/immediate-tokens

Add immediate token rule for enforcing no preceding extras
This commit is contained in:
Max Brunsfeld 2018-08-01 15:21:42 -07:00 committed by GitHub
commit 1dcbd21bbe
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
16 changed files with 220 additions and 67 deletions

View file

@ -21,9 +21,9 @@ fetch_grammar() {
)
}
fetch_grammar javascript master
fetch_grammar javascript immediate-tokens
fetch_grammar json master
fetch_grammar c master
fetch_grammar c immediate-tokens
fetch_grammar cpp master
fetch_grammar python master
fetch_grammar go master

View file

@ -1,8 +1,8 @@
@echo off
call:fetch_grammar javascript master
call:fetch_grammar javascript immediate-tokens
call:fetch_grammar json master
call:fetch_grammar c master
call:fetch_grammar c immediate-tokens
call:fetch_grammar cpp master
call:fetch_grammar python master
call:fetch_grammar go master

View file

@ -379,9 +379,14 @@ class LexTableBuilderImpl : public LexTableBuilder {
for (const LexItem &item : item_set.entries) {
LexItem::CompletionStatus completion_status = item.completion_status();
if (completion_status.is_done) {
AcceptTokenAction action(item.lhs, completion_status.precedence.max,
item.lhs.is_built_in() ||
grammar.variables[item.lhs.index].is_string);
AcceptTokenAction action(item.lhs, completion_status.precedence.max);
if (!item.lhs.is_built_in()) {
const LexicalVariable &variable = grammar.variables[item.lhs.index];
if (variable.is_string) action.implicit_precedence += 2;
if (is_immediate_token(variable.rule)) action.implicit_precedence += 1;
}
AcceptTokenAction &existing_action = lex_table.states[state_id].accept_action;
if (existing_action.is_present()) {
if (should_replace_accept_action(existing_action, action)) {
@ -458,8 +463,8 @@ class LexTableBuilderImpl : public LexTableBuilder {
void remove_duplicate_lex_states(LexTable &lex_table) {
for (LexState &state : lex_table.states) {
state.accept_action.is_string = false;
state.accept_action.precedence = 0;
state.accept_action.implicit_precedence = 0;
}
map<LexStateId, LexStateId> replacements;
@ -523,12 +528,24 @@ class LexTableBuilderImpl : public LexTableBuilder {
}
}
bool is_immediate_token(const Rule &rule) const {
return rule.match(
[](const Metadata &metadata) {
return metadata.params.is_main_token;
},
[](auto rule) {
return false;
}
);
}
LexItemSet item_set_for_terminals(const LookaheadSet &terminals, bool with_separators) {
LexItemSet result;
terminals.for_each([&](Symbol symbol) {
if (symbol.is_terminal()) {
for (auto &&rule : rules_for_symbol(symbol)) {
if (with_separators) {
if (with_separators && !is_immediate_token(rule)) {
for (const auto &separator_rule : separator_rules) {
result.entries.insert(LexItem(
symbol,
@ -598,8 +615,8 @@ class LexTableBuilderImpl : public LexTableBuilder {
const AcceptTokenAction &new_action) {
if (new_action.precedence > old_action.precedence) return true;
if (new_action.precedence < old_action.precedence) return false;
if (new_action.is_string && !old_action.is_string) return true;
if (old_action.is_string && !new_action.is_string) return false;
if (new_action.implicit_precedence > old_action.implicit_precedence) return true;
if (new_action.implicit_precedence < old_action.implicit_precedence) return false;
return new_action.symbol.index < old_action.symbol.index;
}

View file

@ -23,6 +23,7 @@ using std::pair;
using std::set;
using std::string;
using std::to_string;
using std::unordered_set;
using std::vector;
using util::escape_char;
using rules::Symbol;
@ -76,7 +77,7 @@ class CCodeGenerator {
Symbol keyword_capture_token;
const SyntaxGrammar syntax_grammar;
const LexicalGrammar lexical_grammar;
map<string, string> sanitized_names;
map<Symbol, string> symbol_ids;
vector<pair<size_t, ParseTableEntry>> parse_table_entries;
vector<set<Symbol::Index>> external_scanner_states;
size_t next_parse_action_list_index;
@ -165,6 +166,24 @@ class CCodeGenerator {
}
}
unordered_set<string> symbol_id_values;
symbol_ids[rules::END_OF_INPUT()] = "ts_builtin_sym_end";
for (const Symbol &symbol : parse_table.symbols) {
if (!symbol.is_built_in()) {
assign_symbol_id(symbol, &symbol_id_values);
}
}
for (size_t i = 0; i < syntax_grammar.external_tokens.size(); i++) {
const ExternalToken &external_token = syntax_grammar.external_tokens[i];
if (external_token.corresponding_internal_token == rules::NONE()) {
assign_symbol_id(Symbol::external(i), &symbol_id_values);
} else {
symbol_ids[Symbol::external(i)] = symbol_ids[external_token.corresponding_internal_token];
}
}
line("#define LANGUAGE_VERSION " + to_string(TREE_SITTER_LANGUAGE_VERSION));
line("#define STATE_COUNT " + to_string(parse_table.states.size()));
line("#define SYMBOL_COUNT " + to_string(parse_table.symbols.size()));
@ -175,6 +194,33 @@ class CCodeGenerator {
line();
}
void assign_symbol_id(const Symbol &symbol, unordered_set<string> *symbol_id_values) {
auto entry = entry_for_symbol(symbol);
string symbol_id;
switch (entry.second) {
case VariableTypeAuxiliary:
symbol_id = "aux_sym_" + sanitize_name(entry.first);
break;
case VariableTypeAnonymous:
symbol_id = "anon_sym_" + sanitize_name(entry.first);
break;
default:
symbol_id = "sym_" + sanitize_name(entry.first);
break;
}
unsigned suffix_number = 1;
string unique_symbol_id = symbol_id;
while (symbol_id_values->count(unique_symbol_id)) {
suffix_number++;
unique_symbol_id = symbol_id + to_string(suffix_number);
}
symbol_id_values->insert(unique_symbol_id);
symbol_ids[symbol] = unique_symbol_id;
}
void add_symbol_enum() {
line("enum {");
indent([&]() {
@ -696,20 +742,7 @@ class CCodeGenerator {
}
string symbol_id(const Symbol &symbol) {
if (symbol == rules::END_OF_INPUT())
return "ts_builtin_sym_end";
auto entry = entry_for_symbol(symbol);
string name = sanitize_name(entry.first);
switch (entry.second) {
case VariableTypeAuxiliary:
return "aux_sym_" + name;
case VariableTypeAnonymous:
return "anon_sym_" + name;
default:
return "sym_" + name;
}
return symbol_ids[symbol];
}
string alias_id(const Alias &alias) {
@ -776,47 +809,35 @@ class CCodeGenerator {
return name;
}
string sanitize_name(string name) {
auto existing = sanitized_names.find(name);
if (existing != sanitized_names.end())
return existing->second;
string stripped_name;
string sanitize_name(const string &name) {
string result;
for (char c : name) {
if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') ||
('0' <= c && c <= '9') || (c == '_')) {
stripped_name += c;
result += c;
} else {
auto replacement = REPLACEMENTS.find(c);
size_t i = stripped_name.size();
size_t i = result.size();
if (replacement != REPLACEMENTS.end()) {
if (i > 0 && stripped_name[i - 1] != '_')
stripped_name += "_";
stripped_name += replacement->second;
if (i > 0 && result[i - 1] != '_')
result += "_";
result += replacement->second;
}
}
}
for (size_t extra_number = 0;; extra_number++) {
string suffix = extra_number ? to_string(extra_number) : "";
string unique_name = stripped_name + suffix;
if (unique_name == "")
continue;
if (!has_sanitized_name(unique_name)) {
sanitized_names.insert({ name, unique_name });
return unique_name;
}
}
return result;
}
string _boolean(bool value) {
return value ? "true" : "false";
}
bool has_sanitized_name(string name) {
for (const auto &pair : sanitized_names)
if (pair.second == name)
bool has_sanitized_name(const Symbol &symbol, string name) {
for (const auto &pair : symbol_ids) {
if (pair.second == name) {
return true;
}
}
return false;
}

View file

@ -201,7 +201,7 @@
"properties": {
"type": {
"type": "string",
"pattern": "^TOKEN$"
"pattern": "^(TOKEN|IMMEDIATE_TOKEN)$"
},
"content": {
"$ref": "#/definitions/rule"

View file

@ -16,9 +16,9 @@ AdvanceAction::AdvanceAction() : state_index(-1) {}
AdvanceAction::AdvanceAction(size_t state_index,
PrecedenceRange precedence_range,
bool in_main_token)
: state_index(state_index),
precedence_range(precedence_range),
in_main_token(in_main_token) {}
: state_index(state_index),
precedence_range(precedence_range),
in_main_token(in_main_token) {}
bool AdvanceAction::operator==(const AdvanceAction &other) const {
return (state_index == other.state_index) &&
@ -26,19 +26,21 @@ bool AdvanceAction::operator==(const AdvanceAction &other) const {
}
AcceptTokenAction::AcceptTokenAction()
: symbol(rules::NONE()), precedence(0), is_string(false) {}
: symbol(rules::NONE()), precedence(0), implicit_precedence(0) {}
AcceptTokenAction::AcceptTokenAction(Symbol symbol, int precedence,
bool is_string)
: symbol(symbol), precedence(precedence), is_string(is_string) {}
AcceptTokenAction::AcceptTokenAction(Symbol symbol, int precedence)
: symbol(symbol), precedence(precedence), implicit_precedence(0) {}
bool AcceptTokenAction::is_present() const {
return symbol != rules::NONE();
}
bool AcceptTokenAction::operator==(const AcceptTokenAction &other) const {
return (symbol == other.symbol) && (precedence == other.precedence) &&
(is_string == other.is_string);
return (
symbol == other.symbol &&
precedence == other.precedence &&
implicit_precedence == other.implicit_precedence
);
}
bool LexState::operator==(const LexState &other) const {

View file

@ -25,14 +25,14 @@ struct AdvanceAction {
struct AcceptTokenAction {
AcceptTokenAction();
AcceptTokenAction(rules::Symbol, int, bool);
AcceptTokenAction(rules::Symbol, int);
bool is_present() const;
bool operator==(const AcceptTokenAction &other) const;
inline bool operator!=(const AcceptTokenAction &other) const { return !operator==(other); }
rules::Symbol symbol;
int precedence;
bool is_string;
int implicit_precedence;
};
struct LexState {

View file

@ -116,6 +116,15 @@ ParseRuleResult parse_rule(json_value *rule_json) {
return Rule(Metadata::token(move(result.rule)));
}
if (type == "IMMEDIATE_TOKEN") {
json_value content_json = rule_json->operator[]("content");
auto result = parse_rule(&content_json);
if (!result.error_message.empty()) {
return "Invalid token content: " + result.error_message;
}
return Rule(Metadata::immediate_token(move(result.rule)));
}
if (type == "PATTERN") {
json_value value_json = rule_json->operator[]("value");
if (value_json.type == json_string) {

View file

@ -118,6 +118,8 @@ class TokenExtractor {
metadata.params.is_token = false;
if (metadata.params == rules::MetadataParams{}) {
return extract_token(*metadata.rule, VariableTypeAuxiliary);
} else if (metadata.rule->is<rules::String>()) {
return extract_token(metadata, VariableTypeAnonymous);
} else {
return extract_token(metadata, VariableTypeAuxiliary);
}

View file

@ -135,6 +135,9 @@ bool Rule::is<Blank>() const { return type == BlankType; }
template <>
bool Rule::is<Symbol>() const { return type == SymbolType; }
template <>
bool Rule::is<String>() const { return type == StringType; }
template <>
bool Rule::is<Repeat>() const { return type == RepeatType; }

View file

@ -75,6 +75,13 @@ Metadata Metadata::token(Rule &&rule) {
});
}
Metadata Metadata::immediate_token(Rule &&rule) {
return add_metadata(move(rule), [](MetadataParams &params) {
params.is_token = true;
params.is_main_token = true;
});
}
Metadata Metadata::active_prec(int precedence, Rule &&rule) {
return add_metadata(move(rule), [&](MetadataParams &params) {
params.has_precedence = true;

View file

@ -64,6 +64,7 @@ struct Metadata {
static Metadata merge(Rule &&rule, MetadataParams params);
static Metadata token(Rule &&rule);
static Metadata immediate_token(Rule &&rule);
static Metadata active_prec(int precedence, Rule &&rule);
static Metadata prec(int precedence, Rule &&rule);
static Metadata prec_left(int precedence, Rule &&rule);

View file

@ -69,7 +69,7 @@ int main() {
b();
c();
if () d();
if (*) d();
}
}
@ -81,14 +81,14 @@ int main() {
(function_declarator (identifier) (parameter_list))
(compound_statement
(if_statement
(field_expression
(parenthesized_expression (field_expression
(identifier)
(MISSING))
(MISSING)))
(compound_statement
(expression_statement (call_expression (identifier) (argument_list)))
(expression_statement (call_expression (identifier) (argument_list)))
(if_statement
(MISSING)
(parenthesized_expression (pointer_expression (MISSING)))
(expression_statement (call_expression (identifier) (argument_list)))))))))
====================================

View file

@ -0,0 +1,29 @@
===============================
prefix expressions as arguments
===============================
a ::b ::c
---
(program
(call
(call
(identifier)
(prefix (identifier)))
(prefix (identifier))))
===============================
infix expressions
===============================
a::b::c
---
(program
(infix
(infix
(identifier)
(identifier))
(identifier)))

View file

@ -0,0 +1,61 @@
{
"name": "immediate_tokens",
"extras": [
{
"type": "PATTERN",
"value": "\\s"
}
],
"rules": {
"program": {"type": "SYMBOL", "name": "_expression"},
"_expression": {
"type": "CHOICE",
"members": [
{"type": "SYMBOL", "name": "call"},
{"type": "SYMBOL", "name": "infix"},
{"type": "SYMBOL", "name": "prefix"},
{"type": "SYMBOL", "name": "identifier"}
]
},
"call": {
"type": "PREC_LEFT",
"value": -1,
"content": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "_expression"},
{"type": "SYMBOL", "name": "_expression"}
]
}
},
"prefix": {
"type": "SEQ",
"members": [
{"type": "STRING", "value": "::"},
{"type": "SYMBOL", "name": "identifier"}
]
},
"infix": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "_expression"},
{
"type": "IMMEDIATE_TOKEN",
"content": {"type": "STRING", "value": "::"}
},
{"type": "SYMBOL", "name": "identifier"}
]
},
"identifier": {
"type": "PATTERN",
"value": "[a-z]+"
}
}
}

View file

@ -0,0 +1 @@
This grammar demonstrates the usage of the IMMEDIATE_TOKEN rule. It allows the parser to produce a different token based on whether or not there are `extras` preceding the token's main content. When there are *no* leading `extras`, an immediate token is preferred over a normal token which would otherwise match.