Merge pull request #190 from tree-sitter/immediate-tokens
Add immediate token rule for enforcing no preceding extras
This commit is contained in:
commit
1dcbd21bbe
16 changed files with 220 additions and 67 deletions
|
|
@ -21,9 +21,9 @@ fetch_grammar() {
|
|||
)
|
||||
}
|
||||
|
||||
fetch_grammar javascript master
|
||||
fetch_grammar javascript immediate-tokens
|
||||
fetch_grammar json master
|
||||
fetch_grammar c master
|
||||
fetch_grammar c immediate-tokens
|
||||
fetch_grammar cpp master
|
||||
fetch_grammar python master
|
||||
fetch_grammar go master
|
||||
|
|
|
|||
|
|
@ -1,8 +1,8 @@
|
|||
@echo off
|
||||
|
||||
call:fetch_grammar javascript master
|
||||
call:fetch_grammar javascript immediate-tokens
|
||||
call:fetch_grammar json master
|
||||
call:fetch_grammar c master
|
||||
call:fetch_grammar c immediate-tokens
|
||||
call:fetch_grammar cpp master
|
||||
call:fetch_grammar python master
|
||||
call:fetch_grammar go master
|
||||
|
|
|
|||
|
|
@ -379,9 +379,14 @@ class LexTableBuilderImpl : public LexTableBuilder {
|
|||
for (const LexItem &item : item_set.entries) {
|
||||
LexItem::CompletionStatus completion_status = item.completion_status();
|
||||
if (completion_status.is_done) {
|
||||
AcceptTokenAction action(item.lhs, completion_status.precedence.max,
|
||||
item.lhs.is_built_in() ||
|
||||
grammar.variables[item.lhs.index].is_string);
|
||||
AcceptTokenAction action(item.lhs, completion_status.precedence.max);
|
||||
|
||||
if (!item.lhs.is_built_in()) {
|
||||
const LexicalVariable &variable = grammar.variables[item.lhs.index];
|
||||
if (variable.is_string) action.implicit_precedence += 2;
|
||||
if (is_immediate_token(variable.rule)) action.implicit_precedence += 1;
|
||||
}
|
||||
|
||||
AcceptTokenAction &existing_action = lex_table.states[state_id].accept_action;
|
||||
if (existing_action.is_present()) {
|
||||
if (should_replace_accept_action(existing_action, action)) {
|
||||
|
|
@ -458,8 +463,8 @@ class LexTableBuilderImpl : public LexTableBuilder {
|
|||
|
||||
void remove_duplicate_lex_states(LexTable &lex_table) {
|
||||
for (LexState &state : lex_table.states) {
|
||||
state.accept_action.is_string = false;
|
||||
state.accept_action.precedence = 0;
|
||||
state.accept_action.implicit_precedence = 0;
|
||||
}
|
||||
|
||||
map<LexStateId, LexStateId> replacements;
|
||||
|
|
@ -523,12 +528,24 @@ class LexTableBuilderImpl : public LexTableBuilder {
|
|||
}
|
||||
}
|
||||
|
||||
bool is_immediate_token(const Rule &rule) const {
|
||||
return rule.match(
|
||||
[](const Metadata &metadata) {
|
||||
return metadata.params.is_main_token;
|
||||
},
|
||||
|
||||
[](auto rule) {
|
||||
return false;
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
LexItemSet item_set_for_terminals(const LookaheadSet &terminals, bool with_separators) {
|
||||
LexItemSet result;
|
||||
terminals.for_each([&](Symbol symbol) {
|
||||
if (symbol.is_terminal()) {
|
||||
for (auto &&rule : rules_for_symbol(symbol)) {
|
||||
if (with_separators) {
|
||||
if (with_separators && !is_immediate_token(rule)) {
|
||||
for (const auto &separator_rule : separator_rules) {
|
||||
result.entries.insert(LexItem(
|
||||
symbol,
|
||||
|
|
@ -598,8 +615,8 @@ class LexTableBuilderImpl : public LexTableBuilder {
|
|||
const AcceptTokenAction &new_action) {
|
||||
if (new_action.precedence > old_action.precedence) return true;
|
||||
if (new_action.precedence < old_action.precedence) return false;
|
||||
if (new_action.is_string && !old_action.is_string) return true;
|
||||
if (old_action.is_string && !new_action.is_string) return false;
|
||||
if (new_action.implicit_precedence > old_action.implicit_precedence) return true;
|
||||
if (new_action.implicit_precedence < old_action.implicit_precedence) return false;
|
||||
return new_action.symbol.index < old_action.symbol.index;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ using std::pair;
|
|||
using std::set;
|
||||
using std::string;
|
||||
using std::to_string;
|
||||
using std::unordered_set;
|
||||
using std::vector;
|
||||
using util::escape_char;
|
||||
using rules::Symbol;
|
||||
|
|
@ -76,7 +77,7 @@ class CCodeGenerator {
|
|||
Symbol keyword_capture_token;
|
||||
const SyntaxGrammar syntax_grammar;
|
||||
const LexicalGrammar lexical_grammar;
|
||||
map<string, string> sanitized_names;
|
||||
map<Symbol, string> symbol_ids;
|
||||
vector<pair<size_t, ParseTableEntry>> parse_table_entries;
|
||||
vector<set<Symbol::Index>> external_scanner_states;
|
||||
size_t next_parse_action_list_index;
|
||||
|
|
@ -165,6 +166,24 @@ class CCodeGenerator {
|
|||
}
|
||||
}
|
||||
|
||||
unordered_set<string> symbol_id_values;
|
||||
symbol_ids[rules::END_OF_INPUT()] = "ts_builtin_sym_end";
|
||||
|
||||
for (const Symbol &symbol : parse_table.symbols) {
|
||||
if (!symbol.is_built_in()) {
|
||||
assign_symbol_id(symbol, &symbol_id_values);
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < syntax_grammar.external_tokens.size(); i++) {
|
||||
const ExternalToken &external_token = syntax_grammar.external_tokens[i];
|
||||
if (external_token.corresponding_internal_token == rules::NONE()) {
|
||||
assign_symbol_id(Symbol::external(i), &symbol_id_values);
|
||||
} else {
|
||||
symbol_ids[Symbol::external(i)] = symbol_ids[external_token.corresponding_internal_token];
|
||||
}
|
||||
}
|
||||
|
||||
line("#define LANGUAGE_VERSION " + to_string(TREE_SITTER_LANGUAGE_VERSION));
|
||||
line("#define STATE_COUNT " + to_string(parse_table.states.size()));
|
||||
line("#define SYMBOL_COUNT " + to_string(parse_table.symbols.size()));
|
||||
|
|
@ -175,6 +194,33 @@ class CCodeGenerator {
|
|||
line();
|
||||
}
|
||||
|
||||
void assign_symbol_id(const Symbol &symbol, unordered_set<string> *symbol_id_values) {
|
||||
auto entry = entry_for_symbol(symbol);
|
||||
|
||||
string symbol_id;
|
||||
switch (entry.second) {
|
||||
case VariableTypeAuxiliary:
|
||||
symbol_id = "aux_sym_" + sanitize_name(entry.first);
|
||||
break;
|
||||
case VariableTypeAnonymous:
|
||||
symbol_id = "anon_sym_" + sanitize_name(entry.first);
|
||||
break;
|
||||
default:
|
||||
symbol_id = "sym_" + sanitize_name(entry.first);
|
||||
break;
|
||||
}
|
||||
|
||||
unsigned suffix_number = 1;
|
||||
string unique_symbol_id = symbol_id;
|
||||
while (symbol_id_values->count(unique_symbol_id)) {
|
||||
suffix_number++;
|
||||
unique_symbol_id = symbol_id + to_string(suffix_number);
|
||||
}
|
||||
|
||||
symbol_id_values->insert(unique_symbol_id);
|
||||
symbol_ids[symbol] = unique_symbol_id;
|
||||
}
|
||||
|
||||
void add_symbol_enum() {
|
||||
line("enum {");
|
||||
indent([&]() {
|
||||
|
|
@ -696,20 +742,7 @@ class CCodeGenerator {
|
|||
}
|
||||
|
||||
string symbol_id(const Symbol &symbol) {
|
||||
if (symbol == rules::END_OF_INPUT())
|
||||
return "ts_builtin_sym_end";
|
||||
|
||||
auto entry = entry_for_symbol(symbol);
|
||||
string name = sanitize_name(entry.first);
|
||||
|
||||
switch (entry.second) {
|
||||
case VariableTypeAuxiliary:
|
||||
return "aux_sym_" + name;
|
||||
case VariableTypeAnonymous:
|
||||
return "anon_sym_" + name;
|
||||
default:
|
||||
return "sym_" + name;
|
||||
}
|
||||
return symbol_ids[symbol];
|
||||
}
|
||||
|
||||
string alias_id(const Alias &alias) {
|
||||
|
|
@ -776,47 +809,35 @@ class CCodeGenerator {
|
|||
return name;
|
||||
}
|
||||
|
||||
string sanitize_name(string name) {
|
||||
auto existing = sanitized_names.find(name);
|
||||
if (existing != sanitized_names.end())
|
||||
return existing->second;
|
||||
|
||||
string stripped_name;
|
||||
string sanitize_name(const string &name) {
|
||||
string result;
|
||||
for (char c : name) {
|
||||
if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') ||
|
||||
('0' <= c && c <= '9') || (c == '_')) {
|
||||
stripped_name += c;
|
||||
result += c;
|
||||
} else {
|
||||
auto replacement = REPLACEMENTS.find(c);
|
||||
size_t i = stripped_name.size();
|
||||
size_t i = result.size();
|
||||
if (replacement != REPLACEMENTS.end()) {
|
||||
if (i > 0 && stripped_name[i - 1] != '_')
|
||||
stripped_name += "_";
|
||||
stripped_name += replacement->second;
|
||||
if (i > 0 && result[i - 1] != '_')
|
||||
result += "_";
|
||||
result += replacement->second;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t extra_number = 0;; extra_number++) {
|
||||
string suffix = extra_number ? to_string(extra_number) : "";
|
||||
string unique_name = stripped_name + suffix;
|
||||
if (unique_name == "")
|
||||
continue;
|
||||
if (!has_sanitized_name(unique_name)) {
|
||||
sanitized_names.insert({ name, unique_name });
|
||||
return unique_name;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
string _boolean(bool value) {
|
||||
return value ? "true" : "false";
|
||||
}
|
||||
|
||||
bool has_sanitized_name(string name) {
|
||||
for (const auto &pair : sanitized_names)
|
||||
if (pair.second == name)
|
||||
bool has_sanitized_name(const Symbol &symbol, string name) {
|
||||
for (const auto &pair : symbol_ids) {
|
||||
if (pair.second == name) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -201,7 +201,7 @@
|
|||
"properties": {
|
||||
"type": {
|
||||
"type": "string",
|
||||
"pattern": "^TOKEN$"
|
||||
"pattern": "^(TOKEN|IMMEDIATE_TOKEN)$"
|
||||
},
|
||||
"content": {
|
||||
"$ref": "#/definitions/rule"
|
||||
|
|
|
|||
|
|
@ -16,9 +16,9 @@ AdvanceAction::AdvanceAction() : state_index(-1) {}
|
|||
AdvanceAction::AdvanceAction(size_t state_index,
|
||||
PrecedenceRange precedence_range,
|
||||
bool in_main_token)
|
||||
: state_index(state_index),
|
||||
precedence_range(precedence_range),
|
||||
in_main_token(in_main_token) {}
|
||||
: state_index(state_index),
|
||||
precedence_range(precedence_range),
|
||||
in_main_token(in_main_token) {}
|
||||
|
||||
bool AdvanceAction::operator==(const AdvanceAction &other) const {
|
||||
return (state_index == other.state_index) &&
|
||||
|
|
@ -26,19 +26,21 @@ bool AdvanceAction::operator==(const AdvanceAction &other) const {
|
|||
}
|
||||
|
||||
AcceptTokenAction::AcceptTokenAction()
|
||||
: symbol(rules::NONE()), precedence(0), is_string(false) {}
|
||||
: symbol(rules::NONE()), precedence(0), implicit_precedence(0) {}
|
||||
|
||||
AcceptTokenAction::AcceptTokenAction(Symbol symbol, int precedence,
|
||||
bool is_string)
|
||||
: symbol(symbol), precedence(precedence), is_string(is_string) {}
|
||||
AcceptTokenAction::AcceptTokenAction(Symbol symbol, int precedence)
|
||||
: symbol(symbol), precedence(precedence), implicit_precedence(0) {}
|
||||
|
||||
bool AcceptTokenAction::is_present() const {
|
||||
return symbol != rules::NONE();
|
||||
}
|
||||
|
||||
bool AcceptTokenAction::operator==(const AcceptTokenAction &other) const {
|
||||
return (symbol == other.symbol) && (precedence == other.precedence) &&
|
||||
(is_string == other.is_string);
|
||||
return (
|
||||
symbol == other.symbol &&
|
||||
precedence == other.precedence &&
|
||||
implicit_precedence == other.implicit_precedence
|
||||
);
|
||||
}
|
||||
|
||||
bool LexState::operator==(const LexState &other) const {
|
||||
|
|
|
|||
|
|
@ -25,14 +25,14 @@ struct AdvanceAction {
|
|||
|
||||
struct AcceptTokenAction {
|
||||
AcceptTokenAction();
|
||||
AcceptTokenAction(rules::Symbol, int, bool);
|
||||
AcceptTokenAction(rules::Symbol, int);
|
||||
bool is_present() const;
|
||||
bool operator==(const AcceptTokenAction &other) const;
|
||||
inline bool operator!=(const AcceptTokenAction &other) const { return !operator==(other); }
|
||||
|
||||
rules::Symbol symbol;
|
||||
int precedence;
|
||||
bool is_string;
|
||||
int implicit_precedence;
|
||||
};
|
||||
|
||||
struct LexState {
|
||||
|
|
|
|||
|
|
@ -116,6 +116,15 @@ ParseRuleResult parse_rule(json_value *rule_json) {
|
|||
return Rule(Metadata::token(move(result.rule)));
|
||||
}
|
||||
|
||||
if (type == "IMMEDIATE_TOKEN") {
|
||||
json_value content_json = rule_json->operator[]("content");
|
||||
auto result = parse_rule(&content_json);
|
||||
if (!result.error_message.empty()) {
|
||||
return "Invalid token content: " + result.error_message;
|
||||
}
|
||||
return Rule(Metadata::immediate_token(move(result.rule)));
|
||||
}
|
||||
|
||||
if (type == "PATTERN") {
|
||||
json_value value_json = rule_json->operator[]("value");
|
||||
if (value_json.type == json_string) {
|
||||
|
|
|
|||
|
|
@ -118,6 +118,8 @@ class TokenExtractor {
|
|||
metadata.params.is_token = false;
|
||||
if (metadata.params == rules::MetadataParams{}) {
|
||||
return extract_token(*metadata.rule, VariableTypeAuxiliary);
|
||||
} else if (metadata.rule->is<rules::String>()) {
|
||||
return extract_token(metadata, VariableTypeAnonymous);
|
||||
} else {
|
||||
return extract_token(metadata, VariableTypeAuxiliary);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -135,6 +135,9 @@ bool Rule::is<Blank>() const { return type == BlankType; }
|
|||
template <>
|
||||
bool Rule::is<Symbol>() const { return type == SymbolType; }
|
||||
|
||||
template <>
|
||||
bool Rule::is<String>() const { return type == StringType; }
|
||||
|
||||
template <>
|
||||
bool Rule::is<Repeat>() const { return type == RepeatType; }
|
||||
|
||||
|
|
|
|||
|
|
@ -75,6 +75,13 @@ Metadata Metadata::token(Rule &&rule) {
|
|||
});
|
||||
}
|
||||
|
||||
Metadata Metadata::immediate_token(Rule &&rule) {
|
||||
return add_metadata(move(rule), [](MetadataParams ¶ms) {
|
||||
params.is_token = true;
|
||||
params.is_main_token = true;
|
||||
});
|
||||
}
|
||||
|
||||
Metadata Metadata::active_prec(int precedence, Rule &&rule) {
|
||||
return add_metadata(move(rule), [&](MetadataParams ¶ms) {
|
||||
params.has_precedence = true;
|
||||
|
|
|
|||
|
|
@ -64,6 +64,7 @@ struct Metadata {
|
|||
|
||||
static Metadata merge(Rule &&rule, MetadataParams params);
|
||||
static Metadata token(Rule &&rule);
|
||||
static Metadata immediate_token(Rule &&rule);
|
||||
static Metadata active_prec(int precedence, Rule &&rule);
|
||||
static Metadata prec(int precedence, Rule &&rule);
|
||||
static Metadata prec_left(int precedence, Rule &&rule);
|
||||
|
|
|
|||
8
test/fixtures/error_corpus/c_errors.txt
vendored
8
test/fixtures/error_corpus/c_errors.txt
vendored
|
|
@ -69,7 +69,7 @@ int main() {
|
|||
b();
|
||||
c();
|
||||
|
||||
if () d();
|
||||
if (*) d();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -81,14 +81,14 @@ int main() {
|
|||
(function_declarator (identifier) (parameter_list))
|
||||
(compound_statement
|
||||
(if_statement
|
||||
(field_expression
|
||||
(parenthesized_expression (field_expression
|
||||
(identifier)
|
||||
(MISSING))
|
||||
(MISSING)))
|
||||
(compound_statement
|
||||
(expression_statement (call_expression (identifier) (argument_list)))
|
||||
(expression_statement (call_expression (identifier) (argument_list)))
|
||||
(if_statement
|
||||
(MISSING)
|
||||
(parenthesized_expression (pointer_expression (MISSING)))
|
||||
(expression_statement (call_expression (identifier) (argument_list)))))))))
|
||||
|
||||
====================================
|
||||
|
|
|
|||
29
test/fixtures/test_grammars/immediate_tokens/corpus.txt
vendored
Normal file
29
test/fixtures/test_grammars/immediate_tokens/corpus.txt
vendored
Normal file
|
|
@ -0,0 +1,29 @@
|
|||
===============================
|
||||
prefix expressions as arguments
|
||||
===============================
|
||||
|
||||
a ::b ::c
|
||||
|
||||
---
|
||||
|
||||
(program
|
||||
(call
|
||||
(call
|
||||
(identifier)
|
||||
(prefix (identifier)))
|
||||
(prefix (identifier))))
|
||||
|
||||
===============================
|
||||
infix expressions
|
||||
===============================
|
||||
|
||||
a::b::c
|
||||
|
||||
---
|
||||
|
||||
(program
|
||||
(infix
|
||||
(infix
|
||||
(identifier)
|
||||
(identifier))
|
||||
(identifier)))
|
||||
61
test/fixtures/test_grammars/immediate_tokens/grammar.json
vendored
Normal file
61
test/fixtures/test_grammars/immediate_tokens/grammar.json
vendored
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
{
|
||||
"name": "immediate_tokens",
|
||||
|
||||
"extras": [
|
||||
{
|
||||
"type": "PATTERN",
|
||||
"value": "\\s"
|
||||
}
|
||||
],
|
||||
|
||||
"rules": {
|
||||
"program": {"type": "SYMBOL", "name": "_expression"},
|
||||
|
||||
"_expression": {
|
||||
"type": "CHOICE",
|
||||
"members": [
|
||||
{"type": "SYMBOL", "name": "call"},
|
||||
{"type": "SYMBOL", "name": "infix"},
|
||||
{"type": "SYMBOL", "name": "prefix"},
|
||||
{"type": "SYMBOL", "name": "identifier"}
|
||||
]
|
||||
},
|
||||
|
||||
"call": {
|
||||
"type": "PREC_LEFT",
|
||||
"value": -1,
|
||||
"content": {
|
||||
"type": "SEQ",
|
||||
"members": [
|
||||
{"type": "SYMBOL", "name": "_expression"},
|
||||
{"type": "SYMBOL", "name": "_expression"}
|
||||
]
|
||||
}
|
||||
},
|
||||
|
||||
"prefix": {
|
||||
"type": "SEQ",
|
||||
"members": [
|
||||
{"type": "STRING", "value": "::"},
|
||||
{"type": "SYMBOL", "name": "identifier"}
|
||||
]
|
||||
},
|
||||
|
||||
"infix": {
|
||||
"type": "SEQ",
|
||||
"members": [
|
||||
{"type": "SYMBOL", "name": "_expression"},
|
||||
{
|
||||
"type": "IMMEDIATE_TOKEN",
|
||||
"content": {"type": "STRING", "value": "::"}
|
||||
},
|
||||
{"type": "SYMBOL", "name": "identifier"}
|
||||
]
|
||||
},
|
||||
|
||||
"identifier": {
|
||||
"type": "PATTERN",
|
||||
"value": "[a-z]+"
|
||||
}
|
||||
}
|
||||
}
|
||||
1
test/fixtures/test_grammars/immediate_tokens/readme.md
vendored
Normal file
1
test/fixtures/test_grammars/immediate_tokens/readme.md
vendored
Normal file
|
|
@ -0,0 +1 @@
|
|||
This grammar demonstrates the usage of the IMMEDIATE_TOKEN rule. It allows the parser to produce a different token based on whether or not there are `extras` preceding the token's main content. When there are *no* leading `extras`, an immediate token is preferred over a normal token which would otherwise match.
|
||||
Loading…
Add table
Add a link
Reference in a new issue