tree-sitter/src/compiler/generate_code/c_code.cc

688 lines
19 KiB
C++
Raw Normal View History

2014-10-12 12:27:19 -07:00
#include <functional>
#include <map>
#include <set>
2014-03-09 22:45:33 -07:00
#include <string>
#include <utility>
#include <vector>
2014-03-09 21:37:21 -07:00
#include "compiler/generate_code/c_code.h"
2014-10-12 12:27:19 -07:00
#include "compiler/lex_table.h"
#include "compiler/parse_table.h"
#include "compiler/syntax_grammar.h"
#include "compiler/lexical_grammar.h"
2014-10-12 12:27:19 -07:00
#include "compiler/rules/built_in_symbols.h"
#include "compiler/util/string_helpers.h"
2013-12-15 19:33:34 -08:00
namespace tree_sitter {
namespace generate_code {
2016-11-30 09:34:47 -08:00
using std::function;
using std::map;
using std::pair;
using std::set;
2014-10-12 12:27:19 -07:00
using std::string;
using std::to_string;
using std::vector;
using util::escape_char;
2016-11-30 09:34:47 -08:00
using rules::Symbol;
static Variable EOF_ENTRY("end", VariableTypeNamed, rule_ptr());
2015-07-27 18:29:48 -07:00
static const map<char, string> REPLACEMENTS({
{ '~', "TILDE" },
{ '`', "BQUOTE" },
{ '!', "BANG" },
{ '@', "AT" },
{ '#', "POUND" },
{ '$', "DOLLAR" },
{ '%', "PERCENT" },
{ '^', "CARET" },
{ '&', "AMP" },
{ '*', "STAR" },
{ '(', "LPAREN" },
{ ')', "RPAREN" },
{ '-', "DASH" },
{ '+', "PLUS" },
{ '=', "EQ" },
{ '{', "LBRACE" },
{ '}', "RBRACE" },
{ '[', "LBRACK" },
{ ']', "RBRACK" },
{ '\\', "BSLASH" },
{ '|', "PIPE" },
{ ':', "COLON" },
{ ';', "SEMI" },
{ '"', "DQUOTE" },
{ '\'', "SQUOTE" },
{ '<', "LT" },
{ '>', "GT" },
{ ',', "COMMA" },
{ '.', "DOT" },
{ '?', "QMARK" },
{ '/', "SLASH" },
2015-10-26 16:55:59 -07:00
{ '\n', "LF" },
2015-07-27 18:29:48 -07:00
{ '\r', "CR" },
{ '\t', "TAB" },
});
class CCodeGenerator {
string buffer;
size_t indent_level;
const string name;
const ParseTable parse_table;
const LexTable lex_table;
const SyntaxGrammar syntax_grammar;
const LexicalGrammar lexical_grammar;
map<string, string> sanitized_names;
vector<pair<size_t, ParseTableEntry>> parse_table_entries;
vector<set<Symbol::Index>> external_scanner_states;
2015-12-29 11:20:52 -08:00
size_t next_parse_action_list_index;
public:
CCodeGenerator(string name, const ParseTable &parse_table,
const LexTable &lex_table, const SyntaxGrammar &syntax_grammar,
const LexicalGrammar &lexical_grammar)
: indent_level(0),
name(name),
parse_table(parse_table),
lex_table(lex_table),
syntax_grammar(syntax_grammar),
2015-12-29 11:20:52 -08:00
lexical_grammar(lexical_grammar),
2016-11-30 09:34:47 -08:00
next_parse_action_list_index(0) {}
string code() {
buffer = "";
2014-10-12 12:27:19 -07:00
add_includes();
2016-11-30 09:34:47 -08:00
add_warning_pragma();
add_stats();
2014-10-12 12:27:19 -07:00
add_symbol_enum();
add_symbol_names_list();
2016-11-30 09:34:47 -08:00
add_symbol_metadata_list();
2014-10-12 12:27:19 -07:00
add_lex_function();
2016-11-30 09:34:47 -08:00
add_lex_modes_list();
if (!syntax_grammar.external_tokens.empty()) {
2016-11-30 09:34:47 -08:00
add_external_token_enum();
add_external_scanner_symbol_map();
add_external_scanner_states_list();
}
2016-11-30 09:34:47 -08:00
2014-10-12 12:27:19 -07:00
add_parse_table();
add_parser_export();
return buffer;
}
private:
2014-10-12 12:27:19 -07:00
void add_includes() {
add("#include <tree_sitter/parser.h>");
line();
}
2016-11-30 09:34:47 -08:00
void add_warning_pragma() {
line("#pragma GCC diagnostic push");
line("#pragma GCC diagnostic ignored \"-Wmissing-field-initializers\"");
line();
}
void add_stats() {
size_t token_count = 1 + lexical_grammar.variables.size();
for (const ExternalToken &external_token : syntax_grammar.external_tokens) {
if (external_token.corresponding_internal_token == rules::NONE()) {
token_count++;
}
}
line("#define STATE_COUNT " + to_string(parse_table.states.size()));
line("#define SYMBOL_COUNT " + to_string(parse_table.symbols.size()));
line("#define TOKEN_COUNT " + to_string(token_count));
line("#define EXTERNAL_TOKEN_COUNT " + to_string(syntax_grammar.external_tokens.size()));
line();
}
2014-10-12 12:27:19 -07:00
void add_symbol_enum() {
line("enum {");
indent([&]() {
size_t i = 1;
for (const auto &entry : parse_table.symbols) {
2016-11-30 09:34:47 -08:00
const Symbol &symbol = entry.first;
if (!symbol.is_built_in()) {
line(symbol_id(symbol) + " = " + to_string(i) + ",");
i++;
2013-12-15 19:33:34 -08:00
}
}
});
line("};");
line();
}
2014-10-12 12:27:19 -07:00
void add_symbol_names_list() {
line("static const char *ts_symbol_names[] = {");
indent([&]() {
for (const auto &entry : parse_table.symbols)
line("[" + symbol_id(entry.first) + "] = \"" +
sanitize_name_for_string(symbol_name(entry.first)) + "\",");
});
line("};");
line();
}
2016-11-30 09:34:47 -08:00
void add_symbol_metadata_list() {
line("static const TSSymbolMetadata ts_symbol_metadata[SYMBOL_COUNT] = {");
indent([&]() {
for (const auto &entry : parse_table.symbols) {
2016-11-30 09:34:47 -08:00
const Symbol &symbol = entry.first;
line("[" + symbol_id(symbol) + "] = {");
indent([&]() {
switch (symbol_type(symbol)) {
case VariableTypeNamed:
line(".visible = true,");
line(".named = true,");
break;
case VariableTypeAnonymous:
line(".visible = true,");
line(".named = false,");
break;
case VariableTypeHidden:
2016-07-17 08:16:03 -07:00
line(".visible = false,");
line(".named = true,");
break;
case VariableTypeAuxiliary:
line(".visible = false,");
line(".named = false,");
break;
}
line(".structural = " + _boolean(entry.second.structural) + ",");
line(".extra = " + _boolean(entry.second.extra) + ",");
});
line("},");
}
});
line("};");
line();
}
2014-10-12 12:27:19 -07:00
void add_lex_function() {
line("static bool ts_lex(TSLexer *lexer, TSStateId state) {");
indent([&]() {
line("START_LEXER();");
_switch("state", [&]() {
size_t i = 0;
for (const LexState &state : lex_table.states)
_case(to_string(i++), [&]() { add_lex_state(state); });
2014-10-12 12:27:19 -07:00
_default([&]() { line("LEX_ERROR();"); });
});
});
line("}");
line();
}
2016-11-30 09:34:47 -08:00
void add_lex_modes_list() {
add_external_scanner_state({});
2016-11-30 09:34:47 -08:00
map<Symbol::Index, Symbol::Index> external_tokens_by_corresponding_internal_token;
for (size_t i = 0, n = lexical_grammar.variables.size(); i < n; i++) {
for (size_t j = 0; j < syntax_grammar.external_tokens.size(); j++) {
const ExternalToken &external_token = syntax_grammar.external_tokens[j];
if (external_token.corresponding_internal_token.index == i) {
external_tokens_by_corresponding_internal_token.insert({i, j});
break;
}
}
}
2016-11-30 09:34:47 -08:00
line("static TSLexMode ts_lex_modes[STATE_COUNT] = {");
indent([&]() {
size_t state_id = 0;
2016-11-30 09:34:47 -08:00
for (const auto &state : parse_table.states) {
line("[" + to_string(state_id++) + "] = {.lex_state = ");
add(to_string(state.lex_state_id));
bool needs_external_scanner = false;
2016-11-30 09:34:47 -08:00
set<Symbol::Index> external_token_indices;
for (const auto &pair : state.terminal_entries) {
Symbol symbol = pair.first;
if (symbol.is_external()) {
needs_external_scanner = true;
2016-11-30 09:34:47 -08:00
external_token_indices.insert(symbol.index);
} else if (symbol.is_token()) {
auto corresponding_external_token =
external_tokens_by_corresponding_internal_token.find(symbol.index);
if (corresponding_external_token != external_tokens_by_corresponding_internal_token.end()) {
external_token_indices.insert(corresponding_external_token->second);
}
}
}
if (needs_external_scanner) {
add(", .external_lex_state = " + add_external_scanner_state(external_token_indices));
2016-11-30 09:34:47 -08:00
}
add("},");
}
});
line("};");
line();
}
string add_external_scanner_state(set<Symbol::Index> external_token_ids) {
for (size_t i = 0, n = external_scanner_states.size(); i < n; i++)
if (external_scanner_states[i] == external_token_ids)
2016-11-30 09:34:47 -08:00
return to_string(i);
external_scanner_states.push_back(external_token_ids);
return to_string(external_scanner_states.size() - 1);
2016-11-30 09:34:47 -08:00
}
void add_external_token_enum() {
line("enum {");
indent([&]() {
for (size_t i = 0; i < syntax_grammar.external_tokens.size(); i++)
line(external_token_id(i) + ",");
});
line("};");
line();
}
void add_external_scanner_symbol_map() {
line("TSSymbol ts_external_scanner_symbol_map[EXTERNAL_TOKEN_COUNT] = {");
2016-11-30 09:34:47 -08:00
indent([&]() {
for (size_t i = 0; i < syntax_grammar.external_tokens.size(); i++) {
line("[" + external_token_id(i) + "] = " + symbol_id(Symbol(i, Symbol::External)) + ",");
}
});
line("};");
line();
}
void add_external_scanner_states_list() {
line("static bool ts_external_scanner_states[");
add(to_string(external_scanner_states.size()));
2016-11-30 09:34:47 -08:00
add("][EXTERNAL_TOKEN_COUNT] = {");
indent([&]() {
size_t i = 0;
for (const auto &valid_external_lookaheads : external_scanner_states) {
if (!valid_external_lookaheads.empty()) {
2016-11-30 09:34:47 -08:00
line("[" + to_string(i) + "] = {");
indent([&]() {
for (Symbol::Index id : valid_external_lookaheads) {
2016-11-30 09:34:47 -08:00
line("[" + external_token_id(id) + "] = true,");
}
});
line("},");
}
i++;
}
});
line("};");
line();
}
2014-10-12 12:27:19 -07:00
void add_parse_table() {
2016-07-17 07:25:13 -07:00
add_parse_action_list_id(ParseTableEntry{ {}, false, false });
2015-12-29 11:20:52 -08:00
size_t state_id = 0;
2015-12-29 11:20:52 -08:00
line("static unsigned short ts_parse_table[STATE_COUNT][SYMBOL_COUNT] = {");
indent([&]() {
for (const auto &state : parse_table.states) {
line("[" + to_string(state_id++) + "] = {");
indent([&]() {
for (const auto &entry : state.nonterminal_entries) {
2016-11-30 09:34:47 -08:00
line("[" + symbol_id(Symbol(entry.first, Symbol::NonTerminal)) + "] = STATE(");
add(to_string(entry.second));
add("),");
}
for (const auto &entry : state.terminal_entries) {
2016-11-30 09:34:47 -08:00
line("[" + symbol_id(entry.first) + "] = ACTIONS(");
add(to_string(add_parse_action_list_id(entry.second)));
add("),");
}
});
line("},");
}
});
line("};");
line();
2015-12-29 11:20:52 -08:00
add_parse_action_list();
line();
}
2014-10-12 12:27:19 -07:00
void add_parser_export() {
string external_scanner_name = "ts_language_" + name + "_external_scanner";
2016-11-30 09:34:47 -08:00
if (!syntax_grammar.external_tokens.empty()) {
2016-11-30 09:34:47 -08:00
line("void *" + external_scanner_name + "_create();");
line("void " + external_scanner_name + "_destroy();");
line("void " + external_scanner_name + "_reset(void *);");
line("bool " + external_scanner_name + "_scan(void *, TSLexer *, const bool *);");
line("bool " + external_scanner_name + "_serialize(void *, TSExternalTokenState);");
line("void " + external_scanner_name + "_deserialize(void *, const TSExternalTokenState);");
2016-11-30 09:34:47 -08:00
line();
}
line("const TSLanguage *ts_language_" + name + "() {");
indent([&]() {
line("GET_LANGUAGE(");
if (syntax_grammar.external_tokens.empty()) {
add(");");
} else {
indent([&]() {
line("(const bool *)ts_external_scanner_states,");
line("ts_external_scanner_symbol_map,");
line(external_scanner_name + "_create,");
line(external_scanner_name + "_destroy,");
line(external_scanner_name + "_reset,");
line(external_scanner_name + "_scan,");
line(external_scanner_name + "_serialize,");
line(external_scanner_name + "_deserialize,");
});
line(");");
}
});
line("}");
line();
}
2014-10-12 12:27:19 -07:00
void add_lex_state(const LexState &lex_state) {
if (lex_state.is_token_start)
line("START_TOKEN();");
for (const auto &pair : lex_state.advance_actions)
2014-10-12 12:27:19 -07:00
if (!pair.first.is_empty())
_if([&]() { add_character_set_condition(pair.first); },
[&]() { add_advance_action(pair.second); });
if (lex_state.accept_action.is_present())
add_accept_token_action(lex_state.accept_action);
else
line("LEX_ERROR();");
}
2014-10-12 12:27:19 -07:00
void add_character_set_condition(const rules::CharacterSet &rule) {
if (rule.includes_all) {
add("!(");
add_character_range_conditions(rule.excluded_ranges());
add(")");
} else {
2014-10-12 12:27:19 -07:00
add_character_range_conditions(rule.included_ranges());
}
}
2014-10-12 12:27:19 -07:00
void add_character_range_conditions(const vector<rules::CharacterRange> &ranges) {
if (ranges.size() == 1) {
2014-10-12 12:27:19 -07:00
add_character_range_condition(*ranges.begin());
} else {
bool first = true;
for (const auto &range : ranges) {
2014-10-12 12:27:19 -07:00
if (!first) {
add(" ||");
2014-10-12 12:27:19 -07:00
line();
add_padding();
}
2014-10-12 12:27:19 -07:00
add("(");
add_character_range_condition(range);
add(")");
first = false;
}
}
}
2014-10-12 12:27:19 -07:00
void add_character_range_condition(const rules::CharacterRange &range) {
string lookahead("lookahead");
if (range.min == range.max) {
add(lookahead + " == " + escape_char(range.min));
} else {
2014-10-12 12:27:19 -07:00
add(escape_char(range.min) + string(" <= ") + lookahead + " && " +
lookahead + " <= " + escape_char(range.max));
}
}
void add_advance_action(const AdvanceAction &action) {
if (action.in_main_token)
line("ADVANCE(" + to_string(action.state_index) + ");");
else
line("SKIP(" + to_string(action.state_index) + ");");
}
void add_accept_token_action(const AcceptTokenAction &action) {
line("ACCEPT_TOKEN(" + symbol_id(action.symbol) + ");");
}
2015-12-29 11:20:52 -08:00
void add_parse_action_list() {
line("static TSParseActionEntry ts_parse_actions[] = {");
indent([&]() {
for (const auto &pair : parse_table_entries) {
2015-12-29 11:20:52 -08:00
size_t index = pair.first;
line("[" + to_string(index) + "] = {.count = " +
to_string(pair.second.actions.size()) + ", .reusable = " +
_boolean(pair.second.reusable) + ", .depends_on_lookahead = " +
_boolean(pair.second.depends_on_lookahead) + "},");
2015-12-29 11:20:52 -08:00
for (const ParseAction &action : pair.second.actions) {
2015-12-29 11:20:52 -08:00
add(" ");
switch (action.type) {
case ParseActionTypeError:
break;
case ParseActionTypeAccept:
add("ACCEPT_INPUT()");
break;
case ParseActionTypeShift:
if (action.extra) {
add("SHIFT_EXTRA()");
} else {
add("SHIFT(" + to_string(action.state_index) + ")");
2015-12-29 11:20:52 -08:00
}
break;
case ParseActionTypeReduce:
if (action.fragile) {
add("REDUCE_FRAGILE(" + symbol_id(action.symbol) + ", " +
to_string(action.consumed_symbol_count) + ")");
2015-12-29 11:20:52 -08:00
} else {
add("REDUCE(" + symbol_id(action.symbol) + ", " +
to_string(action.consumed_symbol_count) + ")");
2015-12-29 11:20:52 -08:00
}
break;
case ParseActionTypeRecover:
add("RECOVER(" + to_string(action.state_index) + ")");
break;
2015-12-29 11:20:52 -08:00
default: {}
}
2015-12-29 11:20:52 -08:00
add(",");
}
}
});
line("};");
2015-12-29 11:20:52 -08:00
}
size_t add_parse_action_list_id(const ParseTableEntry &entry) {
for (const auto &pair : parse_table_entries) {
if (pair.second == entry) {
2015-12-29 11:20:52 -08:00
return pair.first;
}
}
2015-12-29 11:20:52 -08:00
size_t result = next_parse_action_list_index;
parse_table_entries.push_back({ next_parse_action_list_index, entry });
next_parse_action_list_index += 1 + entry.actions.size();
2015-12-29 11:20:52 -08:00
return result;
}
2016-11-30 09:34:47 -08:00
// Helper functions
2016-11-30 09:34:47 -08:00
string external_token_id(Symbol::Index index) {
return "ts_external_token_" + syntax_grammar.external_tokens[index].name;
}
2016-11-30 09:34:47 -08:00
string symbol_id(const Symbol &symbol) {
if (symbol == rules::END_OF_INPUT())
return "ts_builtin_sym_end";
auto entry = entry_for_symbol(symbol);
string name = sanitize_name(entry.first);
switch (entry.second) {
case VariableTypeAuxiliary:
return "aux_sym_" + name;
case VariableTypeAnonymous:
return "anon_sym_" + name;
default:
return "sym_" + name;
}
}
2016-11-30 09:34:47 -08:00
string symbol_name(const Symbol &symbol) {
if (symbol == rules::END_OF_INPUT())
return "END";
return entry_for_symbol(symbol).first;
}
2016-11-30 09:34:47 -08:00
VariableType symbol_type(const Symbol &symbol) {
if (symbol == rules::END_OF_INPUT())
return VariableTypeHidden;
return entry_for_symbol(symbol).second;
}
2016-11-30 09:34:47 -08:00
pair<string, VariableType> entry_for_symbol(const Symbol &symbol) {
switch (symbol.type) {
case Symbol::NonTerminal: {
const SyntaxVariable &variable = syntax_grammar.variables[symbol.index];
return { variable.name, variable.type };
}
case Symbol::Terminal: {
const Variable &variable = lexical_grammar.variables[symbol.index];
return { variable.name, variable.type };
}
case Symbol::External: {
const ExternalToken &token = syntax_grammar.external_tokens[symbol.index];
return { token.name, token.type };
2016-11-30 09:34:47 -08:00
}
2014-10-12 12:27:19 -07:00
}
}
2014-10-12 12:27:19 -07:00
// C-code generation functions
void _switch(string condition, function<void()> body) {
line("switch (" + condition + ") {");
indent(body);
line("}");
}
void _case(string value, function<void()> body) {
line("case " + value + ":");
indent(body);
}
void _default(function<void()> body) {
line("default:");
indent(body);
}
void _if(function<void()> condition, function<void()> body) {
line("if (");
indent(condition);
add(")");
indent(body);
}
2015-10-26 13:36:13 -07:00
string sanitize_name_for_string(string name) {
util::str_replace(&name, "\\", "\\\\");
2015-10-26 13:36:13 -07:00
util::str_replace(&name, "\n", "\\n");
util::str_replace(&name, "\r", "\\r");
util::str_replace(&name, "\"", "\\\"");
2015-10-26 13:36:13 -07:00
return name;
}
2014-10-12 12:27:19 -07:00
string sanitize_name(string name) {
auto existing = sanitized_names.find(name);
if (existing != sanitized_names.end())
return existing->second;
string stripped_name;
for (char c : name) {
if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') ||
('0' <= c && c <= '9') || (c == '_')) {
stripped_name += c;
} else {
auto replacement = REPLACEMENTS.find(c);
size_t i = stripped_name.size();
if (replacement != REPLACEMENTS.end()) {
if (i > 0 && stripped_name[i - 1] != '_')
stripped_name += "_";
stripped_name += replacement->second;
}
2014-10-12 12:27:19 -07:00
}
}
for (size_t extra_number = 0;; extra_number++) {
string suffix = extra_number ? to_string(extra_number) : "";
string unique_name = stripped_name + suffix;
if (unique_name == "")
continue;
if (!has_sanitized_name(unique_name)) {
sanitized_names.insert({ name, unique_name });
return unique_name;
}
}
}
string _boolean(bool value) {
return value ? "true" : "false";
}
2014-10-12 12:27:19 -07:00
bool has_sanitized_name(string name) {
for (const auto &pair : sanitized_names)
if (pair.second == name)
return true;
return false;
}
// General code generation functions
2015-07-31 16:32:24 -07:00
void line() {
line("");
}
void line(string input) {
add("\n");
if (!input.empty()) {
2014-10-12 12:27:19 -07:00
add_padding();
add(input);
}
}
2014-10-12 12:27:19 -07:00
void add_padding() {
for (size_t i = 0; i < indent_level; i++)
add(" ");
}
2014-10-12 12:27:19 -07:00
void indent(function<void()> body) {
indent_level++;
body();
2014-10-12 12:27:19 -07:00
indent_level--;
}
2014-10-12 12:27:19 -07:00
2015-07-31 16:32:24 -07:00
void add(string input) {
buffer += input;
}
};
string c_code(string name, const ParseTable &parse_table,
const LexTable &lex_table, const SyntaxGrammar &syntax_grammar,
const LexicalGrammar &lexical_grammar) {
return CCodeGenerator(name, parse_table, lex_table, syntax_grammar,
2015-07-27 18:29:48 -07:00
lexical_grammar)
.code();
2014-04-28 21:46:43 -07:00
}
} // namespace generate_code
} // namespace tree_sitter