Merge pull request #22 from maxbrunsfeld/c-compiler-api

Simplify the compiler API
This commit is contained in:
Max Brunsfeld 2016-01-13 21:08:41 -08:00
commit 49f393b75e
67 changed files with 1083 additions and 375 deletions

3
.gitmodules vendored
View file

@ -7,3 +7,6 @@
[submodule "externals/utf8proc"]
path = externals/utf8proc
url = https://github.com/julialang/utf8proc
[submodule "externals/json-parser"]
path = externals/json-parser
url = https://github.com/udp/json-parser.git

211
README.md
View file

@ -2,9 +2,11 @@
[![Build Status](https://travis-ci.org/maxbrunsfeld/tree-sitter.png?branch=master)](https://travis-ci.org/maxbrunsfeld/tree-sitter)
Tree-sitter is an incremental parsing library in C and C++, intended to be used via [bindings](https://github.com/maxbrunsfeld/node-tree-sitter) to higher-level
languages. It allows documents to be efficiently re-parsed after localized
edits, making it suitable for use in performance-intensive text-editing programs.
Tree-sitter is a C library for incremental parsing, intended to be used via
[bindings](https://github.com/maxbrunsfeld/node-tree-sitter) to higher-level
languages. It can be used to build a concrete syntax tree for a program and
efficiently update the syntax tree as the program is edited. This makes it suitable
for use in text-editing programs.
Tree-sitter uses a sentential-form incremental [LR parsing](https://en.wikipedia.org/wiki/LR_parser)
algorithm, as described in the paper *[Efficient and Flexible Incremental Parsing](http://harmonia.cs.berkeley.edu/papers/twagner-parsing.ps.gz)*
@ -15,142 +17,176 @@ This allows it to generate a fast parser for any context-free grammar.
### Installation
```sh
script/configure.sh # Generate a Makefile using gyp
script/configure.sh # Generate a Makefile
make # Build static libraries for the compiler and runtime
```
### Overview
Tree-sitter consists of two libraries. The first library, `libcompiler`, can be
used to generate a parser for a language by supplying a [context-free grammar](https://en.wikipedia.org/wiki/Context-free_grammar) describing the
language. Once the parser has been generated, `libcompiler` is no longer needed.
The second library, `libruntime`, is used in combination with the parsers
generated by `libcompiler`, to generate syntax trees based on text documents, and keep the
syntax trees up-to-date as changes are made to the documents.
### Writing a grammar
Tree-sitter's interface for creating grammars is a C++ library, `libcompiler`.
This allows grammars and rules to be defined, manipulated and
extended as simple values in high-level languages like [javascript](https://github.com/maxbrunsfeld/node-tree-sitter-compiler),
and then converted into tree-sitter's native representation and compiled to C
parsers. These parsers can then be used from any language that has a binding to
tree-sitter's runtime library, `libruntime`.
Tree-sitter's grammars are specified as JSON strings. This format allows them
to be easily created and manipulated in high-level languages like [JavaScript](https://github.com/maxbrunsfeld/node-tree-sitter-compiler).
The structure of a grammar is formally specified by [this JSON schema](./doc/grammar-schema.json).
You can generate a parser for a grammar using the `ts_compile_grammar` function
provided by `libcompiler`.
Here's a simple example that uses `libcompiler` directly:
Here's a simple example of using `ts_compile_grammar` to create a parser for basic
arithmetic expressions. It uses C++11 multi-line strings for readability.
```cpp
// arithmetic_grammar.cc
#include <assert.h>
#include <stdio.h>
#include "tree_sitter/compiler.h"
using namespace tree_sitter;
int main() {
auto arithmetic_grammar = Grammar({
TSCompileResult result = ts_compile_grammar(R"JSON(
{
"name": "arithmetic",
// The first rule listed in a grammar becomes the 'start rule'.
{ "expression", choice({
sym("sum"),
sym("product"),
sym("number"),
sym("variable"),
"extras": [
{"type": "PATTERN", "value": "\\s"},
],
// Error recovery is controlled by wrapping rule subtrees with `err`.
seq({
str("("),
err(sym("expression")),
str(")") }) }) },
"rules": {
"expression": {
"type": "CHOICE",
"members": [
{"type": "SYMBOL", "name": "sum"},
{"type": "SYMBOL", "name": "product"},
{"type": "SYMBOL", "name": "number"},
{"type": "SYMBOL", "name": "variable"},
{
"type": "SEQ",
"members": [
{"type": "STRING", "value": "("},
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": ")"}
]
}
]
},
// Tokens like '+' and '*' are described directly within the grammar's rules,
// as opposed to in a seperate lexer description.
{ "sum", prec_left(1, seq({
sym("expression"),
str("+"),
sym("expression") })) },
"sum": {
"type": "PREC_LEFT",
"value": 1,
"content": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": "+"},
{"type": "SYMBOL", "name": "expression"}
]
}
},
// Ambiguities can be resolved at compile time by assigning precedence
// values to rule subtrees.
{ "product", prec_left(2, seq({
sym("expression"),
str("*"),
sym("expression") })) },
"product": {
"type": "PREC_LEFT",
"value": 2,
"content": {
"type": "SEQ",
"members": [
{"type": "SYMBOL", "name": "expression"},
{"type": "STRING", "value": "*"},
{"type": "SYMBOL", "name": "expression"}
]
}
},
// Tokens can be specified using ECMAScript regexps.
{ "number", pattern("\\d+") },
{ "variable", pattern("[a-zA-Z]+\\w*") },
{ "comment", pattern("//.*") },
"number": {"type": "PATTERN", "value": "\\d+"}
"variable": {"type": "PATTERN", "value": "[a-zA-Z]\\w*"},
}
}
)JSON");
}).extra_tokens({
if (result.error_type != TSCompileErrorTypeNone) {
fprintf(stderr, "Compilation failed: %s\n", result.error_message);
return 1;
}
// Things that can appear anywhere in the language are expressed as
// 'extra tokens'.
sym("comment"),
pattern("\\s+")
});
// Generate C code for parsing this language.
auto output = compile(arithmetic_grammar, "arithmetic");
std::string c_code = output.first;
const GrammarError *error = output.second;
assert(!error);
puts(c_code.c_str());
puts(result.code);
return 0;
}
```
To create a parser for this language, compile and run this grammar like this:
To create the parser, compile this file like this:
```sh
clang++ -stdlib=libc++ -std=c++11 \
-I tree-sitter/include -L tree-sitter/out/Debug -l compiler \
arithmetic_grammar.cc -o arithmetic_grammar
clang++ -std=c++11 \
-I tree-sitter/include \
-L tree-sitter/out/Release \
-l compiler \
arithmetic_grammar.cc \
-o arithmetic_grammar
```
Then run the executable to print out the C code for the parser:
```sh
./arithmetic_grammar > arithmetic_parser.c
```
### Using the parser
The `tree_sitter/runtime` C library exposes a DOM-style interface for inspecting
documents.
#### Providing the text to parse
Functions like `ts_node_child(node, index)` and `ts_node_next_sibling(node)`
Text input is provided to a tree-sitter parser via a `TSInput` struct, which
contains function pointers for seeking to positions in the text, and for reading
chunks of text. The text can be encoded in either UTF8 or UTF16. This interface
allows you to efficiently parse text that is stored in your own data structure.
#### Querying the syntax tree
The `libruntime` API provides a DOM-style interface for inspecting
syntax trees. Functions like `ts_node_child(node, index)` and `ts_node_next_sibling(node)`
expose every node in the concrete syntax tree. This is useful for operations
like syntax-highlighting, that operate on a token-by-token basis. You can also
like syntax-highlighting, which operate on a token-by-token basis. You can also
traverse the tree in a more abstract way by using functions like
`ts_node_named_child(node, index)` and `ts_node_next_named_sibling(node)`. These
functions don't expose nodes that were specified in the grammar as anonymous
tokens, like `(` and `+`. This is useful when analyzing the meaning of a document.
```c
// test_parser.c
#include <assert.h>
#include <string.h>
#include <stdio.h>
#include "tree_sitter/runtime.h"
// Declare the language constructor that was generated from your grammar.
// Declare the language function that was generated from your grammar.
TSLanguage *ts_language_arithmetic();
int main() {
TSDocument *document = ts_document_make();
ts_document_set_language(document, ts_language_arithmetic());
// Usually, you would use the more general `ts_document_set_input`, which
// takes a struct with function pointers for seeking to positions in the text,
// and reading chunks of text. This allows you to efficiently parse text
// stored in your own data structure.
ts_document_set_input_string(document, "a + b * 5");
ts_document_parse(document);
TSNode root_node = ts_document_root_node(document);
printf(
"Root name: %s, start: %lu, end: %lu\n",
ts_node_name(root_node, document),
ts_node_start_char(root_node),
ts_node_end_char(root_node)
);
assert(!strcmp(ts_node_name(root_node, document), "expression"));
assert(ts_node_named_child_count(root_node) == 1);
TSNode product_node = ts_node_named_child(ts_node_child(root_node, 0), 1);
printf(
"Child name: %s, start: %lu, end: %lu\n",
ts_node_name(product_node, document),
ts_node_start_char(product_node),
ts_node_end_char(product_node)
);
TSNode sum_node = ts_node_named_child(root_node, 0);
assert(!strcmp(ts_node_name(sum_node, document), "sum"));
assert(ts_node_named_child_count(sum_node) == 2);
TSNode product_node = ts_node_child(ts_node_named_child(sum_node, 1), 0);
assert(!strcmp(ts_node_name(product_node, document), "product"));
assert(ts_node_named_child_count(product_node) == 2);
printf("Syntax tree: %s\n", ts_node_string(root_node, document));
ts_document_free(document);
return 0;
}
@ -159,9 +195,12 @@ int main() {
To demo this parser's capabilities, compile this program like this:
```sh
clang \
-I tree-sitter/include -L tree-sitter/out/Debug -l runtime \
arithmetic_parser.c test_parser.c -o test_parser
clang \
-I tree-sitter/include \
-L tree-sitter/out/Debug \
-l runtime \
arithmetic_parser.c test_parser.c \
-o test_parser
./test_parser
```

256
doc/grammar-schema.json Normal file
View file

@ -0,0 +1,256 @@
{
"type": "object",
"required": [
"name",
"rules"
],
"additionalProperties": false,
"properties": {
"name": {
"type": "string",
"pattern": "^[a-zA-Z_]\\w*"
},
"rules": {
"type": "object",
"patternProperties": {
"^[a-zA-Z_]\\w*$": {
"$ref": "#/definitions/rule"
}
},
"additionalProperties": false
},
"extras": {
"type": "array",
"items": {
"$ref": "#/definitions/rule"
}
},
"conflicts": {
"type": "array",
"items": {
"type": "array",
"items": {
"type": "string",
"pattern": "^[a-zA-Z_]\\w*$"
}
}
}
},
"definitions": {
"blank-rule": {
"type": "object",
"properties": {
"type": {
"type": "string",
"pattern": "^BLANK$"
}
},
"required": ["type"]
},
"string-rule": {
"type": "object",
"properties": {
"type": {
"type": "string",
"pattern": "^STRING$"
},
"value": {
"type": "string"
}
},
"required": ["type", "value"]
},
"pattern-rule": {
"type": "object",
"properties": {
"type": {
"type": "string",
"pattern": "^PATTERN$"
},
"value": {"type": "string"}
},
"required": ["type", "value"]
},
"symbol-rule": {
"required": ["name"],
"type": "object",
"properties": {
"type": {
"type": "string",
"pattern": "^SYMBOL$"
},
"name": {"type": "string"}
},
"required": ["type", "name"]
},
"seq-rule": {
"type": "object",
"properties": {
"type": {
"type": "string",
"pattern": "^SEQ$"
},
"members": {
"type": "array",
"items": {
"$ref": "#/definitions/rule"
}
}
},
"required": ["type", "members"]
},
"choice-rule": {
"type": "object",
"properties": {
"type": {
"type": "string",
"pattern": "^CHOICE$"
},
"members": {
"type": "array",
"items": {
"$ref": "#/definitions/rule"
}
}
},
"required": ["type", "members"]
},
"repeat-rule": {
"type": "object",
"properties": {
"type": {
"type": "string",
"pattern": "^REPEAT$"
},
"content": {
"$ref": "#/definitions/rule"
}
},
"required": ["type", "content"]
},
"repeat1-rule": {
"type": "object",
"properties": {
"type": {
"type": "string",
"pattern": "^REPEAT1$"
},
"content": {
"$ref": "#/definitions/rule"
}
},
"required": ["type", "content"]
},
"token-rule": {
"type": "object",
"properties": {
"type": {
"type": "string",
"pattern": "^TOKEN$"
},
"content": {
"$ref": "#/definitions/rule"
}
},
"required": ["type", "content"]
},
"error-rule": {
"type": "object",
"properties": {
"type": {
"type": "string",
"pattern": "^ERROR$"
},
"content": {
"$ref": "#/definitions/rule"
}
},
"required": ["type", "content"]
},
"prec-rule": {
"type": "object",
"properties": {
"type": {
"type": "string",
"pattern": "^PREC$"
},
"value": {
"type": "integer"
},
"content": {
"$ref": "#/definitions/rule"
}
},
"required": ["type", "content", "value"]
},
"prec-left-rule": {
"type": "object",
"properties": {
"type": {
"type": "string",
"pattern": "^PREC_LEFT$"
},
"value": {
"type": "integer"
},
"content": {
"$ref": "#/definitions/rule"
}
},
"required": ["type", "content", "value"]
},
"prec-right-rule": {
"type": "object",
"properties": {
"type": {
"type": "string",
"pattern": "^PREC_RIGHT$"
},
"value": {
"type": "integer"
},
"content": {
"$ref": "#/definitions/rule"
}
},
"required": ["type", "content", "value"]
},
"rule": {
"oneOf": [
{ "$ref": "#/definitions/blank-rule" },
{ "$ref": "#/definitions/string-rule" },
{ "$ref": "#/definitions/pattern-rule" },
{ "$ref": "#/definitions/symbol-rule" },
{ "$ref": "#/definitions/seq-rule" },
{ "$ref": "#/definitions/choice-rule" },
{ "$ref": "#/definitions/repeat1-rule" },
{ "$ref": "#/definitions/repeat-rule" },
{ "$ref": "#/definitions/token-rule" },
{ "$ref": "#/definitions/error-rule" },
{ "$ref": "#/definitions/prec-rule" },
{ "$ref": "#/definitions/prec-left-rule" },
{ "$ref": "#/definitions/prec-right-rule" }
]
}
}
}

1
externals/json-parser vendored Submodule

@ -0,0 +1 @@
Subproject commit 70533215eea575e40a0b91a34ae01a779641d73a

View file

@ -1,65 +1,30 @@
#ifndef TREE_SITTER_COMPILER_H_
#define TREE_SITTER_COMPILER_H_
#include <memory>
#include <string>
#include <utility>
#include <vector>
#ifdef __cplusplus
extern "C" {
#endif
namespace tree_sitter {
typedef enum {
TSCompileErrorTypeNone,
TSCompileErrorTypeInvalidGrammar,
TSCompileErrorTypeInvalidRegex,
TSCompileErrorTypeUndefinedSymbol,
TSCompileErrorTypeInvalidUbiquitousToken,
TSCompileErrorTypeLexConflict,
TSCompileErrorTypeParseConflict,
} TSCompileErrorType;
class Rule;
typedef std::shared_ptr<Rule> rule_ptr;
typedef struct {
const char *code;
const char *error_message;
TSCompileErrorType error_type;
} TSCompileResult;
rule_ptr blank();
rule_ptr choice(const std::vector<rule_ptr> &);
rule_ptr repeat(const rule_ptr &);
rule_ptr repeat1(const rule_ptr &);
rule_ptr seq(const std::vector<rule_ptr> &);
rule_ptr sym(const std::string &);
rule_ptr pattern(const std::string &);
rule_ptr str(const std::string &);
rule_ptr err(const rule_ptr &);
rule_ptr prec(int precedence, const rule_ptr &);
rule_ptr prec_left(const rule_ptr &);
rule_ptr prec_left(int precedence, const rule_ptr &);
rule_ptr prec_right(const rule_ptr &);
rule_ptr prec_right(int precedence, const rule_ptr &);
rule_ptr token(const rule_ptr &rule);
TSCompileResult ts_compile_grammar(const char *input);
class Grammar {
const std::vector<std::pair<std::string, rule_ptr>> rules_;
std::vector<rule_ptr> extra_tokens_;
std::vector<std::vector<std::string>> expected_conflicts_;
public:
explicit Grammar(const std::vector<std::pair<std::string, rule_ptr>> &);
Grammar &extra_tokens(const std::vector<rule_ptr> &);
Grammar &expected_conflicts(const std::vector<std::vector<std::string>> &);
const std::vector<std::pair<std::string, rule_ptr>> &rules() const;
const std::vector<rule_ptr> &extra_tokens() const;
const std::vector<std::vector<std::string>> &expected_conflicts() const;
};
enum GrammarErrorType {
GrammarErrorTypeRegex,
GrammarErrorTypeUndefinedSymbol,
GrammarErrorTypeInvalidUbiquitousToken,
GrammarErrorTypeLexConflict,
GrammarErrorTypeParseConflict,
};
class GrammarError {
public:
GrammarError(GrammarErrorType type, std::string message);
bool operator==(const GrammarError &other) const;
GrammarErrorType type;
std::string message;
};
std::pair<std::string, const GrammarError *> compile(const Grammar &,
std::string);
} // namespace tree_sitter
#ifdef __cplusplus
}
#endif
#endif // TREE_SITTER_COMPILER_H_

View file

@ -94,17 +94,17 @@ struct TSLanguage {
* Lexer Macros
*/
#define START_LEXER() \
lexer->start_fn(lexer, state); \
int32_t lookahead; \
next_state: \
#define START_LEXER() \
lexer->start_fn(lexer, state); \
int32_t lookahead; \
next_state: \
lookahead = lexer->lookahead;
#define START_TOKEN() lexer->start_token_fn(lexer);
#define GO_TO_STATE(state_value) \
{ \
state = state_value; \
state = state_value; \
goto next_state; \
}

View file

@ -8,6 +8,7 @@
'include',
'src',
'externals/utf8proc',
'externals/json-parser',
],
'sources': [
'src/compiler/build_tables/build_lex_table.cc',
@ -24,8 +25,8 @@
'src/compiler/build_tables/rule_can_be_blank.cc',
'src/compiler/compile.cc',
'src/compiler/generate_code/c_code.cc',
'src/compiler/grammar.cc',
'src/compiler/lex_table.cc',
'src/compiler/parse_grammar.cc',
'src/compiler/parse_table.cc',
'src/compiler/precedence_range.cc',
'src/compiler/prepare_grammar/expand_repeats.cc',
@ -58,6 +59,7 @@
'src/compiler/rules/visitor.cc',
'src/compiler/util/string_helpers.cc',
'externals/utf8proc/utf8proc.c',
'externals/json-parser/json.c',
],
'cflags_cc': [
'-std=c++0x',

View file

@ -1,4 +1,5 @@
#include "compiler/compiler_spec_helper.h"
#include "compiler/compile.h"
#include <fstream>
#include <iostream>
@ -29,10 +30,10 @@ describe("compiling the example grammars", []() {
it(("compiles the " + language + " grammar").c_str(), [&]() {
auto result = compile(grammar, language);
string code = result.first;
const GrammarError *error = result.second;
const CompileError error = result.second;
if (error)
AssertThat(error->message, Equals(""));
if (error.type)
AssertThat(error.message, Equals(""));
ofstream file(example_parser_dir + language + ".c");
file << get<0>(result);

View file

@ -1,32 +1,45 @@
#include "compiler/compiler_spec_helper.h"
#include "tree_sitter/compiler.h"
#include "compiler/compile.h"
using namespace rules;
START_TEST
describe("Compile", []() {
describe("compile_grammar", []() {
describe("when the grammar's start symbol is a token", [&]() {
it("does not fail", [&]() {
Grammar grammar({
{ "rule1", str("the-value") }
});
TSCompileResult result = ts_compile_grammar(R"JSON(
{
"name": "the_grammar",
"rules": {
"rule1": {
"type": "STRING",
"value": "hello"
}
}
}
)JSON");
auto result = compile(grammar, "test_grammar");
const GrammarError *error = result.second;
AssertThat(error, Equals<const GrammarError *>(nullptr));
AssertThat(string(result.error_message), IsEmpty());
AssertThat(string(result.code), !IsEmpty());
});
});
describe("when the grammar's start symbol is blank", [&]() {
it("does not fail", [&]() {
Grammar grammar({
{ "rule1", blank() }
});
TSCompileResult result = ts_compile_grammar(R"JSON(
{
"name": "the_grammar",
"rules": {
"rule1": {
"type": "BLANK"
}
}
}
)JSON");
auto result = compile(grammar, "test_grammar");
const GrammarError *error = result.second;
AssertThat(error, Equals<const GrammarError *>(nullptr));
AssertThat(string(result.error_message), IsEmpty());
AssertThat(string(result.code), !IsEmpty());
});
});
});

View file

@ -5,7 +5,7 @@
#include "compiler/helpers/stream_methods.h"
#include "compiler/helpers/equals_pointer.h"
#include "compiler/helpers/rule_helpers.h"
#include "tree_sitter/compiler.h"
#include "compiler/rules.h"
using namespace tree_sitter;
using namespace std;

View file

@ -12,7 +12,7 @@ ostream &operator<<(ostream &stream, const Grammar &grammar) {
stream << string("#<grammar");
stream << string(" rules: {");
bool started = false;
for (auto pair : grammar.rules()) {
for (auto pair : grammar.rules) {
if (started)
stream << string(", ");
stream << pair.first;
@ -23,11 +23,11 @@ ostream &operator<<(ostream &stream, const Grammar &grammar) {
return stream << string("}>");
}
ostream &operator<<(ostream &stream, const GrammarError *error) {
if (error)
return stream << (string("#<grammar-error '") + error->message + "'>");
ostream &operator<<(ostream &stream, const CompileError &error) {
if (error.type)
return stream << (string("#<compile-error '") + error.message + "'>");
else
return stream << string("#<null>");
return stream << string("#<no-compile-error>");
}
ostream &operator<<(ostream &stream, const Rule &rule) {

View file

@ -7,7 +7,8 @@
#include <map>
#include <unordered_set>
#include <vector>
#include "tree_sitter/compiler.h"
#include "compiler/grammar.h"
#include "compiler/compile_error.h"
using std::cout;
@ -98,7 +99,7 @@ struct ProductionStep;
struct PrecedenceRange;
ostream &operator<<(ostream &, const Grammar &);
ostream &operator<<(ostream &, const GrammarError &);
ostream &operator<<(ostream &, const CompileError &);
ostream &operator<<(ostream &, const Rule &);
ostream &operator<<(ostream &, const rule_ptr &);
ostream &operator<<(ostream &, const Variable &);

View file

@ -20,7 +20,7 @@ describe("expand_tokens", []() {
auto result = expand_tokens(grammar);
AssertThat(result.second, Equals((const GrammarError *)nullptr));
AssertThat(result.second, Equals(CompileError::none()));
AssertThat(result.first.variables, Equals(vector<Variable>({
Variable("rule_A", VariableTypeNamed, seq({
i_sym(10),
@ -69,7 +69,7 @@ describe("expand_tokens", []() {
auto result = expand_tokens(grammar);
AssertThat(result.second, Equals((const GrammarError *)nullptr));
AssertThat(result.second, Equals(CompileError::none()));
AssertThat(result.first.variables, Equals(vector<Variable>({
Variable("rule_A", VariableTypeNamed, seq({
i_sym(10),
@ -102,7 +102,7 @@ describe("expand_tokens", []() {
auto result = expand_tokens(grammar);
AssertThat(result.second, EqualsPointer(new GrammarError(GrammarErrorTypeRegex, "unmatched open paren")));
AssertThat(result.second, Equals(CompileError(TSCompileErrorTypeInvalidRegex, "unmatched open paren")));
});
});
});

View file

@ -30,9 +30,9 @@ describe("extract_tokens", []() {
InitialSyntaxGrammar &syntax_grammar = get<0>(result);
LexicalGrammar &lexical_grammar = get<1>(result);
const GrammarError *error = get<2>(result);
CompileError error = get<2>(result);
AssertThat(error, Equals<const GrammarError *>(nullptr));
AssertThat(error, Equals(CompileError::none()));
AssertThat(syntax_grammar.variables, Equals(vector<Variable>({
Variable("rule_A", VariableTypeNamed, repeat1(seq({
@ -150,7 +150,7 @@ describe("extract_tokens", []() {
pattern("\\s+"),
}, {}});
AssertThat(get<2>(result), Equals<const GrammarError *>(nullptr));
AssertThat(get<2>(result), Equals(CompileError::none()));
AssertThat(get<1>(result).separators.size(), Equals<size_t>(2));
AssertThat(get<1>(result).separators[0], EqualsPointer(str("y")));
@ -167,7 +167,7 @@ describe("extract_tokens", []() {
str("y"),
}, {}});
AssertThat(get<2>(result), Equals<const GrammarError *>(nullptr));
AssertThat(get<2>(result), Equals(CompileError::none()));
AssertThat(get<1>(result).separators.size(), Equals<size_t>(0));
AssertThat(get<0>(result).extra_tokens, Equals(set<Symbol>({ Symbol(1, true) })));
});
@ -181,7 +181,7 @@ describe("extract_tokens", []() {
i_sym(2),
}, {}});
AssertThat(get<2>(result), Equals<const GrammarError *>(nullptr));
AssertThat(get<2>(result), Equals(CompileError::none()));
AssertThat(get<0>(result).extra_tokens, Equals(set<Symbol>({
{ Symbol(3, true) },
@ -196,9 +196,9 @@ describe("extract_tokens", []() {
Variable("rule_B", VariableTypeNamed, seq({ str("y"), str("z") })),
}, { i_sym(1) }, {}});
AssertThat(get<2>(result), !Equals<const GrammarError *>(nullptr));
AssertThat(get<2>(result), EqualsPointer(
new GrammarError(GrammarErrorTypeInvalidUbiquitousToken,
AssertThat(get<2>(result), !Equals(CompileError::none()));
AssertThat(get<2>(result), Equals(
CompileError(TSCompileErrorTypeInvalidUbiquitousToken,
"Not a token: rule_B")));
});
@ -208,9 +208,9 @@ describe("extract_tokens", []() {
Variable("rule_B", VariableTypeNamed, str("y")),
}, { choice({ i_sym(1), blank() }) }, {}});
AssertThat(get<2>(result), !Equals<const GrammarError *>(nullptr));
AssertThat(get<2>(result), EqualsPointer(
new GrammarError(GrammarErrorTypeInvalidUbiquitousToken,
AssertThat(get<2>(result), !Equals(CompileError::none()));
AssertThat(get<2>(result), Equals(
CompileError(TSCompileErrorTypeInvalidUbiquitousToken,
"Not a token: (choice (sym 1) (blank))")));
});
});

View file

@ -10,15 +10,15 @@ using prepare_grammar::intern_symbols;
describe("intern_symbols", []() {
it("replaces named symbols with numerically-indexed symbols", [&]() {
Grammar grammar({
Grammar grammar{{
{ "x", choice({ sym("y"), sym("_z") }) },
{ "y", sym("_z") },
{ "_z", str("stuff") }
});
}, {}, {}};
auto result = intern_symbols(grammar);
AssertThat(result.second, Equals((GrammarError *)nullptr));
AssertThat(result.second, Equals(CompileError::none()));
AssertThat(result.first.variables, Equals(vector<Variable>({
Variable("x", VariableTypeNamed, choice({ i_sym(1), i_sym(2) })),
Variable("y", VariableTypeNamed, i_sym(2)),
@ -28,26 +28,28 @@ describe("intern_symbols", []() {
describe("when there are symbols that reference undefined rules", [&]() {
it("returns an error", []() {
Grammar grammar({
Grammar grammar{{
{ "x", sym("y") },
});
}, {}, {}};
auto result = intern_symbols(grammar);
AssertThat(result.second->message, Equals("Undefined rule 'y'"));
AssertThat(result.second.message, Equals("Undefined rule 'y'"));
});
});
it("translates the grammar's optional 'extra_tokens' to numerical symbols", [&]() {
auto grammar = Grammar({
Grammar grammar{{
{ "x", choice({ sym("y"), sym("z") }) },
{ "y", sym("z") },
{ "z", str("stuff") }
}).extra_tokens({ sym("z") });
}, {
sym("z")
}, {}};
auto result = intern_symbols(grammar);
AssertThat(result.second, Equals((GrammarError *)nullptr));
AssertThat(result.second, Equals(CompileError::none()));
AssertThat(result.first.extra_tokens.size(), Equals<size_t>(1));
AssertThat(*result.first.extra_tokens.begin(), EqualsPointer(i_sym(2)));
});

View file

@ -222,8 +222,8 @@ describe("parse_regex", []() {
for (auto &row : invalid_inputs) {
it(("handles invalid regexes with " + row.description).c_str(), [&]() {
auto result = parse_regex(row.pattern);
AssertThat(result.second, !Equals((const GrammarError *)nullptr));
AssertThat(result.second->message, Contains(row.message));
AssertThat(result.second.type, Equals(TSCompileErrorTypeInvalidRegex));
AssertThat(result.second.message, Contains(row.message));
});
}
});

View file

@ -3,14 +3,14 @@
namespace tree_sitter_examples {
extern const Grammar anonymous_tokens = Grammar({
extern const Grammar anonymous_tokens{{
{ "program", choice({
str("\n"),
str("\r"),
pattern("\\d"),
str("\"hello\"") }) },
}).extra_tokens({
}, {
pattern("\\s"),
});
}, {}};
} // namespace tree_sitter_examples

View file

@ -3,7 +3,7 @@
namespace tree_sitter_examples {
extern const Grammar arithmetic = Grammar({
extern const Grammar arithmetic{{
{ "program", sym("_expression") },
{ "_expression", choice({
@ -37,9 +37,9 @@ extern const Grammar arithmetic = Grammar({
pattern("[0-9]") })) })) },
{ "comment", pattern("#.*") },
}).extra_tokens({
}, {
sym("comment"),
pattern("\\s"),
});
}, {}};
} // namespace tree_sitter_examples

View file

@ -5,7 +5,7 @@ namespace tree_sitter_examples {
// http://slps.github.io/zoo/c/iso-9899-tc3.html
extern const Grammar c = Grammar({
extern const Grammar c{{
{ "translation_unit", repeat(choice({
sym("preproc_define"),
sym("preproc_call"),
@ -258,13 +258,13 @@ extern const Grammar c = Grammar({
pattern("[^\\*]"),
pattern("\\*[^/]") })),
str("*/") }) })) },
}).extra_tokens({
}, {
sym("comment"),
pattern("[ \t\r\n]"),
}).expected_conflicts({
}, {
{ "_type_specifier", "_expression" },
{ "_type_specifier", "_expression", "macro_type" },
{ "_type_specifier", "macro_type" },
});
}};
} // namespace tree_sitter_examples

View file

@ -5,7 +5,7 @@ namespace tree_sitter_examples {
// http://slps.github.io/zoo/cpp/iso-n2723.html
extern const Grammar cpp = Grammar({
extern const Grammar cpp{{
{ "translation_unit", repeat(sym("_declaration")) },
{ "_declaration", choice({
@ -211,13 +211,13 @@ extern const Grammar cpp = Grammar({
{ "number", pattern("\\d+(\\.\\d+)?") },
{ "comment", pattern("//[^\n]*") },
}).extra_tokens({
}, {
sym("comment"),
pattern("[ \t\r\n]"),
}).expected_conflicts({
}, {
{ "type_specifier", "_expression" },
{ "template_call", "_expression" },
{ "template_call", "relational_expression" },
});
}};
} // namespace tree_sitter_examples

View file

@ -9,7 +9,7 @@ static rule_ptr terminated(rule_ptr rule) {
str(";") }) });
}
extern const Grammar golang = Grammar({
extern const Grammar golang{{
{ "program", seq({
sym("package_directive"),
repeat(sym("imports_block")),
@ -203,10 +203,10 @@ extern const Grammar golang = Grammar({
{ "comment", pattern("//[^\n]*") },
}).extra_tokens({
}, {
sym("comment"),
sym("_line_break"),
pattern("[ \t\r]"),
});
}, {}};
} // namespace tree_sitter_examples

View file

@ -1,4 +1,4 @@
#include "tree_sitter/compiler.h"
#include "compiler/rules.h"
namespace tree_sitter_examples {

View file

@ -1,7 +1,8 @@
#ifndef TREESITTER_EXAMPLES_HELPERS_
#define TREESITTER_EXAMPLES_HELPERS_
#include "tree_sitter/compiler.h"
#include "compiler/rules.h"
#include "compiler/grammar.h"
namespace tree_sitter_examples {

View file

@ -30,7 +30,7 @@ enum {
PREC_ARGS = 16,
};
extern const Grammar javascript = Grammar({
extern const Grammar javascript{{
{ "program", repeat(sym("_statement")) },
/*
@ -349,13 +349,13 @@ extern const Grammar javascript = Grammar({
str(")"),
sym("statement_block") }) },
}).extra_tokens({
}, {
sym("comment"),
sym("_line_break"),
pattern("[ \t\r]"),
}).expected_conflicts({
}, {
{ "for_in_statement", "_expression" },
{ "method_definition", "_expression" },
});
}};
} // namespace tree_sitter_examples

View file

@ -3,7 +3,7 @@
namespace tree_sitter_examples {
extern const Grammar json = Grammar({
extern const Grammar json{{
{ "_value", choice({
sym("object"),
sym("array"),
@ -22,8 +22,8 @@ extern const Grammar json = Grammar({
{ "null", str("null") },
{ "true", str("true") },
{ "false", str("false") },
}).extra_tokens({
}, {
pattern("\\s"),
});
}, {}};
} // namespace tree_sitter_examples

View file

@ -1,7 +1,6 @@
#ifndef COMPILER_BUILD_TABLES_BUILD_LEX_TABLE_H_
#define COMPILER_BUILD_TABLES_BUILD_LEX_TABLE_H_
#include "tree_sitter/compiler.h"
#include "compiler/lex_table.h"
namespace tree_sitter {

View file

@ -45,7 +45,7 @@ class ParseTableBuilder {
const LexicalGrammar &lex_grammar)
: grammar(grammar), lexical_grammar(lex_grammar) {}
pair<ParseTable, const GrammarError *> build() {
pair<ParseTable, CompileError> build() {
Symbol start_symbol = Symbol(0, grammar.variables.empty());
Production start_production({
ProductionStep(start_symbol, 0, rules::AssociativityNone),
@ -68,9 +68,9 @@ class ParseTableBuilder {
add_shift_actions(item_set, state_id);
if (!conflicts.empty())
return { parse_table, new GrammarError(GrammarErrorTypeParseConflict,
"Unresolved conflict.\n\n" +
*conflicts.begin()) };
return { parse_table,
CompileError(TSCompileErrorTypeParseConflict,
"Unresolved conflict.\n\n" + *conflicts.begin()) };
}
for (ParseStateId state = 0; state < parse_table.states.size(); state++) {
@ -83,7 +83,7 @@ class ParseTableBuilder {
parse_table.symbols.insert({ rules::ERROR(), {} });
return { parse_table, nullptr };
return { parse_table, CompileError::none() };
}
private:
@ -370,7 +370,7 @@ class ParseTableBuilder {
}
};
pair<ParseTable, const GrammarError *> build_parse_table(
pair<ParseTable, CompileError> build_parse_table(
const SyntaxGrammar &grammar, const LexicalGrammar &lex_grammar) {
return ParseTableBuilder(grammar, lex_grammar).build();
}

View file

@ -4,7 +4,7 @@
#include <utility>
#include <vector>
#include "compiler/parse_table.h"
#include "tree_sitter/compiler.h"
#include "compiler/compile_error.h"
namespace tree_sitter {
@ -13,8 +13,8 @@ struct LexicalGrammar;
namespace build_tables {
std::pair<ParseTable, const GrammarError *> build_parse_table(
const SyntaxGrammar &, const LexicalGrammar &);
std::pair<ParseTable, CompileError> build_parse_table(const SyntaxGrammar &,
const LexicalGrammar &);
} // namespace build_tables
} // namespace tree_sitter

View file

@ -4,6 +4,7 @@
#include "compiler/build_tables/build_parse_table.h"
#include "compiler/syntax_grammar.h"
#include "compiler/lexical_grammar.h"
#include "compiler/compile_error.h"
namespace tree_sitter {
namespace build_tables {
@ -13,11 +14,11 @@ using std::tuple;
using std::vector;
using std::make_tuple;
tuple<ParseTable, LexTable, const GrammarError *> build_tables(
tuple<ParseTable, LexTable, CompileError> build_tables(
const SyntaxGrammar &grammar, const LexicalGrammar &lex_grammar) {
auto parse_table_result = build_parse_table(grammar, lex_grammar);
ParseTable parse_table = parse_table_result.first;
const GrammarError *error = parse_table_result.second;
const CompileError error = parse_table_result.second;
LexTable lex_table = build_lex_table(&parse_table, lex_grammar);
return make_tuple(parse_table, lex_table, error);
}

View file

@ -4,9 +4,9 @@
#include <string>
#include <tuple>
#include <vector>
#include "tree_sitter/compiler.h"
#include "compiler/parse_table.h"
#include "compiler/lex_table.h"
#include "compiler/compile_error.h"
namespace tree_sitter {
@ -15,7 +15,7 @@ struct LexicalGrammar;
namespace build_tables {
std::tuple<ParseTable, LexTable, const GrammarError *> build_tables(
std::tuple<ParseTable, LexTable, CompileError> build_tables(
const SyntaxGrammar &, const LexicalGrammar &);
} // namespace build_tables

View file

@ -1,7 +1,7 @@
#ifndef COMPILER_BUILD_TABLES_DOES_MATCH_ANY_LINE_H_
#define COMPILER_BUILD_TABLES_DOES_MATCH_ANY_LINE_H_
#include "tree_sitter/compiler.h"
#include "compiler/rule.h"
namespace tree_sitter {
namespace build_tables {

View file

@ -2,7 +2,6 @@
#include <set>
#include <vector>
#include <utility>
#include "tree_sitter/compiler.h"
#include "compiler/syntax_grammar.h"
namespace tree_sitter {

View file

@ -2,7 +2,6 @@
#define COMPILER_BUILD_TABLES_LEX_CONFLICT_MANAGER_H_
#include <set>
#include "tree_sitter/compiler.h"
#include "compiler/lexical_grammar.h"
#include "compiler/rules/symbol.h"

View file

@ -2,7 +2,6 @@
#define COMPILER_BUILD_TABLES_PARSE_CONFLICT_MANAGER_H_
#include <utility>
#include "tree_sitter/compiler.h"
#include "compiler/syntax_grammar.h"
#include "compiler/build_tables/parse_item.h"

View file

@ -1,7 +1,6 @@
#include "compiler/build_tables/parse_item.h"
#include <string>
#include "compiler/syntax_grammar.h"
#include "tree_sitter/compiler.h"
namespace tree_sitter {
namespace build_tables {

View file

@ -1,5 +1,4 @@
#include "compiler/build_tables/rule_can_be_blank.h"
#include "tree_sitter/compiler.h"
#include "compiler/rules/symbol.h"
#include "compiler/rules/visitor.h"
#include "compiler/rules/seq.h"

View file

@ -1,7 +1,7 @@
#ifndef COMPILER_BUILD_TABLES_RULE_CAN_BE_BLANK_H_
#define COMPILER_BUILD_TABLES_RULE_CAN_BE_BLANK_H_
#include "tree_sitter/compiler.h"
#include "compiler/rule.h"
namespace tree_sitter {
namespace build_tables {

View file

@ -4,6 +4,8 @@
#include "compiler/generate_code/c_code.h"
#include "compiler/syntax_grammar.h"
#include "compiler/lexical_grammar.h"
#include "compiler/parse_grammar.h"
#include "json.h"
namespace tree_sitter {
@ -13,14 +15,44 @@ using std::vector;
using std::get;
using std::make_tuple;
pair<string, const GrammarError *> compile(const Grammar &grammar,
std::string name) {
extern "C" TSCompileResult ts_compile_grammar(const char *input) {
ParseGrammarResult parse_result = parse_grammar(string(input));
if (!parse_result.error_message.empty()) {
return { "", strdup(parse_result.error_message.c_str()),
TSCompileErrorTypeInvalidGrammar };
}
auto prepare_grammar_result =
prepare_grammar::prepare_grammar(parse_result.grammar);
const SyntaxGrammar &syntax_grammar = get<0>(prepare_grammar_result);
const LexicalGrammar &lexical_grammar = get<1>(prepare_grammar_result);
CompileError error = get<2>(prepare_grammar_result);
if (error.type) {
return { "", strdup(error.message.c_str()), error.type };
}
auto table_build_result =
build_tables::build_tables(syntax_grammar, lexical_grammar);
const ParseTable &parse_table = get<0>(table_build_result);
const LexTable &lex_table = get<1>(table_build_result);
error = get<2>(table_build_result);
if (error.type) {
return { "", strdup(error.message.c_str()), error.type };
}
string code = generate_code::c_code(parse_result.name, parse_table, lex_table,
syntax_grammar, lexical_grammar);
return { strdup(code.c_str()), "", TSCompileErrorTypeNone };
}
pair<string, const CompileError> compile(const Grammar &grammar,
std::string name) {
auto prepare_grammar_result = prepare_grammar::prepare_grammar(grammar);
const SyntaxGrammar &syntax_grammar = get<0>(prepare_grammar_result);
const LexicalGrammar &lexical_grammar = get<1>(prepare_grammar_result);
const GrammarError *error = get<2>(prepare_grammar_result);
if (error)
CompileError error = get<2>(prepare_grammar_result);
if (error.type)
return { "", error };
auto table_build_result =
@ -28,14 +60,13 @@ pair<string, const GrammarError *> compile(const Grammar &grammar,
const ParseTable &parse_table = get<0>(table_build_result);
const LexTable &lex_table = get<1>(table_build_result);
error = get<2>(table_build_result);
if (error)
if (error.type)
return { "", error };
string code = generate_code::c_code(name, parse_table, lex_table,
syntax_grammar, lexical_grammar);
return { code, nullptr };
return { code, CompileError::none() };
}
} // namespace tree_sitter

16
src/compiler/compile.h Normal file
View file

@ -0,0 +1,16 @@
#ifndef COMPILER_COMPILE_H_
#define COMPILER_COMPILE_H_
#include <string>
#include <utility>
#include "compiler/compile_error.h"
namespace tree_sitter {
struct Grammar;
std::pair<std::string, CompileError> compile(const Grammar &, std::string);
} // namespace tree_sitter
#endif // COMPILER_COMPILE_H_

View file

@ -0,0 +1,28 @@
#ifndef COMPILER_COMPILE_ERROR_H_
#define COMPILER_COMPILE_ERROR_H_
#include <string>
#include "tree_sitter/compiler.h"
namespace tree_sitter {
class CompileError {
public:
CompileError(TSCompileErrorType type, std::string message)
: type(type), message(message) {}
static CompileError none() {
return CompileError(TSCompileErrorTypeNone, "");
}
bool operator==(const CompileError &other) const {
return type == other.type && message == other.message;
}
TSCompileErrorType type;
std::string message;
};
} // namespace tree_sitter
#endif // COMPILER_COMPILE_ERROR_H_

View file

@ -188,7 +188,9 @@ class CCodeGenerator {
}
void add_lex_function() {
line("static TSTree *ts_lex(TSLexer *lexer, TSStateId state, bool error_mode) {");
line(
"static TSTree *ts_lex(TSLexer *lexer, TSStateId state, bool error_mode) "
"{");
indent([&]() {
line("START_LEXER();");
_switch("state", [&]() {

View file

@ -1,44 +0,0 @@
#include "tree_sitter/compiler.h"
#include "compiler/rule.h"
namespace tree_sitter {
using std::ostream;
using std::pair;
using std::string;
using std::vector;
Grammar::Grammar(const vector<pair<string, rule_ptr>> &rules)
: rules_(rules), extra_tokens_({}) {}
const vector<pair<string, rule_ptr>> &Grammar::rules() const {
return rules_;
}
const vector<rule_ptr> &Grammar::extra_tokens() const {
return extra_tokens_;
}
const vector<vector<string>> &Grammar::expected_conflicts() const {
return expected_conflicts_;
}
Grammar &Grammar::extra_tokens(const vector<rule_ptr> &extra_tokens) {
extra_tokens_ = extra_tokens;
return *this;
}
Grammar &Grammar::expected_conflicts(
const vector<vector<string>> &expected_conflicts) {
expected_conflicts_ = expected_conflicts;
return *this;
}
GrammarError::GrammarError(GrammarErrorType type, string message)
: type(type), message(message) {}
bool GrammarError::operator==(const GrammarError &other) const {
return type == other.type && message == other.message;
}
} // namespace tree_sitter

19
src/compiler/grammar.h Normal file
View file

@ -0,0 +1,19 @@
#ifndef COMPILER_GRAMMAR_H_
#define COMPILER_GRAMMAR_H_
#include <vector>
#include <string>
#include <utility>
#include "compiler/rule.h"
namespace tree_sitter {
struct Grammar {
std::vector<std::pair<std::string, rule_ptr>> rules;
std::vector<rule_ptr> extra_tokens;
std::vector<std::vector<std::string>> expected_conflicts;
};
} // namespace tree_sitter
#endif // COMPILER_GRAMMAR_H_

View file

@ -4,7 +4,7 @@
#include <vector>
#include <string>
#include <set>
#include "tree_sitter/compiler.h"
#include "compiler/rule.h"
#include "compiler/variable.h"
namespace tree_sitter {

View file

@ -0,0 +1,326 @@
#include "compiler/parse_grammar.h"
#include <string>
#include <vector>
#include <utility>
#include "json.h"
#include "compiler/rule.h"
#include "compiler/rules.h"
namespace tree_sitter {
using std::string;
using std::vector;
using std::pair;
struct ParseRuleResult {
rule_ptr rule;
string error_message;
};
ParseRuleResult parse_rule(json_value *rule_json) {
string error_message;
json_value rule_type_json;
string type;
if (!rule_json) {
error_message = "Rule cannot be null";
goto error;
}
if (rule_json->type != json_object) {
error_message = "Rule type must be an object";
goto error;
}
rule_type_json = rule_json->operator[]("type");
if (rule_type_json.type != json_string) {
error_message = "Rule type must be a string";
goto error;
}
type = rule_type_json.u.string.ptr;
if (type == "BLANK") {
return { blank(), "" };
}
if (type == "CHOICE") {
json_value members_json = rule_json->operator[]("members");
if (members_json.type != json_array) {
error_message = "Choice members must be an array";
goto error;
}
vector<rule_ptr> members;
for (size_t i = 0, length = members_json.u.array.length; i < length; i++) {
json_value *member_json = members_json.u.array.values[i];
ParseRuleResult member = parse_rule(member_json);
if (member.rule.get()) {
members.push_back(member.rule);
} else {
error_message = "Invalid choice member: " + member.error_message;
goto error;
}
}
return { choice(members), "" };
}
if (type == "SEQ") {
json_value members_json = rule_json->operator[]("members");
if (members_json.type != json_array) {
error_message = "Seq members must be an array";
goto error;
}
vector<rule_ptr> members;
for (size_t i = 0, length = members_json.u.array.length; i < length; i++) {
json_value *member_json = members_json.u.array.values[i];
ParseRuleResult member = parse_rule(member_json);
if (member.rule.get()) {
members.push_back(member.rule);
} else {
error_message = "Invalid seq member: " + member.error_message;
goto error;
}
}
return { seq(members), "" };
}
if (type == "ERROR") {
json_value content_json = rule_json->operator[]("content");
ParseRuleResult content = parse_rule(&content_json);
if (content.rule.get()) {
return { err(content.rule), "" };
} else {
error_message = "Invalid error content: " + content.error_message;
goto error;
}
}
if (type == "REPEAT") {
json_value content_json = rule_json->operator[]("content");
ParseRuleResult content = parse_rule(&content_json);
if (content.rule.get()) {
return { repeat(content.rule), "" };
} else {
error_message = "Invalid repeat content: " + content.error_message;
goto error;
}
}
if (type == "REPEAT1") {
json_value content_json = rule_json->operator[]("content");
ParseRuleResult content = parse_rule(&content_json);
if (content.rule.get()) {
return { repeat1(content.rule), "" };
} else {
error_message = "Invalid repeat1 content: " + content.error_message;
goto error;
}
}
if (type == "TOKEN") {
json_value content_json = rule_json->operator[]("content");
ParseRuleResult content = parse_rule(&content_json);
if (content.rule.get()) {
return { token(content.rule), "" };
} else {
error_message = "Invalid token content: " + content.error_message;
goto error;
}
}
if (type == "PATTERN") {
json_value value_json = rule_json->operator[]("value");
if (value_json.type == json_string) {
return { pattern(value_json.u.string.ptr), "" };
} else {
error_message = "Pattern value must be a string";
goto error;
}
}
if (type == "STRING") {
json_value value_json = rule_json->operator[]("value");
if (value_json.type == json_string) {
return { str(value_json.u.string.ptr), "" };
} else {
error_message = "String rule value must be a string";
goto error;
}
}
if (type == "SYMBOL") {
json_value value_json = rule_json->operator[]("name");
if (value_json.type == json_string) {
return { sym(value_json.u.string.ptr), "" };
} else {
error_message = "Symbol value must be a string";
goto error;
}
}
if (type == "PREC") {
json_value precedence_json = rule_json->operator[]("value");
if (precedence_json.type != json_integer) {
error_message = "Precedence value must be an integer";
goto error;
}
json_value content_json = rule_json->operator[]("content");
ParseRuleResult content = parse_rule(&content_json);
if (!content.rule.get()) {
error_message = "Invalid precedence content: " + content.error_message;
goto error;
}
return { prec(precedence_json.u.integer, content.rule), "" };
}
if (type == "PREC_LEFT") {
json_value precedence_json = rule_json->operator[]("value");
if (precedence_json.type != json_integer) {
error_message = "Precedence value must be an integer";
goto error;
}
json_value content_json = rule_json->operator[]("content");
ParseRuleResult content = parse_rule(&content_json);
if (!content.rule.get()) {
error_message = "Invalid precedence content: " + content.error_message;
goto error;
}
return { prec_left(precedence_json.u.integer, content.rule), "" };
}
if (type == "PREC_RIGHT") {
json_value precedence_json = rule_json->operator[]("value");
if (precedence_json.type != json_integer) {
error_message = "Precedence value must be an integer";
goto error;
}
json_value content_json = rule_json->operator[]("content");
ParseRuleResult content = parse_rule(&content_json);
if (!content.rule.get()) {
error_message = "Invalid precedence content: " + content.error_message;
goto error;
}
return { prec_right(precedence_json.u.integer, content.rule), "" };
}
error_message = "Unknown rule type " + type;
error:
return { rule_ptr(), error_message };
}
ParseGrammarResult parse_grammar(const string &input) {
string error_message;
string name;
Grammar grammar;
json_value name_json, rules_json, extras_json, conflicts_json;
json_settings settings = { 0, 0, 0, 0, 0, 0 };
char parse_error[json_error_max];
json_value *grammar_json =
json_parse_ex(&settings, input.c_str(), input.size(), parse_error);
if (!grammar_json) {
error_message = string("Invalid JSON at ") + parse_error;
goto error;
}
if (grammar_json->type != json_object) {
error_message = "Body must be an object";
goto error;
}
name_json = grammar_json->operator[]("name");
if (name_json.type != json_string) {
error_message = "Name must be a string";
goto error;
}
name = name_json.u.string.ptr;
rules_json = grammar_json->operator[]("rules");
if (rules_json.type != json_object) {
error_message = "Rules must be an object";
goto error;
}
for (size_t i = 0, length = rules_json.u.object.length; i < length; i++) {
json_object_entry entry_json = rules_json.u.object.values[i];
ParseRuleResult entry = parse_rule(entry_json.value);
if (!entry.rule.get()) {
error_message =
string("Invalid rule '") + entry_json.name + "' " + entry.error_message;
goto error;
}
grammar.rules.push_back({ string(entry_json.name), entry.rule });
}
extras_json = grammar_json->operator[]("extras");
if (extras_json.type != json_none) {
if (extras_json.type != json_array) {
error_message = "Extras must be an array";
goto error;
}
for (size_t i = 0, length = extras_json.u.array.length; i < length; i++) {
json_value *extra_json = extras_json.u.array.values[i];
ParseRuleResult extra = parse_rule(extra_json);
if (!extra.rule.get()) {
error_message = string("Invalid extra token: ") + extra.error_message;
goto error;
}
grammar.extra_tokens.push_back(extra.rule);
}
}
conflicts_json = grammar_json->operator[]("conflicts");
if (conflicts_json.type != json_none) {
if (conflicts_json.type != json_array) {
error_message = "Conflicts must be an array";
goto error;
}
for (size_t i = 0, length = conflicts_json.u.array.length; i < length; i++) {
json_value *conflict_json = conflicts_json.u.array.values[i];
if (conflict_json->type != json_array) {
error_message = "Each conflict entry must be an array";
goto error;
}
vector<string> conflict;
for (size_t j = 0, conflict_length = conflict_json->u.array.length;
j < conflict_length; j++) {
json_value *conflict_entry_json = conflict_json->u.array.values[j];
if (conflict_entry_json->type != json_string) {
error_message = "Each conflict entry must be an array of strings";
goto error;
}
conflict.push_back(string(conflict_entry_json->u.string.ptr));
}
grammar.expected_conflicts.push_back(conflict);
}
}
return { name, grammar, "" };
error:
if (grammar_json) {
json_value_free(grammar_json);
}
return { "", Grammar{}, error_message };
}
} // namespace tree_sitter

View file

@ -0,0 +1,20 @@
#ifndef COMPILER_GRAMMAR_JSON_H_
#define COMPILER_GRAMMAR_JSON_H_
#include <string>
#include "tree_sitter/compiler.h"
#include "compiler/grammar.h"
namespace tree_sitter {
struct ParseGrammarResult {
std::string name;
Grammar grammar;
std::string error_message;
};
ParseGrammarResult parse_grammar(const std::string &);
} // namespace tree_sitter
#endif // COMPILER_GRAMMAR_JSON_H_

View file

@ -3,6 +3,7 @@
#include <string>
#include <utility>
#include "compiler/prepare_grammar/initial_syntax_grammar.h"
#include "compiler/rule.h"
#include "compiler/rules/visitor.h"
#include "compiler/rules/seq.h"
#include "compiler/rules/symbol.h"

View file

@ -1,7 +1,7 @@
#ifndef COMPILER_PREPARE_GRAMMAR_EXPAND_REPEATS_H_
#define COMPILER_PREPARE_GRAMMAR_EXPAND_REPEATS_H_
#include "tree_sitter/compiler.h"
#include "compiler/prepare_grammar/initial_syntax_grammar.h"
namespace tree_sitter {
namespace prepare_grammar {

View file

@ -53,36 +53,35 @@ class ExpandTokens : public rules::IdentityRuleFn {
rule_ptr apply_to(const Pattern *rule) {
auto pair = parse_regex(rule->value);
if (!error)
if (!error.type)
error = pair.second;
return pair.first;
}
public:
const GrammarError *error;
ExpandTokens() : error(nullptr) {}
CompileError error;
ExpandTokens() : error(CompileError::none()) {}
};
pair<LexicalGrammar, const GrammarError *> expand_tokens(
const LexicalGrammar &grammar) {
pair<LexicalGrammar, CompileError> expand_tokens(const LexicalGrammar &grammar) {
LexicalGrammar result;
ExpandTokens expander;
for (const Variable &variable : grammar.variables) {
auto rule = expander.apply(variable.rule);
if (expander.error)
if (expander.error.type)
return { result, expander.error };
result.variables.push_back(Variable(variable.name, variable.type, rule));
}
for (auto &sep : grammar.separators) {
auto rule = expander.apply(sep);
if (expander.error)
if (expander.error.type)
return { result, expander.error };
result.separators.push_back(rule);
}
return { result, nullptr };
return { result, CompileError::none() };
}
} // namespace prepare_grammar

View file

@ -2,7 +2,7 @@
#define COMPILER_PREPARE_GRAMMAR_EXPAND_TOKENS_H_
#include <utility>
#include "tree_sitter/compiler.h"
#include "compiler/compile_error.h"
namespace tree_sitter {
@ -10,8 +10,7 @@ struct LexicalGrammar;
namespace prepare_grammar {
std::pair<LexicalGrammar, const GrammarError *> expand_tokens(
const LexicalGrammar &);
std::pair<LexicalGrammar, CompileError> expand_tokens(const LexicalGrammar &);
} // namespace prepare_grammar
} // namespace tree_sitter

View file

@ -2,7 +2,7 @@
#define COMPILER_PREPARE_GRAMMAR_EXTRACT_CHOICES_H_
#include <vector>
#include "tree_sitter/compiler.h"
#include "compiler/rule.h"
namespace tree_sitter {
namespace prepare_grammar {

View file

@ -90,12 +90,12 @@ class TokenExtractor : public rules::IdentityRuleFn {
vector<Variable> tokens;
};
static const GrammarError *ubiq_token_err(const string &message) {
return new GrammarError(GrammarErrorTypeInvalidUbiquitousToken,
"Not a token: " + message);
static CompileError ubiq_token_err(const string &message) {
return CompileError(TSCompileErrorTypeInvalidUbiquitousToken,
"Not a token: " + message);
}
tuple<InitialSyntaxGrammar, LexicalGrammar, const GrammarError *> extract_tokens(
tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
const InternedGrammar &grammar) {
InitialSyntaxGrammar syntax_grammar;
LexicalGrammar lexical_grammar;
@ -186,7 +186,7 @@ tuple<InitialSyntaxGrammar, LexicalGrammar, const GrammarError *> extract_tokens
syntax_grammar.extra_tokens.insert(new_symbol);
}
return make_tuple(syntax_grammar, lexical_grammar, nullptr);
return make_tuple(syntax_grammar, lexical_grammar, CompileError::none());
}
} // namespace prepare_grammar

View file

@ -2,7 +2,7 @@
#define COMPILER_PREPARE_GRAMMAR_EXTRACT_TOKENS_H_
#include <tuple>
#include "tree_sitter/compiler.h"
#include "compiler/compile_error.h"
#include "compiler/lexical_grammar.h"
#include "compiler/prepare_grammar/initial_syntax_grammar.h"
#include "compiler/prepare_grammar/interned_grammar.h"
@ -10,8 +10,8 @@
namespace tree_sitter {
namespace prepare_grammar {
std::tuple<InitialSyntaxGrammar, LexicalGrammar, const GrammarError *>
extract_tokens(const InternedGrammar &);
std::tuple<InitialSyntaxGrammar, LexicalGrammar, CompileError> extract_tokens(
const InternedGrammar &);
} // namespace prepare_grammar
} // namespace tree_sitter

View file

@ -3,6 +3,7 @@
#include <vector>
#include <set>
#include "tree_sitter/compiler.h"
#include "compiler/grammar.h"
#include "compiler/rules/visitor.h"
#include "compiler/rules/blank.h"
#include "compiler/rules/named_symbol.h"
@ -31,8 +32,8 @@ class InternSymbols : public rules::IdentityRuleFn {
public:
std::shared_ptr<rules::Symbol> symbol_for_rule_name(string rule_name) {
for (size_t i = 0; i < grammar.rules().size(); i++)
if (grammar.rules()[i].first == rule_name)
for (size_t i = 0; i < grammar.rules.size(); i++)
if (grammar.rules[i].first == rule_name)
return make_shared<rules::Symbol>(i);
return nullptr;
}
@ -42,16 +43,16 @@ class InternSymbols : public rules::IdentityRuleFn {
string missing_rule_name;
};
const GrammarError *missing_rule_error(string rule_name) {
return new GrammarError(GrammarErrorTypeUndefinedSymbol,
"Undefined rule '" + rule_name + "'");
CompileError missing_rule_error(string rule_name) {
return CompileError(TSCompileErrorTypeUndefinedSymbol,
"Undefined rule '" + rule_name + "'");
}
pair<InternedGrammar, const GrammarError *> intern_symbols(const Grammar &grammar) {
pair<InternedGrammar, CompileError> intern_symbols(const Grammar &grammar) {
InternedGrammar result;
InternSymbols interner(grammar);
for (auto &pair : grammar.rules()) {
for (auto &pair : grammar.rules) {
auto new_rule = interner.apply(pair.second);
if (!interner.missing_rule_name.empty())
return { result, missing_rule_error(interner.missing_rule_name) };
@ -61,14 +62,14 @@ pair<InternedGrammar, const GrammarError *> intern_symbols(const Grammar &gramma
new_rule));
}
for (auto &rule : grammar.extra_tokens()) {
for (auto &rule : grammar.extra_tokens) {
auto new_rule = interner.apply(rule);
if (!interner.missing_rule_name.empty())
return { result, missing_rule_error(interner.missing_rule_name) };
result.extra_tokens.push_back(new_rule);
}
for (auto &names : grammar.expected_conflicts()) {
for (auto &names : grammar.expected_conflicts) {
set<rules::Symbol> entry;
for (auto &name : names) {
auto symbol = interner.symbol_for_rule_name(name);
@ -78,7 +79,7 @@ pair<InternedGrammar, const GrammarError *> intern_symbols(const Grammar &gramma
result.expected_conflicts.insert(entry);
}
return { result, nullptr };
return { result, CompileError::none() };
}
} // namespace prepare_grammar

View file

@ -3,13 +3,16 @@
#include <utility>
#include <string>
#include "tree_sitter/compiler.h"
#include "compiler/compile_error.h"
#include "compiler/prepare_grammar/interned_grammar.h"
namespace tree_sitter {
struct Grammar;
namespace prepare_grammar {
std::pair<InternedGrammar, const GrammarError *> intern_symbols(const Grammar &);
std::pair<InternedGrammar, CompileError> intern_symbols(const Grammar &);
} // namespace prepare_grammar
} // namespace tree_sitter

View file

@ -1,7 +1,7 @@
#ifndef COMPILER_PREPARE_GRAMMAR_IS_TOKEN_H_
#define COMPILER_PREPARE_GRAMMAR_IS_TOKEN_H_
#include "tree_sitter/compiler.h"
#include "compiler/rule.h"
namespace tree_sitter {
namespace prepare_grammar {

View file

@ -32,7 +32,7 @@ class PatternParser {
next();
}
pair<rule_ptr, const GrammarError *> rule(bool nested) {
pair<rule_ptr, CompileError> rule(bool nested) {
vector<rule_ptr> choices = {};
do {
if (!choices.empty()) {
@ -42,17 +42,17 @@ class PatternParser {
break;
}
auto pair = term(nested);
if (pair.second)
if (pair.second.type)
return { Blank::build(), pair.second };
choices.push_back(pair.first);
} while (has_more_input());
auto rule =
(choices.size() > 1) ? make_shared<Choice>(choices) : choices.front();
return { rule, nullptr };
return { rule, CompileError::none() };
}
private:
pair<rule_ptr, const GrammarError *> term(bool nested) {
pair<rule_ptr, CompileError> term(bool nested) {
rule_ptr result = Blank::build();
do {
if (peek() == '|')
@ -60,16 +60,16 @@ class PatternParser {
if (nested && peek() == ')')
break;
auto pair = factor();
if (pair.second)
if (pair.second.type)
return { Blank::build(), pair.second };
result = Seq::build({ result, pair.first });
} while (has_more_input());
return { result, nullptr };
return { result, CompileError::none() };
}
pair<rule_ptr, const GrammarError *> factor() {
pair<rule_ptr, CompileError> factor() {
auto pair = atom();
if (pair.second)
if (pair.second.type)
return { Blank::build(), pair.second };
rule_ptr result = pair.first;
if (has_more_input()) {
@ -88,30 +88,30 @@ class PatternParser {
break;
}
}
return { result, nullptr };
return { result, CompileError::none() };
}
pair<rule_ptr, const GrammarError *> atom() {
pair<rule_ptr, CompileError> atom() {
switch (peek()) {
case '(': {
next();
auto pair = rule(true);
if (pair.second)
if (pair.second.type)
return { Blank::build(), pair.second };
if (peek() != ')')
return error("unmatched open paren");
next();
return { pair.first, nullptr };
return { pair.first, CompileError::none() };
}
case '[': {
next();
auto pair = char_set();
if (pair.second)
if (pair.second.type)
return { Blank::build(), pair.second };
if (peek() != ']')
return error("unmatched open square bracket");
next();
return { pair.first.copy(), nullptr };
return { pair.first.copy(), CompileError::none() };
}
case ')': {
return error("unmatched close paren");
@ -121,18 +121,19 @@ class PatternParser {
}
case '.': {
next();
return { CharacterSet().include_all().exclude('\n').copy(), nullptr };
return { CharacterSet().include_all().exclude('\n').copy(),
CompileError::none() };
}
default: {
auto pair = single_char();
if (pair.second)
if (pair.second.type)
return { Blank::build(), pair.second };
return { pair.first.copy(), nullptr };
return { pair.first.copy(), CompileError::none() };
}
}
}
pair<CharacterSet, const GrammarError *> char_set() {
pair<CharacterSet, CompileError> char_set() {
CharacterSet result;
bool is_affirmative = true;
if (peek() == '^') {
@ -143,7 +144,7 @@ class PatternParser {
while (has_more_input() && (peek() != ']')) {
auto pair = single_char();
if (pair.second)
if (pair.second.type)
return { CharacterSet(), pair.second };
if (is_affirmative)
result.add_set(pair.first);
@ -151,10 +152,10 @@ class PatternParser {
result.remove_set(pair.first);
}
return { result, nullptr };
return { result, CompileError::none() };
}
pair<CharacterSet, const GrammarError *> single_char() {
pair<CharacterSet, CompileError> single_char() {
CharacterSet value;
switch (peek()) {
case '\\':
@ -173,7 +174,7 @@ class PatternParser {
value = CharacterSet().include(first_char);
}
}
return { value, nullptr };
return { value, CompileError::none() };
}
CharacterSet escaped_char(uint32_t value) {
@ -217,8 +218,8 @@ class PatternParser {
return lookahead && iter <= end;
}
pair<rule_ptr, const GrammarError *> error(string msg) {
return { Blank::build(), new GrammarError(GrammarErrorTypeRegex, msg) };
pair<rule_ptr, CompileError> error(string msg) {
return { Blank::build(), CompileError(TSCompileErrorTypeInvalidRegex, msg) };
}
string input;
@ -227,7 +228,7 @@ class PatternParser {
int32_t lookahead;
};
pair<rule_ptr, const GrammarError *> parse_regex(const std::string &input) {
pair<rule_ptr, CompileError> parse_regex(const std::string &input) {
return PatternParser(input.c_str()).rule(false);
}

View file

@ -3,12 +3,13 @@
#include <string>
#include <utility>
#include "tree_sitter/compiler.h"
#include "compiler/rule.h"
#include "compiler/compile_error.h"
namespace tree_sitter {
namespace prepare_grammar {
std::pair<rule_ptr, const GrammarError *> parse_regex(const std::string &);
std::pair<rule_ptr, CompileError> parse_regex(const std::string &);
} // namespace prepare_grammar
} // namespace tree_sitter

View file

@ -16,14 +16,14 @@ using std::tuple;
using std::get;
using std::make_tuple;
tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> prepare_grammar(
tuple<SyntaxGrammar, LexicalGrammar, CompileError> prepare_grammar(
const Grammar &input_grammar) {
/*
* Convert all string-based `NamedSymbols` into numerical `Symbols`
*/
auto intern_result = intern_symbols(input_grammar);
const GrammarError *error = intern_result.second;
if (error)
CompileError error = intern_result.second;
if (error.type)
return make_tuple(SyntaxGrammar(), LexicalGrammar(), error);
/*
@ -31,7 +31,7 @@ tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> prepare_grammar(
*/
auto extract_result = extract_tokens(intern_result.first);
error = get<2>(extract_result);
if (error)
if (error.type)
return make_tuple(SyntaxGrammar(), LexicalGrammar(), error);
/*
@ -45,7 +45,7 @@ tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> prepare_grammar(
auto expand_tokens_result = expand_tokens(get<1>(extract_result));
LexicalGrammar lex_grammar = expand_tokens_result.first;
error = expand_tokens_result.second;
if (error)
if (error.type)
return make_tuple(SyntaxGrammar(), LexicalGrammar(), error);
/*
@ -58,7 +58,7 @@ tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> prepare_grammar(
*/
lex_grammar = normalize_rules(lex_grammar);
return make_tuple(syntax_grammar, lex_grammar, nullptr);
return make_tuple(syntax_grammar, lex_grammar, CompileError::none());
}
} // namespace prepare_grammar

View file

@ -4,15 +4,15 @@
#include <tuple>
#include "compiler/syntax_grammar.h"
#include "compiler/lexical_grammar.h"
#include "compiler/compile_error.h"
namespace tree_sitter {
class Grammar;
class GrammarError;
struct Grammar;
namespace prepare_grammar {
std::tuple<SyntaxGrammar, LexicalGrammar, const GrammarError *> prepare_grammar(
std::tuple<SyntaxGrammar, LexicalGrammar, CompileError> prepare_grammar(
const Grammar &);
} // namespace prepare_grammar

View file

@ -1,5 +1,4 @@
#include "compiler/prepare_grammar/token_description.h"
#include "tree_sitter/compiler.h"
#include "compiler/rules/visitor.h"
#include "compiler/rules/pattern.h"
#include "compiler/rules/seq.h"

View file

@ -2,7 +2,7 @@
#define COMPILER_PREPARE_GRAMMAR_TOKEN_DESCRIPTION_H_
#include <string>
#include "tree_sitter/compiler.h"
#include "compiler/rule.h"
namespace tree_sitter {
namespace prepare_grammar {

29
src/compiler/rules.h Normal file
View file

@ -0,0 +1,29 @@
#ifndef COMPILER_RULES_H_
#define COMPILER_RULES_H_
#include <string>
#include <vector>
#include <memory>
#include "compiler/rule.h"
namespace tree_sitter {
rule_ptr blank();
rule_ptr choice(const std::vector<rule_ptr> &);
rule_ptr repeat(const rule_ptr &);
rule_ptr repeat1(const rule_ptr &);
rule_ptr seq(const std::vector<rule_ptr> &);
rule_ptr sym(const std::string &);
rule_ptr pattern(const std::string &);
rule_ptr str(const std::string &);
rule_ptr err(const rule_ptr &);
rule_ptr prec(int precedence, const rule_ptr &);
rule_ptr prec_left(const rule_ptr &);
rule_ptr prec_left(int precedence, const rule_ptr &);
rule_ptr prec_right(const rule_ptr &);
rule_ptr prec_right(int precedence, const rule_ptr &);
rule_ptr token(const rule_ptr &rule);
} // namespace std
#endif // COMPILER_RULES_H_

View file

@ -3,7 +3,6 @@
#include <string>
#include <map>
#include "tree_sitter/compiler.h"
#include "compiler/rule.h"
namespace tree_sitter {

View file

@ -2,8 +2,8 @@
#include <map>
#include <set>
#include <string>
#include "tree_sitter/compiler.h"
#include "compiler/rule.h"
#include "compiler/rules.h"
#include "compiler/rules/blank.h"
#include "compiler/rules/named_symbol.h"
#include "compiler/rules/choice.h"

View file

@ -4,7 +4,6 @@
#include <vector>
#include <string>
#include <set>
#include "tree_sitter/compiler.h"
#include "compiler/rules/symbol.h"
#include "compiler/rules/metadata.h"
#include "compiler/variable.h"

View file

@ -2,7 +2,7 @@
#define COMPILER_VARIABLE_H_
#include <string>
#include "tree_sitter/compiler.h"
#include "compiler/rule.h"
#include "compiler/rules/symbol.h"
namespace tree_sitter {