Update README to reflect new compiler API
This commit is contained in:
parent
96dd5c820e
commit
7121689cfe
1 changed files with 125 additions and 86 deletions
211
README.md
211
README.md
|
|
@ -2,9 +2,11 @@
|
|||
|
||||
[](https://travis-ci.org/maxbrunsfeld/tree-sitter)
|
||||
|
||||
Tree-sitter is an incremental parsing library in C and C++, intended to be used via [bindings](https://github.com/maxbrunsfeld/node-tree-sitter) to higher-level
|
||||
languages. It allows documents to be efficiently re-parsed after localized
|
||||
edits, making it suitable for use in performance-intensive text-editing programs.
|
||||
Tree-sitter is a C library for incremental parsing, intended to be used via
|
||||
[bindings](https://github.com/maxbrunsfeld/node-tree-sitter) to higher-level
|
||||
languages. It can be used to build a concrete syntax tree for a program and
|
||||
efficiently update the syntax tree as the program is edited. This makes it suitable
|
||||
for use in text-editing programs.
|
||||
|
||||
Tree-sitter uses a sentential-form incremental [LR parsing](https://en.wikipedia.org/wiki/LR_parser)
|
||||
algorithm, as described in the paper *[Efficient and Flexible Incremental Parsing](http://harmonia.cs.berkeley.edu/papers/twagner-parsing.ps.gz)*
|
||||
|
|
@ -15,142 +17,176 @@ This allows it to generate a fast parser for any context-free grammar.
|
|||
### Installation
|
||||
|
||||
```sh
|
||||
script/configure.sh # Generate a Makefile using gyp
|
||||
script/configure.sh # Generate a Makefile
|
||||
make # Build static libraries for the compiler and runtime
|
||||
```
|
||||
|
||||
### Overview
|
||||
|
||||
Tree-sitter consists of two libraries. The first library, `libcompiler`, can be
|
||||
used to generate a parser for a language by supplying a [context-free grammar](https://en.wikipedia.org/wiki/Context-free_grammar) describing the
|
||||
language. Once the parser has been generated, `libcompiler` is no longer needed.
|
||||
|
||||
The second library, `libruntime`, is used in combination with the parsers
|
||||
generated by `libcompiler`, to generate syntax trees based on text documents, and keep the
|
||||
syntax trees up-to-date as changes are made to the documents.
|
||||
|
||||
|
||||
### Writing a grammar
|
||||
|
||||
Tree-sitter's interface for creating grammars is a C++ library, `libcompiler`.
|
||||
This allows grammars and rules to be defined, manipulated and
|
||||
extended as simple values in high-level languages like [javascript](https://github.com/maxbrunsfeld/node-tree-sitter-compiler),
|
||||
and then converted into tree-sitter's native representation and compiled to C
|
||||
parsers. These parsers can then be used from any language that has a binding to
|
||||
tree-sitter's runtime library, `libruntime`.
|
||||
Tree-sitter's grammars are specified as JSON strings. This format allows them
|
||||
to be easily created and manipulated in high-level languages like [JavaScript](https://github.com/maxbrunsfeld/node-tree-sitter-compiler).
|
||||
The structure of a grammar is formally specified by [this JSON schema](./doc/grammar-schema.json).
|
||||
You can generate a parser for a grammar using the `ts_compile_grammar` function
|
||||
provided by `libcompiler`.
|
||||
|
||||
Here's a simple example that uses `libcompiler` directly:
|
||||
Here's a simple example of using `ts_compile_grammar` to create a parser for basic
|
||||
arithmetic expressions. It uses C++11 multi-line strings for readability.
|
||||
|
||||
```cpp
|
||||
// arithmetic_grammar.cc
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include "tree_sitter/compiler.h"
|
||||
|
||||
using namespace tree_sitter;
|
||||
|
||||
int main() {
|
||||
auto arithmetic_grammar = Grammar({
|
||||
TSCompileResult result = ts_compile_grammar(R"JSON(
|
||||
{
|
||||
"name": "arithmetic",
|
||||
|
||||
// The first rule listed in a grammar becomes the 'start rule'.
|
||||
{ "expression", choice({
|
||||
sym("sum"),
|
||||
sym("product"),
|
||||
sym("number"),
|
||||
sym("variable"),
|
||||
"extras": [
|
||||
{"type": "PATTERN", "value": "\\s"},
|
||||
],
|
||||
|
||||
// Error recovery is controlled by wrapping rule subtrees with `err`.
|
||||
seq({
|
||||
str("("),
|
||||
err(sym("expression")),
|
||||
str(")") }) }) },
|
||||
"rules": {
|
||||
"expression": {
|
||||
"type": "CHOICE",
|
||||
"members": [
|
||||
{"type": "SYMBOL", "name": "sum"},
|
||||
{"type": "SYMBOL", "name": "product"},
|
||||
{"type": "SYMBOL", "name": "number"},
|
||||
{"type": "SYMBOL", "name": "variable"},
|
||||
{
|
||||
"type": "SEQ",
|
||||
"members": [
|
||||
{"type": "STRING", "value": "("},
|
||||
{"type": "SYMBOL", "name": "expression"},
|
||||
{"type": "STRING", "value": ")"}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
// Tokens like '+' and '*' are described directly within the grammar's rules,
|
||||
// as opposed to in a seperate lexer description.
|
||||
{ "sum", prec_left(1, seq({
|
||||
sym("expression"),
|
||||
str("+"),
|
||||
sym("expression") })) },
|
||||
"sum": {
|
||||
"type": "PREC_LEFT",
|
||||
"value": 1,
|
||||
"content": {
|
||||
"type": "SEQ",
|
||||
"members": [
|
||||
{"type": "SYMBOL", "name": "expression"},
|
||||
{"type": "STRING", "value": "+"},
|
||||
{"type": "SYMBOL", "name": "expression"}
|
||||
]
|
||||
}
|
||||
},
|
||||
|
||||
// Ambiguities can be resolved at compile time by assigning precedence
|
||||
// values to rule subtrees.
|
||||
{ "product", prec_left(2, seq({
|
||||
sym("expression"),
|
||||
str("*"),
|
||||
sym("expression") })) },
|
||||
"product": {
|
||||
"type": "PREC_LEFT",
|
||||
"value": 2,
|
||||
"content": {
|
||||
"type": "SEQ",
|
||||
"members": [
|
||||
{"type": "SYMBOL", "name": "expression"},
|
||||
{"type": "STRING", "value": "*"},
|
||||
{"type": "SYMBOL", "name": "expression"}
|
||||
]
|
||||
}
|
||||
},
|
||||
|
||||
// Tokens can be specified using ECMAScript regexps.
|
||||
{ "number", pattern("\\d+") },
|
||||
{ "variable", pattern("[a-zA-Z]+\\w*") },
|
||||
{ "comment", pattern("//.*") },
|
||||
"number": {"type": "PATTERN", "value": "\\d+"}
|
||||
"variable": {"type": "PATTERN", "value": "[a-zA-Z]\\w*"},
|
||||
}
|
||||
}
|
||||
)JSON");
|
||||
|
||||
}).extra_tokens({
|
||||
if (result.error_type != TSCompileErrorTypeNone) {
|
||||
fprintf(stderr, "Compilation failed: %s\n", result.error_message);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Things that can appear anywhere in the language are expressed as
|
||||
// 'extra tokens'.
|
||||
sym("comment"),
|
||||
pattern("\\s+")
|
||||
});
|
||||
|
||||
// Generate C code for parsing this language.
|
||||
auto output = compile(arithmetic_grammar, "arithmetic");
|
||||
std::string c_code = output.first;
|
||||
const GrammarError *error = output.second;
|
||||
|
||||
assert(!error);
|
||||
puts(c_code.c_str());
|
||||
puts(result.code);
|
||||
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
To create a parser for this language, compile and run this grammar like this:
|
||||
To create the parser, compile this file like this:
|
||||
|
||||
```sh
|
||||
clang++ -stdlib=libc++ -std=c++11 \
|
||||
-I tree-sitter/include -L tree-sitter/out/Debug -l compiler \
|
||||
arithmetic_grammar.cc -o arithmetic_grammar
|
||||
clang++ -std=c++11 \
|
||||
-I tree-sitter/include \
|
||||
-L tree-sitter/out/Release \
|
||||
-l compiler \
|
||||
arithmetic_grammar.cc \
|
||||
-o arithmetic_grammar
|
||||
```
|
||||
|
||||
Then run the executable to print out the C code for the parser:
|
||||
|
||||
```sh
|
||||
./arithmetic_grammar > arithmetic_parser.c
|
||||
```
|
||||
|
||||
### Using the parser
|
||||
|
||||
The `tree_sitter/runtime` C library exposes a DOM-style interface for inspecting
|
||||
documents.
|
||||
#### Providing the text to parse
|
||||
|
||||
Functions like `ts_node_child(node, index)` and `ts_node_next_sibling(node)`
|
||||
Text input is provided to a tree-sitter parser via a `TSInput` struct, which
|
||||
contains function pointers for seeking to positions in the text, and for reading
|
||||
chunks of text. The text can be encoded in either UTF8 or UTF16. This interface
|
||||
allows you to efficiently parse text that is stored in your own data structure.
|
||||
|
||||
#### Querying the syntax tree
|
||||
|
||||
The `libruntime` API provides a DOM-style interface for inspecting
|
||||
syntax trees. Functions like `ts_node_child(node, index)` and `ts_node_next_sibling(node)`
|
||||
expose every node in the concrete syntax tree. This is useful for operations
|
||||
like syntax-highlighting, that operate on a token-by-token basis. You can also
|
||||
like syntax-highlighting, which operate on a token-by-token basis. You can also
|
||||
traverse the tree in a more abstract way by using functions like
|
||||
`ts_node_named_child(node, index)` and `ts_node_next_named_sibling(node)`. These
|
||||
functions don't expose nodes that were specified in the grammar as anonymous
|
||||
tokens, like `(` and `+`. This is useful when analyzing the meaning of a document.
|
||||
|
||||
```c
|
||||
// test_parser.c
|
||||
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "tree_sitter/runtime.h"
|
||||
|
||||
// Declare the language constructor that was generated from your grammar.
|
||||
// Declare the language function that was generated from your grammar.
|
||||
TSLanguage *ts_language_arithmetic();
|
||||
|
||||
int main() {
|
||||
TSDocument *document = ts_document_make();
|
||||
ts_document_set_language(document, ts_language_arithmetic());
|
||||
|
||||
// Usually, you would use the more general `ts_document_set_input`, which
|
||||
// takes a struct with function pointers for seeking to positions in the text,
|
||||
// and reading chunks of text. This allows you to efficiently parse text
|
||||
// stored in your own data structure.
|
||||
ts_document_set_input_string(document, "a + b * 5");
|
||||
ts_document_parse(document);
|
||||
|
||||
TSNode root_node = ts_document_root_node(document);
|
||||
printf(
|
||||
"Root name: %s, start: %lu, end: %lu\n",
|
||||
ts_node_name(root_node, document),
|
||||
ts_node_start_char(root_node),
|
||||
ts_node_end_char(root_node)
|
||||
);
|
||||
assert(!strcmp(ts_node_name(root_node, document), "expression"));
|
||||
assert(ts_node_named_child_count(root_node) == 1);
|
||||
|
||||
TSNode product_node = ts_node_named_child(ts_node_child(root_node, 0), 1);
|
||||
printf(
|
||||
"Child name: %s, start: %lu, end: %lu\n",
|
||||
ts_node_name(product_node, document),
|
||||
ts_node_start_char(product_node),
|
||||
ts_node_end_char(product_node)
|
||||
);
|
||||
TSNode sum_node = ts_node_named_child(root_node, 0);
|
||||
assert(!strcmp(ts_node_name(sum_node, document), "sum"));
|
||||
assert(ts_node_named_child_count(sum_node) == 2);
|
||||
|
||||
TSNode product_node = ts_node_child(ts_node_named_child(sum_node, 1), 0);
|
||||
assert(!strcmp(ts_node_name(product_node, document), "product"));
|
||||
assert(ts_node_named_child_count(product_node) == 2);
|
||||
|
||||
printf("Syntax tree: %s\n", ts_node_string(root_node, document));
|
||||
ts_document_free(document);
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -159,9 +195,12 @@ int main() {
|
|||
To demo this parser's capabilities, compile this program like this:
|
||||
|
||||
```sh
|
||||
clang \
|
||||
-I tree-sitter/include -L tree-sitter/out/Debug -l runtime \
|
||||
arithmetic_parser.c test_parser.c -o test_parser
|
||||
clang \
|
||||
-I tree-sitter/include \
|
||||
-L tree-sitter/out/Debug \
|
||||
-l runtime \
|
||||
arithmetic_parser.c test_parser.c \
|
||||
-o test_parser
|
||||
|
||||
./test_parser
|
||||
```
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue