diff --git a/README.md b/README.md index 96b26a09..aa3c54f1 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,11 @@ [![Build Status](https://travis-ci.org/maxbrunsfeld/tree-sitter.png?branch=master)](https://travis-ci.org/maxbrunsfeld/tree-sitter) -Tree-sitter is an incremental parsing library in C and C++, intended to be used via [bindings](https://github.com/maxbrunsfeld/node-tree-sitter) to higher-level -languages. It allows documents to be efficiently re-parsed after localized -edits, making it suitable for use in performance-intensive text-editing programs. +Tree-sitter is a C library for incremental parsing, intended to be used via +[bindings](https://github.com/maxbrunsfeld/node-tree-sitter) to higher-level +languages. It can be used to build a concrete syntax tree for a program and +efficiently update the syntax tree as the program is edited. This makes it suitable +for use in text-editing programs. Tree-sitter uses a sentential-form incremental [LR parsing](https://en.wikipedia.org/wiki/LR_parser) algorithm, as described in the paper *[Efficient and Flexible Incremental Parsing](http://harmonia.cs.berkeley.edu/papers/twagner-parsing.ps.gz)* @@ -15,142 +17,176 @@ This allows it to generate a fast parser for any context-free grammar. ### Installation ```sh -script/configure.sh # Generate a Makefile using gyp +script/configure.sh # Generate a Makefile make # Build static libraries for the compiler and runtime ``` +### Overview + +Tree-sitter consists of two libraries. The first library, `libcompiler`, can be +used to generate a parser for a language by supplying a [context-free grammar](https://en.wikipedia.org/wiki/Context-free_grammar) describing the +language. Once the parser has been generated, `libcompiler` is no longer needed. + +The second library, `libruntime`, is used in combination with the parsers +generated by `libcompiler`, to generate syntax trees based on text documents, and keep the +syntax trees up-to-date as changes are made to the documents. + + ### Writing a grammar -Tree-sitter's interface for creating grammars is a C++ library, `libcompiler`. -This allows grammars and rules to be defined, manipulated and -extended as simple values in high-level languages like [javascript](https://github.com/maxbrunsfeld/node-tree-sitter-compiler), -and then converted into tree-sitter's native representation and compiled to C -parsers. These parsers can then be used from any language that has a binding to -tree-sitter's runtime library, `libruntime`. +Tree-sitter's grammars are specified as JSON strings. This format allows them +to be easily created and manipulated in high-level languages like [JavaScript](https://github.com/maxbrunsfeld/node-tree-sitter-compiler). +The structure of a grammar is formally specified by [this JSON schema](./doc/grammar-schema.json). +You can generate a parser for a grammar using the `ts_compile_grammar` function +provided by `libcompiler`. -Here's a simple example that uses `libcompiler` directly: +Here's a simple example of using `ts_compile_grammar` to create a parser for basic +arithmetic expressions. It uses C++11 multi-line strings for readability. ```cpp // arithmetic_grammar.cc -#include #include #include "tree_sitter/compiler.h" -using namespace tree_sitter; - int main() { - auto arithmetic_grammar = Grammar({ + TSCompileResult result = ts_compile_grammar(R"JSON( + { + "name": "arithmetic", - // The first rule listed in a grammar becomes the 'start rule'. - { "expression", choice({ - sym("sum"), - sym("product"), - sym("number"), - sym("variable"), + "extras": [ + {"type": "PATTERN", "value": "\\s"}, + ], - // Error recovery is controlled by wrapping rule subtrees with `err`. - seq({ - str("("), - err(sym("expression")), - str(")") }) }) }, + "rules": { + "expression": { + "type": "CHOICE", + "members": [ + {"type": "SYMBOL", "name": "sum"}, + {"type": "SYMBOL", "name": "product"}, + {"type": "SYMBOL", "name": "number"}, + {"type": "SYMBOL", "name": "variable"}, + { + "type": "SEQ", + "members": [ + {"type": "STRING", "value": "("}, + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": ")"} + ] + } + ] + }, - // Tokens like '+' and '*' are described directly within the grammar's rules, - // as opposed to in a seperate lexer description. - { "sum", prec_left(1, seq({ - sym("expression"), - str("+"), - sym("expression") })) }, + "sum": { + "type": "PREC_LEFT", + "value": 1, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "+"}, + {"type": "SYMBOL", "name": "expression"} + ] + } + }, - // Ambiguities can be resolved at compile time by assigning precedence - // values to rule subtrees. - { "product", prec_left(2, seq({ - sym("expression"), - str("*"), - sym("expression") })) }, + "product": { + "type": "PREC_LEFT", + "value": 2, + "content": { + "type": "SEQ", + "members": [ + {"type": "SYMBOL", "name": "expression"}, + {"type": "STRING", "value": "*"}, + {"type": "SYMBOL", "name": "expression"} + ] + } + }, - // Tokens can be specified using ECMAScript regexps. - { "number", pattern("\\d+") }, - { "variable", pattern("[a-zA-Z]+\\w*") }, - { "comment", pattern("//.*") }, + "number": {"type": "PATTERN", "value": "\\d+"} + "variable": {"type": "PATTERN", "value": "[a-zA-Z]\\w*"}, + } + } + )JSON"); - }).extra_tokens({ + if (result.error_type != TSCompileErrorTypeNone) { + fprintf(stderr, "Compilation failed: %s\n", result.error_message); + return 1; + } - // Things that can appear anywhere in the language are expressed as - // 'extra tokens'. - sym("comment"), - pattern("\\s+") - }); - - // Generate C code for parsing this language. - auto output = compile(arithmetic_grammar, "arithmetic"); - std::string c_code = output.first; - const GrammarError *error = output.second; - - assert(!error); - puts(c_code.c_str()); + puts(result.code); return 0; } ``` -To create a parser for this language, compile and run this grammar like this: +To create the parser, compile this file like this: ```sh -clang++ -stdlib=libc++ -std=c++11 \ - -I tree-sitter/include -L tree-sitter/out/Debug -l compiler \ - arithmetic_grammar.cc -o arithmetic_grammar +clang++ -std=c++11 \ + -I tree-sitter/include \ + -L tree-sitter/out/Release \ + -l compiler \ + arithmetic_grammar.cc \ + -o arithmetic_grammar +``` +Then run the executable to print out the C code for the parser: + +```sh ./arithmetic_grammar > arithmetic_parser.c ``` ### Using the parser -The `tree_sitter/runtime` C library exposes a DOM-style interface for inspecting -documents. +#### Providing the text to parse -Functions like `ts_node_child(node, index)` and `ts_node_next_sibling(node)` +Text input is provided to a tree-sitter parser via a `TSInput` struct, which +contains function pointers for seeking to positions in the text, and for reading +chunks of text. The text can be encoded in either UTF8 or UTF16. This interface +allows you to efficiently parse text that is stored in your own data structure. + +#### Querying the syntax tree + +The `libruntime` API provides a DOM-style interface for inspecting +syntax trees. Functions like `ts_node_child(node, index)` and `ts_node_next_sibling(node)` expose every node in the concrete syntax tree. This is useful for operations -like syntax-highlighting, that operate on a token-by-token basis. You can also +like syntax-highlighting, which operate on a token-by-token basis. You can also traverse the tree in a more abstract way by using functions like `ts_node_named_child(node, index)` and `ts_node_next_named_sibling(node)`. These functions don't expose nodes that were specified in the grammar as anonymous tokens, like `(` and `+`. This is useful when analyzing the meaning of a document. ```c +// test_parser.c + +#include +#include #include #include "tree_sitter/runtime.h" -// Declare the language constructor that was generated from your grammar. +// Declare the language function that was generated from your grammar. TSLanguage *ts_language_arithmetic(); int main() { TSDocument *document = ts_document_make(); ts_document_set_language(document, ts_language_arithmetic()); - - // Usually, you would use the more general `ts_document_set_input`, which - // takes a struct with function pointers for seeking to positions in the text, - // and reading chunks of text. This allows you to efficiently parse text - // stored in your own data structure. ts_document_set_input_string(document, "a + b * 5"); ts_document_parse(document); TSNode root_node = ts_document_root_node(document); - printf( - "Root name: %s, start: %lu, end: %lu\n", - ts_node_name(root_node, document), - ts_node_start_char(root_node), - ts_node_end_char(root_node) - ); + assert(!strcmp(ts_node_name(root_node, document), "expression")); + assert(ts_node_named_child_count(root_node) == 1); - TSNode product_node = ts_node_named_child(ts_node_child(root_node, 0), 1); - printf( - "Child name: %s, start: %lu, end: %lu\n", - ts_node_name(product_node, document), - ts_node_start_char(product_node), - ts_node_end_char(product_node) - ); + TSNode sum_node = ts_node_named_child(root_node, 0); + assert(!strcmp(ts_node_name(sum_node, document), "sum")); + assert(ts_node_named_child_count(sum_node) == 2); + TSNode product_node = ts_node_child(ts_node_named_child(sum_node, 1), 0); + assert(!strcmp(ts_node_name(product_node, document), "product")); + assert(ts_node_named_child_count(product_node) == 2); + + printf("Syntax tree: %s\n", ts_node_string(root_node, document)); ts_document_free(document); return 0; } @@ -159,9 +195,12 @@ int main() { To demo this parser's capabilities, compile this program like this: ```sh -clang \ - -I tree-sitter/include -L tree-sitter/out/Debug -l runtime \ - arithmetic_parser.c test_parser.c -o test_parser +clang \ + -I tree-sitter/include \ + -L tree-sitter/out/Debug \ + -l runtime \ + arithmetic_parser.c test_parser.c \ + -o test_parser ./test_parser ```