Update README to reflect new compiler API

2016-01-13 20:52:58 -08:00 · 2016-01-13 20:52:58 -08:00 · 7121689cfe
commit 7121689cfe
parent 96dd5c820e
1 changed files with 125 additions and 86 deletions
--- a/README.md
+++ b/README.md
@ -2,9 +2,11 @@

 [![Build Status](https://travis-ci.org/maxbrunsfeld/tree-sitter.png?branch=master)](https://travis-ci.org/maxbrunsfeld/tree-sitter)

-Tree-sitter is an incremental parsing library in C and C++, intended to be used via [bindings](https://github.com/maxbrunsfeld/node-tree-sitter) to higher-level
-languages. It allows documents to be efficiently re-parsed after localized
-edits, making it suitable for use in performance-intensive text-editing programs.
+Tree-sitter is a C library for incremental parsing, intended to be used via
+[bindings](https://github.com/maxbrunsfeld/node-tree-sitter) to higher-level
+languages. It can be used to build a concrete syntax tree for a program and
+efficiently update the syntax tree as the program is edited. This makes it suitable
+for use in text-editing programs.

 Tree-sitter uses a sentential-form incremental [LR parsing](https://en.wikipedia.org/wiki/LR_parser)
 algorithm, as described in the paper *[Efficient and Flexible Incremental Parsing](http://harmonia.cs.berkeley.edu/papers/twagner-parsing.ps.gz)*
@ -15,142 +17,176 @@ This allows it to generate a fast parser for any context-free grammar.
 ### Installation

 ```sh
-script/configure.sh  # Generate a Makefile using gyp
+script/configure.sh  # Generate a Makefile
 make                 # Build static libraries for the compiler and runtime
 ```

+### Overview
+
+Tree-sitter consists of two libraries. The first library, `libcompiler`, can be
+used to generate a parser for a language by supplying a [context-free grammar](https://en.wikipedia.org/wiki/Context-free_grammar) describing the
+language. Once the parser has been generated, `libcompiler` is no longer needed.
+
+The second library, `libruntime`, is used in combination with the parsers
+generated by `libcompiler`, to generate syntax trees based on text documents, and keep the
+syntax trees up-to-date as changes are made to the documents.
+
+
 ### Writing a grammar

-Tree-sitter's interface for creating grammars is a C++ library, `libcompiler`.
-This allows grammars and rules to be defined, manipulated and
-extended as simple values in high-level languages like [javascript](https://github.com/maxbrunsfeld/node-tree-sitter-compiler),
-and then converted into tree-sitter's native representation and compiled to C
-parsers. These parsers can then be used from any language that has a binding to
-tree-sitter's runtime library, `libruntime`.
+Tree-sitter's grammars are specified as JSON strings. This format allows them
+to be easily created and manipulated in high-level languages like [JavaScript](https://github.com/maxbrunsfeld/node-tree-sitter-compiler).
+The structure of a grammar is formally specified by [this JSON schema](./doc/grammar-schema.json).
+You can generate a parser for a grammar using the `ts_compile_grammar` function
+provided by `libcompiler`.

-Here's a simple example that uses `libcompiler` directly:
+Here's a simple example of using `ts_compile_grammar` to create a parser for basic
+arithmetic expressions. It uses C++11 multi-line strings for readability.

 ```cpp
 // arithmetic_grammar.cc

-#include <assert.h>
 #include <stdio.h>
 #include "tree_sitter/compiler.h"

-using namespace tree_sitter;
-
 int main() {
-  auto arithmetic_grammar = Grammar({
+  TSCompileResult result = ts_compile_grammar(R"JSON(
+    {
+      "name": "arithmetic",

-    // The first rule listed in a grammar becomes the 'start rule'.
-    { "expression", choice({
-      sym("sum"),
-      sym("product"),
-      sym("number"),
-      sym("variable"),
+      "extras": [
+        {"type": "PATTERN", "value": "\\s"},
+      ],

-      // Error recovery is controlled by wrapping rule subtrees with `err`.
-      seq({
-        str("("),
-        err(sym("expression")),
-        str(")") }) }) },
+      "rules": {
+        "expression": {
+          "type": "CHOICE",
+          "members": [
+            {"type": "SYMBOL", "name": "sum"},
+            {"type": "SYMBOL", "name": "product"},
+            {"type": "SYMBOL", "name": "number"},
+            {"type": "SYMBOL", "name": "variable"},
+            {
+              "type": "SEQ",
+              "members": [
+                {"type": "STRING", "value": "("},
+                {"type": "SYMBOL", "name": "expression"},
+                {"type": "STRING", "value": ")"}
+              ]
+            }
+          ]
+        },

-    // Tokens like '+' and '*' are described directly within the grammar's rules,
-    // as opposed to in a seperate lexer description.
-    { "sum", prec_left(1, seq({
-      sym("expression"),
-      str("+"),
-      sym("expression") })) },
+        "sum": {
+          "type": "PREC_LEFT",
+          "value": 1,
+          "content": {
+            "type": "SEQ",
+            "members": [
+              {"type": "SYMBOL", "name": "expression"},
+              {"type": "STRING", "value": "+"},
+              {"type": "SYMBOL", "name": "expression"}
+            ]
+          }
+        },

-    // Ambiguities can be resolved at compile time by assigning precedence
-    // values to rule subtrees.
-    { "product", prec_left(2, seq({
-      sym("expression"),
-      str("*"),
-      sym("expression") })) },
+        "product": {
+          "type": "PREC_LEFT",
+          "value": 2,
+          "content": {
+            "type": "SEQ",
+            "members": [
+              {"type": "SYMBOL", "name": "expression"},
+              {"type": "STRING", "value": "*"},
+              {"type": "SYMBOL", "name": "expression"}
+            ]
+          }
+        },

-    // Tokens can be specified using ECMAScript regexps.
-    { "number", pattern("\\d+") },
-    { "variable", pattern("[a-zA-Z]+\\w*") },
-    { "comment", pattern("//.*") },
+        "number": {"type": "PATTERN", "value": "\\d+"}
+        "variable": {"type": "PATTERN", "value": "[a-zA-Z]\\w*"},
+      }
+    }
+  )JSON");

-  }).extra_tokens({
+  if (result.error_type != TSCompileErrorTypeNone) {
+    fprintf(stderr, "Compilation failed: %s\n", result.error_message);
+    return 1;
+  }

-    // Things that can appear anywhere in the language are expressed as
-    // 'extra tokens'.
-    sym("comment"),
-    pattern("\\s+")
-  });
-
-  // Generate C code for parsing this language.
-  auto output = compile(arithmetic_grammar, "arithmetic");
-  std::string c_code = output.first;
-  const GrammarError *error = output.second;
-
-  assert(!error);
-  puts(c_code.c_str());
+  puts(result.code);

  return 0;
 }
 ```

-To create a parser for this language, compile and run this grammar like this:
+To create the parser, compile this file like this:

 ```sh
-clang++ -stdlib=libc++ -std=c++11                             \
-  -I tree-sitter/include -L tree-sitter/out/Debug -l compiler \
-  arithmetic_grammar.cc -o arithmetic_grammar
+clang++ -std=c++11 \
+  -I tree-sitter/include \
+  -L tree-sitter/out/Release \
+  -l compiler \
+  arithmetic_grammar.cc \
+  -o arithmetic_grammar
+```

+Then run the executable to print out the C code for the parser:
+
+```sh
 ./arithmetic_grammar > arithmetic_parser.c
 ```

 ### Using the parser

-The `tree_sitter/runtime` C library exposes a DOM-style interface for inspecting
-documents.
+#### Providing the text to parse

-Functions like `ts_node_child(node, index)` and `ts_node_next_sibling(node)`
+Text input is provided to a tree-sitter parser via a `TSInput` struct, which
+contains function pointers for seeking to positions in the text, and for reading
+chunks of text. The text can be encoded in either UTF8 or UTF16. This interface
+allows you to efficiently parse text that is stored in your own data structure.
+
+#### Querying the syntax tree
+
+The `libruntime` API provides a DOM-style interface for inspecting
+syntax trees. Functions like `ts_node_child(node, index)` and `ts_node_next_sibling(node)`
 expose every node in the concrete syntax tree. This is useful for operations
-like syntax-highlighting, that operate on a token-by-token basis. You can also
+like syntax-highlighting, which operate on a token-by-token basis. You can also
 traverse the tree in a more abstract way by using functions like
 `ts_node_named_child(node, index)` and `ts_node_next_named_sibling(node)`. These
 functions don't expose nodes that were specified in the grammar as anonymous
 tokens, like `(` and `+`. This is useful when analyzing the meaning of a document.

 ```c
+// test_parser.c
+
+#include <assert.h>
+#include <string.h>
 #include <stdio.h>
 #include "tree_sitter/runtime.h"

-// Declare the language constructor that was generated from your grammar.
+// Declare the language function that was generated from your grammar.
 TSLanguage *ts_language_arithmetic();

 int main() {
  TSDocument *document = ts_document_make();
  ts_document_set_language(document, ts_language_arithmetic());
-
-  // Usually, you would use the more general `ts_document_set_input`, which
-  // takes a struct with function pointers for seeking to positions in the text,
-  // and reading chunks of text. This allows you to efficiently parse text
-  // stored in your own data structure.
  ts_document_set_input_string(document, "a + b * 5");
  ts_document_parse(document);

  TSNode root_node = ts_document_root_node(document);
-  printf(
-    "Root name: %s, start: %lu, end: %lu\n",
-    ts_node_name(root_node, document),
-    ts_node_start_char(root_node),
-    ts_node_end_char(root_node)
-  );
+  assert(!strcmp(ts_node_name(root_node, document), "expression"));
+  assert(ts_node_named_child_count(root_node) == 1);

-  TSNode product_node = ts_node_named_child(ts_node_child(root_node, 0), 1);
-  printf(
-    "Child name: %s, start: %lu, end: %lu\n",
-    ts_node_name(product_node, document),
-    ts_node_start_char(product_node),
-    ts_node_end_char(product_node)
-  );
+  TSNode sum_node = ts_node_named_child(root_node, 0);
+  assert(!strcmp(ts_node_name(sum_node, document), "sum"));
+  assert(ts_node_named_child_count(sum_node) == 2);

+  TSNode product_node = ts_node_child(ts_node_named_child(sum_node, 1), 0);
+  assert(!strcmp(ts_node_name(product_node, document), "product"));
+  assert(ts_node_named_child_count(product_node) == 2);
+
+  printf("Syntax tree: %s\n", ts_node_string(root_node, document));
  ts_document_free(document);
  return 0;
 }
@ -159,9 +195,12 @@ int main() {
 To demo this parser's capabilities, compile this program like this:

 ```sh
-clang                                                        \
-  -I tree-sitter/include -L tree-sitter/out/Debug -l runtime \
-  arithmetic_parser.c test_parser.c -o test_parser
+clang \
+  -I tree-sitter/include \
+  -L tree-sitter/out/Debug \
+  -l runtime \
+  arithmetic_parser.c test_parser.c \
+  -o test_parser

 ./test_parser
 ```