Merge branch 'master' into pike-doc-extras-default

This commit is contained in:
Max Brunsfeld 2018-06-13 09:42:22 -07:00
commit 0895ca237d
89 changed files with 5484 additions and 4095 deletions

View file

@ -1 +1,2 @@
markdown: kramdown
theme: jekyll-theme-cayman

133
docs/_layouts/default.html Normal file
View file

@ -0,0 +1,133 @@
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no">
<link rel="stylesheet" href="{{ '/assets/css/style.css?v=' | append: site.github.build_revision | relative_url }}" media="screen" type="text/css">
<title>Tree-sitter{{ page.title }}</title>
</head>
<body class="sidebar-hidden">
<script>
if (localStorage.getItem('sidebar-active') === 'true') {
document.body.classList.remove('sidebar-hidden');
}
</script>
<div id="container">
<div id="sidebar">
<nav id="table-of-contents">
<a class="logo table-of-contents-section" href="https://github.com/tree-sitter/tree-sitter">
<img src="{{ '/assets/images/tree-sitter-small.png' | relative_url }}" width=200 height=200 />
</a>
{% for other_page in site.html_pages %}
{% if page.title == other_page.title %}
<li class="table-of-contents-section active">
<a class="nav-link" href="{{ other_page.url | relative_url }}">
{{ other_page.title }}
</a>
<div id="current-page-table-of-contents">
{% capture whitespace %}
{% assign min_header = 2 %}
{% assign nodes = content | split: "<h" %}
{% assign first_header = true %}
{% for node in nodes %}
{% if node == "" %}
{% continue %}
{% endif %}
{% assign header_level = node | replace: '"', '' | slice: 0, 1 | times: 1 %}
{% if header_level < min_header or header_level > maxHeader %}
{% continue %}
{% endif %}
{% if first_header %}
{% assign first_header = false %}
{% assign min_header = header_level %}
{% endif %}
{% assign indent_level = header_level | minus: min_header | add: 1 %}
{% assign header_content = node | split: '</h' %}
{% assign header_content = header_content[0] %}
{% assign html_id = header_content | split: 'id="' %}
{% assign html_id = html_id[1] | split: '"' %}
{% assign html_id = html_id[0] %}
{% capture header_attrs_to_strip %}{{ header_content | split: '>' | first }}>{% endcapture %}
{% assign header = header_content | replace: header_attrs_to_strip, '' %}
{% assign space = '' %}
{% for i in (1..indent_level) %}
{% assign space = space | prepend: ' ' %}
{% endfor %}
{% capture my_toc %}{{ my_toc }}
{{ space }}- [{{ header }}](#{{ html_id }}){: .nav-link}{% endcapture %}
{% endfor %}
{% endcapture %}
{{ my_toc | strip | markdownify | strip }}
</ul>
</li>
{% else %}
<li class="table-of-contents-section">
<a class="nav-link" href="{{ other_page.url | relative_url }}">
{{ other_page.title }}
</a>
</li>
{% endif %}
{% endfor %}
</nav>
</div>
<a id="sidebar-toggle-link" href="#"></a>
<main id="main-content">
{{ content }}
</main>
</div>
</body>
</html>
<script
src="https://code.jquery.com/jquery-3.3.1.min.js"
crossorigin="anonymous">
</script>
<script
src="https://maxcdn.bootstrapcdn.com/bootstrap/4.1.0/js/bootstrap.bundle.min.js">
</script>
<script>
$('#sidebar-toggle-link').click(function(e) {
e.preventDefault();
$(document.body).toggleClass('sidebar-hidden');
localStorage.setItem(
'sidebar-active',
localStorage.getItem('sidebar-active') === 'true' ? 'false' : 'true'
);
});
if (document.body.scrollHeight > window.innerHeight + 500) {
$(document.body).scrollspy({
target: '#current-page-table-of-contents',
offset: 40
});
}
$(document).scroll(function() {
if ($(document).scrollLeft() > 0) {
localStorage.setItem('sidebar-active', 'false');
$(document.body).addClass('sidebar-hidden');
}
});
$('h1, h2, h3, h4, h5, h6').filter('[id]').each(function() {
$(this).html('<a href="#'+$(this).attr('id')+'">' + $(this).text() + '</a>');
});
</script>

View file

@ -1,74 +0,0 @@
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no">
<link rel="stylesheet" type="text/css" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" media="screen">
<link rel="stylesheet" type="text/css" href="{{ site.baseurl }}/css/style.css" media="screen">
<title>{{ page.title }}</title>
</head>
<body data-spy="scroll" data-target="#table-of-contents" data-offset="40">
<!-- Fixed sidebar -->
<div style="position: fixed; width: 100%;">
<div class="container" style="height: 0;">
<div class="row" style="height: 0;">
<div class="col-md-3">
<nav class="nav navbar navbar-light" id="table-of-contents"></nav>
</div>
</div>
</div>
</div>
<!-- Main content -->
<div class="container">
<div class="row">
<div class="col-md-3" style="pointer-events: none;">
</div>
<div class="col-md-9 content">
<div id="main-content">
{{ content }}
</div>
</div>
</div>
</div>
</body>
</html>
<!-- Generate a table of contents based on header elements -->
<script type="text/javascript">
var mainContent = document.getElementById('main-content');
var tableOfContents = document.getElementById('table-of-contents');
var headers = mainContent.querySelectorAll('h2, h3');
var lastSubnav;
for (let i = 0, length = headers.length; i < length; i++) {
var header = headers[i];
if (!header.id) continue;
var li = document.createElement('li');
li.className = 'navbar-item';
var link = document.createElement('a');
link.href = '#' + header.id;
link.innerText = header.innerText;
link.className = 'nav-link'
li.appendChild(link);
if (header.tagName === 'H2') {
lastSubnav = document.createElement('ul');
lastSubnav.className = 'nav navbar';
li.appendChild(lastSubnav);
tableOfContents.appendChild(li);
} else {
lastSubnav.appendChild(li);
}
}
</script>
<script
src="https://code.jquery.com/jquery-3.3.1.min.js"
crossorigin="anonymous"></script>
<script
src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/js/bootstrap.bundle.min.js"></script>

118
docs/assets/css/style.scss Normal file
View file

@ -0,0 +1,118 @@
---
---
@import 'jekyll-theme-cayman';
$padding: 20px;
$sidebar-width: 300px;
$sidebar-transition: left 0.25s;
$container-width: 1024px;
body {
overflow: scroll;
}
#container {
position: relative;
max-width: $container-width;
margin: 0 auto;
}
#main-content, #sidebar {
padding: $padding 0;
}
#sidebar {
position: fixed;
background: white;
top: 0;
bottom: 0;
width: $sidebar-width;
overflow-y: auto;
border-right: 1px solid #ccc;
z-index: 1;
}
#sidebar-toggle-link {
font-size: 24px;
position: fixed;
background-color: white;
opacity: 0.75;
box-shadow: 1px 1px 5px #aaa;
left: $sidebar-width;
padding: 5px 10px;
display: none;
z-index: 100;
text-decoration: none !important;
color: #aaa;
}
#main-content {
position: relative;
padding: $padding;
padding-left: $sidebar-width + $padding;
}
.nav-link.active {
text-decoration: underline;
}
.table-of-contents-section {
border-bottom: 1px solid #ccc;
}
.logo {
display: block;
}
.table-of-contents-section.active {
background-color: #edffcb;
}
.table-of-contents-section {
padding: 10px 20px;
}
#table-of-contents {
ul {
padding: 0;
margin: 0;
}
li {
display: block;
padding: 5px 20px;
}
}
@media (max-width: 900px) {
#sidebar {
left: 0;
transition: $sidebar-transition;
}
#sidebar-toggle-link {
display: block;
transition: $sidebar-transition;
}
#main-content {
left: $sidebar-width;
padding-left: $padding;
transition: $sidebar-transition;
}
body.sidebar-hidden {
#sidebar {
left: -$sidebar-width;
}
#main-content {
left: 0;
}
#sidebar-toggle-link {
left: 0;
}
}
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 112 KiB

View file

@ -1,13 +0,0 @@
#main-content, #table-of-contents {
margin-top: 20px;
}
#table-of-contents {
padding: 10px;
border-radius: 10px;
border: 1px solid #ddd;
}
.nav-link.active {
text-decoration: underline;
}

View file

@ -1,10 +1,62 @@
Tree-sitter is a library for parsing source code. It aims to be:
---
title: Introduction
---
# Introduction
Tree-sitter is an incremental parsing library. It can build a concrete syntax tree for a source file and efficiently update the syntax tree as the source file is edited. Tree-sitter aims to be:
* **General** enough to parse any programming language
* **Dependency-free** and written in pure C so that it can be embedded in any application
* **Fast** and incremental so that it can be used in a text editor
* **Robust** enough to provide useful results even in the presence of syntax errors
* **Fast** enough to parse on every keystroke in a text editor
* **Robust** enough to provide useful results even in the presence of syntax errors,
* **Dependency-free** (and written in pure C) so that it can be embedded in any application
## Table of contents
### Language Bindings
1. [Creating parsers](creating-parsers.md)
There are currently bindings that allow Tree-sitter to be used from the following languages:
* [JavaScript](https://github.com/tree-sitter/node-tree-sitter)
* [Rust](https://github.com/tree-sitter/rust-tree-sitter)
* [Haskell](https://github.com/tree-sitter/haskell-tree-sitter)
* [Ruby](https://github.com/tree-sitter/ruby-tree-sitter)
### Available Parsers
Parsers for these languages are fairly complete:
* [Bash](https://github.com/tree-sitter/tree-sitter-bash)
* [C](https://github.com/tree-sitter/tree-sitter-c)
* [C++](https://github.com/tree-sitter/tree-sitter-cpp)
* [Go](https://github.com/tree-sitter/tree-sitter-go)
* [HTML](https://github.com/tree-sitter/tree-sitter-html)
* [JavaScript](https://github.com/tree-sitter/tree-sitter-javascript)
* [PHP](https://github.com/tree-sitter/tree-sitter-php)
* [Python](https://github.com/tree-sitter/tree-sitter-python)
* [Ruby](https://github.com/tree-sitter/tree-sitter-ruby)
* [Rust](https://github.com/tree-sitter/tree-sitter-rust)
* [TypeScript](https://github.com/tree-sitter/tree-sitter-typescript)
Parsers for these languages are in development:
* [Haskell](https://github.com/tree-sitter/tree-sitter-haskell)
* [Java](https://github.com/tree-sitter/tree-sitter-java)
* [OCaml](https://github.com/tree-sitter/tree-sitter-ocaml)
* [C-sharp](https://github.com/tree-sitter/tree-sitter-c-sharp)
* [Julia](https://github.com/tree-sitter/tree-sitter-julia)
* [Scala](https://github.com/tree-sitter/tree-sitter-scala)
### Talks on Tree-sitter
* [FOSDEM 2018](https://www.youtube.com/watch?v=0CGzC_iss-8)
* [GitHub Universe 2017](https://www.youtube.com/watch?v=a1rC79DHpmY)
### Underlying Research
The design of Tree-sitter was greatly influenced by the following research papers:
- [Practical Algorithms for Incremental Software Development Environments](https://www2.eecs.berkeley.edu/Pubs/TechRpts/1997/CSD-97-946.pdf)
- [Context Aware Scanning for Parsing Extensible Languages](http://www.umsec.umn.edu/publications/Context-Aware-Scanning-Parsing-Extensible)
- [Efficient and Flexible Incremental Parsing](http://ftp.cs.berkeley.edu/sggs/toplas-parsing.ps)
- [Incremental Analysis of Real Programming Languages](https://pdfs.semanticscholar.org/ca69/018c29cc415820ed207d7e1d391e2da1656f.pdf)
- [Error Detection and Recovery in LR Parsers](http://what-when-how.com/compiler-writing/bottom-up-parsing-compiler-writing-part-13)
- [Error Recovery for LR Parsers](http://www.dtic.mil/dtic/tr/fulltext/u2/a043470.pdf)

View file

@ -0,0 +1,24 @@
---
title: Architecture
permalink: architecture
---
# Architecture
Tree-sitter consists of two separate libraries, both of which expose C APIs.
The first library, `libcompiler`, is
used to generate a parser for a language by supplying a [context-free grammar](https://en.wikipedia.org/wiki/Context-free_grammar) describing the
language. `libcompiler` is a build tool; it is no longer needed once a parser has been generated. Its public interface is specified in the header file [`compiler.h`](https://github.com/tree-sitter/tree-sitter/blob/master/include/tree_sitter/compiler.h).
The second library, `libruntime`, is used in combination with the parsers
generated by `libcompiler`, to produce syntax trees from source code and keep the
syntax trees up-to-date as the source code changes. `libruntime` is designed to be embedded in applications. Its interface is specified in the header file [`runtime.h`](https://github.com/tree-sitter/tree-sitter/blob/master/include/tree_sitter/runtime.h).
## The Compiler
WIP
## The Runtime
WIP

View file

@ -1,5 +1,6 @@
---
layout: table-of-contents
title: Creating Parsers
permalink: creating-parsers
---
# Creating parsers
@ -57,59 +58,63 @@ It's usually a good idea to find a formal specification for the language you're
Although languages have very different constructs, their constructs can often be categorized in to similar groups like *Declarations*, *Definitions*, *Statements*, *Expressions*, *Types*, and *Patterns*. In writing your grammar, a good first step is to create just enough structure to include all of these basic *groups* of symbols. For an imaginary C-like language, this might look something like this:
```js
rules: $ => {
source_file: $ => repeat($._definition),
{
// ...
_definition: $ => choice(
$.function_definition
// TODO: other kinds of definitions
),
rules: $ => {
source_file: $ => repeat($._definition),
function_definition: $ => seq(
'func',
$.identifier,
$.parameter_list,
$._type,
$.block
),
_definition: $ => choice(
$.function_definition
// TODO: other kinds of definitions
),
parameter_list: $ => seq(
'(',
// TODO: parameters
')'
),
function_definition: $ => seq(
'func',
$.identifier,
$.parameter_list,
$._type,
$.block
),
_type: $ => choice(
'bool'
// TODO: other kinds of types
),
parameter_list: $ => seq(
'(',
// TODO: parameters
')'
),
block: $ => seq(
'{',
repeat($._statement),
'}'
),
_type: $ => choice(
'bool'
// TODO: other kinds of types
),
_statement: $ => choice(
$.return_statement
// TODO: other kinds of statements
),
block: $ => seq(
'{',
repeat($._statement),
'}'
),
return_statement: $ => seq(
'return',
$._expression,
';'
),
_statement: $ => choice(
$.return_statement
// TODO: other kinds of statements
),
_expression: $ => choice(
$.identifier,
$.number
// TODO: other kinds of expressions
),
return_statement: $ => seq(
'return',
$._expression,
';'
),
identifier: $ => /[a-z]+/,
_expression: $ => choice(
$.identifier,
$.number
// TODO: other kinds of expressions
),
number: $ => /\d+/
identifier: $ => /[a-z]+/,
number: $ => /\d+/
}
}
```
@ -118,27 +123,31 @@ Some of the details of this grammar will be explained in more depth later on, bu
With this structure in place, you can now freely decide what part of the grammar to flesh out next. For example, you might decide to start with *types*. One-by-one, you could define the rules for writing basic types and composing them into more complex types:
```js
_type: $ => choice(
$.primitive_type,
$.array_type,
$.pointer_type
),
{
// ...
primitive_type: $ => choice(
'bool',
'int'
),
_type: $ => choice(
$.primitive_type,
$.array_type,
$.pointer_type
),
array_type: $ => seq(
'[',
']',
$._type
),
primitive_type: $ => choice(
'bool',
'int'
),
pointer_type: $ => seq(
'*',
$._type
),
array_type: $ => seq(
'[',
']',
$._type
),
pointer_type: $ => seq(
'*',
$._type
)
}
```
After developing the *type* sublanguage a bit further, you might decide to switch to working on *statements* or *expressions* instead. It's often useful to check your progress by trying to parse some real code using `tree-sitter parse`.
@ -250,24 +259,28 @@ The language spec encodes the 20 precedence levels of JavaScript expressions usi
To produce a readable syntax tree, we'd like to model JavaScript expressions using a much flatter structure like this:
```js
_expression: $ => choice(
$.identifier,
$.unary_expression,
$.binary_expression,
{
// ...
),
unary_expression: $ => choice(
seq('-', $._expression),
seq('!', $._expression),
// ...
),
_expression: $ => choice(
$.identifier,
$.unary_expression,
$.binary_expression,
// ...
),
binary_expression: $ => choice(
seq($._expression, '*', $._expression),
seq($._expression, '+', $._expression),
// ...
),
unary_expression: $ => choice(
seq('-', $._expression),
seq('!', $._expression),
// ...
),
binary_expression: $ => choice(
seq($._expression, '*', $._expression),
seq($._expression, '+', $._expression),
// ...
),
}
```
Of course, this flat structure is highly ambiguous. If we try to generate a parser, Tree-sitter gives us an error message:
@ -293,11 +306,15 @@ Possible resolutions:
For an expression like `-a * b`, it's not clear whether the `-` operator applies to the `a * b` or just to the `a`. This is where the `prec` function described above comes into play. By wrapping a rule with `prec`, we can indicate that certain sequence of symbols should *bind to each other more tightly* than others. For example, the `'-', $._expression` sequence in `unary_expression` should bind more tightly than the `$._expression, '+', $._expression` sequence in `binary_expression`:
```js
unary_expression: $ => prec(2, choice(
seq('-', $._expression),
seq('!', $._expression),
{
// ...
))
unary_expression: $ => prec(2, choice(
seq('-', $._expression),
seq('!', $._expression),
// ...
))
}
```
### Using associativity
@ -323,11 +340,15 @@ Possible resolutions:
For an expression like `a * b * c`, it's not clear whether we mean `a * (b * c)` or `(a * b) * c`. This is where `prec.left` and `prec.right` come into use. We want to select the second interpretation, so we use `prec.left`.
```js
binary_expression: $ => choice(
prec.left(2, seq($._expression, '*', $._expression)),
prec.left(1, seq($._expression, '+', $._expression)),
{
// ...
),
binary_expression: $ => choice(
prec.left(2, seq($._expression, '*', $._expression)),
prec.left(1, seq($._expression, '+', $._expression)),
// ...
),
}
```
### Hiding rules
@ -336,6 +357,8 @@ You may have noticed in the above examples that some of the grammar rule name li
## Dealing with LR conflicts
TODO
[cst]: https://en.wikipedia.org/wiki/Parse_tree
[non-terminal]: https://en.wikipedia.org/wiki/Terminal_and_nonterminal_symbols
[language-spec]: https://en.wikipedia.org/wiki/Programming_language_specification

View file

@ -0,0 +1,81 @@
---
title: Using Parsers
permalink: using-parsers
---
# Using Parsers
A Tree-sitter parser consists of a single C source file which exports one function with the naming scheme `tree_sitter_${LANGUAGE_NAME}`. This function returns a pointer to a `TSLanguage` struct, which can be used in conjunction with a `TSParser` to produce a syntax trees.
## The Raw C API
Here's an example of a simple C program that uses the Tree-sitter [JSON parser](https://github.com/tree-sitter/tree-sitter-json).
```c
// Filename - test-json-parser.c
#include <assert.h>
#include <string.h>
#include <stdio.h>
#include "tree_sitter/runtime.h"
TSLanguage *tree_sitter_json();
int main() {
// Create a parser with the JSON language.
TSParser *parser = ts_parser_new();
ts_parser_set_language(parser, tree_sitter_json());
// Parse some source code.
const char *source_code = "[1, null]";
TSTree *tree = ts_parser_parse_string(parser, NULL, source_code, strlen(source_code));
// Find some syntax tree nodes.
TSNode root_node = ts_tree_root_node(tree);
TSNode array_node = ts_node_named_child(root_node, 0);
TSNode number_node = ts_node_named_child(array_node, 0);
// Check that the nodes have the expected types.
assert(!strcmp(ts_node_type(root_node), "value"));
assert(!strcmp(ts_node_type(array_node), "array"));
assert(!strcmp(ts_node_type(number_node), "number"));
// Check that the nodes have the expected child counts.
assert(ts_node_child_count(root_node) == 1);
assert(ts_node_child_count(array_node) == 4);
assert(ts_node_named_child_count(array_node) == 2);
assert(ts_node_child_count(number_node) == 0);
// Print the syntax tree as an S-expression.
char *string = ts_node_string(root_node);
printf("Syntax tree: %s\n", string);
// Free all of the heap allocations.
free(string);
ts_tree_delete(tree);
ts_parser_delete(parser);
return 0;
}
```
This program uses the Tree-sitter C API, which is declared in the header file `tree_sitter/runtime.h`, so we need to add the `tree_sitter/include` directory to the include path. We also need to link `libruntime.a` into the binary.
```sh
clang \
-I tree-sitter/include \
test-json-parser.c \
tree-sitter-json/src/parser.c \
tree-sitter/out/Release/libruntime.a \
-o test-json-parser
./test-json-parser
```
### Providing the text to parse
Text input is provided to a tree-sitter parser via a `TSInput` struct, which contains function pointers for seeking to positions in the text, and for reading chunks of text. The text can be encoded in either UTF8 or UTF16. This interface allows you to efficiently parse text that is stored in your own data structure.
### Querying the syntax tree
Tree-sitter provides a DOM-style interface for inspecting syntax trees. Functions like `ts_node_child(node, index)` and `ts_node_next_sibling(node)` expose every node in the concrete syntax tree. This is useful for operations like syntax-highlighting, which operate on a token-by-token basis. You can also traverse the tree in a more abstract way by using functions like
`ts_node_named_child(node, index)` and `ts_node_next_named_sibling(node)`. These functions don't expose nodes that were specified in the grammar as anonymous tokens, like `:` and `{`. This is useful when analyzing the meaning of a document.