Merge branch 'master' of https://github.com/tree-sitter/tree-sitter into feat/snapshot-testing

This commit is contained in:
Ika 2019-09-06 10:44:32 +08:00
commit 9770a0c9f6
17 changed files with 643 additions and 373 deletions

View file

@ -293,11 +293,11 @@ pub fn ansi(
{
let event = event.map_err(|e| e.to_string())?;
match event {
HighlightEvent::Source(s) => {
HighlightEvent::Source { start, end } => {
if let Some(style) = highlight_stack.last().and_then(|s| theme.ansi_style(*s)) {
write!(&mut stdout, "{}", style.paint(s))?;
style.paint(&source[start..end]).write_to(&mut stdout)?;
} else {
write!(&mut stdout, "{}", s)?;
stdout.write_all(&source[start..end])?;
}
}
HighlightEvent::HighlightStart(h) => {

View file

@ -17,7 +17,7 @@ lazy_static! {
.multi_line(true)
.build()
.unwrap();
static ref DIVIDER_REGEX: ByteRegex = ByteRegexBuilder::new(r"\r?\n---+\r?\n")
static ref DIVIDER_REGEX: ByteRegex = ByteRegexBuilder::new(r"^---+\r?\n")
.multi_line(true)
.build()
.unwrap();
@ -183,11 +183,7 @@ pub fn parse_tests(path: &Path) -> io::Result<TestEntry> {
let mut children = Vec::new();
for entry in fs::read_dir(path)? {
let entry = entry?;
let hidden = entry
.file_name()
.to_str()
.unwrap_or("")
.starts_with(".");
let hidden = entry.file_name().to_str().unwrap_or("").starts_with(".");
if !hidden {
children.push(parse_tests(&entry.path())?);
}
@ -206,29 +202,42 @@ pub fn strip_sexp_fields(sexp: String) -> String {
fn parse_test_content(name: String, content: String) -> TestEntry {
let mut children = Vec::new();
let bytes = content.as_bytes();
let mut previous_name = String::new();
let mut previous_header_end = 0;
for header_match in HEADER_REGEX
let mut prev_name = String::new();
let mut prev_header_end = 0;
// Identify all of the test descriptions using the `======` headers.
for (header_start, header_end) in HEADER_REGEX
.find_iter(&bytes)
.map(|m| (m.start(), m.end()))
.chain(Some((bytes.len(), bytes.len())))
{
let (header_start, header_end) = header_match;
if previous_header_end > 0 {
if let Some(divider_match) =
DIVIDER_REGEX.find(&bytes[previous_header_end..header_start])
{
let (divider_start, divider_end) = (
previous_header_end + divider_match.start(),
previous_header_end + divider_match.end(),
);
// Find the longest line of dashes following each test description.
// That is the divider between input and expected output.
if prev_header_end > 0 {
let divider_match = DIVIDER_REGEX
.find_iter(&bytes[prev_header_end..header_start])
.map(|m| (prev_header_end + m.start(), prev_header_end + m.end()))
.max_by_key(|(start, end)| end - start);
if let Some((divider_start, divider_end)) = divider_match {
if let Ok(output) = str::from_utf8(&bytes[divider_end..header_start]) {
let input = bytes[previous_header_end..divider_start].to_vec();
let mut input = bytes[prev_header_end..divider_start].to_vec();
// Remove trailing newline from the input.
input.pop();
if input.last() == Some(&b'\r') {
input.pop();
}
// Normalize the whitespace in the expected output.
let output = WHITESPACE_REGEX.replace_all(output.trim(), " ").to_string();
let output = output.replace(" )", ")");
// Identify if the expected output has fields indicated. If not, then
// fields will not be checked.
let has_fields = SEXP_FIELD_REGEX.is_match(&output);
children.push(TestEntry::Example {
name: previous_name,
name: prev_name,
input,
output,
has_fields,
@ -236,10 +245,10 @@ fn parse_test_content(name: String, content: String) -> TestEntry {
}
}
}
previous_name = String::from_utf8_lossy(&bytes[header_start..header_end])
prev_name = String::from_utf8_lossy(&bytes[header_start..header_end])
.trim_matches(|c| char::is_whitespace(c) || c == '=')
.to_string();
previous_header_end = header_end;
prev_header_end = header_end;
}
TestEntry::Group { name, children }
}
@ -296,4 +305,56 @@ d
}
);
}
#[test]
fn test_parse_test_content_with_dashes_in_source_code() {
let entry = parse_test_content(
"the-filename".to_string(),
r#"
==================
Code with dashes
==================
abc
---
defg
----
hijkl
-------
(a (b))
=========================
Code ending with dashes
=========================
abc
-----------
-------------------
(c (d))
"#
.trim()
.to_string(),
);
assert_eq!(
entry,
TestEntry::Group {
name: "the-filename".to_string(),
children: vec![
TestEntry::Example {
name: "Code with dashes".to_string(),
input: "abc\n---\ndefg\n----\nhijkl".as_bytes().to_vec(),
output: "(a (b))".to_string(),
has_fields: false,
},
TestEntry::Example {
name: "Code ending with dashes".to_string(),
input: "abc\n-----------".as_bytes().to_vec(),
output: "(c (d))".to_string(),
has_fields: false,
},
]
}
);
}
}

View file

@ -420,10 +420,10 @@ fn test_highlighting_via_c_api() {
assert_eq!(
lines,
vec![
"&lt;<span class=tag>script</span>&gt;",
"<span class=keyword>const</span> <span>a</span> <span>=</span> <span class=function>b</span><span>(</span><span class=string>&#39;c&#39;</span><span>)</span><span>;</span>",
"<span>c</span><span>.</span><span class=function>d</span><span>(</span><span>)</span><span>;</span>",
"&lt;/<span class=tag>script</span>&gt;",
"&lt;<span class=tag>script</span>&gt;\n",
"<span class=keyword>const</span> <span>a</span> <span>=</span> <span class=function>b</span><span>(</span><span class=string>&#39;c&#39;</span><span>)</span><span>;</span>\n",
"<span>c</span><span>.</span><span class=function>d</span><span>(</span><span>)</span><span>;</span>\n",
"&lt;/<span class=tag>script</span>&gt;\n",
]
);
@ -431,6 +431,23 @@ fn test_highlighting_via_c_api() {
c::ts_highlight_buffer_delete(buffer);
}
#[test]
fn test_decode_utf8_lossy() {
use tree_sitter_highlight::util::LossyUtf8;
let parts = LossyUtf8::new(b"hi").collect::<Vec<_>>();
assert_eq!(parts, vec!["hi"]);
let parts = LossyUtf8::new(b"hi\xc0\xc1bye").collect::<Vec<_>>();
assert_eq!(parts, vec!["hi", "\u{fffd}", "\u{fffd}", "bye"]);
let parts = LossyUtf8::new(b"\xc0\xc1bye").collect::<Vec<_>>();
assert_eq!(parts, vec!["\u{fffd}", "\u{fffd}", "bye"]);
let parts = LossyUtf8::new(b"hello\xc0\xc1").collect::<Vec<_>>();
assert_eq!(parts, vec!["hello", "\u{fffd}", "\u{fffd}"]);
}
fn c_string(s: &str) -> CString {
CString::new(s.as_bytes().to_vec()).unwrap()
}
@ -466,11 +483,12 @@ fn to_token_vector<'a>(
language: Language,
property_sheet: &'a PropertySheet<Properties>,
) -> Result<Vec<Vec<(&'a str, Vec<Highlight>)>>, Error> {
let src = src.as_bytes();
let mut lines = Vec::new();
let mut highlights = Vec::new();
let mut line = Vec::new();
for event in highlight(
src.as_bytes(),
src,
language,
property_sheet,
None,
@ -481,7 +499,8 @@ fn to_token_vector<'a>(
HighlightEvent::HighlightEnd => {
highlights.pop();
}
HighlightEvent::Source(s) => {
HighlightEvent::Source { start, end } => {
let s = str::from_utf8(&src[start..end]).unwrap();
for (i, l) in s.split("\n").enumerate() {
let l = l.trim_end_matches('\r');
if i > 0 {

View file

@ -190,7 +190,7 @@ fn test_tree_edit() {
}
#[test]
fn test_tree_walk() {
fn test_tree_cursor() {
let mut parser = Parser::new();
parser.set_language(get_language("rust")).unwrap();
@ -225,6 +225,43 @@ fn test_tree_walk() {
assert_eq!(cursor.node().is_named(), true);
}
#[test]
fn test_tree_cursor_fields() {
let mut parser = Parser::new();
parser.set_language(get_language("javascript")).unwrap();
let tree = parser
.parse("function /*1*/ bar /*2*/ () {}", None)
.unwrap();
let mut cursor = tree.walk();
assert_eq!(cursor.node().kind(), "program");
cursor.goto_first_child();
assert_eq!(cursor.node().kind(), "function_declaration");
assert_eq!(cursor.field_name(), None);
cursor.goto_first_child();
assert_eq!(cursor.node().kind(), "function");
assert_eq!(cursor.field_name(), None);
cursor.goto_next_sibling();
assert_eq!(cursor.node().kind(), "comment");
assert_eq!(cursor.field_name(), None);
cursor.goto_next_sibling();
assert_eq!(cursor.node().kind(), "identifier");
assert_eq!(cursor.field_name(), Some("name"));
cursor.goto_next_sibling();
assert_eq!(cursor.node().kind(), "comment");
assert_eq!(cursor.field_name(), None);
cursor.goto_next_sibling();
assert_eq!(cursor.node().kind(), "formal_parameters");
assert_eq!(cursor.field_name(), Some("parameters"));
}
#[test]
fn test_tree_node_equality() {
let mut parser = Parser::new();

View file

@ -9,6 +9,8 @@ All of Tree-sitter's parsing functionality is exposed through C APIs. Applicatio
This document will describes the general concepts of how to use Tree-sitter, which should be relevant regardless of what language you're using. It also goes into some C-specific details that are useful if you're using the C API directly or are building a new binding to a different language.
All of the API functions shown here are declared and documented in the `tree_sitter/api.h` header file.
## Building the Library
Building the library requires one git submodule: [`utf8proc`](https://github.com/JuliaStrings/utf8proc). Make sure that `utf8proc` is downloaded by running this command from the Tree-sitter directory:
@ -36,7 +38,7 @@ Alternatively, you can use the library in a larger project by adding one source
## The Objects
There are four main types of objects involved when using Tree-sitter: languages, parsers, syntax trees, and syntax nodes. In C, these are called `TSLanguage`, `TSParser`, `TSTree`, and `TSNode`.
* An `TSLanguage` is an opaque object that defines how to parse a particular programming language. The code for each `TSLanguage` is generated by Tree-sitter. Many languages are already available in separate git repositories within the the [Tree-sitter GitHub organization](https://github.com/tree-sitter). See [the next section](./creating-parsers) for how to create new languages.
* An `TSLanguage` is an opaque object that defines how to parse a particular programming language. The code for each `TSLanguage` is generated by Tree-sitter. Many languages are already available in separate git repositories within the the [Tree-sitter GitHub organization](https://github.com/tree-sitter). See [the next page](./creating-parsers) for how to create new languages.
* A `TSParser` is a stateful object that can be assigned a `TSLanguage` and used to produce a `TSTree` based on some source code.
* A `TSTree` represents the syntax tree of an entire source code file. It contains `TSNode` instances that indicate the structure of the source code. It can also be edited and used to produce a new `TSTree` in the event that the source code changes.
* A `TSNode` represents a single node in the syntax tree. It tracks its start and end positions in the source code, as well as its relation to other nodes like its parent, siblings and children.
@ -241,6 +243,32 @@ TSNode ts_node_prev_named_sibling(TSNode);
If you use this group of methods, the syntax tree functions much like an abstract syntax tree.
## Node Field Names
To make syntax nodes easier to analyze, many grammars assign unique *field names* to particular child nodes. The the next page [explains](./creating-parsers#using-fields) how to do this on your own grammars. If a syntax node has fields, you can access its children using their field name:
```c
TSNode ts_node_child_by_field_name(
TSNode self,
const char *field_name,
uint32_t field_name_length
);
```
Fields also have numeric ids that you can use, if you want to avoid repeated string comparisons. You can convert between strings and ids using the `TSLanguage`:
```c
uint32_t ts_language_field_count(const TSLanguage *);
const char *ts_language_field_name_for_id(const TSLanguage *, TSFieldId);
TSFieldId ts_language_field_id_for_name(const TSLanguage *, const char *, uint32_t);
```
The field ids can be used in place of the name:
```c
TSNode ts_node_child_by_field_id(TSNode, TSFieldId);
```
## Editing
In applications like text editors, you often need to re-parse a file after its source code has changed. Tree-sitter is designed to support this use case efficiently. There are two steps required. First, you must *edit* the syntax tree, which adjusts the ranges of its nodes so that they stay in sync with the code.

View file

@ -5,26 +5,16 @@ permalink: creating-parsers
# Creating parsers
Developing Tree-sitter parsers can have a difficult learning curve, but once you get the hang of it, it can be fun and even zen-like. This document should help you to build an effective mental model for parser development.
Developing Tree-sitter grammars can have a difficult learning curve, but once you get the hang of it, it can be fun and even zen-like. This document will help get you to get started and to develop a useful mental model.
## Getting Started
### Understanding the problem
Writing a grammar requires creativity. There are an infinite number of CFGs (context-free grammars) that can be used to describe any given language. In order to produce a good Tree-sitter parser, you need to create a grammar with two important properties:
1. **An intuitive structure** - Tree-sitter's output is a [concrete syntax tree][cst]; each node in the tree corresponds directly to a [terminal or non-terminal symbol][non-terminal] in the grammar. So in order to produce an easy-to-analyze tree, there should be a direct correspondence between the symbols in your grammar and the recognizable constructs in the language. This might seem obvious, but it is very different from the way that context-free grammars are often written in contexts like [language specifications][language-spec] or [Yacc][yacc]/[Bison][bison] parsers.
2. **A close adherence to LR(1)** - Tree-sitter is based on the [GLR parsing][glr-parsing] algorithm. This means that while it can handle any context-free grammar, it works most efficiently with a class of context-free grammars called [LR(1) Grammars][lr-grammars]. In this respect, Tree-sitter's grammars are similar to (but less restrictive than) [Yacc][yacc] and [Bison][bison] grammars, but *different* from [ANTLR grammars][antlr], [Parsing Expression Grammars][peg], or the [ambiguous grammars][ambiguous-grammar] commonly used in language specifications.
It's unlikely that you'll be able to satisfy these two properties just by translating an existing context-free grammar directly into Tree-sitter's grammar format. There are a few kinds of adjustments that are often required. The following sections will explain these adjustments in more depth.
### Dependencies
In order to develop a Tree-sitter parser, there are two dependencies that you need to install:
* **Node.js** - Tree-sitter grammars are written in JavaScript, and Tree-sitter uses [Node.js][node.js] to interpret JavaScript files. It requires the `node` command to be in one of the directories in your [`PATH`][path-env]. It shouldn't matter what version of Node you have.
* **C Compiler** - Tree-sitter creates parsers that are written in C. In order to run and test these parsers with the `tree-sitter parse` or `tree-sitter test` commands, you must have a C/C++ compiler installed. Tree-sitter will try to look for these compilers in the standard places for each platform.
* **A C Compiler** - Tree-sitter creates parsers that are written in C. In order to run and test these parsers with the `tree-sitter parse` or `tree-sitter test` commands, you must have a C/C++ compiler installed. Tree-sitter will try to look for these compilers in the standard places for each platform.
### Installation
@ -34,7 +24,7 @@ To create a Tree-sitter parser, you need to use the [the `tree-sitter` CLI][tree
* Download a binary for your platform from [the latest GitHub release][releases], and put it into a directory on your `PATH`.
* Build the `tree-sitter-cli` [Rust crate][crate] from source using [`cargo`][cargo], the Rust package manager.
### Setting up a Project
### Project Setup
The preferred convention is to name the parser repository "tree-sitter-" followed by the name of the language.
@ -43,7 +33,7 @@ mkdir tree-sitter-${YOUR_LANGUAGE_NAME}
cd tree-sitter-${YOUR_LANGUAGE_NAME}
```
You should create a `package.json` file that describes your project, and allows your parser to be used from Node.js.
You can use the `npm` command line tool to create a `package.json` file that describes your project, and allows your parser to be used from Node.js.
```sh
# This will prompt you for input
@ -56,16 +46,21 @@ npm install --save nan
npm install --save-dev tree-sitter-cli
```
The last command will install the CLI into the `node_modules` folder in your project. An executable program called `tree-sitter` will be created at the path `./node_modules/.bin/tree-sitter`. You may want to follow the Node.js convention of adding `./node_modules/.bin` to your `PATH` so that you can easily run this program when working in this directory.
The last command will install the CLI into the `node_modules` folder in your working directory. An executable program called `tree-sitter` will be created inside of `node_modules/.bin/`. You may want to follow the Node.js convention of adding that folder to your your `PATH` so that you can easily run this program when working in this directory.
Once you have the CLI installed, create a file called `grammar.js` with the following skeleton:
```sh
# In your shell profile script
export PATH=$PATH:./node_modules/.bin
```
Once you have the CLI installed, create a file called `grammar.js` with the following contents:
```js
module.exports = grammar({
name: 'the_language_name',
name: 'YOUR_LANGUAGE_NAME',
rules: {
// The production rules of the context-free grammar
// TODO: add the actual grammar rules
source_file: $ => 'hello'
}
});
@ -77,10 +72,13 @@ Then run the the following command:
tree-sitter generate
```
This will generate the C code required to parse this trivial language, as well as all of the files needed to compile and load this native parser as a Node.js module. You can test this parser by creating a source file with the contents `hello` and parsing it:
This will generate the C code required to parse this trivial language, as well as a few files that are needed to compile and load this native parser as a Node.js module.
You can test this parser by creating a source file with the contents "hello" and parsing it:
```sh
tree-sitter parse ./the-file
echo 'hello' > example-file
tree-sitter parse example-file
```
This should print the following:
@ -89,13 +87,148 @@ This should print the following:
(source_file [1, 0] - [1, 5])
```
You might notice that the first time you run `tree-sitter parse`, it takes a few seconds. This is because Tree-sitter automatically compiles your C code into a dynamically-loadable library. Whenever you make changes to your grammar, you can update the parser simply by re-running `tree-sitter generate`. When the parser changes, Tree-sitter will recompile it as needed.
You now have a working parser.
## Tool Overview
Let's go over all of the functionality of the `tree-sitter` command line tool.
### Command: `generate`
The most important command you'll use is `tree-sitter generate`. This command reads the `grammar.js` file in your current working directory and creates a file called `src/parser.c`, which implements the parser. After making changes to your grammar, just run `tree-sitter` generate again.
The first time you run `tree-sitter generate`, it will also generate a few other files:
* `binding.gyp` - This file tells Node.js how to compile your language.
* `index.js` - This is the file that Node.js initially loads when using your language.
* `src/binding.cc` - This file wraps your language in a JavaScript object when used in Node.js
* `src/tree_sitter/parser.h` - This file provides some basic C definitions that are used in your generated `parser.c` file.
If there is an ambiguity or *local ambiguity* in your grammar, Tree-sitter will detect it during parser generation, and it will exit with a `Unresolved conflict` error message. See below for more information on these errors.
### Command: `test`
The `tree-sitter test` command allows you to easily test that your parser is working correctly.
For each rule that you add to the grammar, you should first create a *test* that describes how the syntax trees should look when parsing that rule. These tests are written using specially-formatted text files in a `corpus` directory in your parser's root folder.
For example, you might have a file called `corpus/statements.txt` that contains a series of entries like this:
```
==================
Return statements
==================
func x() int {
return 1;
}
---
(source_file
(function_definition
(identifier)
(parameter_list)
(primitive_type)
(block
(return_statement (number)))))
```
* The **name** of each test is written between two lines containing only `=` (equal sign) characters.
* Then the **input source code** is written, followed by a line containing three or more `-` (dash) characters.
* Then, the **expected output syntax tree** is written as an [S-expression][s-exp]. The exact placement of whitespace in the S-expression doesn't matter, but ideally the syntax tree should be legible. Note that the S-expression does not show syntax nodes like `func`, `(` and `;`, which are expressed as strings and regexes in the grammar. It only shows the *named* nodes, as described in [this section][named-vs-anonymous-nodes-section] of the page on parser usage.
The expected output section can also *optionally* show the [*field names*][field-names-section] associated with each child node. To include field names in your tests, you write a node's field name followed by a colon, before the node itself in the S-expression:
```
(source_file
(function_definition
name: (identifier)
parameters: (parameter_list)
result: (primitive_type)
body: (block
(return_statement (number)))))
```
These tests are important. They serve as the parser's API documentation, and they can be run every time you change the grammar to verify that everything still parses correctly.
By default, the `tree-sitter test` command runs all of the tests in your `corpus` folder. To run a particular test, you can use the the `-f` flag:
```sh
tree-sitter test -f 'Return statements'
```
The recommendation is to be comprehensive in adding tests. If it's a visible node, add it to a test file in your `corpus` directory. It's typically a good idea to test all of the permutations of each language construct. This increases test coverage, but doubly acquaints readers with a way to examine expected outputs and understand the "edges" of a language.
#### Automatic Compilation
You might notice that the first time you run `tree-sitter test` after regenerating your parser, it takes some extra time. This is because Tree-sitter automatically compiles your C code into a dynamically-loadable library. It recompiles your parser as-needed whenever you update it by re-running `tree-sitter generate`.
### Command: `parse`
You can run your parser on an arbitrary file using `tree-sitter parse`. This will print the resulting the syntax tree, including nodes' ranges and field names, like this:
```
(source_file [0, 0] - [3, 0]
(function_declaration [0, 0] - [2, 1]
name: (identifier [0, 5] - [0, 9])
parameters: (parameter_list [0, 9] - [0, 11])
result: (type_identifier [0, 12] - [0, 15])
body: (block [0, 16] - [2, 1]
(return_statement [1, 2] - [1, 10]
(expression_list [1, 9] - [1, 10]
(int_literal [1, 9] - [1, 10]))))))
```
You can pass as many files to `tree-sitter parse` as your OS will allow. The command will exit with a non-zero status code if any parse errors occurred. You can also prevent the syntax trees from being printed using the `--quiet` flag. This makes `tree-sitter parse` usable as a secondary testing strategy: you can check that a large number of files parse without error:
```sh
find ./examples -name '*.go' | xargs -n 1000 tree-sitter parse --quiet
```
### The Grammar DSL
The following is a complete list of built-in functions you can use in your `grammar.js` to define rules. Use-cases for some of these functions will be explained in more detail in later sections.
* **Symbols (the `$` object)** - Every grammar rule is written as a JavaScript function that takes a parameter conventionally called `$`. The syntax `$.identifier` is how you refer to another grammar symbol within a rule.
* **String and Regex literals** - The terminal symbols in a grammar are described using JavaScript strings and regular expressions. Of course during parsing, Tree-sitter does not actually use JavaScript's regex engine to evaluate these regexes; it generates its own regex-matching logic as part of each parser. Regex literals are just used as a convenient way of writing regular expressions in your grammar.
* **Sequences : `seq(rule1, rule2, ...)`** - This function creates a rule that matches any number of other rules, one after another. It is analogous to simply writing multiple symbols next to each other in [EBNF notation][ebnf].
* **Alternatives : `choice(rule1, rule2, ...)`** - This function creates a rule that matches *one* of a set of possible rules. The order of the arguments does not matter. This is analogous to the `|` (pipe) operator in EBNF notation.
* **Repetitions : `repeat(rule)`** - This function creates a rule that matches *zero-or-more* occurrences of a given rule. It is analogous to the `{x}` (curly brace) syntax in EBNF notation.
* **Repetitions : `repeat1(rule)`** - This function creates a rule that matches *one-or-more* occurrences of a given rule. The previous `repeat` rule is implemented in terms of `repeat1` but is included because it is very commonly used.
* **Options : `optional(rule)`** - This function creates a rule that matches *zero or one* occurrence of a given rule it is analogous to the `[x]` (square bracket) syntax in EBNF notation.
* **Precedence : `prec(number, rule)`** - This function marks the given rule with a numerical precedence which will be used to resolve [*LR(1) Conflicts*][lr-conflict] at parser-generation time. When two rules overlap in a way that represents either a true ambiguity or a *local* ambiguity given one token of lookahead, Tree-sitter will try to resolve the conflict by matching the rule with the higher precedence. The default precedence of all rules is zero. This works similarly to the [precedence directives][yacc-prec] in Yacc grammars.
* **Left Associativity : `prec.left([number], rule)`** - This function marks the given rule as left-associative (and optionally applies a numerical precedence). When an LR(1) conflict arises in which all of the rules have the same numerical precedence, Tree-sitter will consult the rules' associativity. If there is a left-associative rule, Tree-sitter will prefer matching a rule that ends *earlier*. This works similarly to [associativity directives][yacc-prec] in Yacc grammars.
* **Right Associativity : `prec.right([number], rule)`** - This function is like `prec.left`, but it instructs Tree-sitter to prefer matching a rule that ends *later*.
* **Dynamic Precedence : `prec.dynamic(number, rule)`** - This function is similar to `prec`, but the given numerical precedence is applied at *runtime* instead of at parser generation time. This is only necessary when handling a conflict dynamically using the the `conflicts` field in the grammar, and when there is a genuine *ambiguity*: multiple rules correctly match a given piece of code. In that event, Tree-sitter compares the total dynamic precedence associated with each rule, and selects the one with the highest total. This is similar to [dynamic precedence directives][bison-dprec] in Bison grammars.
* **Tokens : `token(rule)`** - This function marks the given rule as producing only a single token. Tree-sitter's default is to treat each String or RegExp literal in the grammar as a separate token. Each token is matched separately by the lexer and returned as its own leaf node in the tree. The `token` function allows you to express a complex rule using the functions described above (rather than as a single regular expression) but still have Tree-sitter treat it as a single token.
* **Aliases : `alias(rule, name)`** - This function causes the given rule to *appear* with an alternative name in the syntax tree. If `name` is a *symbol*, as in `alias($.foo, $.bar)`, then the aliased rule will *appear* as a [named node][named-vs-anonymous-nodes-section] called `bar`. And if `name` is a *string literal*, as in `alias($.foo, 'bar')`, then the aliased rule will appear as an [anonymous node][named-vs-anonymous-nodes-section], as if the rule had been written as the simple string.
* **Field Names : `field(name, rule)`** - This function assigns a *field name* to the child node(s) matched by the given rule. In the resulting syntax tree, you can then use that field name to access specific children.
In addition to the `name` and `rules` fields, grammars have a few other optional public fields that influence the behavior of the parser.
* **`extras`** - an array of tokens that may appear *anywhere* in the language. This is often used for whitespace and comments. The default value of `extras` is to accept whitespace. To control whitespace explicitly, specify `extras: $ => []` in your grammar.
* **`inline`** - an array of rule names that should be automatically *removed* from the grammar by replacing all of their usages with a copy of their definition. This is useful for rules that are used in multiple places but for which you *don't* want to create syntax tree nodes at runtime.
* **`conflicts`** - an array of arrays of rule names. Each inner array represents a set of rules that's involved in an *LR(1) conflict* that is *intended to exist* in the grammar. When these conflicts occur at runtime, Tree-sitter will use the GLR algorithm to explore all of the possible interpretations. If *multiple* parses end up succeeding, Tree-sitter will pick the subtree whose corresponding rule has the highest total *dynamic precedence*.
* **`externals`** - an array of token names which can be returned by an [*external scanner*](#external-scanners). External scanners allow you to write custom C code which runs during the lexing process in order to handle lexical rules (e.g. Python's indentation tokens) that cannot be described by regular expressions.
* **`word`** - the name of a token that will match keywords for the purpose of the [keyword extraction](#keyword-extraction) optimization.
## Writing the Grammar
Writing a grammar requires creativity. There are an infinite number of CFGs (context-free grammars) that can be used to describe any given language. In order to produce a good Tree-sitter parser, you need to create a grammar with two important properties:
1. **An intuitive structure** - Tree-sitter's output is a [concrete syntax tree][cst]; each node in the tree corresponds directly to a [terminal or non-terminal symbol][non-terminal] in the grammar. So in order to produce an easy-to-analyze tree, there should be a direct correspondence between the symbols in your grammar and the recognizable constructs in the language. This might seem obvious, but it is very different from the way that context-free grammars are often written in contexts like [language specifications][language-spec] or [Yacc][yacc]/[Bison][bison] parsers.
2. **A close adherence to LR(1)** - Tree-sitter is based on the [GLR parsing][glr-parsing] algorithm. This means that while it can handle any context-free grammar, it works most efficiently with a class of context-free grammars called [LR(1) Grammars][lr-grammars]. In this respect, Tree-sitter's grammars are similar to (but less restrictive than) [Yacc][yacc] and [Bison][bison] grammars, but *different* from [ANTLR grammars][antlr], [Parsing Expression Grammars][peg], or the [ambiguous grammars][ambiguous-grammar] commonly used in language specifications.
It's unlikely that you'll be able to satisfy these two properties just by translating an existing context-free grammar directly into Tree-sitter's grammar format. There are a few kinds of adjustments that are often required. The following sections will explain these adjustments in more depth.
### The First Few Rules
It's usually a good idea to find a formal specification for the language you're trying to parse. This specification will most likely contain a context-free grammar. As you read through the rules of this CFG, you will probably discover a complex and cyclic graph of relationships. It might be unclear how you should navigate this graph as you define your grammar.
Although languages have very different constructs, their constructs can often be categorized in to similar groups like *Declarations*, *Definitions*, *Statements*, *Expressions*, *Types*, and *Patterns*. In writing your grammar, a good first step is to create just enough structure to include all of these basic *groups* of symbols. For an imaginary C-like language, this might look something like this:
Although languages have very different constructs, their constructs can often be categorized in to similar groups like *Declarations*, *Definitions*, *Statements*, *Expressions*, *Types*, and *Patterns*. In writing your grammar, a good first step is to create just enough structure to include all of these basic *groups* of symbols. For a lanugage like Go, you might start with something like this:
```js
{
@ -192,76 +325,11 @@ With this structure in place, you can now freely decide what part of the grammar
After developing the *type* sublanguage a bit further, you might decide to switch to working on *statements* or *expressions* instead. It's often useful to check your progress by trying to parse some real code using `tree-sitter parse`.
### Writing unit tests
**And remember to add tests for each rule in your `corpus` folder!**
For each rule that you add to the grammar, you should first create a *test* that describes how the syntax trees should look when parsing that rule. These tests are written using specially-formatted text files in a `corpus` directory in your parser's root folder. Here is an example of how these tests should look:
### Structuring Rules Well
```
==================
Return statements
==================
func x() int {
return 1;
}
---
(source_file
(function_definition
(identifier)
(parameter_list)
(primitive_type)
(block
(return_statement (number)))))
```
The name of the test is written between two lines containing only `=` characters. Then the source code is written, followed by a line containing three or more `-` characters. Then, the expected syntax tree is written as an [S-expression][s-exp]. The exact placement of whitespace in the S-expression doesn't matter, but ideally the syntax tree should be legible. Note that the S-expression does not show syntax nodes like `func`, `(` and `;`, which are expressed as strings and regexes in the grammar. It only shows the *named* nodes, as described in [the previous page][named-vs-anonymous-nodes-section].
These tests are important. They serve as the parser's API documentation, and they can be run every time you change the grammar to verify that everything still parses correctly. You can run these tests using this command:
```sh
tree-sitter test
```
To run a particular test, you can use the the `-f` flag:
```sh
tree-sitter test -f 'Return statements'
```
The recommendation is to be comprehensive in adding tests. If it's a visible node, add it to a test file in your `corpus` directory. It's typically a good idea to test all of the permutations of each language construct. This increases test coverage, but doubly acquaints readers with a way to examine expected outputs and understand the "edges" of a language.
### The Grammar DSL
The following is a complete list of built-in functions you can use to define Tree-sitter grammars. Use-cases for some of these functions will be explained in more detail in later sections.
* **Symbols (the `$` object)** - Every grammar rule is written as a JavaScript function that takes a parameter conventionally called `$`. The syntax `$.identifier` is how you refer to another grammar symbol within a rule.
* **String and Regex literals** - The terminal symbols in a grammar are described using JavaScript strings and regular expressions. Of course during parsing, Tree-sitter does not actually use JavaScript's regex engine to evaluate these regexes; it generates its own regex-matching logic as part of each parser. Regex literals are just used as a convenient way of writing regular expressions in your grammar.
* **Sequences : `seq(rule1, rule2, ...)`** - This function creates a rule that matches any number of other rules, one after another. It is analogous to simply writing multiple symbols next to each other in [EBNF notation][ebnf].
* **Alternatives : `choice(rule1, rule2, ...)`** - This function creates a rule that matches *one* of a set of possible rules. The order of the arguments does not matter. This is analogous to the `|` (pipe) operator in EBNF notation.
* **Repetitions : `repeat(rule)`** - This function creates a rule that matches *zero-or-more* occurrences of a given rule. It is analogous to the `{x}` (curly brace) syntax in EBNF notation.
* **Repetitions : `repeat1(rule)`** - This function creates a rule that matches *one-or-more* occurrences of a given rule. The previous `repeat` rule is implemented in terms of `repeat1` but is included because it is very commonly used.
* **Options : `optional(rule)`** - This function creates a rule that matches *zero or one* occurrence of a given rule it is analogous to the `[x]` (square bracket) syntax in EBNF notation.
* **Precedence : `prec(number, rule)`** - This function marks the given rule with a numerical precedence which will be used to resolve [*LR(1) Conflicts*][lr-conflict] at parser-generation time. When two rules overlap in a way that represents either a true ambiguity or a *local* ambiguity given one token of lookahead, Tree-sitter will try to resolve the conflict by matching the rule with the higher precedence. The default precedence of all rules is zero. This works similarly to the [precedence directives][yacc-prec] in Yacc grammars.
* **Left Associativity : `prec.left([number], rule)`** - This function marks the given rule as left-associative (and optionally applies a numerical precedence). When an LR(1) conflict arises in which all of the rules have the same numerical precedence, Tree-sitter will consult the rules' associativity. If there is a left-associative rule, Tree-sitter will prefer matching a rule that ends *earlier*. This works similarly to [associativity directives][yacc-prec] in Yacc grammars.
* **Right Associativity : `prec.right([number], rule)`** - This function is like `prec.left`, but it instructs Tree-sitter to prefer matching a rule that ends *later*.
* **Dynamic Precedence : `prec.dynamic(number, rule)`** - This function is similar to `prec`, but the given numerical precedence is applied at *runtime* instead of at parser generation time. This is only necessary when handling a conflict dynamically using the the `conflicts` field in the grammar, and when there is a genuine *ambiguity*: multiple rules correctly match a given piece of code. In that event, Tree-sitter compares the total dynamic precedence associated with each rule, and selects the one with the highest total. This is similar to [dynamic precedence directives][bison-dprec] in Bison grammars.
* **Tokens : `token(rule)`** - This function marks the given rule as producing only a single token. Tree-sitter's default is to treat each String or RegExp literal in the grammar as a separate token. Each token is matched separately by the lexer and returned as its own leaf node in the tree. The `token` function allows you to express a complex rule using the functions described above (rather than as a single regular expression) but still have Tree-sitter treat it as a single token.
* **Aliases : `alias(rule, name)`** - Depending on the *type* of the `name` argument this does one of two closely related things: Given `alias($.foo, $.bar)` it uses the rule `foo` but makes it *appear* in the syntax tree as a [named node][named-vs-anonymous-nodes-section] with the name `bar` instead. However when the second argument given is a string - for instance `alias($.foo, 'bar')` - it uses the rule `foo` but turns it into an [anonymous node][named-vs-anonymous-nodes-section] with the name `bar` instead. It is useful in cases where a language construct needs to be parsed differently in different contexts (and thus needs to be defined using multiple symbols), but should always *appear* as the same type of node.
In addition to the `name` and `rules` fields, grammars have a few other optional public fields that influence the behavior of the parser.
* **`extras`** - an array of tokens that may appear *anywhere* in the language. This is often used for whitespace and comments. The default value of `extras` is to accept whitespace. To control whitespace explicitly, specify `extras: $ => []` in your grammar.
* **`inline`** - an array of rule names that should be automatically *removed* from the grammar by replacing all of their usages with a copy of their definition. This is useful for rules that are used in multiple places but for which you *don't* want to create syntax tree nodes at runtime.
* **`conflicts`** - an array of arrays of rule names. Each inner array represents a set of rules that's involved in an *LR(1) conflict* that is *intended to exist* in the grammar. When these conflicts occur at runtime, Tree-sitter will use the GLR algorithm to explore all of the possible interpretations. If *multiple* parses end up succeeding, Tree-sitter will pick the subtree whose corresponding rule has the highest total *dynamic precedence*.
* **`externals`** - an array of token names which can be returned by an [*external scanner*](#external-scanners). External scanners allow you to write custom C code which runs during the lexing process in order to handle lexical rules (e.g. Python's indentation tokens) that cannot be described by regular expressions.
* **`word`** - the name of a token that will match keywords for the purpose of the [keyword extraction](#keyword-extraction) optimization.
### Adjusting existing grammars
Imagine that you were just starting work on the [Tree-sitter JavaScript parser][tree-sitter-javascript]. You might try to directly mirror the structure of the [ECMAScript Language Spec][ecmascript-spec]. To illustrate the problem with this approach, consider the following line of code:
Imagine that you were just starting work on the [Tree-sitter JavaScript parser][tree-sitter-javascript]. Naively, you might try to directly mirror the structure of the [ECMAScript Language Spec][ecmascript-spec]. To illustrate the problem with this approach, consider the following line of code:
```js
return x + y;
@ -293,9 +361,9 @@ MemberExpression -> PrimaryExpression
PrimaryExpression -> IdentifierReference
```
The language spec encodes the 20 precedence levels of JavaScript expressions using 20 different non-terminal symbols. If we were to create a concrete syntax tree representing this statement according to the language spec, it would have twenty levels of nesting and it would contain nodes with names like `BitwiseXORExpression`, which are unrelated to the actual code.
The language spec encodes the twenty different precedence levels of JavaScript expressions using twenty levels of indirection between `IdentifierReference` and `Expression`. If we were to create a concrete syntax tree representing this statement according to the language spec, it would have twenty levels of nesting, and it would contain nodes with names like `BitwiseXORExpression`, which are unrelated to the actual code.
### Using precedence
### Using Precedence
To produce a readable syntax tree, we'd like to model JavaScript expressions using a much flatter structure like this:
@ -358,7 +426,7 @@ For an expression like `-a * b`, it's not clear whether the `-` operator applies
}
```
### Using associativity
### Using Associativity
Applying a higher precedence in `unary_expression` fixes that conflict, but there is still another conflict:
@ -392,13 +460,25 @@ For an expression like `a * b * c`, it's not clear whether we mean `a * (b * c)`
}
```
### Hiding rules
### Hiding Rules
You may have noticed in the above examples that some of the grammar rule name like `_expression` and `_type` began with an underscore. Starting a rule's name with an underscore causes the rule to be *hidden* in the syntax tree. This is useful for rules like `_expression` in the grammars above, which always just wrap a single child node. If these nodes were not hidden, they would add substantial depth and noise to the syntax tree without making it any easier to understand.
## LR conflicts
### Using Fields
...
Often, it's easier to analyze a syntax nodes if you can refer to its children by *name* instead of by their position in an ordered list. Tree-sitter grammars support this using the `field` function. This function allows you to assign unique names to some or all of a node's children:
```js
function_definition: $ => seq(
'func',
field('name', $.identifier),
field('parameters', $.parameter_list),
field('return_type', $._type),
field('body', $.block)
)
```
Adding fields like this allows you to retrieve nodes using the [field APIs][field-names-section].
## Lexical Analysis
@ -410,7 +490,7 @@ Grammars often contain multiple tokens that can match the same characters. For e
1. **Context-aware Lexing** - Tree-sitter performs lexing on-demand, during the parsing process. At any given position in a source document, the lexer only tries to recognize tokens that are *valid* at that position in the document.
2. **Lexical Precedence** - When the precedence functions described [above](#using-the-grammar-dsl) are used within the `token` function, the given precedence values serve as instructions to the lexer. If there are two valid tokens that match the characters at a given position in the document, Tree-sitter will select the one with the higher precedence.
2. **Lexical Precedence** - When the precedence functions described [above](#the-grammar-dsl) are used within the `token` function, the given precedence values serve as instructions to the lexer. If there are two valid tokens that match the characters at a given position in the document, Tree-sitter will select the one with the higher precedence.
3. **Match Length** - If multiple valid tokens with the same precedence match the characters at a given position in a document, Tree-sitter will select the token that matches the [longest sequence of characters][longest-match].
@ -623,6 +703,7 @@ if (valid_symbols[INDENT] || valid_symbol[DEDENT]) {
[lr-grammars]: https://en.wikipedia.org/wiki/LR_parser
[multi-language-section]: ./using-parsers#multi-language-documents
[named-vs-anonymous-nodes-section]: ./using-parsers#named-vs-anonymous-nodes
[field-names-section]: ./using-parsers#node-field-names
[nan]: https://github.com/nodejs/nan
[node-module]: https://www.npmjs.com/package/tree-sitter-cli
[node.js]: https://nodejs.org

View file

@ -1,8 +1,7 @@
use super::{escape, load_property_sheet, Error, Highlight, HighlightEvent, Highlighter, Properties};
use super::{load_property_sheet, Error, Highlight, Highlighter, HtmlRenderer, Properties};
use regex::Regex;
use std::collections::HashMap;
use std::ffi::CStr;
use std::io::Write;
use std::os::raw::c_char;
use std::process::abort;
use std::sync::atomic::AtomicUsize;
@ -20,10 +19,7 @@ pub struct TSHighlighter {
attribute_strings: Vec<&'static [u8]>,
}
pub struct TSHighlightBuffer {
html: Vec<u8>,
line_offsets: Vec<u32>,
}
pub struct TSHighlightBuffer(HtmlRenderer);
#[repr(C)]
pub enum ErrorCode {
@ -57,10 +53,7 @@ pub extern "C" fn ts_highlighter_new(
#[no_mangle]
pub extern "C" fn ts_highlight_buffer_new() -> *mut TSHighlightBuffer {
Box::into_raw(Box::new(TSHighlightBuffer {
html: Vec::new(),
line_offsets: Vec::new(),
}))
Box::into_raw(Box::new(TSHighlightBuffer(HtmlRenderer::new())))
}
#[no_mangle]
@ -76,25 +69,25 @@ pub extern "C" fn ts_highlight_buffer_delete(this: *mut TSHighlightBuffer) {
#[no_mangle]
pub extern "C" fn ts_highlight_buffer_content(this: *const TSHighlightBuffer) -> *const u8 {
let this = unwrap_ptr(this);
this.html.as_slice().as_ptr()
this.0.html.as_slice().as_ptr()
}
#[no_mangle]
pub extern "C" fn ts_highlight_buffer_line_offsets(this: *const TSHighlightBuffer) -> *const u32 {
let this = unwrap_ptr(this);
this.line_offsets.as_slice().as_ptr()
this.0.line_offsets.as_slice().as_ptr()
}
#[no_mangle]
pub extern "C" fn ts_highlight_buffer_len(this: *const TSHighlightBuffer) -> u32 {
let this = unwrap_ptr(this);
this.html.len() as u32
this.0.html.len() as u32
}
#[no_mangle]
pub extern "C" fn ts_highlight_buffer_line_count(this: *const TSHighlightBuffer) -> u32 {
let this = unwrap_ptr(this);
this.line_offsets.len() as u32
this.0.line_offsets.len() as u32
}
#[no_mangle]
@ -183,77 +176,28 @@ impl TSHighlighter {
);
if let Ok(highlighter) = highlighter {
output.html.clear();
output.line_offsets.clear();
output.line_offsets.push(0);
let mut highlights = Vec::new();
for event in highlighter {
match event {
Ok(HighlightEvent::HighlightStart(s)) => {
highlights.push(s);
output.start_highlight(s, &self.attribute_strings);
}
Ok(HighlightEvent::HighlightEnd) => {
highlights.pop();
output.end_highlight();
}
Ok(HighlightEvent::Source(src)) => {
output.add_text(src, &highlights, &self.attribute_strings);
},
Err(Error::Cancelled) => {
return ErrorCode::Timeout;
},
Err(Error::InvalidLanguage) => {
return ErrorCode::InvalidLanguage;
},
Err(Error::Unknown) => {
return ErrorCode::Timeout;
}
output.0.reset();
let result = output.0.render(highlighter, source_code, &|s| {
self.attribute_strings[s as usize]
});
match result {
Err(Error::Cancelled) => {
return ErrorCode::Timeout;
}
Err(Error::InvalidLanguage) => {
return ErrorCode::InvalidLanguage;
}
Err(Error::Unknown) => {
return ErrorCode::Timeout;
}
Ok(()) => ErrorCode::Ok,
}
ErrorCode::Ok
} else {
ErrorCode::Timeout
}
}
}
impl TSHighlightBuffer {
fn start_highlight(&mut self, h: Highlight, attribute_strings: &[&[u8]]) {
let attribute_string = attribute_strings[h as usize];
self.html.extend(b"<span");
if !attribute_string.is_empty() {
self.html.extend(b" ");
self.html.extend(attribute_string);
}
self.html.extend(b">");
}
fn end_highlight(&mut self) {
self.html.extend(b"</span>");
}
fn finish_line(&mut self) {
self.line_offsets.push(self.html.len() as u32);
}
fn add_text(&mut self, src: &str, highlights: &Vec<Highlight>, attribute_strings: &[&[u8]]) {
let mut multiline = false;
for line in src.split('\n') {
let line = line.trim_end_matches('\r');
if multiline {
highlights.iter().for_each(|_| self.end_highlight());
self.finish_line();
highlights
.iter()
.for_each(|scope| self.start_highlight(*scope, attribute_strings));
}
write!(&mut self.html, "{}", escape::Escape(line)).unwrap();
multiline = true;
}
}
}
fn unwrap_ptr<'a, T>(result: *const T) -> &'a T {
unsafe { result.as_ref() }.unwrap_or_else(|| {
eprintln!("{}:{} - pointer must not be null", file!(), line!());

View file

@ -1,53 +0,0 @@
// Copyright 2013 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! HTML Escaping
//!
//! This module contains one unit-struct which can be used to HTML-escape a
//! string of text (for use in a format string).
use std::fmt;
/// Wrapper struct which will emit the HTML-escaped version of the contained
/// string when passed to a format string.
pub struct Escape<'a>(pub &'a str);
impl<'a> fmt::Display for Escape<'a> {
fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
// Because the internet is always right, turns out there's not that many
// characters to escape: http://stackoverflow.com/questions/7381974
let Escape(s) = *self;
let pile_o_bits = s;
let mut last = 0;
for (i, ch) in s.bytes().enumerate() {
match ch as char {
'<' | '>' | '&' | '\'' | '"' => {
fmt.write_str(&pile_o_bits[last..i])?;
let s = match ch as char {
'>' => "&gt;",
'<' => "&lt;",
'&' => "&amp;",
'\'' => "&#39;",
'"' => "&quot;",
_ => unreachable!(),
};
fmt.write_str(s)?;
last = i + 1;
}
_ => {}
}
}
if last < s.len() {
fmt.write_str(&pile_o_bits[last..])?;
}
Ok(())
}
}

View file

@ -1,13 +1,12 @@
pub mod c_lib;
mod escape;
pub mod util;
pub use c_lib as c;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use serde_derive::*;
use std::fmt::{self, Write};
use std::mem::transmute;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::{cmp, str, usize};
use std::{cmp, fmt, str, usize};
use tree_sitter::{Language, Node, Parser, Point, PropertySheet, Range, Tree, TreePropertyCursor};
const CANCELLATION_CHECK_INTERVAL: usize = 100;
@ -116,14 +115,13 @@ where
parser: Parser,
layers: Vec<Layer<'a>>,
max_opaque_layer_depth: usize,
utf8_error_len: Option<usize>,
operation_count: usize,
cancellation_flag: Option<&'a AtomicUsize>,
}
#[derive(Copy, Clone, Debug)]
pub enum HighlightEvent<'a> {
Source(&'a str),
pub enum HighlightEvent {
Source { start: usize, end: usize },
HighlightStart(Highlight),
HighlightEnd,
}
@ -471,7 +469,6 @@ where
injection_callback,
source_offset: 0,
operation_count: 0,
utf8_error_len: None,
max_opaque_layer_depth: 0,
layers: vec![Layer::new(
source,
@ -489,30 +486,13 @@ where
})
}
fn emit_source(&mut self, next_offset: usize) -> Option<Result<HighlightEvent<'a>, Error>> {
let input = &self.source[self.source_offset..next_offset];
match str::from_utf8(input) {
Ok(valid) => {
self.source_offset = next_offset;
Some(Ok(HighlightEvent::Source(valid)))
}
Err(error) => {
if let Some(error_len) = error.error_len() {
if error.valid_up_to() > 0 {
let prefix = &input[0..error.valid_up_to()];
self.utf8_error_len = Some(error_len);
Some(Ok(HighlightEvent::Source(unsafe {
str::from_utf8_unchecked(prefix)
})))
} else {
self.source_offset += error_len;
Some(Ok(HighlightEvent::Source("\u{FFFD}")))
}
} else {
None
}
}
}
fn emit_source(&mut self, next_offset: usize) -> HighlightEvent {
let result = HighlightEvent::Source {
start: self.source_offset,
end: next_offset,
};
self.source_offset = next_offset;
result
}
fn process_tree_step(&self, step: &TreeStep, nodes: &mut Vec<Node>) {
@ -727,7 +707,7 @@ impl<'a, T> Iterator for Highlighter<'a, T>
where
T: Fn(&str) -> Option<(Language, &'a PropertySheet<Properties>)>,
{
type Item = Result<HighlightEvent<'a>, Error>;
type Item = Result<HighlightEvent, Error>;
fn next(&mut self) -> Option<Self::Item> {
if let Some(cancellation_flag) = self.cancellation_flag {
@ -740,11 +720,6 @@ where
}
}
if let Some(utf8_error_len) = self.utf8_error_len.take() {
self.source_offset += utf8_error_len;
return Some(Ok(HighlightEvent::Source("\u{FFFD}")));
}
while !self.layers.is_empty() {
let mut scope_event = None;
let first_layer = &self.layers[0];
@ -808,7 +783,7 @@ where
// Before returning any highlight boundaries, return any remaining slice of
// the source code the precedes that highlight boundary.
if self.source_offset < next_offset {
return self.emit_source(next_offset);
return Some(Ok(self.emit_source(next_offset)));
}
scope_event = if first_layer.at_node_end {
@ -841,7 +816,7 @@ where
}
if self.source_offset < self.source.len() {
self.emit_source(self.source.len())
Some(Ok(self.emit_source(self.source.len())))
} else {
None
}
@ -1081,7 +1056,7 @@ pub fn highlight<'a, F>(
property_sheet: &'a PropertySheet<Properties>,
cancellation_flag: Option<&'a AtomicUsize>,
injection_callback: F,
) -> Result<impl Iterator<Item = Result<HighlightEvent<'a>, Error>> + 'a, Error>
) -> Result<impl Iterator<Item = Result<HighlightEvent, Error>> + 'a, Error>
where
F: Fn(&str) -> Option<(Language, &'a PropertySheet<Properties>)> + 'a,
{
@ -1106,87 +1081,124 @@ where
F1: Fn(&str) -> Option<(Language, &'a PropertySheet<Properties>)>,
F2: Fn(Highlight) -> &'a str,
{
let highlighter = Highlighter::new(
let mut renderer = HtmlRenderer::new();
renderer.render(
Highlighter::new(
source,
language,
property_sheet,
injection_callback,
cancellation_flag,
)?,
source,
language,
property_sheet,
injection_callback,
cancellation_flag,
&|s| (attribute_callback)(s).as_bytes(),
)?;
let mut renderer = HtmlRenderer::new(attribute_callback);
let mut scopes = Vec::new();
for event in highlighter {
let event = event?;
match event {
HighlightEvent::HighlightStart(s) => {
scopes.push(s);
renderer.start_scope(s);
}
HighlightEvent::HighlightEnd => {
scopes.pop();
renderer.end_scope();
}
HighlightEvent::Source(src) => {
renderer.add_text(src, &scopes);
}
};
}
if !renderer.current_line.is_empty() {
renderer.finish_line();
}
Ok(renderer.result)
Ok(renderer
.line_offsets
.iter()
.enumerate()
.map(|(i, offset)| {
let offset = *offset as usize;
let next_offset = renderer
.line_offsets
.get(i + 1)
.map_or(renderer.html.len(), |i| *i as usize);
String::from_utf8(renderer.html[offset..next_offset].to_vec()).unwrap()
})
.collect())
}
struct HtmlRenderer<'a, F: Fn(Highlight) -> &'a str> {
result: Vec<String>,
current_line: String,
attribute_callback: F,
pub struct HtmlRenderer {
pub html: Vec<u8>,
pub line_offsets: Vec<u32>,
}
impl<'a, F> HtmlRenderer<'a, F>
where
F: Fn(Highlight) -> &'a str,
{
fn new(attribute_callback: F) -> Self {
impl HtmlRenderer {
fn new() -> Self {
HtmlRenderer {
result: Vec::new(),
current_line: String::new(),
attribute_callback,
html: Vec::new(),
line_offsets: vec![0],
}
}
fn start_scope(&mut self, s: Highlight) {
write!(
&mut self.current_line,
"<span {}>",
(self.attribute_callback)(s),
)
.unwrap();
pub fn reset(&mut self) {
self.html.clear();
self.line_offsets.clear();
self.line_offsets.push(0);
}
fn end_scope(&mut self) {
write!(&mut self.current_line, "</span>").unwrap();
}
fn finish_line(&mut self) {
self.current_line.push('\n');
self.result.push(self.current_line.clone());
self.current_line.clear();
}
fn add_text(&mut self, src: &str, scopes: &Vec<Highlight>) {
let mut multiline = false;
for line in src.split('\n') {
let line = line.trim_end_matches('\r');
if multiline {
scopes.iter().for_each(|_| self.end_scope());
self.finish_line();
scopes
.iter()
.for_each(|highlight| self.start_scope(*highlight));
pub fn render<'a, F>(
&mut self,
highlighter: impl Iterator<Item = Result<HighlightEvent, Error>>,
source: &'a [u8],
attribute_callback: &F,
) -> Result<(), Error>
where
F: Fn(Highlight) -> &'a [u8],
{
let mut highlights = Vec::new();
for event in highlighter {
match event {
Ok(HighlightEvent::HighlightStart(s)) => {
highlights.push(s);
self.start_highlight(s, attribute_callback);
}
Ok(HighlightEvent::HighlightEnd) => {
highlights.pop();
self.end_highlight();
}
Ok(HighlightEvent::Source { start, end }) => {
self.add_text(&source[start..end], &highlights, attribute_callback);
}
Err(a) => return Err(a),
}
}
if self.html.last() != Some(&b'\n') {
self.html.push(b'\n');
}
if self.line_offsets.last() == Some(&(self.html.len() as u32)) {
self.line_offsets.pop();
}
Ok(())
}
fn start_highlight<'a, F>(&mut self, h: Highlight, attribute_callback: &F)
where
F: Fn(Highlight) -> &'a [u8],
{
let attribute_string = (attribute_callback)(h);
self.html.extend(b"<span");
if !attribute_string.is_empty() {
self.html.extend(b" ");
self.html.extend(attribute_string);
}
self.html.extend(b">");
}
fn end_highlight(&mut self) {
self.html.extend(b"</span>");
}
fn add_text<'a, F>(&mut self, src: &[u8], highlights: &Vec<Highlight>, attribute_callback: &F)
where
F: Fn(Highlight) -> &'a [u8],
{
for c in util::LossyUtf8::new(src).flat_map(|p| p.bytes()) {
if c == b'\n' {
if self.html.ends_with(b"\r") {
self.html.pop();
}
highlights.iter().for_each(|_| self.end_highlight());
self.html.push(c);
self.line_offsets.push(self.html.len() as u32);
highlights
.iter()
.for_each(|scope| self.start_highlight(*scope, attribute_callback));
} else if let Some(escape) = util::html_escape(c) {
self.html.extend_from_slice(escape);
} else {
self.html.push(c);
}
write!(&mut self.current_line, "{}", escape::Escape(line)).unwrap();
multiline = true;
}
}
}

63
highlight/src/util.rs Normal file
View file

@ -0,0 +1,63 @@
use std::str;
pub struct LossyUtf8<'a> {
bytes: &'a [u8],
in_replacement: bool,
}
impl<'a> LossyUtf8<'a> {
pub fn new(bytes: &'a [u8]) -> Self {
LossyUtf8 {
bytes,
in_replacement: false,
}
}
}
impl<'a> Iterator for LossyUtf8<'a> {
type Item = &'a str;
fn next(&mut self) -> Option<&'a str> {
if self.bytes.is_empty() {
return None;
}
if self.in_replacement {
self.in_replacement = false;
return Some("\u{fffd}");
}
match str::from_utf8(self.bytes) {
Ok(valid) => {
self.bytes = &[];
Some(valid)
}
Err(error) => {
if let Some(error_len) = error.error_len() {
let error_start = error.valid_up_to();
if error_start > 0 {
let result =
unsafe { str::from_utf8_unchecked(&self.bytes[..error_start]) };
self.bytes = &self.bytes[(error_start + error_len)..];
self.in_replacement = true;
Some(result)
} else {
self.bytes = &self.bytes[error_len..];
Some("\u{fffd}")
}
} else {
None
}
}
}
}
}
pub fn html_escape(c: u8) -> Option<&'static [u8]> {
match c as char {
'>' => Some(b"&gt;"),
'<' => Some(b"&lt;"),
'&' => Some(b"&amp;"),
'\'' => Some(b"&#39;"),
'"' => Some(b"&quot;"),
_ => None,
}
}

View file

@ -331,6 +331,11 @@ void ts_node_named_child_wasm(const TSTree *tree, uint32_t index) {
marshal_node(TRANSFER_BUFFER, ts_node_named_child(node, index));
}
void ts_node_child_by_field_id_wasm(const TSTree *tree, uint32_t field_id) {
TSNode node = unmarshal_node(tree);
marshal_node(TRANSFER_BUFFER, ts_node_child_by_field_id(node, field_id));
}
void ts_node_next_sibling_wasm(const TSTree *tree) {
TSNode node = unmarshal_node(tree);
marshal_node(TRANSFER_BUFFER, ts_node_next_sibling(node));

View file

@ -302,6 +302,17 @@ class Node {
return unmarshalNode(this.tree);
}
childForFieldId(fieldId) {
marshalNode(this);
C._ts_node_child_by_field_id_wasm(this.tree[0], fieldId);
return unmarshalNode(this.tree);
}
childForFieldName(fieldName) {
const fieldId = this.tree.language.fields.indexOf(fieldName);
if (fieldId !== -1) return this.childForFieldId(fieldId);
}
get childCount() {
marshalNode(this);
return C._ts_node_child_count_wasm(this.tree[0]);
@ -586,10 +597,13 @@ class TreeCursor {
return unmarshalNode(this.tree);
}
currentFieldName() {
currentFieldId() {
marshalTreeCursor(this);
const fieldId = C._ts_tree_cursor_current_field_id_wasm(this.tree[0]);
return this.tree.language.fields[fieldId];
return C._ts_tree_cursor_current_field_id_wasm(this.tree[0]);
}
currentFieldName() {
return this.tree.language.fields[this.currentFieldId()];
}
gotoFirstChild() {
@ -641,6 +655,23 @@ class Language {
return C._ts_language_version(this[0]);
}
get fieldCount() {
return this.fields.length - 1;
}
fieldIdForName(fieldName) {
const result = this.fields.indexOf(fieldName);
if (result !== -1) {
return result;
} else {
return null;
}
}
fieldNameForId(fieldId) {
return this.fields[fieldName] || null;
}
static load(url) {
let bytes;
if (

View file

@ -36,6 +36,7 @@
"_ts_language_symbol_name",
"_ts_language_symbol_type",
"_ts_language_version",
"_ts_node_child_by_field_id_wasm",
"_ts_node_child_count_wasm",
"_ts_node_child_wasm",
"_ts_node_children_wasm",

View file

@ -116,6 +116,35 @@ describe("Node", () => {
})
});
describe('.childForFieldName()', () => {
it('returns null when the node has no children', () => {
tree = parser.parse("class A { b() {} }");
const classNode = tree.rootNode.firstChild;
assert.equal(classNode.type, 'class_declaration');
const classNameNode = classNode.childForFieldName('name');
assert.equal(classNameNode.type, 'identifier');
assert.equal(classNameNode.text, 'A');
const bodyNode = classNode.childForFieldName('body');
assert.equal(bodyNode.type, 'class_body');
assert.equal(bodyNode.text, '{ b() {} }');
const methodNode = bodyNode.firstNamedChild;
assert.equal(methodNode.type, 'method_definition');
assert.equal(methodNode.text, 'b() {}');
const methodNameNode = methodNode.childForFieldName('name');
assert.equal(methodNameNode.type, 'property_identifier');
assert.equal(methodNameNode.text, 'b');
const paramsNode = methodNode.childForFieldName('parameters');
assert.equal(paramsNode.type, 'formal_parameters');
assert.equal(paramsNode.text, '()');
});
});
describe(".nextSibling and .previousSibling", () => {
it("returns the node's next and previous sibling", () => {
tree = parser.parse("x10 + 1000");

View file

@ -71,6 +71,8 @@ declare module 'web-tree-sitter' {
toString(): string;
child(index: number): SyntaxNode | null;
namedChild(index: number): SyntaxNode | null;
childForFieldId(fieldId: number): SyntaxNode | null;
childForFieldName(fieldName: string): SyntaxNode | null;
descendantForIndex(index: number): SyntaxNode;
descendantForIndex(startIndex: number, endIndex: number): SyntaxNode;
@ -97,6 +99,7 @@ declare module 'web-tree-sitter' {
reset(node: SyntaxNode): void;
delete(): void;
currentNode(): SyntaxNode;
currentFieldId(): number;
currentFieldName(): string;
gotoParent(): boolean;
gotoFirstChild(): boolean;
@ -115,8 +118,15 @@ declare module 'web-tree-sitter' {
getEditedRange(other: Tree): Range;
getLanguage(): any;
}
namespace Language {
function load(url: string): Promise<any>
class Language {
static load(): Promise<Language>;
readonly version: number;
readonly fieldCount: number;
fieldNameForId(fieldId: number): string | null
fieldIdForName(fieldName: string): number | null
}
}

View file

@ -273,6 +273,7 @@ TSFieldId ts_tree_cursor_current_field_id(const TSTreeCursor *_self) {
while (field_map < field_map_end) {
if (
!ts_subtree_extra(*entry->subtree) &&
!field_map->inherited &&
field_map->child_index == entry->structural_child_index
) return field_map->field_id;

View file

@ -91,6 +91,7 @@ $emcc \
$emscripten_flags \
-std=c99 \
-D 'fprintf(...)=' \
-D NDEBUG= \
-I lib/src \
-I lib/include \
-I lib/utf8proc \