diff --git a/docs/section-2-using-parsers.md b/docs/section-2-using-parsers.md index ee151426..499ddce6 100644 --- a/docs/section-2-using-parsers.md +++ b/docs/section-2-using-parsers.md @@ -27,7 +27,7 @@ Here's an example of a simple C program that uses the Tree-sitter [JSON parser]( #include #include #include -#include "tree_sitter/runtime.h" +#include // Declare the `tree_sitter_json` function, which is // implemented by the `tree-sitter-json` library. @@ -236,6 +236,110 @@ void ts_node_edit(TSNode *, const TSInputEdit *); Then, you can call `ts_parser_parse` again, passing in the old tree. This will create a new tree that internally shares structure with the old tree. +## Multi-language Documents + +Sometimes, different parts of a file may be written in different languages. For example, templating languages like [EJS](http://ejs.co) and [ERB](https://ruby-doc.org/stdlib-2.5.1/libdoc/erb/rdoc/ERB.html) allow you to generate HTML by writing a mixture of HTML and another language like JavaScript or Ruby. + +Tree-sitter handles these types of documents by allowing you to create a syntax tree based on the text in certain *ranges* of a file. + +```c +typedef struct { + TSPoint start_point; + TSPoint end_point; + uint32_t start_byte; + uint32_t end_byte; +} TSRange; + +void ts_parser_set_included_ranges( + TSParser *self, + const TSRange *ranges, + uint32_t range_count +); +``` + +For example, consider this ERB document: + +```erb +
    + <% people.each do |person| %> +
  • <%= person.name %>
  • + <% end %> +
+``` + +Conceptually, it can be represented by three syntax trees with overlapping ranges: an ERB syntax tree, a Ruby syntax tree, and an HTML syntax tree. You could generate these syntax trees as follows: + +```c +#include +#include + +// These functions are each implemented in their own repo. +const TSLanguage *tree_sitter_embedded_template(); +const TSLanguage *tree_sitter_html(); +const TSLanguage *tree_sitter_ruby(); + +int main(int argc, const char **argv) { + const char *text = argv[1]; + unsigned len = strlen(src); + + // Parse the entire text as ERB. + TSParser *parser = ts_parser_new(); + ts_parser_set_language(parser, tree_sitter_embedded_template()); + TSTree *erb_tree = ts_parser_parse_string(parser, NULL, text, len); + TSNode erb_root_node = ts_tree_root_node(erb_tree); + + // Find the ranges of the `content` nodes, which represent + // the underlying HTML, and the `code` nodes, which represent + // the interpolated Ruby. + TSRange html_ranges[10]; + TSRange ruby_ranges[10]; + unsigned html_range_count = 0; + unsigned ruby_range_count = 0; + unsigned child_count = ts_node_child_count(erb_root_node); + + for (unsigned i = 0; i < child_count; i++) { + TSNode node = ts_node_child(erb_root_node, i); + if (strcmp(ts_node_type(node), "content") == 0) { + html_ranges[html_range_count++] = (TSRange) { + ts_node_start_point(node), + ts_node_end_point(node), + ts_node_start_byte(node), + ts_node_end_byte(node), + }; + } else { + TSNode code_node = ts_node_named_child(node, 0); + ruby_ranges[ruby_range_count++] = (TSRange) { + ts_node_start_point(code_node), + ts_node_end_point(code_node), + ts_node_start_byte(code_node), + ts_node_end_byte(code_node), + }; + } + } + + // Use the HTML ranges to parse the HTML. + ts_parser_set_language(parser, tree_sitter_html()); + ts_parser_set_included_ranges(parser, html_ranges, html_range_count); + TSTree *html_tree = ts_parser_parse_string(parser, NULL, text, len); + TSNode html_root_node = ts_tree_root_node(html_tree); + + // Use the Ruby ranges to parse the Ruby. + ts_parser_set_language(parser, tree_sitter_ruby()); + ts_parser_set_included_ranges(parser, ruby_ranges, ruby_range_count); + TSTree *ruby_tree = ts_parser_parse_string(parser, NULL, text, len); + TSNode ruby_root_node = ts_tree_root_node(ruby_tree); + + // Print all three trees. + char *erb_sexp = ts_node_string(erb_root_node); + char *html_sexp = ts_node_string(html_root_node); + char *ruby_sexp = ts_node_string(ruby_root_node); + printf("ERB: %s\n", erb_sexp); + printf("HTML: %s\n", html_sexp); + printf("Ruby: %s\n", ruby_sexp); + return 0; +} +``` + ## Concurrency Tree-sitter supports multi-threaded use cases by making syntax trees very cheap to copy.