From 1e79ed794be6012b6492f262fd5b80b4de72042a Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Fri, 8 Aug 2014 23:58:59 -0700 Subject: [PATCH] Allow multiple top-level nodes Now, the root node of a document is always a document node. It will often have only one child node which corresponds to the grammar's start symbol, but not always. Currently, it may have more than one child if there are ubiquitous tokens such as comments at the beginning of the document. In the future, it will also be possible be possible to have multiple for the document to have multiple children if the document is partially parsed. --- examples/parsers/arithmetic.c | 5 ++- examples/parsers/golang.c | 5 ++- examples/parsers/javascript.c | 5 ++- examples/parsers/json.c | 6 ++- include/tree_sitter/runtime.h | 3 +- spec/runtime/document_spec.cc | 8 ++-- spec/runtime/languages/arithmetic/errors.txt | 2 +- .../runtime/languages/javascript/literals.txt | 4 +- spec/runtime/languages/json/errors.txt | 2 +- spec/runtime/languages/language_specs.cc | 2 +- spec/runtime/node_spec.cc | 39 ++++++++++++------- .../build_tables/build_parse_table.cc | 1 + src/compiler/generate_code/c_code.cc | 21 ++++++---- src/compiler/rules/built_in_symbols.cc | 1 + src/compiler/rules/built_in_symbols.h | 1 + src/runtime/parser.c | 10 ++++- src/runtime/stack.c | 2 + 17 files changed, 78 insertions(+), 39 deletions(-) diff --git a/examples/parsers/arithmetic.c b/examples/parsers/arithmetic.c index 7e79db8f..e644fd75 100644 --- a/examples/parsers/arithmetic.c +++ b/examples/parsers/arithmetic.c @@ -1,10 +1,10 @@ #include "tree_sitter/parser.h" #define STATE_COUNT 32 -#define SYMBOL_COUNT 18 +#define SYMBOL_COUNT 19 enum { - ts_sym_expression = ts_start_sym, + ts_sym_expression = ts_builtin_sym_start, ts_sym_sum, ts_sym_difference, ts_sym_product, @@ -23,6 +23,7 @@ enum { }; SYMBOL_NAMES = { + [ts_builtin_sym_document] = "DOCUMENT", [ts_sym_expression] = "expression", [ts_sym_sum] = "sum", [ts_sym_difference] = "difference", diff --git a/examples/parsers/golang.c b/examples/parsers/golang.c index cbc6a529..2b171aaf 100644 --- a/examples/parsers/golang.c +++ b/examples/parsers/golang.c @@ -1,10 +1,10 @@ #include "tree_sitter/parser.h" #define STATE_COUNT 372 -#define SYMBOL_COUNT 87 +#define SYMBOL_COUNT 88 enum { - ts_sym_program = ts_start_sym, + ts_sym_program = ts_builtin_sym_start, ts_sym_package_directive, ts_sym_imports_block, ts_sym_package_import, @@ -92,6 +92,7 @@ enum { }; SYMBOL_NAMES = { + [ts_builtin_sym_document] = "DOCUMENT", [ts_sym_program] = "program", [ts_sym_package_directive] = "package_directive", [ts_sym_imports_block] = "imports_block", diff --git a/examples/parsers/javascript.c b/examples/parsers/javascript.c index bc2a6a13..06fe2a49 100644 --- a/examples/parsers/javascript.c +++ b/examples/parsers/javascript.c @@ -1,10 +1,10 @@ #include "tree_sitter/parser.h" #define STATE_COUNT 2212 -#define SYMBOL_COUNT 109 +#define SYMBOL_COUNT 110 enum { - ts_sym_program = ts_start_sym, + ts_sym_program = ts_builtin_sym_start, ts_sym_statement, ts_sym_statement_block, ts_sym_for_statement, @@ -114,6 +114,7 @@ enum { }; SYMBOL_NAMES = { + [ts_builtin_sym_document] = "DOCUMENT", [ts_sym_program] = "program", [ts_sym_statement] = "statement", [ts_sym_statement_block] = "statement_block", diff --git a/examples/parsers/json.c b/examples/parsers/json.c index aabf1c69..af4c2a42 100644 --- a/examples/parsers/json.c +++ b/examples/parsers/json.c @@ -1,10 +1,10 @@ #include "tree_sitter/parser.h" #define STATE_COUNT 60 -#define SYMBOL_COUNT 18 +#define SYMBOL_COUNT 19 enum { - ts_sym_value = ts_start_sym, + ts_sym_value = ts_builtin_sym_start, ts_sym_object, ts_sym_array, ts_sym_string, @@ -23,6 +23,7 @@ enum { }; SYMBOL_NAMES = { + [ts_builtin_sym_document] = "DOCUMENT", [ts_sym_value] = "value", [ts_sym_object] = "object", [ts_sym_array] = "array", @@ -319,6 +320,7 @@ LEX_FN() { ADVANCE(27); LEX_ERROR(); case ts_lex_state_error: + START_TOKEN(); if (lookahead == 0) ADVANCE(25); if (('\t' <= lookahead && lookahead <= '\n') || diff --git a/include/tree_sitter/runtime.h b/include/tree_sitter/runtime.h index ece23a10..889e2054 100644 --- a/include/tree_sitter/runtime.h +++ b/include/tree_sitter/runtime.h @@ -50,7 +50,8 @@ TSNode *ts_document_root_node(const TSDocument *); #define ts_builtin_sym_error 0 #define ts_builtin_sym_end 1 -#define ts_start_sym 2 +#define ts_builtin_sym_document 2 +#define ts_builtin_sym_start 3 #ifdef __cplusplus } diff --git a/spec/runtime/document_spec.cc b/spec/runtime/document_spec.cc index 795b8edf..b3a28178 100644 --- a/spec/runtime/document_spec.cc +++ b/spec/runtime/document_spec.cc @@ -41,7 +41,7 @@ describe("Document", [&]() { it("parses the document", [&]() { AssertThat(string(ts_node_string(ts_document_root_node(doc))), Equals( - "(object (string) (array (number) (number)))")); + "(DOCUMENT (object (string) (array (number) (number))))")); }); }); }); @@ -62,7 +62,7 @@ describe("Document", [&]() { it("parses the input", [&]() { AssertThat(string(ts_node_string(ts_document_root_node(doc))), Equals( - "(object (string) (array (number) (number)))")); + "(DOCUMENT (object (string) (array (number) (number))))")); }); it("reads the entire input", [&]() { @@ -82,7 +82,7 @@ describe("Document", [&]() { it("updates the parse tree", [&]() { AssertThat(string(ts_node_string(ts_document_root_node(doc))), Equals( - "(object (string) (array (number) (number)) (string) (number))")); + "(DOCUMENT (object (string) (array (number) (number)) (string) (number)))")); }); it("re-reads only the changed portion of the input", [&]() { @@ -102,7 +102,7 @@ describe("Document", [&]() { it("updates the parse tree", [&]() { AssertThat(string(ts_node_string(ts_document_root_node(doc))), Equals( - "(object (string) (number) (string) (array (number) (number)))")); + "(DOCUMENT (object (string) (number) (string) (array (number) (number))))")); }); it_skip("re-reads only the changed portion of the input", [&]() { diff --git a/spec/runtime/languages/arithmetic/errors.txt b/spec/runtime/languages/arithmetic/errors.txt index 56b0bca6..9e9319b1 100644 --- a/spec/runtime/languages/arithmetic/errors.txt +++ b/spec/runtime/languages/arithmetic/errors.txt @@ -3,7 +3,7 @@ recovers from errors at the top level ===================================================== x * * y --- -(ERROR '*') +(variable) (ERROR '*') ===================================================== recovers from errors inside parenthesized expressions diff --git a/spec/runtime/languages/javascript/literals.txt b/spec/runtime/languages/javascript/literals.txt index 41f7da00..1e9dfca2 100644 --- a/spec/runtime/languages/javascript/literals.txt +++ b/spec/runtime/languages/javascript/literals.txt @@ -39,7 +39,7 @@ var x = { (statement_block (var_declaration (identifier) (identifier))))))) ========================================== -parses comments. TODO - leading comments +parses comments ========================================== // this is the beginning of the script. // here we go. @@ -54,6 +54,8 @@ var thing = { } }; --- +(comment) +(comment) (program (var_declaration (identifier) (object (comment) diff --git a/spec/runtime/languages/json/errors.txt b/spec/runtime/languages/json/errors.txt index 567155f5..96a19a18 100644 --- a/spec/runtime/languages/json/errors.txt +++ b/spec/runtime/languages/json/errors.txt @@ -3,7 +3,7 @@ recovers from top-level errors ========================================== [} --- -(ERROR '}') +(ERROR ) (ERROR '}') ========================================== recovers from unexpected tokens diff --git a/spec/runtime/languages/language_specs.cc b/spec/runtime/languages/language_specs.cc index 1c147693..f2386283 100644 --- a/spec/runtime/languages/language_specs.cc +++ b/spec/runtime/languages/language_specs.cc @@ -29,7 +29,7 @@ describe("Languages", [&]() { it(entry.description.c_str(), [&]() { ts_document_set_input_string(doc, entry.input.c_str()); auto doc_string = ts_node_string(ts_document_root_node(doc)); - AssertThat(doc_string, Equals(entry.tree_string.c_str())); + AssertThat(doc_string, Equals(("(DOCUMENT " + entry.tree_string + ")").c_str())); free((void *)doc_string); }); } diff --git a/spec/runtime/node_spec.cc b/spec/runtime/node_spec.cc index 6cc2ce8e..99888006 100644 --- a/spec/runtime/node_spec.cc +++ b/spec/runtime/node_spec.cc @@ -14,7 +14,7 @@ describe("Node", []() { ts_document_set_input_string(document, " [12, 5, 345]"); root = ts_document_root_node(document); - AssertThat(ts_node_string(root), Equals("(array (number) (number) (number))")); + AssertThat(ts_node_string(root), Equals("(DOCUMENT (array (number) (number) (number)))")); }); after_each([&]() { @@ -23,23 +23,28 @@ describe("Node", []() { describe("child_count", [&]() { it("returns the number of visible child nodes", [&]() { - AssertThat(ts_node_child_count(root), Equals(3)); + TSNode *array = ts_node_child(root, 0); + + AssertThat(ts_node_child_count(array), Equals(3)); + + ts_node_release(array); }); }); describe("child(i)", [&]() { it("returns the child node at the given index", [&]() { - TSNode *number1 = ts_node_child(root, 0); - TSNode *number2 = ts_node_child(root, 1); - TSNode *number3 = ts_node_child(root, 2); + TSNode *array = ts_node_child(root, 0); + TSNode *number1 = ts_node_child(array, 0); + TSNode *number2 = ts_node_child(array, 1); + TSNode *number3 = ts_node_child(array, 2); - AssertThat(ts_node_name(root), Equals("array")); + AssertThat(ts_node_name(array), Equals("array")); AssertThat(ts_node_name(number1), Equals("number")); AssertThat(ts_node_name(number2), Equals("number")); AssertThat(ts_node_name(number3), Equals("number")); - AssertThat(ts_node_pos(root), Equals(2)); - AssertThat(ts_node_size(root), Equals(12)); + AssertThat(ts_node_pos(array), Equals(2)); + AssertThat(ts_node_size(array), Equals(12)); AssertThat(ts_node_pos(number1), Equals(3)); AssertThat(ts_node_size(number1), Equals(2)); @@ -50,6 +55,7 @@ describe("Node", []() { AssertThat(ts_node_pos(number3), Equals(10)); AssertThat(ts_node_size(number3), Equals(3)); + ts_node_release(array); ts_node_release(number1); ts_node_release(number2); ts_node_release(number3); @@ -58,21 +64,28 @@ describe("Node", []() { describe("parent", [&]() { it("returns the node's parent node", [&]() { - TSNode *number = ts_node_child(root, 1); - AssertThat(ts_node_parent(number), Equals(root)); + TSNode *array = ts_node_child(root, 0); + TSNode *number = ts_node_child(array, 1); + + AssertThat(ts_node_parent(number), Equals(array)); + AssertThat(ts_node_parent(array), Equals(root)); + + ts_node_release(array); ts_node_release(number); }); }); describe("next_sibling and prev_sibling", [&]() { it("returns the node's next and previous siblings", [&]() { - TSNode *number1 = ts_node_child(root, 0); - TSNode *number2 = ts_node_child(root, 1); - TSNode *number3 = ts_node_child(root, 2); + TSNode *array = ts_node_child(root, 0); + TSNode *number1 = ts_node_child(array, 0); + TSNode *number2 = ts_node_child(array, 1); + TSNode *number3 = ts_node_child(array, 2); AssertThat(ts_node_eq(ts_node_next_sibling(number2), number3), IsTrue()); AssertThat(ts_node_eq(ts_node_prev_sibling(number2), number1), IsTrue()); + ts_node_release(array); ts_node_release(number1); ts_node_release(number2); ts_node_release(number3); diff --git a/src/compiler/build_tables/build_parse_table.cc b/src/compiler/build_tables/build_parse_table.cc index d9118b5a..e9be8f95 100644 --- a/src/compiler/build_tables/build_parse_table.cc +++ b/src/compiler/build_tables/build_parse_table.cc @@ -143,6 +143,7 @@ class ParseTableBuilder { item_set_closure(start_item, { rules::END_OF_INPUT() }, grammar)); parse_table.symbols.insert(rules::ERROR()); + parse_table.symbols.insert(rules::DOCUMENT()); while (!item_sets_to_process.empty()) { auto pair = item_sets_to_process.back(); diff --git a/src/compiler/generate_code/c_code.cc b/src/compiler/generate_code/c_code.cc index d3538991..9573baa8 100644 --- a/src/compiler/generate_code/c_code.cc +++ b/src/compiler/generate_code/c_code.cc @@ -77,7 +77,7 @@ class CCodeGenerator { for (auto symbol : parse_table.symbols) if (!symbol.is_built_in()) { if (at_start) - line(symbol_id(symbol) + " = ts_start_sym,"); + line(symbol_id(symbol) + " = ts_builtin_sym_start,"); else line(symbol_id(symbol) + ","); at_start = false; @@ -88,10 +88,6 @@ class CCodeGenerator { } void symbol_names_list() { - set symbols(parse_table.symbols); - symbols.insert(rules::END_OF_INPUT()); - symbols.insert(rules::ERROR()); - line("SYMBOL_NAMES = {"); indent([&]() { for (auto symbol : parse_table.symbols) @@ -174,8 +170,12 @@ class CCodeGenerator { string symbol_id(const rules::Symbol &symbol) { if (symbol.is_built_in()) { - return (symbol == rules::ERROR()) ? "ts_builtin_sym_error" - : "ts_builtin_sym_end"; + if (symbol == rules::ERROR()) + return "ts_builtin_sym_error"; + else if (symbol == rules::END_OF_INPUT()) + return "ts_builtin_sym_end"; + else + return "ts_builtin_sym_document"; } else { string name = sanitize_name(rule_name(symbol)); if (symbol.is_auxiliary()) @@ -221,7 +221,12 @@ class CCodeGenerator { string symbol_name(const rules::Symbol &symbol) { if (symbol.is_built_in()) { - return (symbol == rules::ERROR()) ? "error" : "end"; + if (symbol == rules::ERROR()) + return "error"; + else if (symbol == rules::END_OF_INPUT()) + return "end"; + else + return "DOCUMENT"; } else if (symbol.is_token() && symbol.is_auxiliary()) { return rule_name(symbol); } else { diff --git a/src/compiler/rules/built_in_symbols.cc b/src/compiler/rules/built_in_symbols.cc index a6befc8e..7a648a3d 100644 --- a/src/compiler/rules/built_in_symbols.cc +++ b/src/compiler/rules/built_in_symbols.cc @@ -6,6 +6,7 @@ namespace rules { Symbol END_OF_INPUT() { return Symbol(-1, SymbolOptionToken); } Symbol ERROR() { return Symbol(-2, SymbolOptionToken); } Symbol START() { return Symbol(-3); } +Symbol DOCUMENT() { return Symbol(-4); } } // namespace rules } // namespace tree_sitter diff --git a/src/compiler/rules/built_in_symbols.h b/src/compiler/rules/built_in_symbols.h index 10f1f2da..63ad3df4 100644 --- a/src/compiler/rules/built_in_symbols.h +++ b/src/compiler/rules/built_in_symbols.h @@ -9,6 +9,7 @@ namespace rules { Symbol ERROR(); Symbol START(); Symbol END_OF_INPUT(); +Symbol DOCUMENT(); } // namespace rules } // namespace tree_sitter diff --git a/src/runtime/parser.c b/src/runtime/parser.c index 3b0da1cf..3c732d49 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -177,7 +177,15 @@ int ts_parser_handle_error(TSParser *parser) { } TSTree *ts_parser_tree_root(TSParser *parser) { - return ts_stack_top_node(&parser->stack); + TSStack *stack = &parser->stack; + if (stack->size == 0) + return NULL; + + TSTree *tree = ts_stack_reduce(stack, ts_builtin_sym_document, + stack->size, parser->language->hidden_symbol_flags); + tree->options = 0; + ts_stack_push(stack, 0, tree); + return tree; } TSParseAction ts_parser_next_action(TSParser *parser) { diff --git a/src/runtime/stack.c b/src/runtime/stack.c index 374b7243..43a20a9d 100644 --- a/src/runtime/stack.c +++ b/src/runtime/stack.c @@ -58,6 +58,8 @@ TSTree *ts_stack_reduce(TSStack *stack, TSSymbol symbol, size_t child_count, // The child node count is known ahead of time, but some children may be // ubiquitous tokens, which don't count. for (size_t i = 0; i < child_count; i++) { + if (child_count == stack->size) + break; TSTree *child = stack->entries[stack->size - 1 - i].node; if (ts_tree_is_extra(child)) child_count++;