tree-sitter/spec/runtime/parser_spec.cc
Max Brunsfeld 1e353381ff Don't create error node in lexer unless token is completely invalid
Before, any syntax error would cause the lexer to create an error
leaf node. This could happen even with a valid input, if the parse
stack had split and one particular version of the parse stack
failed to parse.

Now, an error leaf node is only created when the lexer cannot understand
part of the input stream at all. When a normal syntax error occurs,
the lexer just returns a token that is outside of the expected token
set, and the parser handles the unexpected token.
2016-05-26 14:15:10 -07:00

523 lines
17 KiB
C++

#include "spec_helper.h"
#include "runtime/alloc.h"
#include "helpers/record_alloc.h"
#include "helpers/spy_input.h"
#include "helpers/load_language.h"
#include "helpers/log_debugger.h"
#include "helpers/record_alloc.h"
START_TEST
describe("Parser", [&]() {
TSDocument *doc;
SpyInput *input;
TSNode root;
size_t chunk_size;
before_each([&]() {
record_alloc::start();
chunk_size = 3;
input = nullptr;
doc = ts_document_make();
});
after_each([&]() {
if (doc)
ts_document_free(doc);
if (input)
delete input;
record_alloc::stop();
AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty());
});
auto set_text = [&](const char *text) {
input = new SpyInput(text, chunk_size);
ts_document_set_input(doc, input->input());
AssertThat(ts_document_parse(doc), Equals(0));
root = ts_document_root_node(doc);
AssertThat(ts_node_end_byte(root), Equals(strlen(text)));
input->clear();
};
auto insert_text = [&](size_t position, string text) {
size_t prev_size = ts_node_end_byte(root);
ts_document_edit(doc, input->replace(position, 0, text));
ts_document_parse(doc);
root = ts_document_root_node(doc);
size_t new_size = ts_node_end_byte(root);
AssertThat(new_size, Equals(prev_size + text.size()));
};
auto delete_text = [&](size_t position, size_t length) {
size_t prev_size = ts_node_end_byte(root);
ts_document_edit(doc, input->replace(position, length, ""));
ts_document_parse(doc);
root = ts_document_root_node(doc);
size_t new_size = ts_node_end_byte(root);
AssertThat(new_size, Equals(prev_size - length));
};
auto replace_text = [&](size_t position, size_t length, string new_text) {
size_t prev_size = ts_node_end_byte(root);
ts_document_edit(doc, input->replace(position, length, new_text));
ts_document_parse(doc);
root = ts_document_root_node(doc);
size_t new_size = ts_node_end_byte(root);
AssertThat(new_size, Equals(prev_size - length + new_text.size()));
};
auto assert_root_node = [&](const string &expected) {
TSNode node = ts_document_root_node(doc);
char *str = ts_node_string(node, doc);
string actual(str);
ts_free(str);
AssertThat(actual, Equals(expected));
};
describe("handling errors", [&]() {
before_each([&]() {
ts_document_set_language(doc, get_test_language("json"));
});
auto get_node_text = [&](TSNode node) {
size_t start = ts_node_start_byte(node);
size_t end = ts_node_end_byte(node);
return input->content.substr(start, end - start);
};
describe("when there is an invalid substring right before a valid token", [&]() {
it("computes the error node's size and position correctly", [&]() {
set_text(" [123, @@@@@, true]");
assert_root_node(
"(array (number) (ERROR (UNEXPECTED '@')) (true))");
TSNode error = ts_node_named_child(root, 1);
AssertThat(ts_node_name(error, doc), Equals("ERROR"));
AssertThat(get_node_text(error), Equals("@@@@@,"));
AssertThat(ts_node_child_count(error), Equals<size_t>(2));
AssertThat(ts_node_child_count(error), Equals<size_t>(2));
TSNode garbage = ts_node_child(error, 0);
AssertThat(get_node_text(garbage), Equals("@@@@@"));
TSNode comma = ts_node_child(error, 1);
AssertThat(get_node_text(comma), Equals(","));
TSNode node_after_error = ts_node_named_child(root, 2);
AssertThat(ts_node_name(node_after_error, doc), Equals("true"));
AssertThat(get_node_text(node_after_error), Equals("true"));
});
});
describe("when there is an unexpected string in the middle of a token", [&]() {
it("computes the error node's size and position correctly", [&]() {
set_text(" [123, faaaaalse, true]");
assert_root_node(
"(array (number) (ERROR (UNEXPECTED 'a')) (true))");
TSNode error = ts_node_named_child(root, 1);
AssertThat(ts_node_symbol(error), Equals(ts_builtin_sym_error));
AssertThat(ts_node_name(error, doc), Equals("ERROR"));
AssertThat(get_node_text(error), Equals("faaaaalse,"));
AssertThat(ts_node_child_count(error), Equals<size_t>(2));
TSNode garbage = ts_node_child(error, 0);
AssertThat(ts_node_name(garbage, doc), Equals("ERROR"));
AssertThat(get_node_text(garbage), Equals("faaaaalse"));
TSNode comma = ts_node_child(error, 1);
AssertThat(ts_node_name(comma, doc), Equals(","));
AssertThat(get_node_text(comma), Equals(","));
TSNode last = ts_node_named_child(root, 2);
AssertThat(ts_node_name(last, doc), Equals("true"));
AssertThat(ts_node_start_byte(last), Equals(strlen(" [123, faaaaalse, ")));
});
});
describe("when there is one unexpected token between two valid tokens", [&]() {
it("computes the error node's size and position correctly", [&]() {
set_text(" [123, true false, true]");
assert_root_node(
"(array (number) (ERROR (true)) (false) (true))");
TSNode error = ts_node_named_child(root, 1);
AssertThat(ts_node_name(error, doc), Equals("ERROR"));
AssertThat(get_node_text(error), Equals("true"));
AssertThat(ts_node_child_count(error), Equals<size_t>(1));
TSNode last = ts_node_named_child(root, 2);
AssertThat(ts_node_name(last, doc), Equals("false"));
AssertThat(get_node_text(last), Equals("false"));
});
});
});
describe("handling extra tokens", [&]() {
// In the javascript example grammar, ASI works by using newlines as
// terminators in statements, but also as extra tokens.
before_each([&]() {
ts_document_set_language(doc, get_test_language("javascript"));
});
describe("when the token appears as part of a grammar rule", [&]() {
it("is incorporated into the tree", [&]() {
set_text("fn()\n");
assert_root_node(
"(program (expression_statement (function_call (identifier))))");
});
});
describe("when the token appears somewhere else", [&]() {
it("is incorporated into the tree", [&]() {
set_text(
"fn()\n"
" .otherFn();");
assert_root_node(
"(program (expression_statement (function_call "
"(member_access (function_call (identifier)) (identifier)))))");
});
});
describe("when several extra tokens appear in a row", [&]() {
it("is incorporated into the tree", [&]() {
set_text(
"fn()\n\n"
"// This is a comment"
"\n\n"
".otherFn();");
assert_root_node(
"(program (expression_statement (function_call "
"(member_access (function_call (identifier)) "
"(comment) "
"(identifier)))))");
});
});
});
describe("editing", [&]() {
before_each([&]() {
ts_document_set_language(doc, get_test_language("javascript"));
});
describe("inserting text", [&]() {
describe("creating new tokens near the end of the input", [&]() {
it("updates the parse tree and re-reads only the changed portion of the text", [&]() {
set_text("x * (100 + abc);");
assert_root_node(
"(program (expression_statement (math_op "
"(identifier) "
"(math_op (number) (identifier)))))");
insert_text(strlen("x ^ (100 + abc"), ".d");
assert_root_node(
"(program (expression_statement (math_op "
"(identifier) "
"(math_op (number) (member_access (identifier) (identifier))))))");
AssertThat(input->strings_read, Equals(vector<string>({ " abc.d);", "" })));
});
});
describe("creating new tokens near the beginning of the input", [&]() {
it("updates the parse tree and re-reads only the changed portion of the input", [&]() {
chunk_size = 2;
set_text("123 + 456 * (10 + x);");
assert_root_node(
"(program (expression_statement (math_op "
"(number) "
"(math_op (number) (math_op (number) (identifier))))))");
insert_text(strlen("123"), " || 5");
assert_root_node(
"(program (expression_statement (bool_op "
"(number) "
"(math_op "
"(number) "
"(math_op (number) (math_op (number) (identifier)))))))");
AssertThat(input->strings_read, Equals(vector<string>({ "123 || 5 +", "" })));
});
});
describe("introducing an error", [&]() {
it("gives the error the right size", [&]() {
ts_document_set_language(doc, get_test_language("javascript"));
set_text("var x = y;");
assert_root_node(
"(program (var_declaration (var_assignment "
"(identifier) (identifier))))");
insert_text(strlen("var x = y"), " *");
assert_root_node(
"(program (var_declaration (identifier) (ERROR (identifier))))");
insert_text(strlen("var x = y *"), " z");
assert_root_node(
"(program (var_declaration (var_assignment "
"(identifier) (math_op (identifier) (identifier)))))");
});
});
describe("into the middle of an existing token", [&]() {
it("updates the parse tree", [&]() {
set_text("abc * 123;");
assert_root_node(
"(program (expression_statement (math_op (identifier) (number))))");
insert_text(strlen("ab"), "XYZ");
assert_root_node(
"(program (expression_statement (math_op (identifier) (number))))");
TSNode node = ts_node_named_descendant_for_range(root, 1, 1);
AssertThat(ts_node_name(node, doc), Equals("identifier"));
AssertThat(ts_node_end_byte(node), Equals(strlen("abXYZc")));
});
});
describe("at the end of an existing token", [&]() {
it("updates the parse tree", [&]() {
set_text("abc * 123;");
assert_root_node(
"(program (expression_statement (math_op (identifier) (number))))");
insert_text(strlen("abc"), "XYZ");
assert_root_node(
"(program (expression_statement (math_op (identifier) (number))))");
TSNode node = ts_node_named_descendant_for_range(root, 1, 1);
AssertThat(ts_node_name(node, doc), Equals("identifier"));
AssertThat(ts_node_end_byte(node), Equals(strlen("abcXYZ")));
});
});
describe("with non-ascii characters", [&]() {
it("inserts the text according to the UTF8 character index", [&]() {
// 'αβδ' + '1'
set_text("'\u03b1\u03b2\u03b4' + '1';");
assert_root_node(
"(program (expression_statement (math_op (string) (string))))");
// 'αβδ' + 'ψ1'
insert_text(strlen("'abd' + '"), "\u03c8");
assert_root_node(
"(program (expression_statement (math_op (string) (string))))");
});
});
describe("into a node containing a extra token", [&]() {
it("updates the parse tree", [&]() {
set_text("123 *\n"
"// a-comment\n"
"abc;");
assert_root_node(
"(program (expression_statement (math_op "
"(number) "
"(comment) "
"(identifier))))");
insert_text(
strlen("123 *\n"
"// a-comment\n"
"abc"),
"XYZ");
assert_root_node(
"(program (expression_statement (math_op "
"(number) "
"(comment) "
"(identifier))))");
});
});
});
describe("deleting text", [&]() {
describe("when a critical token is removed", [&]() {
it("updates the parse tree, creating an error", [&]() {
set_text("123 * 456; 789 * 123;");
assert_root_node(
"(program "
"(expression_statement (math_op (number) (number))) "
"(expression_statement (math_op (number) (number))))");
delete_text(strlen("123 "), 2);
assert_root_node(
"(program "
"(expression_statement (number) (ERROR (number))) "
"(expression_statement (math_op (number) (number))))");
});
});
});
describe("replacing text", [&]() {
it("does not try to re-use nodes that are within the edited region", [&]() {
ts_document_set_language(doc, get_test_language("javascript"));
set_text("{ x: (b.c) };");
assert_root_node(
"(program (expression_statement (object (pair "
"(identifier) (member_access (identifier) (identifier))))))");
replace_text(strlen("{ x: "), strlen("(b.c)"), "b.c");
assert_root_node(
"(program (expression_statement (object (pair "
"(identifier) (member_access (identifier) (identifier))))))");
});
});
it("updates the document's parse count", [&]() {
ts_document_set_language(doc, get_test_language("javascript"));
AssertThat(ts_document_parse_count(doc), Equals<size_t>(0));
set_text("{ x: (b.c) };");
AssertThat(ts_document_parse_count(doc), Equals<size_t>(1));
insert_text(strlen("{ x"), "yz");
AssertThat(ts_document_parse_count(doc), Equals<size_t>(2));
});
});
describe("lexing", [&]() {
before_each([&]() {
ts_document_set_language(doc, get_test_language("javascript"));
});
describe("handling tokens containing wildcard patterns (e.g. comments)", [&]() {
it("terminates them at the end of the document", [&]() {
set_text("x; // this is a comment");
assert_root_node(
"(program (expression_statement (identifier)) (comment))");
TSNode comment = ts_node_named_child(root, 1);
AssertThat(ts_node_start_byte(comment), Equals(strlen("x; ")));
AssertThat(ts_node_end_byte(comment), Equals(strlen("x; // this is a comment")));
});
});
it("recognizes UTF8 characters as single characters", [&]() {
// 'ΩΩΩ — ΔΔ';
set_text("'\u03A9\u03A9\u03A9 \u2014 \u0394\u0394';");
assert_root_node(
"(program (expression_statement (string)))");
AssertThat(ts_node_end_char(root), Equals(strlen("'OOO - DD';")));
AssertThat(ts_node_end_byte(root), Equals(strlen("'\u03A9\u03A9\u03A9 \u2014 \u0394\u0394';")));
});
});
describe("handling allocation failures", [&]() {
it("handles failures when allocating documents", [&]() {
record_alloc::start();
TSDocument *document = ts_document_make();
ts_document_free(document);
AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty());
size_t allocation_count = record_alloc::allocation_count();
AssertThat(allocation_count, IsGreaterThan<size_t>(1));
for (size_t i = 0; i < allocation_count; i++) {
record_alloc::start();
record_alloc::fail_at_allocation_index(i);
AssertThat(ts_document_make(), Equals<TSDocument *>(nullptr));
AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty());
}
record_alloc::stop();
});
it("handles allocation failures during parsing", [&]() {
const TSLanguage *language = get_test_language("cpp");
const char *input_string = "int main() { return vector<int *>().size(); }";
string expected_node_string =
"(translation_unit (function_definition "
"(identifier) "
"(function_declarator (identifier)) "
"(compound_statement "
"(return_statement (call_expression (field_expression "
"(call_expression (template_call "
"(identifier) "
"(type_name (identifier) (abstract_pointer_declarator)))) "
"(identifier)))))))";
record_alloc::start();
ts_document_set_language(doc, language);
ts_document_set_input_string(doc, input_string);
AssertThat(ts_document_parse(doc), Equals(0));
size_t allocation_count = record_alloc::allocation_count();
AssertThat(allocation_count, IsGreaterThan<size_t>(1));
assert_root_node(expected_node_string);
ts_document_free(doc);
doc = nullptr;
AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty());
for (size_t i = 0; i < allocation_count; i++) {
record_alloc::stop();
doc = ts_document_make();
record_alloc::start();
record_alloc::fail_at_allocation_index(i);
ts_document_set_language(doc, language);
ts_document_set_input_string(doc, input_string);
AssertThat(ts_document_parse(doc), Equals(-1));
AssertThat(ts_document_root_node(doc).data, Equals<void *>(nullptr));
ts_document_free(doc);
doc = nullptr;
AssertThat(record_alloc::outstanding_allocation_indices(), IsEmpty());
}
record_alloc::stop();
doc = ts_document_make();
record_alloc::start();
record_alloc::fail_at_allocation_index(allocation_count + 1);
ts_document_set_language(doc, language);
ts_document_set_input_string(doc, input_string);
AssertThat(ts_document_parse(doc), Equals(0));
assert_root_node(expected_node_string);
});
});
});
END_TEST