diff --git a/spec/runtime/document_spec.cc b/spec/runtime/document_spec.cc index bffc8e3b..13c007b9 100644 --- a/spec/runtime/document_spec.cc +++ b/spec/runtime/document_spec.cc @@ -2,6 +2,7 @@ #include "runtime/helpers/spy_reader.h" extern "C" const TSLanguage * ts_language_json(); +extern "C" const TSLanguage * ts_language_javascript(); START_TEST @@ -116,13 +117,13 @@ describe("Document", [&]() { describe("parsing", [&]() { TSNode *root; - describe("error handling", [&]() { + describe("handling errors", [&]() { before_each([&]() { ts_document_set_language(doc, ts_language_json()); }); describe("when the error occurs at the beginning of a token", [&]() { - it("computes the error node's size and position correctly 1", [&]() { + it("computes the error node's size and position correctly", [&]() { ts_document_set_input_string(doc, " [123, @@@@@, true]"); AssertThat(ts_node_string(ts_document_root_node(doc)), Equals( "(DOCUMENT (array (number) (ERROR '@') (true)))")); @@ -130,37 +131,47 @@ describe("Document", [&]() { root = ts_document_root_node(doc); TSNode *array = ts_node_child(root, 0); TSNode *error = ts_node_child(array, 1); + TSNode *last = ts_node_child(array, 2); AssertThat(ts_node_name(error), Equals("error")); AssertThat(ts_node_pos(error), Equals(string(" [123,").length())) AssertThat(ts_node_size(error), Equals(string(" @@@@@").length())) + AssertThat(ts_node_name(last), Equals("true")); + AssertThat(ts_node_pos(last), Equals(string(" [123, @@@@@, ").length())) + + ts_node_release(last); ts_node_release(error); ts_node_release(array); }); }); describe("when the error occurs in the middle of a token", [&]() { - it("computes the error node's size and position correctly 2", [&]() { - ts_document_set_input_string(doc, " [123, total nonsense, true]"); + it("computes the error node's size and position correctly", [&]() { + ts_document_set_input_string(doc, " [123, faaaaalse, true]"); AssertThat(ts_node_string(ts_document_root_node(doc)), Equals( - "(DOCUMENT (array (number) (ERROR 'o') (true)))")); + "(DOCUMENT (array (number) (ERROR 'a') (true)))")); root = ts_document_root_node(doc); TSNode *array = ts_node_child(root, 0); TSNode *error = ts_node_child(array, 1); + TSNode *last = ts_node_child(array, 2); AssertThat(ts_node_name(error), Equals("error")); AssertThat(ts_node_pos(error), Equals(string(" [123,").length())) - AssertThat(ts_node_size(error), Equals(string(" total nonsense").length())) + AssertThat(ts_node_size(error), Equals(string(" faaaaalse").length())) + AssertThat(ts_node_name(last), Equals("true")); + AssertThat(ts_node_pos(last), Equals(string(" [123, faaaaalse, ").length())) + + ts_node_release(last); ts_node_release(error); ts_node_release(array); }); }); describe("when the error occurs after one or more tokens", [&]() { - it("computes the error node's size and position correctly 3", [&]() { + it("computes the error node's size and position correctly", [&]() { ts_document_set_input_string(doc, " [123, true false, true]"); AssertThat(ts_node_string(ts_document_root_node(doc)), Equals( "(DOCUMENT (array (number) (ERROR 'f') (true)))")); @@ -168,15 +179,89 @@ describe("Document", [&]() { root = ts_document_root_node(doc); TSNode *array = ts_node_child(root, 0); TSNode *error = ts_node_child(array, 1); + TSNode *last = ts_node_child(array, 2); AssertThat(ts_node_name(error), Equals("error")); AssertThat(ts_node_pos(error), Equals(string(" [123,").length())) AssertThat(ts_node_size(error), Equals(string(" true false").length())) + AssertThat(ts_node_name(last), Equals("true")); + AssertThat(ts_node_pos(last), Equals(string(" [123, true false, ").length())) + + ts_node_release(last); ts_node_release(error); ts_node_release(array); }); }); + + describe("when the error is an empty string", [&]() { + it("computes the error node's size and position correctly", [&]() { + ts_document_set_input_string(doc, " [123, , true]"); + AssertThat(ts_node_string(ts_document_root_node(doc)), Equals( + "(DOCUMENT (array (number) (ERROR ',') (true)))")); + + root = ts_document_root_node(doc); + TSNode *array = ts_node_child(root, 0); + TSNode *error = ts_node_child(array, 1); + TSNode *last = ts_node_child(array, 2); + + AssertThat(ts_node_name(error), Equals("error")); + AssertThat(ts_node_pos(error), Equals(string(" [123,").length())) + AssertThat(ts_node_size(error), Equals(string(" ").length())) + + AssertThat(ts_node_name(last), Equals("true")); + AssertThat(ts_node_pos(last), Equals(string(" [123, , ").length())) + + ts_node_release(last); + ts_node_release(error); + ts_node_release(array); + }); + }); + }); + + describe("handling ubiquitous tokens", [&]() { + + // In the javascript example grammar, ASI works by using newlines as + // terminators in statements, but also as ubiquitous tokens. + before_each([&]() { + ts_document_set_language(doc, ts_language_javascript()); + }); + + describe("when the token appears as part of a grammar rule", [&]() { + it("is incorporated into the tree", [&]() { + ts_document_set_input_string(doc, "fn()\n"); + AssertThat(ts_node_string(ts_document_root_node(doc)), Equals( + "(DOCUMENT (program (expression_statement (function_call (identifier)))))")); + }); + }); + + describe("when the token appears somewhere else", [&]() { + it("is incorporated into the tree", [&]() { + ts_document_set_input_string(doc, + "fn()\n" + " .otherFn();"); + AssertThat(ts_node_string(ts_document_root_node(doc)), Equals( + "(DOCUMENT (program " + "(expression_statement (function_call " + "(property_access (function_call (identifier)) (identifier))))))")); + }); + + describe("when several ubiquitous tokens appear in a row", [&]() { + it("is incorporated into the tree", [&]() { + ts_document_set_input_string(doc, + "fn()\n\n" + "// This is a comment" + "\n\n" + ".otherFn();"); + AssertThat(ts_node_string(ts_document_root_node(doc)), Equals( + "(DOCUMENT (program " + "(expression_statement (function_call " + "(property_access (function_call (identifier)) " + "(comment) " + "(identifier))))))")); + }); + }); + }); }); }); }); diff --git a/src/runtime/parser.c b/src/runtime/parser.c index ee555fab..54beae41 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -110,8 +110,15 @@ static void reduce(TSParser *parser, TSSymbol symbol, size_t child_count) { } static int reduce_extra(TSParser *parser, TSSymbol symbol) { - TSTree *top_node = ts_stack_top_node(&parser->stack); - if (top_node->symbol == symbol && !ts_tree_is_extra(top_node)) { + TSTree *last_node = NULL; + TS_STACK_FROM_TOP(parser->stack, entry, i) { + if (!ts_tree_is_extra(entry->node)) { + last_node = entry->node; + break; + } + } + + if (last_node && last_node->symbol == symbol) { reduce(parser, symbol, 1); ts_tree_set_extra(parser->lookahead); return 1; @@ -136,28 +143,13 @@ static int handle_error(TSParser *parser) { for (;;) { - /* - * If there is no state in the stack for which we can recover with the - * current lookahead token, advance to the next token. If no characters - * were consumed, advance the lexer to the next character. - */ - size_t prev_position = ts_lexer_position(&parser->lexer); - lex(parser, ts_lex_state_error); - if (ts_lexer_position(&parser->lexer) == prev_position) - if (!ts_lexer_advance(&parser->lexer)) { - DEBUG_PARSE("FAIL TO RECOVER"); - ts_stack_push(&parser->stack, 0, error); - ts_tree_release(error); - return 0; - } - /* * Unwind the parse stack until a state is found in which an error is * expected and the current lookahead token is expected afterwards. */ size_t error_start_pos = last_token_end; - for (size_t i = parser->stack.size - 1; i + 1 > 0; i--) { - TSStateId state = parser->stack.entries[i].state; + TS_STACK_FROM_TOP(parser->stack, entry, i) { + TSStateId state = entry->state; TSParseAction action_on_error = actions_for_state(parser->language, state)[ts_builtin_sym_error]; @@ -168,7 +160,8 @@ static int handle_error(TSParser *parser) { if (action_after_error.type != TSParseActionTypeError) { DEBUG_PARSE("RECOVER %u", state_after_error); - error->size = ts_lexer_position(&parser->lexer) - error_start_pos - 1; + size_t current_position = ts_lexer_position(&parser->lexer); + error->size = current_position - 1 - error_start_pos; ts_stack_shrink(&parser->stack, i + 1); ts_stack_push(&parser->stack, state_after_error, error); ts_tree_release(error); @@ -176,9 +169,25 @@ static int handle_error(TSParser *parser) { } } - TSTree *removed_tree = parser->stack.entries[i].node; + TSTree *removed_tree = entry->node; error_start_pos -= ts_tree_total_size(removed_tree); } + + /* + * If there is no state in the stack for which we can recover with the + * current lookahead token, advance to the next token. If no characters + * were consumed, advance the lexer to the next character. + */ + size_t prev_position = ts_lexer_position(&parser->lexer); + lex(parser, ts_lex_state_error); + parser->lookahead->padding = 0; + if (ts_lexer_position(&parser->lexer) == prev_position) + if (!ts_lexer_advance(&parser->lexer)) { + DEBUG_PARSE("FAIL TO RECOVER"); + ts_stack_push(&parser->stack, 0, error); + ts_tree_release(error); + return 0; + } } } @@ -256,8 +265,12 @@ const TSTree *ts_parser_parse(TSParser *parser, TSInput input, break; case TSParseActionTypeReduceExtra: + if (!reduce_extra(parser, action.data.symbol)) { + DEBUG_PARSE("ERROR"); + if (!handle_error(parser)) + return get_root(parser); + } DEBUG_PARSE("REDUCE EXTRA"); - reduce_extra(parser, action.data.symbol); break; case TSParseActionTypeAccept: diff --git a/src/runtime/stack.c b/src/runtime/stack.c index e272157c..c66beb39 100644 --- a/src/runtime/stack.c +++ b/src/runtime/stack.c @@ -6,10 +6,10 @@ static size_t INITIAL_STACK_SIZE = 100; static TSStateId INITIAL_STATE = 0; TSStack ts_stack_make() { - TSStack result = { - .entries = calloc(INITIAL_STACK_SIZE, sizeof(*result.entries)), .size = 0, - .capacity = INITIAL_STACK_SIZE, - }; + TSStack result = { .size = 0, + .capacity = INITIAL_STACK_SIZE, + .entries = + calloc(INITIAL_STACK_SIZE, sizeof(*result.entries)), }; return result; } @@ -33,7 +33,8 @@ TSTree *ts_stack_top_node(const TSStack *stack) { void ts_stack_push(TSStack *stack, TSStateId state, TSTree *node) { if (stack->size == stack->capacity) { stack->capacity *= 2; - stack->entries = realloc(stack->entries, stack->capacity * sizeof(*stack->entries)); + stack->entries = + realloc(stack->entries, stack->capacity * sizeof(*stack->entries)); } stack->entries[stack->size].state = state; stack->entries[stack->size].node = node; diff --git a/src/runtime/stack.h b/src/runtime/stack.h index 2ba2582a..adaab82a 100644 --- a/src/runtime/stack.h +++ b/src/runtime/stack.h @@ -7,14 +7,15 @@ extern "C" { #include "tree_sitter/parser.h" +typedef struct { + TSTree *node; + TSStateId state; +} TSStackEntry; + typedef struct { size_t size; size_t capacity; - struct { - TSTree *node; - TSStateId state; - int is_extra; - } *entries; + TSStackEntry *entries; } TSStack; TSStack ts_stack_make(); @@ -25,6 +26,11 @@ TSStateId ts_stack_top_state(const TSStack *stack); TSTree *ts_stack_top_node(const TSStack *stack); size_t ts_stack_right_position(const TSStack *stack); +#define TS_STACK_FROM_TOP(stack, entry, index) \ + size_t index = stack.size - 1; \ + for (TSStackEntry *entry = stack.entries + stack.size - 1; \ + entry >= stack.entries; entry-- && index--) + #ifdef __cplusplus } #endif