diff --git a/cli/src/tests/helpers/scope_sequence.rs b/cli/src/tests/helpers/scope_sequence.rs index 2f904025..b204c568 100644 --- a/cli/src/tests/helpers/scope_sequence.rs +++ b/cli/src/tests/helpers/scope_sequence.rs @@ -58,7 +58,7 @@ impl ScopeSequence { let mut position = Point { row: 0, column: 0 }; for (i, stack) in self.0.iter().enumerate() { let other_stack = &other.0[i]; - if *stack != *other_stack { + if *stack != *other_stack && ![b'\r', b'\n'].contains(&text[i]) { let containing_range = known_changed_ranges .iter() .find(|range| range.start_point <= position && position < range.end_point); diff --git a/lib/src/language.h b/lib/src/language.h index 1b65f25f..7234685e 100644 --- a/lib/src/language.h +++ b/lib/src/language.h @@ -283,6 +283,31 @@ static inline void ts_language_aliases_for_symbol( } } +static inline void ts_language_write_symbol_as_dot_string( + const TSLanguage *self, + FILE *f, + TSSymbol symbol +) { + const char *name = ts_language_symbol_name(self, symbol); + for (const char *c = name; *c; c++) { + switch (*c) { + case '"': + case '\\': + fputc('\\', f); + fputc(*c, f); + break; + case '\n': + fputs("\\n", f); + break; + case '\t': + fputs("\\n", f); + break; + default: + fputc(*c, f); + break; + } + } +} #ifdef __cplusplus } diff --git a/lib/src/parser.c b/lib/src/parser.c index 6fad1664..a45204e1 100644 --- a/lib/src/parser.c +++ b/lib/src/parser.c @@ -160,7 +160,7 @@ static void ts_parser__log(TSParser *self) { if (self->dot_graph_file) { fprintf(self->dot_graph_file, "graph {\nlabel=\""); for (char *c = &self->lexer.debug_buffer[0]; *c != 0; c++) { - if (*c == '"') fputc('\\', self->dot_graph_file); + if (*c == '"' || *c == '\\') fputc('\\', self->dot_graph_file); fputc(*c, self->dot_graph_file); } fprintf(self->dot_graph_file, "\"\n}\n\n"); @@ -393,8 +393,8 @@ static Subtree ts_parser__lex( return NULL_SUBTREE; } - Length start_position = ts_stack_position(self->stack, version); - Subtree external_token = ts_stack_last_external_token(self->stack, version); + const Length start_position = ts_stack_position(self->stack, version); + const Subtree external_token = ts_stack_last_external_token(self->stack, version); const bool *valid_external_tokens = ts_language_enabled_external_tokens( self->language, lex_mode.external_lex_state @@ -408,6 +408,8 @@ static Subtree ts_parser__lex( Length error_start_position = length_zero(); Length error_end_position = length_zero(); uint32_t lookahead_end_byte = 0; + uint32_t external_scanner_state_len = 0; + bool external_scanner_state_changed = false; ts_lexer_reset(&self->lexer, start_position); for (;;) { @@ -429,22 +431,36 @@ static Subtree ts_parser__lex( ); ts_lexer_finish(&self->lexer, &lookahead_end_byte); - // Zero-length external tokens are generally allowed, but they're not - // allowed right after a syntax error. This is for two reasons: - // 1. After a syntax error, the lexer is looking for any possible token, - // as opposed to the specific set of tokens that are valid in some - // parse state. In this situation, it's very easy for an external - // scanner to produce unwanted zero-length tokens. - // 2. The parser sometimes inserts *missing* tokens to recover from - // errors. These tokens are also zero-length. If we allow more - // zero-length tokens to be created after missing tokens, it - // can lead to infinite loops. Forbidding zero-length tokens - // right at the point of error recovery is a conservative strategy - // for preventing this kind of infinite loop. - if (found_token && ( - self->lexer.token_end_position.bytes > current_position.bytes || - (!error_mode && ts_stack_has_advanced_since_error(self->stack, version)) - )) { + if (found_token) { + external_scanner_state_len = self->language->external_scanner.serialize( + self->external_scanner_payload, + self->lexer.debug_buffer + ); + external_scanner_state_changed = !ts_external_scanner_state_eq( + ts_subtree_external_scanner_state(external_token), + self->lexer.debug_buffer, + external_scanner_state_len + ); + + // When recovering from an error, ignore any zero-length external tokens + // unless they have changed the external scanner's state. This helps to + // avoid infinite loops which could otherwise occur, because the lexer is + // looking for any possible token, instead of looking for the specific set of + // tokens that are valid in some parse state. + if ( + self->lexer.token_end_position.bytes == current_position.bytes && + (error_mode || !ts_stack_has_advanced_since_error(self->stack, version)) && + !external_scanner_state_changed + ) { + LOG( + "ignore_empty_external_token symbol:%s", + SYM_NAME(self->language->external_scanner.symbol_map[self->lexer.data.result_symbol]) + ) + found_token = false; + } + } + + if (found_token) { found_external_token = true; called_get_column = self->lexer.did_get_column; break; @@ -508,11 +524,6 @@ static Subtree ts_parser__lex( parse_state, self->language ); - - LOG_LOOKAHEAD( - SYM_NAME(ts_subtree_symbol(result)), - ts_subtree_total_size(result).bytes - ); } else { if (self->lexer.token_end_position.bytes < self->lexer.token_start_position.bytes) { self->lexer.token_start_position = self->lexer.token_end_position; @@ -554,23 +565,20 @@ static Subtree ts_parser__lex( ); if (found_external_token) { - unsigned length = self->language->external_scanner.serialize( - self->external_scanner_payload, - self->lexer.debug_buffer - ); + MutableSubtree mut_result = ts_subtree_to_mut_unsafe(result); ts_external_scanner_state_init( - &((SubtreeHeapData *)result.ptr)->external_scanner_state, + &mut_result.ptr->external_scanner_state, self->lexer.debug_buffer, - length + external_scanner_state_len ); + mut_result.ptr->has_external_scanner_state_change = external_scanner_state_changed; } - - LOG_LOOKAHEAD( - SYM_NAME(ts_subtree_symbol(result)), - ts_subtree_total_size(result).bytes - ); } + LOG_LOOKAHEAD( + SYM_NAME(ts_subtree_symbol(result)), + ts_subtree_total_size(result).bytes + ); return result; } @@ -1205,6 +1213,15 @@ static void ts_parser__recover( return; } + if ( + did_recover && + ts_subtree_has_external_scanner_state_change(lookahead) + ) { + ts_stack_halt(self->stack, version); + ts_subtree_release(&self->tree_pool, lookahead); + return; + } + // If the parser is still in the error state at the end of the file, just wrap everything // in an ERROR node and terminate. if (ts_subtree_is_eof(lookahead)) { @@ -1539,6 +1556,13 @@ static bool ts_parser__advance( continue; } + // A non-terminal extra rule was reduced and merged into an existing + // stack version. This version can be discarded. + if (!lookahead.ptr) { + ts_stack_halt(self->stack, version); + return true; + } + // If there were no parse actions for the current lookahead token, then // it is not valid in this state. If the current lookahead token is a // keyword, then switch to treating it as the normal word token if that @@ -1928,6 +1952,7 @@ TSTree *ts_parser_parse( } } while (version_count != 0); + assert(self->finished_tree.ptr); ts_subtree_balance(self->finished_tree, &self->tree_pool, self->language); LOG("done"); LOG_TREE(self->finished_tree); diff --git a/lib/src/stack.c b/lib/src/stack.c index 2a11abd8..6962a157 100644 --- a/lib/src/stack.c +++ b/lib/src/stack.c @@ -846,11 +846,7 @@ bool ts_stack_print_dot_graph(Stack *self, const TSLanguage *language, FILE *f) fprintf(f, "label=\""); bool quoted = ts_subtree_visible(link.subtree) && !ts_subtree_named(link.subtree); if (quoted) fprintf(f, "'"); - const char *name = ts_language_symbol_name(language, ts_subtree_symbol(link.subtree)); - for (const char *c = name; *c; c++) { - if (*c == '\"' || *c == '\\') fprintf(f, "\\"); - fprintf(f, "%c", *c); - } + ts_language_write_symbol_as_dot_string(language, f, ts_subtree_symbol(link.subtree)); if (quoted) fprintf(f, "'"); fprintf(f, "\""); fprintf( diff --git a/lib/src/subtree.c b/lib/src/subtree.c index 1e53f7d2..d6cd2d71 100644 --- a/lib/src/subtree.c +++ b/lib/src/subtree.c @@ -21,8 +21,6 @@ typedef struct { #define TS_MAX_INLINE_TREE_LENGTH UINT8_MAX #define TS_MAX_TREE_POOL_SIZE 32 -static const ExternalScannerState empty_state = {{.short_data = {0}}, .length = 0}; - // ExternalScannerState void ts_external_scanner_state_init(ExternalScannerState *self, const char *data, unsigned length) { @@ -58,11 +56,10 @@ const char *ts_external_scanner_state_data(const ExternalScannerState *self) { } } -bool ts_external_scanner_state_eq(const ExternalScannerState *a, const ExternalScannerState *b) { - return a == b || ( - a->length == b->length && - !memcmp(ts_external_scanner_state_data(a), ts_external_scanner_state_data(b), a->length) - ); +bool ts_external_scanner_state_eq(const ExternalScannerState *a, const char *buffer, unsigned length) { + return + a->length == length && + memcmp(ts_external_scanner_state_data(a), buffer, length) == 0; } // SubtreeArray @@ -214,6 +211,7 @@ Subtree ts_subtree_new_leaf( .fragile_right = false, .has_changes = false, .has_external_tokens = has_external_tokens, + .has_external_scanner_state_change = false, .depends_on_column = depends_on_column, .is_missing = false, .is_keyword = is_keyword, @@ -381,6 +379,7 @@ void ts_subtree_summarize_children( self.ptr->node_count = 1; self.ptr->has_external_tokens = false; self.ptr->depends_on_column = false; + self.ptr->has_external_scanner_state_change = false; self.ptr->dynamic_precedence = 0; uint32_t structural_index = 0; @@ -398,6 +397,10 @@ void ts_subtree_summarize_children( self.ptr->depends_on_column = true; } + if (ts_subtree_has_external_scanner_state_change(child)) { + self.ptr->has_external_scanner_state_change = true; + } + if (i == 0) { self.ptr->padding = ts_subtree_padding(child); self.ptr->size = ts_subtree_size(child); @@ -521,6 +524,7 @@ MutableSubtree ts_subtree_new_node( .visible = metadata.visible, .named = metadata.named, .has_changes = false, + .has_external_scanner_state_change = false, .fragile_left = fragile, .fragile_right = fragile, .is_keyword = false, @@ -830,18 +834,6 @@ static size_t ts_subtree__write_char_to_string(char *s, size_t n, int32_t c) { return snprintf(s, n, "%d", c); } -static void ts_subtree__write_dot_string(FILE *f, const char *string) { - for (const char *c = string; *c; c++) { - if (*c == '"') { - fputs("\\\"", f); - } else if (*c == '\n') { - fputs("\\n", f); - } else { - fputc(*c, f); - } - } -} - static const char *ROOT_FIELD = "__ROOT__"; static size_t ts_subtree__write_to_string( @@ -971,7 +963,7 @@ void ts_subtree__print_dot_graph(const Subtree *self, uint32_t start_offset, TSSymbol symbol = alias_symbol ? alias_symbol : subtree_symbol; uint32_t end_offset = start_offset + ts_subtree_total_bytes(*self); fprintf(f, "tree_%p [label=\"", (void *)self); - ts_subtree__write_dot_string(f, ts_language_symbol_name(language, symbol)); + ts_language_write_symbol_as_dot_string(language, f, symbol); fprintf(f, "\""); if (ts_subtree_child_count(*self) == 0) fprintf(f, ", shape=plaintext"); @@ -1024,14 +1016,26 @@ void ts_subtree_print_dot_graph(Subtree self, const TSLanguage *language, FILE * fprintf(f, "}\n"); } -bool ts_subtree_external_scanner_state_eq(Subtree self, Subtree other) { - const ExternalScannerState *state1 = &empty_state; - const ExternalScannerState *state2 = &empty_state; - if (self.ptr && ts_subtree_has_external_tokens(self) && !self.ptr->child_count) { - state1 = &self.ptr->external_scanner_state; +const ExternalScannerState *ts_subtree_external_scanner_state(Subtree self) { + static const ExternalScannerState empty_state = {{.short_data = {0}}, .length = 0}; + if ( + self.ptr && + !self.data.is_inline && + self.ptr->has_external_tokens && + self.ptr->child_count == 0 + ) { + return &self.ptr->external_scanner_state; + } else { + return &empty_state; } - if (other.ptr && ts_subtree_has_external_tokens(other) && !other.ptr->child_count) { - state2 = &other.ptr->external_scanner_state; - } - return ts_external_scanner_state_eq(state1, state2); +} + +bool ts_subtree_external_scanner_state_eq(Subtree a, Subtree b) { + const ExternalScannerState *state_a = ts_subtree_external_scanner_state(a); + const ExternalScannerState *state_b = ts_subtree_external_scanner_state(b); + return ts_external_scanner_state_eq( + state_a, + ts_external_scanner_state_data(state_b), + state_b->length + ); } diff --git a/lib/src/subtree.h b/lib/src/subtree.h index 5e246882..4a02b8fe 100644 --- a/lib/src/subtree.h +++ b/lib/src/subtree.h @@ -114,7 +114,7 @@ typedef struct { Length size; uint32_t lookahead_bytes; uint32_t error_cost; - uint32_t child_count; + uint16_t child_count; TSSymbol symbol; TSStateId parse_state; @@ -125,6 +125,7 @@ typedef struct { bool fragile_right : 1; bool has_changes : 1; bool has_external_tokens : 1; + bool has_external_scanner_state_change : 1; bool depends_on_column: 1; bool is_missing : 1; bool is_keyword : 1; @@ -135,8 +136,8 @@ typedef struct { uint32_t visible_child_count; uint32_t named_child_count; uint32_t node_count; - uint32_t repeat_depth; int32_t dynamic_precedence; + uint16_t repeat_depth; uint16_t production_id; struct { TSSymbol symbol; @@ -174,6 +175,8 @@ typedef struct { void ts_external_scanner_state_init(ExternalScannerState *, const char *, unsigned); const char *ts_external_scanner_state_data(const ExternalScannerState *); +bool ts_external_scanner_state_eq(const ExternalScannerState *a, const char *, unsigned); +void ts_external_scanner_state_delete(ExternalScannerState *self); void ts_subtree_array_copy(SubtreeArray, SubtreeArray *); void ts_subtree_array_clear(SubtreePool *, SubtreeArray *); @@ -206,6 +209,7 @@ Subtree ts_subtree_edit(Subtree, const TSInputEdit *edit, SubtreePool *); char *ts_subtree_string(Subtree, const TSLanguage *, bool include_all); void ts_subtree_print_dot_graph(Subtree, const TSLanguage *, FILE *); Subtree ts_subtree_last_external_token(Subtree); +const ExternalScannerState *ts_subtree_external_scanner_state(Subtree self); bool ts_subtree_external_scanner_state_eq(Subtree, Subtree); #define SUBTREE_GET(self, name) (self.data.is_inline ? self.data.name : self.ptr->name) @@ -331,6 +335,10 @@ static inline bool ts_subtree_has_external_tokens(Subtree self) { return self.data.is_inline ? false : self.ptr->has_external_tokens; } +static inline bool ts_subtree_has_external_scanner_state_change(Subtree self) { + return self.data.is_inline ? false : self.ptr->has_external_scanner_state_change; +} + static inline bool ts_subtree_depends_on_column(Subtree self) { return self.data.is_inline ? false : self.ptr->depends_on_column; } diff --git a/script/test b/script/test index df7f643b..619c190f 100755 --- a/script/test +++ b/script/test @@ -14,11 +14,9 @@ OPTIONS -a Compile C code with the Clang static analyzer - -l Run only the corpus tests for the given language - -e Run only the corpus tests whose name contain the given string - -t Run only the given trial number of randomized test + -i Run the given number of iterations of randomized tests (default 10) -s Set the seed used to control random behavior @@ -36,7 +34,7 @@ export RUST_BACKTRACE=full mode=normal test_flags="" -while getopts "adDghl:e:s:t:" option; do +while getopts "adDghl:e:s:i:" option; do case ${option} in h) usage @@ -62,6 +60,9 @@ while getopts "adDghl:e:s:t:" option; do s) export TREE_SITTER_SEED=${OPTARG} ;; + i) + export TREE_SITTER_ITERATIONS=${OPTARG} + ;; d) export TREE_SITTER_LOG=1 ;; diff --git a/test/fixtures/error_corpus/python_errors.txt b/test/fixtures/error_corpus/python_errors.txt index 957a4c86..edabb510 100644 --- a/test/fixtures/error_corpus/python_errors.txt +++ b/test/fixtures/error_corpus/python_errors.txt @@ -1,29 +1,111 @@ -========================================== -errors in if statements -========================================== +============================================= +incomplete condition in if statement +============================================= if a is: - print b - print c + print b + print c +print d --- (module - (if_statement (identifier) (ERROR) (block - (print_statement (identifier)) - (print_statement (identifier))))) + (if_statement + condition: (identifier) + (ERROR) + consequence: (block + (print_statement argument: (identifier)) + (print_statement argument: (identifier)))) + (print_statement argument: (identifier))) ========================================== -errors in function definitions +extra colon in function definition ========================================== def a():: b c +d --- (module - (function_definition (identifier) (parameters) (ERROR) (block - (expression_statement (identifier)) - (expression_statement (identifier))))) + (function_definition + name: (identifier) + parameters: (parameters) + (ERROR) + body: (block + (expression_statement (identifier)) + (expression_statement (identifier)))) + (expression_statement (identifier))) + +======================================================== +stray if keyword in function definition +======================================================== + +def a(): + if + +--- + +(module + (function_definition + name: (identifier) + parameters: (parameters) + (ERROR) + body: (block))) + +======================================================== +incomplete if statement in function definition +======================================================== + +def a(): + if a + +--- + +(module + (function_definition + name: (identifier) + parameters: (parameters) + (ERROR (identifier)) + body: (block))) + +======================================================== +incomplete expression before triple-quoted string +======================================================== + +def a(): + b. + """ + c + """ + +--- + +(module + (function_definition + name: (identifier) + parameters: (parameters) + (ERROR (identifier)) + body: (block + (expression_statement (string))))) + +=========================================== +incomplete definition in class definition +=========================================== + +class A: + def + +b + +--- + +(module + (class_definition + name: (identifier) + (ERROR) + body: (block)) + (expression_statement + (identifier))) \ No newline at end of file