feat: add error information in the progress callback

This allows users to bail parsing if an error was *definitely* detected
using the progress callback, as all possible stack versions have a
non-zero error cost.

Co-authored-by: Amaan Qureshi <amaanq12@gmail.com>
This commit is contained in:
Allan Clements 2025-01-25 01:21:04 -05:00 committed by Amaan Qureshi
parent ca087d2c07
commit cda634a1c4
9 changed files with 87 additions and 4 deletions

View file

@ -1029,6 +1029,35 @@ fn test_parsing_with_timeout_during_balancing() {
});
}
#[test]
fn test_parsing_with_timeout_when_error_detected() {
let mut parser = Parser::new();
parser.set_language(&get_language("json")).unwrap();
// Parse an infinitely-long array, but insert an error after 1000 characters.
let mut offset = 0;
let erroneous_code = "!,";
let tree = parser.parse_with_options(
&mut |i, _| match i {
0 => "[",
1..=1000 => "0,",
_ => erroneous_code,
},
None,
Some(ParseOptions::new().progress_callback(&mut |state| {
offset = state.current_byte_offset();
state.has_error()
})),
);
// The callback is called at the end of parsing, however, what we're asserting here is that
// parsing ends immediately as the error is detected. This is verified by checking the offset
// of the last byte processed is the length of the erroneous code we inserted, aka, 1002, or
// 1000 + the length of the erroneous code.
assert_eq!(offset, 1000 + erroneous_code.len());
assert!(tree.is_none());
}
// Included Ranges
#[test]

View file

@ -82,6 +82,7 @@ pub struct TSInput {
pub struct TSParseState {
pub payload: *mut ::core::ffi::c_void,
pub current_byte_offset: u32,
pub has_error: bool,
}
#[repr(C)]
#[derive(Debug, Copy, Clone)]

View file

@ -147,6 +147,11 @@ impl ParseState {
pub const fn current_byte_offset(&self) -> usize {
unsafe { self.0.as_ref() }.current_byte_offset as usize
}
#[must_use]
pub const fn has_error(&self) -> bool {
unsafe { self.0.as_ref() }.has_error
}
}
/// A stateful object that is passed into a [`QueryProgressCallback`]

View file

@ -23,9 +23,9 @@ mergeInto(LibraryManager.library, {
}
},
tree_sitter_progress_callback(currentOffset) {
tree_sitter_progress_callback(currentOffset, hasError) {
if (Module.currentProgressCallback) {
return Module.currentProgressCallback({ currentOffset });
return Module.currentProgressCallback({ currentOffset, hasError });
}
return false;
},

View file

@ -139,7 +139,8 @@ extern void tree_sitter_log_callback(
);
extern bool tree_sitter_progress_callback(
uint32_t current_offset
uint32_t current_offset,
bool has_error
);
extern bool tree_sitter_query_progress_callback(
@ -178,7 +179,7 @@ static void call_log_callback(
static bool progress_callback(
TSParseState *state
) {
return tree_sitter_progress_callback(state->current_byte_offset);
return tree_sitter_progress_callback(state->current_byte_offset, state->has_error);
}
static bool query_progress_callback(

View file

@ -53,6 +53,9 @@ export interface ParseOptions {
export interface ParseState {
/** The byte offset in the document that the parser is at. */
currentOffset: number;
/** Indicates whether the parser has encountered an error during parsing. */
hasError: boolean;
}
/**

View file

@ -413,5 +413,34 @@ describe('Parser', () => {
{ progressCallback },
)).toBeNull();
});
it('times out when an error is detected', { timeout: 5000 }, () => {
parser.setLanguage(JSON);
let offset = 0;
const erroneousCode = '!,';
const progressCallback = (state: ParseState) => {
offset = state.currentOffset;
return state.hasError;
};
const tree = parser.parse(
(offset) => {
if (offset === 0) return '[';
if (offset >= 1 && offset < 1000) return '0,';
return erroneousCode;
},
null,
{ progressCallback },
);
// The callback is called at the end of parsing, however, what we're asserting here is that
// parsing ends immediately as the error is detected. This is verified by checking the offset
// of the last byte processed is the length of the erroneous code we inserted, aka, 1002, or
// 1000 + the length of the erroneous code. Note that in this WASM test, we multiply the offset
// by 2 because JavaScript strings are UTF-16 encoded.
expect(offset).toBe((1000 + erroneousCode.length) * 2);
expect(tree).toBeNull();
});
});
});

View file

@ -94,6 +94,7 @@ typedef struct TSInput {
typedef struct TSParseState {
void *payload;
uint32_t current_byte_offset;
bool has_error;
} TSParseState;
typedef struct TSParseOptions {

View file

@ -116,6 +116,7 @@ struct TSParser {
unsigned included_range_difference_index;
bool has_scanner_error;
bool canceled_balancing;
bool has_error;
};
typedef struct {
@ -1419,6 +1420,16 @@ static void ts_parser__recover(
self->stack, version, ts_subtree_last_external_token(lookahead)
);
}
bool has_error = true;
for (unsigned i = 0; i < ts_stack_version_count(self->stack); i++) {
ErrorStatus status = ts_parser__version_status(self, i);
if (!status.is_in_error) {
has_error = false;
break;
}
}
self->has_error = has_error;
}
static void ts_parser__handle_error(
@ -1525,6 +1536,7 @@ static bool ts_parser__check_progress(TSParser *self, Subtree *lookahead, const
}
if (self->parse_options.progress_callback && position != NULL) {
self->parse_state.current_byte_offset = *position;
self->parse_state.has_error = self->has_error;
}
if (
self->operation_count == 0 &&
@ -1929,6 +1941,7 @@ TSParser *ts_parser_new(void) {
self->timeout_duration = 0;
self->language = NULL;
self->has_scanner_error = false;
self->has_error = false;
self->canceled_balancing = false;
self->external_scanner_payload = NULL;
self->end_clock = clock_null();
@ -2066,6 +2079,7 @@ void ts_parser_reset(TSParser *self) {
}
self->accept_count = 0;
self->has_scanner_error = false;
self->has_error = false;
self->parse_options = (TSParseOptions) {0};
self->parse_state = (TSParseState) {0};
}