feat: allow external scanners to use the logger

Co-authored-by: Amaan Qureshi <amaanq12@gmail.com>
This commit is contained in:
Ron Panduwana 2024-08-18 01:46:28 +07:00 committed by GitHub
parent fec6c77da8
commit 2bb20fe2fe
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 40 additions and 0 deletions

View file

@ -1422,6 +1422,30 @@ if foo && bar || baz {}
parser.parse(&input, Some(&tree)).unwrap();
}
#[test]
fn test_parsing_with_scanner_logging() {
let dir = fixtures_dir().join("test_grammars").join("external_tokens");
let grammar_json = load_grammar_file(&dir.join("grammar.js"), None).unwrap();
let (grammar_name, parser_code) = generate_parser_for_grammar(&grammar_json).unwrap();
let mut parser = Parser::new();
parser
.set_language(&get_test_language(&grammar_name, &parser_code, Some(&dir)))
.unwrap();
let mut found = false;
parser.set_logger(Some(Box::new(|log_type, message| {
if log_type == LogType::Lex && message == "Found a percent string" {
found = true;
}
})));
let source_code = "x + %(sup (external) scanner?)";
parser.parse(source_code, None).unwrap();
assert!(found);
}
const fn simple_range(start: usize, end: usize) -> Range {
Range {
start_byte: start,

View file

@ -862,6 +862,7 @@ This function is responsible for recognizing external tokens. It should return `
* **`uint32_t (*get_column)(TSLexer *)`** - A function for querying the current column position of the lexer. It returns the number of codepoints since the start of the current line. The codepoint position is recalculated on every call to this function by reading from the start of the line.
* **`bool (*is_at_included_range_start)(const TSLexer *)`** - A function for checking whether the parser has just skipped some characters in the document. When parsing an embedded document using the `ts_parser_set_included_ranges` function (described in the [multi-language document section][multi-language-section]), the scanner may want to apply some special behavior when moving to a disjoint part of the document. For example, in [EJS documents][ejs], the JavaScript parser uses this function to enable inserting automatic semicolon tokens in between the code directives, delimited by `<%` and `%>`.
* **`bool (*eof)(const TSLexer *)`** - A function for determining whether the lexer is at the end of the file. The value of `lookahead` will be `0` at the end of a file, but this function should be used instead of checking for that value because the `0` or "NUL" value is also a valid character that could be present in the file being parsed.
- **`void (*log)(const TSLexer *, const char * format, ...)`** - A `printf`-like function for logging. The log is viewable through e.g. `tree-sitter parse --debug` or the browser's console after checking the `log` option in the [Playground](./playground).
The third argument to the `scan` function is an array of booleans that indicates which of external tokens are currently expected by the parser. You should only look for a given token if it is valid according to this array. At the same time, you cannot backtrack, so you may need to combine certain pieces of logic.

View file

@ -3,6 +3,7 @@
#include "./subtree.h"
#include "./length.h"
#include "./unicode.h"
#include <stdarg.h>
#define LOG(message, character) \
if (self->logger.log) { \
@ -284,6 +285,17 @@ static bool ts_lexer__is_at_included_range_start(const TSLexer *_self) {
}
}
static void ts_lexer__log(const TSLexer *_self, const char *fmt, ...) {
Lexer *self = (Lexer *)_self;
va_list args;
va_start(args, fmt);
if (self->logger.log) {
vsnprintf(self->debug_buffer, TREE_SITTER_SERIALIZATION_BUFFER_SIZE, fmt, args);
self->logger.log(self->logger.payload, TSLogTypeLex, self->debug_buffer);
}
va_end(args);
}
void ts_lexer_init(Lexer *self) {
*self = (Lexer) {
.data = {
@ -295,6 +307,7 @@ void ts_lexer_init(Lexer *self) {
.get_column = ts_lexer__get_column,
.is_at_included_range_start = ts_lexer__is_at_included_range_start,
.eof = ts_lexer__eof,
.log = ts_lexer__log,
.lookahead = 0,
.result_symbol = 0,
},

View file

@ -47,6 +47,7 @@ struct TSLexer {
uint32_t (*get_column)(TSLexer *);
bool (*is_at_included_range_start)(const TSLexer *);
bool (*eof)(const TSLexer *);
void (*log)(const TSLexer *, const char *, ...);
};
typedef enum {

View file

@ -77,6 +77,7 @@ bool tree_sitter_external_tokens_external_scanner_scan(
for (;;) {
if (scanner->depth == 0) {
lexer->log(lexer, "Found a percent string");
lexer->result_symbol = percent_string;
return true;
}