diff --git a/include/tree_sitter/parser.h b/include/tree_sitter/parser.h index e1fcff9a..bb04a7db 100644 --- a/include/tree_sitter/parser.h +++ b/include/tree_sitter/parser.h @@ -10,14 +10,20 @@ extern "C" { #include #include "tree_sitter/runtime.h" -typedef struct TSTree TSTree; - #define ts_lex_state_error 0 #define TS_DEBUG_BUFFER_SIZE 512 -typedef struct TSLexer { - TSInput input; +typedef struct TSTree TSTree; +typedef unsigned short TSStateId; +typedef struct TSLexer { + // Public + void (*start_fn)(struct TSLexer *, TSStateId); + void (*start_token_fn)(struct TSLexer *); + bool (*advance_fn)(struct TSLexer *, TSStateId); + TSTree *(*accept_fn)(struct TSLexer *, TSSymbol, int, const char *); + + // Private const char *chunk; size_t chunk_start; size_t chunk_size; @@ -29,15 +35,11 @@ typedef struct TSLexer { size_t lookahead_size; int32_t lookahead; - TSTree *(*accept_fn)(struct TSLexer *, TSSymbol, int); - bool (*advance_fn)(struct TSLexer *); - + TSInput input; TSDebugger debugger; char debug_buffer[TS_DEBUG_BUFFER_SIZE]; } TSLexer; -typedef unsigned short TSStateId; - typedef enum { TSParseActionTypeError, TSParseActionTypeShift, @@ -67,44 +69,34 @@ struct TSLanguage { TSTree *(*lex_fn)(TSLexer *, TSStateId); }; -#define DEBUG_LEX(...) \ - if (lexer->debugger.debug_fn) { \ - snprintf(lexer->debug_buffer, TS_DEBUG_BUFFER_SIZE, __VA_ARGS__); \ - lexer->debugger.debug_fn(lexer->debugger.data, lexer->debug_buffer); \ +/* + * Lexer Macros + */ + +#define START_LEXER() \ + lexer->start_fn(lexer, lex_state); \ + int32_t lookahead; \ + next_state: \ + lookahead = lexer->lookahead; + +#define START_TOKEN() lexer->start_token_fn(lexer); + +#define ADVANCE(state_index) \ + { \ + lexer->advance_fn(lexer, state_index); \ + lex_state = state_index; \ + goto next_state; \ } -#define START_LEXER() \ - DEBUG_LEX("start state:%d", lex_state); \ - int32_t lookahead; \ - next_state: \ - lookahead = lexer->lookahead; \ - DEBUG_LEX((0 < lookahead &&lookahead < 255 ? "lookahead char:'%c'" \ - : "lookahead char:%d"), \ - lookahead); +#define ACCEPT_TOKEN(symbol) \ + return lexer->accept_fn(lexer, symbol, ts_hidden_symbol_flags[symbol], \ + ts_symbol_names[symbol]); -#define START_TOKEN() \ - DEBUG_LEX("start_token chars:%lu", lexer->current_position.chars); \ - lexer->token_start_position = lexer->current_position; +#define LEX_ERROR() ACCEPT_TOKEN(ts_builtin_sym_error); -#define ADVANCE(state_index) \ - { \ - DEBUG_LEX("advance state:%d", state_index); \ - lexer->advance_fn(lexer); \ - lex_state = state_index; \ - goto next_state; \ - } - -#define ACCEPT_TOKEN(symbol) \ - { \ - DEBUG_LEX("accept_token sym:%s", ts_symbol_names[symbol]); \ - return lexer->accept_fn(lexer, symbol, ts_hidden_symbol_flags[symbol]); \ - } - -#define LEX_ERROR() \ - { \ - DEBUG_LEX("error"); \ - return lexer->accept_fn(lexer, ts_builtin_sym_error, 0); \ - } +/* + * Parse Table Macros + */ #define SHIFT(to_state_value) \ { \ diff --git a/src/runtime/lexer.c b/src/runtime/lexer.c index b3409615..3bd2bcdc 100644 --- a/src/runtime/lexer.c +++ b/src/runtime/lexer.c @@ -5,54 +5,66 @@ #include "runtime/debugger.h" #include "utf8proc.h" +#define DEBUG(...) \ + if (lexer->debugger.debug_fn) { \ + snprintf(lexer->debug_buffer, TS_DEBUG_BUFFER_SIZE, __VA_ARGS__); \ + lexer->debugger.debug_fn(lexer->debugger.data, lexer->debug_buffer); \ + } + static const char *empty_chunk = ""; -static void ts_lexer_read_next_chunk(TSLexer *lexer) { +static void read_next_chunk(TSLexer *lexer) { TSInput input = lexer->input; if (lexer->current_position.bytes != lexer->chunk_start + lexer->chunk_size) input.seek_fn(input.data, lexer->current_position); + lexer->chunk_start = lexer->current_position.bytes; lexer->chunk = input.read_fn(input.data, &lexer->chunk_size); if (!lexer->chunk_size) lexer->chunk = empty_chunk; } -static bool advance(TSLexer *lexer) { - - /* - * Return false if the Lexer has already reached the end of the input. - */ - if (lexer->chunk == empty_chunk) - return false; - - /* - * Increment the Lexer's position. - */ - if (lexer->lookahead_size) { - lexer->current_position.bytes += lexer->lookahead_size; - lexer->current_position.chars += 1; - } - - /* - * Request a new chunk of text from the Input if the Lexer has reached - * the end of the current chunk. - */ - if (lexer->current_position.bytes >= lexer->chunk_start + lexer->chunk_size) { - ts_lexer_read_next_chunk(lexer); - } - - /* - * Read the next unicode character from the current chunk of text. - */ +static void read_lookahead(TSLexer *lexer) { size_t position_in_chunk = lexer->current_position.bytes - lexer->chunk_start; lexer->lookahead_size = utf8proc_iterate( (const uint8_t *)lexer->chunk + position_in_chunk, lexer->chunk_size - position_in_chunk + 1, &lexer->lookahead); + DEBUG((0 < lexer->lookahead && lexer->lookahead < 256) ? "lookahead char:'%c'" + : "lookahead char:%d", + lexer->lookahead); +} + +static void start(TSLexer *lexer, TSStateId lex_state) { + DEBUG("start_lex state:%d", lex_state); +} + +static void start_token(TSLexer *lexer) { + DEBUG("start_token chars:%lu", lexer->current_position.chars); + lexer->token_start_position = lexer->current_position; +} + +static bool advance(TSLexer *lexer, TSStateId state) { + DEBUG("advance state:%d", state); + + if (lexer->chunk == empty_chunk) + return false; + + if (lexer->lookahead_size) { + lexer->current_position.bytes += lexer->lookahead_size; + lexer->current_position.chars += 1; + } + + if (lexer->current_position.bytes >= lexer->chunk_start + lexer->chunk_size) + read_next_chunk(lexer); + + read_lookahead(lexer); return true; } -static TSTree *accept(TSLexer *lexer, TSSymbol symbol, int is_hidden) { +static TSTree *accept(TSLexer *lexer, TSSymbol symbol, int is_hidden, + const char *symbol_name) { + DEBUG("accept_token sym:%s", symbol_name); TSLength size = ts_length_sub(lexer->current_position, lexer->token_start_position); TSLength padding = @@ -64,14 +76,15 @@ static TSTree *accept(TSLexer *lexer, TSSymbol symbol, int is_hidden) { } /* - * The `advance` and `accept` methods are stored as fields on the Lexer so - * that generated parsers can call them without needing to be linked against - * this library. + * The lexer's methods are stored as struct fields so that generated parsers + * can call them without needing to be linked against this library. */ + TSLexer ts_lexer_make() { - TSLexer result = (TSLexer) { .advance_fn = advance, + TSLexer result = (TSLexer) { .start_fn = start, + .start_token_fn = start_token, + .advance_fn = advance, .accept_fn = accept, - .debugger = ts_debugger_null(), .chunk = NULL, .chunk_start = 0, .chunk_size = 0, @@ -79,17 +92,14 @@ TSLexer ts_lexer_make() { .token_start_position = ts_length_zero(), .token_end_position = ts_length_zero(), .lookahead = 0, - .lookahead_size = 0, }; + .lookahead_size = 0, + .debugger = ts_debugger_null() }; return result; } void ts_lexer_reset(TSLexer *lexer, TSLength position) { - lexer->lookahead = 0; - lexer->lookahead_size = 0; - lexer->token_end_position = position; lexer->current_position = position; - ts_lexer_read_next_chunk(lexer); - - lexer->advance_fn(lexer); + read_next_chunk(lexer); + read_lookahead(lexer); } diff --git a/src/runtime/parser.c b/src/runtime/parser.c index 17cbabac..eb001c50 100644 --- a/src/runtime/parser.c +++ b/src/runtime/parser.c @@ -13,7 +13,7 @@ * Debugging */ -#define DEBUG_PARSE(...) \ +#define DEBUG(...) \ if (parser->debugger.debug_fn) { \ snprintf(parser->lexer.debug_buffer, TS_DEBUG_BUFFER_SIZE, __VA_ARGS__); \ parser->debugger.debug_fn(parser->debugger.data, \ @@ -50,7 +50,7 @@ static TSLength break_down_left_stack(TSParser *parser, TSInputEdit edit) { if (left_subtree_end.chars < edit.position && !children) break; - DEBUG_PARSE("pop_left sym:%s", SYM_NAME(node->symbol)); + DEBUG("pop_left sym:%s", SYM_NAME(node->symbol)); parser->stack.size--; left_subtree_end = ts_length_sub(left_subtree_end, ts_tree_total_size(node)); @@ -62,7 +62,7 @@ static TSLength break_down_left_stack(TSParser *parser, TSInputEdit edit) { TSStateId next_state = action.type == TSParseActionTypeShift ? action.data.to_state : state; - DEBUG_PARSE("push_left sym:%s", SYM_NAME(child->symbol)); + DEBUG("push_left sym:%s", SYM_NAME(child->symbol)); ts_stack_push(&parser->stack, next_state, child); left_subtree_end = ts_length_add(left_subtree_end, ts_tree_total_size(child)); @@ -74,14 +74,14 @@ static TSLength break_down_left_stack(TSParser *parser, TSInputEdit edit) { if (right_subtree_start < edit.position + edit.chars_inserted) break; - DEBUG_PARSE("push_right sym:%s", SYM_NAME(child->symbol)); + DEBUG("push_right sym:%s", SYM_NAME(child->symbol)); ts_stack_push(&parser->right_stack, 0, child); } ts_tree_release(node); } - DEBUG_PARSE("reuse_left chars:%lu", left_subtree_end.chars); + DEBUG("reuse_left chars:%lu", left_subtree_end.chars); return left_subtree_end; } @@ -111,7 +111,7 @@ static TSTree *break_down_right_stack(TSParser *parser) { size_t child_count; TSTree **children = ts_tree_children(node, &child_count); - DEBUG_PARSE("pop_right %s", SYM_NAME(node->symbol)); + DEBUG("pop_right %s", SYM_NAME(node->symbol)); stack->size--; right_subtree_start += ts_tree_total_size(node).chars; @@ -120,7 +120,7 @@ static TSTree *break_down_right_stack(TSParser *parser) { break; TSTree *child = children[i]; - DEBUG_PARSE("push_right sym:%s", SYM_NAME(child->symbol)); + DEBUG("push_right sym:%s", SYM_NAME(child->symbol)); ts_stack_push(stack, 0, child); right_subtree_start -= ts_tree_total_size(child).chars; } @@ -133,7 +133,7 @@ static TSTree *get_next_node(TSParser *parser, TSStateId lex_state) { TSTree *node; if ((node = break_down_right_stack(parser))) { - DEBUG_PARSE("reuse sym:%s", SYM_NAME(node->symbol)); + DEBUG("reuse sym:%s", SYM_NAME(node->symbol)); parser->lexer.lookahead = 0; parser->lexer.lookahead_size = 0; @@ -141,7 +141,7 @@ static TSTree *get_next_node(TSParser *parser, TSStateId lex_state) { ts_length_add(parser->lexer.current_position, ts_tree_total_size(node)); } else { node = parser->language->lex_fn(&parser->lexer, lex_state); - DEBUG_PARSE("lex sym:%s", SYM_NAME(node->symbol)); + DEBUG("lex sym:%s", SYM_NAME(node->symbol)); } return node; @@ -224,7 +224,7 @@ static int handle_error(TSParser *parser) { parser->language, state_after_error, parser->lookahead->symbol); if (action_after_error.type != TSParseActionTypeError) { - DEBUG_PARSE("recover state:%u", state_after_error); + DEBUG("recover state:%u", state_after_error); ts_stack_shrink(&parser->stack, entry - parser->stack.entries + 1); parser->lookahead->padding = ts_length_zero(); @@ -242,7 +242,7 @@ static int handle_error(TSParser *parser) { * current lookahead token, advance to the next token. If no characters * were consumed, advance the lexer to the next character. */ - DEBUG_PARSE("lex_again"); + DEBUG("skip_token"); TSLength prev_position = parser->lexer.current_position; if (parser->lookahead) ts_tree_release(parser->lookahead); @@ -253,8 +253,8 @@ static int handle_error(TSParser *parser) { * just skip it. If the end of input is reached, exit. */ if (ts_length_eq(parser->lexer.current_position, prev_position)) - if (!parser->lexer.advance_fn(&parser->lexer)) { - DEBUG_PARSE("fail_to_recover"); + if (!parser->lexer.advance_fn(&parser->lexer, 0)) { + DEBUG("fail_to_recover"); resize_error(parser, error); ts_stack_push(&parser->stack, 0, error); @@ -323,11 +323,11 @@ const TSTree *ts_parser_parse(TSParser *parser, TSInput input, TSLength position; if (edit) { - DEBUG_PARSE("edit pos:%lu inserted:%lu deleted:%lu", edit->position, + DEBUG("edit pos:%lu inserted:%lu deleted:%lu", edit->position, edit->chars_inserted, edit->chars_removed); position = break_down_left_stack(parser, *edit); } else { - DEBUG_PARSE("new_parse"); + DEBUG("new_parse"); ts_stack_shrink(&parser->stack, 0); position = ts_length_zero(); } @@ -346,37 +346,37 @@ const TSTree *ts_parser_parse(TSParser *parser, TSInput input, switch (action.type) { case TSParseActionTypeShift: if (parser->lookahead->symbol == ts_builtin_sym_error) { - DEBUG_PARSE("error"); + DEBUG("error"); if (!handle_error(parser)) return finish(parser); } else { - DEBUG_PARSE("shift state:%u", action.data.to_state); + DEBUG("shift state:%u", action.data.to_state); shift(parser, action.data.to_state); } break; case TSParseActionTypeShiftExtra: - DEBUG_PARSE("shift_extra"); + DEBUG("shift_extra"); shift_extra(parser); break; case TSParseActionTypeReduce: - DEBUG_PARSE("reduce sym:%s count:%u", SYM_NAME(action.data.symbol), + DEBUG("reduce sym:%s count:%u", SYM_NAME(action.data.symbol), action.data.child_count); reduce(parser, action.data.symbol, action.data.child_count); break; case TSParseActionTypeReduceExtra: - DEBUG_PARSE("reduce_extra sym:%s", SYM_NAME(action.data.symbol)); + DEBUG("reduce_extra sym:%s", SYM_NAME(action.data.symbol)); reduce_extra(parser, action.data.symbol); break; case TSParseActionTypeAccept: - DEBUG_PARSE("accept"); + DEBUG("accept"); return finish(parser); case TSParseActionTypeError: - DEBUG_PARSE("error"); + DEBUG("error"); if (!handle_error(parser)) return finish(parser); break;