From 75aa295b6655ead644cd17bc7002a469d475257f Mon Sep 17 00:00:00 2001 From: Andrew Helwer Date: Mon, 27 Sep 2021 16:52:14 -0400 Subject: [PATCH 1/9] get_column now counts codepoints --- lib/src/lexer.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/lib/src/lexer.c b/lib/src/lexer.c index 033f93e9..aef92f19 100644 --- a/lib/src/lexer.c +++ b/lib/src/lexer.c @@ -233,10 +233,28 @@ static void ts_lexer__mark_end(TSLexer *_self) { static uint32_t ts_lexer__get_column(TSLexer *_self) { Lexer *self = (Lexer *)_self; + + uint32_t goal_byte = self->current_position.bytes; + self->did_get_column = true; - return self->current_position.extent.column; + self->current_position.bytes -= self->current_position.extent.column; + self->current_position.extent.column = 0; + + if (self->current_position.bytes < self->chunk_start) { + ts_lexer__get_chunk(self); + } + + uint32_t result = 0; + ts_lexer__get_lookahead(_self); + while (self->current_position.bytes < goal_byte && !ts_lexer__eof(_self)) { + ts_lexer__advance(_self, false); + result++; + } + + return result; } + // Is the lexer at a boundary between two disjoint included ranges of // source code? This is exposed as an API because some languages' external // scanners need to perform custom actions at these boundaries. From 0a52e90b01528d08a8f64a806e1f07cd550091dc Mon Sep 17 00:00:00 2001 From: Andrew Helwer Date: Mon, 27 Sep 2021 17:04:22 -0400 Subject: [PATCH 2/9] Fixed pointer type --- lib/src/lexer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/src/lexer.c b/lib/src/lexer.c index aef92f19..c94a80b4 100644 --- a/lib/src/lexer.c +++ b/lib/src/lexer.c @@ -245,7 +245,7 @@ static uint32_t ts_lexer__get_column(TSLexer *_self) { } uint32_t result = 0; - ts_lexer__get_lookahead(_self); + ts_lexer__get_lookahead(self); while (self->current_position.bytes < goal_byte && !ts_lexer__eof(_self)) { ts_lexer__advance(_self, false); result++; From ace81f6267daf9580f365270b7779decf0a7093d Mon Sep 17 00:00:00 2001 From: Andrew Helwer Date: Wed, 13 Oct 2021 15:47:53 -0400 Subject: [PATCH 3/9] Don't log when counting codepoints --- lib/src/lexer.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/lib/src/lexer.c b/lib/src/lexer.c index c94a80b4..e72297fe 100644 --- a/lib/src/lexer.c +++ b/lib/src/lexer.c @@ -152,18 +152,8 @@ static void ts_lexer_goto(Lexer *self, Length position) { } } -// Advance to the next character in the source code, retrieving a new -// chunk of source code if needed. -static void ts_lexer__advance(TSLexer *_self, bool skip) { - Lexer *self = (Lexer *)_self; - if (!self->chunk) return; - - if (skip) { - LOG("skip", self->data.lookahead); - } else { - LOG("consume", self->data.lookahead); - } - +// Intended to be called only from functions below that control logging. +static void ts_lexer__do_advance(Lexer *self, bool skip) { if (self->lookahead_size) { self->current_position.bytes += self->lookahead_size; if (self->data.lookahead == '\n') { @@ -205,6 +195,27 @@ static void ts_lexer__advance(TSLexer *_self, bool skip) { } } +// Advance to the next character in the source code, retrieving a new +// chunk of source code if needed. +static void ts_lexer__advance(TSLexer *_self, bool skip) { + Lexer *self = (Lexer *)_self; + if (!self->chunk) return; + + if (skip) { + LOG("skip", self->data.lookahead); + } else { + LOG("consume", self->data.lookahead); + } + + ts_lexer__do_advance(self, skip); +} + +// Advance without logging. +static void ts_lexer__advance_no_log(Lexer *self, bool skip) { + if (!self->chunk) return; + ts_lexer__do_advance(self, skip); +} + // Mark that a token match has completed. This can be called multiple // times if a longer match is found later. static void ts_lexer__mark_end(TSLexer *_self) { @@ -247,7 +258,7 @@ static uint32_t ts_lexer__get_column(TSLexer *_self) { uint32_t result = 0; ts_lexer__get_lookahead(self); while (self->current_position.bytes < goal_byte && !ts_lexer__eof(_self)) { - ts_lexer__advance(_self, false); + ts_lexer__advance_no_log(self, false); result++; } From bfb692d2f7ca64fe6943e4b8454cc133b49a0977 Mon Sep 17 00:00:00 2001 From: Andrew Helwer Date: Fri, 7 Jan 2022 10:16:20 -0500 Subject: [PATCH 4/9] Improve diff --- lib/src/lexer.c | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/lib/src/lexer.c b/lib/src/lexer.c index e72297fe..b90bba1b 100644 --- a/lib/src/lexer.c +++ b/lib/src/lexer.c @@ -152,6 +152,27 @@ static void ts_lexer_goto(Lexer *self, Length position) { } } +// Advance to the next character in the source code, retrieving a new +// chunk of source code if needed. +static void ts_lexer__advance(TSLexer *_self, bool skip) { + Lexer *self = (Lexer *)_self; + if (!self->chunk) return; + + if (skip) { + LOG("skip", self->data.lookahead); + } else { + LOG("consume", self->data.lookahead); + } + + ts_lexer__do_advance(self, skip); +} + +// Advance without logging. +static void ts_lexer__advance_no_log(Lexer *self, bool skip) { + if (!self->chunk) return; + ts_lexer__do_advance(self, skip); +} + // Intended to be called only from functions below that control logging. static void ts_lexer__do_advance(Lexer *self, bool skip) { if (self->lookahead_size) { @@ -195,27 +216,6 @@ static void ts_lexer__do_advance(Lexer *self, bool skip) { } } -// Advance to the next character in the source code, retrieving a new -// chunk of source code if needed. -static void ts_lexer__advance(TSLexer *_self, bool skip) { - Lexer *self = (Lexer *)_self; - if (!self->chunk) return; - - if (skip) { - LOG("skip", self->data.lookahead); - } else { - LOG("consume", self->data.lookahead); - } - - ts_lexer__do_advance(self, skip); -} - -// Advance without logging. -static void ts_lexer__advance_no_log(Lexer *self, bool skip) { - if (!self->chunk) return; - ts_lexer__do_advance(self, skip); -} - // Mark that a token match has completed. This can be called multiple // times if a longer match is found later. static void ts_lexer__mark_end(TSLexer *_self) { @@ -265,7 +265,6 @@ static uint32_t ts_lexer__get_column(TSLexer *_self) { return result; } - // Is the lexer at a boundary between two disjoint included ranges of // source code? This is exposed as an API because some languages' external // scanners need to perform custom actions at these boundaries. From 3ab6d1b937e5106e90d0967c922765618f8896fd Mon Sep 17 00:00:00 2001 From: Andrew Helwer Date: Fri, 7 Jan 2022 10:17:53 -0500 Subject: [PATCH 5/9] Improve diff further --- lib/src/lexer.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/src/lexer.c b/lib/src/lexer.c index b90bba1b..0f94b309 100644 --- a/lib/src/lexer.c +++ b/lib/src/lexer.c @@ -152,6 +152,12 @@ static void ts_lexer_goto(Lexer *self, Length position) { } } +// Advance without logging. +static void ts_lexer__advance_no_log(Lexer *self, bool skip) { + if (!self->chunk) return; + ts_lexer__do_advance(self, skip); +} + // Advance to the next character in the source code, retrieving a new // chunk of source code if needed. static void ts_lexer__advance(TSLexer *_self, bool skip) { @@ -167,12 +173,6 @@ static void ts_lexer__advance(TSLexer *_self, bool skip) { ts_lexer__do_advance(self, skip); } -// Advance without logging. -static void ts_lexer__advance_no_log(Lexer *self, bool skip) { - if (!self->chunk) return; - ts_lexer__do_advance(self, skip); -} - // Intended to be called only from functions below that control logging. static void ts_lexer__do_advance(Lexer *self, bool skip) { if (self->lookahead_size) { From 80c34d62ab1de5d0d7faf45919bca9341f9b1521 Mon Sep 17 00:00:00 2001 From: Andrew Helwer Date: Fri, 7 Jan 2022 10:36:25 -0500 Subject: [PATCH 6/9] Fixed rust build, updated docs --- docs/section-3-creating-parsers.md | 2 +- lib/src/lexer.c | 44 +++++++++++++++--------------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/docs/section-3-creating-parsers.md b/docs/section-3-creating-parsers.md index f5f7c933..05824e22 100644 --- a/docs/section-3-creating-parsers.md +++ b/docs/section-3-creating-parsers.md @@ -674,7 +674,7 @@ This function is responsible for recognizing external tokens. It should return ` * **`TSSymbol result_symbol`** - The symbol that was recognized. Your scan function should *assign* to this field one of the values from the `TokenType` enum, described above. * **`void (*advance)(TSLexer *, bool skip)`** - A function for advancing to the next character. If you pass `true` for the second argument, the current character will be treated as whitespace. * **`void (*mark_end)(TSLexer *)`** - A function for marking the end of the recognized token. This allows matching tokens that require multiple characters of lookahead. By default (if you don't call `mark_end`), any character that you moved past using the `advance` function will be included in the size of the token. But once you call `mark_end`, then any later calls to `advance` will *not* increase the size of the returned token. You can call `mark_end` multiple times to increase the size of the token. -* **`uint32_t (*get_column)(TSLexer *)`** - A function for querying the current column position of the lexer. It returns the number of bytes (not characters) since the start of the current line. +* **`uint32_t (*get_column)(TSLexer *)`** - A function for querying the current column position of the lexer. It returns the number of codepoints since the start of the current line. The codepoint position is recalculated on every call to this function by reading from the start of the line. * **`bool (*is_at_included_range_start)(TSLexer *)`** - A function for checking if the parser has just skipped some characters in the document. When parsing an embedded document using the `ts_parser_set_included_ranges` function (described in the [multi-language document section][multi-language-section]), your scanner may want to apply some special behavior when moving to a disjoint part of the document. For example, in [EJS documents][ejs], the JavaScript parser uses this function to enable inserting automatic semicolon tokens in between the code directives, delimited by `<%` and `%>`. The third argument to the `scan` function is an array of booleans that indicates which of your external tokens are currently expected by the parser. You should only look for a given token if it is valid according to this array. At the same time, you cannot backtrack, so you may need to combine certain pieces of logic. diff --git a/lib/src/lexer.c b/lib/src/lexer.c index 0f94b309..57dc55d5 100644 --- a/lib/src/lexer.c +++ b/lib/src/lexer.c @@ -152,28 +152,7 @@ static void ts_lexer_goto(Lexer *self, Length position) { } } -// Advance without logging. -static void ts_lexer__advance_no_log(Lexer *self, bool skip) { - if (!self->chunk) return; - ts_lexer__do_advance(self, skip); -} - -// Advance to the next character in the source code, retrieving a new -// chunk of source code if needed. -static void ts_lexer__advance(TSLexer *_self, bool skip) { - Lexer *self = (Lexer *)_self; - if (!self->chunk) return; - - if (skip) { - LOG("skip", self->data.lookahead); - } else { - LOG("consume", self->data.lookahead); - } - - ts_lexer__do_advance(self, skip); -} - -// Intended to be called only from functions below that control logging. +// Intended to be called only from functions that control logging. static void ts_lexer__do_advance(Lexer *self, bool skip) { if (self->lookahead_size) { self->current_position.bytes += self->lookahead_size; @@ -216,6 +195,27 @@ static void ts_lexer__do_advance(Lexer *self, bool skip) { } } +// Advance to the next character in the source code, retrieving a new +// chunk of source code if needed. +static void ts_lexer__advance(TSLexer *_self, bool skip) { + Lexer *self = (Lexer *)_self; + if (!self->chunk) return; + + if (skip) { + LOG("skip", self->data.lookahead); + } else { + LOG("consume", self->data.lookahead); + } + + ts_lexer__do_advance(self, skip); +} + +// Advance without logging. +static void ts_lexer__advance_no_log(Lexer *self, bool skip) { + if (!self->chunk) return; + ts_lexer__do_advance(self, skip); +} + // Mark that a token match has completed. This can be called multiple // times if a longer match is found later. static void ts_lexer__mark_end(TSLexer *_self) { From 5a6530a413d1dd780db24e84f804d5197110d3c7 Mon Sep 17 00:00:00 2001 From: Andrew Helwer Date: Tue, 11 Jan 2022 12:05:37 -0500 Subject: [PATCH 7/9] Added tests --- lib/src/lexer.c | 10 +- .../README.md | 1 + .../corpus.txt | 93 +++++++++++++++++++ .../grammar.js | 17 ++++ .../scanner.c | 83 +++++++++++++++++ 5 files changed, 196 insertions(+), 8 deletions(-) create mode 100644 test/fixtures/test_grammars/external_unicode_column_alignment/README.md create mode 100644 test/fixtures/test_grammars/external_unicode_column_alignment/corpus.txt create mode 100644 test/fixtures/test_grammars/external_unicode_column_alignment/grammar.js create mode 100644 test/fixtures/test_grammars/external_unicode_column_alignment/scanner.c diff --git a/lib/src/lexer.c b/lib/src/lexer.c index 57dc55d5..f2c10fbd 100644 --- a/lib/src/lexer.c +++ b/lib/src/lexer.c @@ -210,12 +210,6 @@ static void ts_lexer__advance(TSLexer *_self, bool skip) { ts_lexer__do_advance(self, skip); } -// Advance without logging. -static void ts_lexer__advance_no_log(Lexer *self, bool skip) { - if (!self->chunk) return; - ts_lexer__do_advance(self, skip); -} - // Mark that a token match has completed. This can be called multiple // times if a longer match is found later. static void ts_lexer__mark_end(TSLexer *_self) { @@ -257,8 +251,8 @@ static uint32_t ts_lexer__get_column(TSLexer *_self) { uint32_t result = 0; ts_lexer__get_lookahead(self); - while (self->current_position.bytes < goal_byte && !ts_lexer__eof(_self)) { - ts_lexer__advance_no_log(self, false); + while (self->current_position.bytes < goal_byte && !ts_lexer__eof(_self) && self->chunk) { + ts_lexer__do_advance(self, false); result++; } diff --git a/test/fixtures/test_grammars/external_unicode_column_alignment/README.md b/test/fixtures/test_grammars/external_unicode_column_alignment/README.md new file mode 100644 index 00000000..8fe141d2 --- /dev/null +++ b/test/fixtures/test_grammars/external_unicode_column_alignment/README.md @@ -0,0 +1 @@ +This tests that `get_column` correctly counts codepoints since start of line. \ No newline at end of file diff --git a/test/fixtures/test_grammars/external_unicode_column_alignment/corpus.txt b/test/fixtures/test_grammars/external_unicode_column_alignment/corpus.txt new file mode 100644 index 00000000..de7a5f24 --- /dev/null +++ b/test/fixtures/test_grammars/external_unicode_column_alignment/corpus.txt @@ -0,0 +1,93 @@ +======================== +Single list, no boxes +======================== + +- +- +- + +---------------------- + +(expression + (list + (list_item) + (list_item) + (list_item) + ) +) + +======================== +Two lists, no boxes +======================== + + - + - + - + - + - + +---------------------- + +(expression + (list + (list_item) + (list_item) + (list_item) + ) + (list + (list_item) + (list_item) + ) +) + +======================== +List with boxes +======================== + + - +□- + - + +---------------------- + +(expression + (list + (list_item) + (list_item) + (list_item) + ) +) + +======================== +Multiple lists with boxes +======================== + + - +□ □- + □ - +□□□□□□- +□ □ □ - + - +□□□ - +□□□- +□ □- + +---------------------- + +(expression + (list + (list_item) + (list_item) + (list_item) + ) + (list + (list_item) + (list_item) + (list_item) + (list_item) + ) + (list + (list_item) + (list_item) + ) +) diff --git a/test/fixtures/test_grammars/external_unicode_column_alignment/grammar.js b/test/fixtures/test_grammars/external_unicode_column_alignment/grammar.js new file mode 100644 index 00000000..3016b31d --- /dev/null +++ b/test/fixtures/test_grammars/external_unicode_column_alignment/grammar.js @@ -0,0 +1,17 @@ +module.exports = grammar({ + name: "external_unicode_column_alignment", + + externals: $ => [ + $._start_list, + $.list_item, + $._end_list + ], + + extras: $ => [/\s/, '□'], + + rules: { + expression: $ => repeat($.list), + + list: $ => seq($._start_list, repeat1($.list_item), $._end_list) + } +}) diff --git a/test/fixtures/test_grammars/external_unicode_column_alignment/scanner.c b/test/fixtures/test_grammars/external_unicode_column_alignment/scanner.c new file mode 100644 index 00000000..13d9f9db --- /dev/null +++ b/test/fixtures/test_grammars/external_unicode_column_alignment/scanner.c @@ -0,0 +1,83 @@ +#include + +enum { + LIST_START, + LIST_ITEM, + LIST_END +}; + +typedef struct { + int32_t column; +} Scanner; + +void *tree_sitter_external_unicode_column_alignment_external_scanner_create() { + Scanner *scanner = malloc(sizeof(Scanner)); + *scanner = (Scanner){ + .column = -1 + }; + return scanner; +} + +void tree_sitter_external_unicode_column_alignment_external_scanner_destroy(void *payload) { + free(payload); +} + +unsigned tree_sitter_external_unicode_column_alignment_external_scanner_serialize( + void *payload, + char *buffer +) { + Scanner *scanner = payload; + unsigned copied = sizeof(int32_t); + memcpy(buffer, &(scanner->column), copied); + return copied; +} + +void tree_sitter_external_unicode_column_alignment_external_scanner_deserialize( + void *payload, + const char *buffer, + unsigned length +) { + Scanner *scanner = payload; + scanner->column = -1; + if (length > 0) { + memcpy(&(scanner->column), buffer, sizeof(int32_t)); + } +} + +bool tree_sitter_external_unicode_column_alignment_external_scanner_scan( + void *payload, + TSLexer *lexer, + const bool *whitelist +) { + Scanner *scanner = payload; + // 9633 is the int equivalent of □ (U+25A1) + while (iswspace(lexer->lookahead) || 9633 == lexer->lookahead) { + lexer->advance(lexer, true); + } + if ('-' == lexer->lookahead) { + const int32_t column = lexer->get_column(lexer); + if (-1 == scanner->column) { + lexer->result_symbol = LIST_START; + scanner->column = column; + return true; + } else { + if (column == scanner->column) { + lexer->result_symbol = LIST_ITEM; + lexer->advance(lexer, false); + return true; + } else { + lexer->result_symbol = LIST_END; + scanner->column = -1; + return true; + } + } + } + + if (lexer->eof(lexer) && -1 != scanner->column) { + lexer->result_symbol = LIST_END; + scanner->column = -1; + return true; + } + + return false; +} From 69ff091a87671f25fe822b5120eedb8d88d32eb1 Mon Sep 17 00:00:00 2001 From: Andrew Helwer Date: Tue, 11 Jan 2022 12:31:41 -0500 Subject: [PATCH 8/9] Added includes for macos --- .../test_grammars/external_unicode_column_alignment/scanner.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/fixtures/test_grammars/external_unicode_column_alignment/scanner.c b/test/fixtures/test_grammars/external_unicode_column_alignment/scanner.c index 13d9f9db..86fb270b 100644 --- a/test/fixtures/test_grammars/external_unicode_column_alignment/scanner.c +++ b/test/fixtures/test_grammars/external_unicode_column_alignment/scanner.c @@ -1,4 +1,6 @@ #include +#include +#include enum { LIST_START, From e1ee261181238f3648717f753b1f98f787fc836f Mon Sep 17 00:00:00 2001 From: Andrew Helwer Date: Tue, 11 Jan 2022 19:15:36 -0500 Subject: [PATCH 9/9] Changed decimal unicode codepoint to hex --- .../test_grammars/external_unicode_column_alignment/scanner.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/fixtures/test_grammars/external_unicode_column_alignment/scanner.c b/test/fixtures/test_grammars/external_unicode_column_alignment/scanner.c index 86fb270b..a9e98735 100644 --- a/test/fixtures/test_grammars/external_unicode_column_alignment/scanner.c +++ b/test/fixtures/test_grammars/external_unicode_column_alignment/scanner.c @@ -52,8 +52,8 @@ bool tree_sitter_external_unicode_column_alignment_external_scanner_scan( const bool *whitelist ) { Scanner *scanner = payload; - // 9633 is the int equivalent of □ (U+25A1) - while (iswspace(lexer->lookahead) || 9633 == lexer->lookahead) { + // U+25A1 is unicode codepoint □ + while (iswspace(lexer->lookahead) || 0x25A1 == lexer->lookahead) { lexer->advance(lexer, true); } if ('-' == lexer->lookahead) {