Merge pull request #1581 from tlaplus-community/get-codepoint-column

`get_column` now counts codepoints instead of bytes
2022-01-12 10:49:44 -08:00 · 2022-01-12 10:49:44 -08:00 · 9df064c9fe
commit 9df064c9fe
parent bf210f0c9e e1ee261181
6 changed files with 232 additions and 14 deletions
--- a/docs/section-3-creating-parsers.md
+++ b/docs/section-3-creating-parsers.md
@ -674,7 +674,7 @@ This function is responsible for recognizing external tokens. It should return `
 * **`TSSymbol result_symbol`** - The symbol that was recognized. Your scan function should *assign* to this field one of the values from the `TokenType` enum, described above.
 * **`void (*advance)(TSLexer *, bool skip)`** - A function for advancing to the next character. If you pass `true` for the second argument, the current character will be treated as whitespace.
 * **`void (*mark_end)(TSLexer *)`** - A function for marking the end of the recognized token. This allows matching tokens that require multiple characters of lookahead. By default (if you don't call `mark_end`), any character that you moved past using the `advance` function will be included in the size of the token. But once you call `mark_end`, then any later calls to `advance` will *not* increase the size of the returned token. You can call `mark_end` multiple times to increase the size of the token.
-* **`uint32_t (*get_column)(TSLexer *)`** - A function for querying the current column position of the lexer. It returns the number of bytes (not characters) since the start of the current line.
+* **`uint32_t (*get_column)(TSLexer *)`** - A function for querying the current column position of the lexer. It returns the number of codepoints since the start of the current line. The codepoint position is recalculated on every call to this function by reading from the start of the line.
 * **`bool (*is_at_included_range_start)(TSLexer *)`** - A function for checking if the parser has just skipped some characters in the document. When parsing an embedded document using the `ts_parser_set_included_ranges` function (described in the [multi-language document section][multi-language-section]), your scanner may want to apply some special behavior when moving to a disjoint part of the document. For example, in [EJS documents][ejs], the JavaScript parser uses this function to enable inserting automatic semicolon tokens in between the code directives, delimited by `<%` and `%>`.

 The third argument to the `scan` function is an array of booleans that indicates which of your external tokens are currently expected by the parser. You should only look for a given token if it is valid according to this array. At the same time, you cannot backtrack, so you may need to combine certain pieces of logic.
--- a/lib/src/lexer.c
+++ b/lib/src/lexer.c
@ -152,18 +152,8 @@ static void ts_lexer_goto(Lexer *self, Length position) {
  }
 }

-// Advance to the next character in the source code, retrieving a new
-// chunk of source code if needed.
-static void ts_lexer__advance(TSLexer *_self, bool skip) {
-  Lexer *self = (Lexer *)_self;
-  if (!self->chunk) return;
-
-  if (skip) {
-    LOG("skip", self->data.lookahead);
-  } else {
-    LOG("consume", self->data.lookahead);
-  }
-
+// Intended to be called only from functions that control logging.
+static void ts_lexer__do_advance(Lexer *self, bool skip) {
  if (self->lookahead_size) {
    self->current_position.bytes += self->lookahead_size;
    if (self->data.lookahead == '\n') {
@ -205,6 +195,21 @@ static void ts_lexer__advance(TSLexer *_self, bool skip) {
  }
 }

+// Advance to the next character in the source code, retrieving a new
+// chunk of source code if needed.
+static void ts_lexer__advance(TSLexer *_self, bool skip) {
+  Lexer *self = (Lexer *)_self;
+  if (!self->chunk) return;
+
+  if (skip) {
+    LOG("skip", self->data.lookahead);
+  } else {
+    LOG("consume", self->data.lookahead);
+  }
+  
+  ts_lexer__do_advance(self, skip);
+}
+
 // Mark that a token match has completed. This can be called multiple
 // times if a longer match is found later.
 static void ts_lexer__mark_end(TSLexer *_self) {
@ -233,8 +238,25 @@ static void ts_lexer__mark_end(TSLexer *_self) {

 static uint32_t ts_lexer__get_column(TSLexer *_self) {
  Lexer *self = (Lexer *)_self;
+  
+  uint32_t goal_byte = self->current_position.bytes;
+  
  self->did_get_column = true;
-  return self->current_position.extent.column;
+  self->current_position.bytes -= self->current_position.extent.column;
+  self->current_position.extent.column = 0;
+
+  if (self->current_position.bytes < self->chunk_start) {
+    ts_lexer__get_chunk(self);
+  }
+
+  uint32_t result = 0;
+  ts_lexer__get_lookahead(self);
+  while (self->current_position.bytes < goal_byte && !ts_lexer__eof(_self) && self->chunk) {
+    ts_lexer__do_advance(self, false);
+    result++;
+  }
+
+  return result;
 }

 // Is the lexer at a boundary between two disjoint included ranges of
--- a/test/fixtures/test_grammars/external_unicode_column_alignment/README.md
+++ b/test/fixtures/test_grammars/external_unicode_column_alignment/README.md
@ -0,0 +1 @@
+This tests that `get_column` correctly counts codepoints since start of line.
--- a/test/fixtures/test_grammars/external_unicode_column_alignment/corpus.txt
+++ b/test/fixtures/test_grammars/external_unicode_column_alignment/corpus.txt
@ -0,0 +1,93 @@
+========================
+Single list, no boxes
+========================
+
+-
+-
+-
+
+----------------------
+
+(expression
+  (list
+    (list_item)
+    (list_item)
+    (list_item)
+  )
+)
+
+========================
+Two lists, no boxes
+========================
+
+ -
+ -
+ -
+  -
+  -
+
+----------------------
+
+(expression
+  (list
+    (list_item)
+    (list_item)
+    (list_item)
+  )
+  (list
+    (list_item)
+    (list_item)
+  )
+)
+
+========================
+List with boxes
+========================
+
+ -
+□-
+ -
+
+----------------------
+
+(expression
+  (list
+    (list_item)
+    (list_item)
+    (list_item)
+  )
+)
+
+========================
+Multiple lists with boxes
+========================
+
+   -
+□ □-
+ □ -
+□□□□□□-
+□ □ □ -
+      -
+□□□   -
+□□□-
+□ □-
+
+----------------------
+
+(expression
+  (list
+    (list_item)
+    (list_item)
+    (list_item)
+  )
+  (list
+    (list_item)
+    (list_item)
+    (list_item)
+    (list_item)
+  )
+  (list
+    (list_item)
+    (list_item)
+  )
+)
--- a/test/fixtures/test_grammars/external_unicode_column_alignment/grammar.js
+++ b/test/fixtures/test_grammars/external_unicode_column_alignment/grammar.js
@ -0,0 +1,17 @@
+module.exports = grammar({
+  name: "external_unicode_column_alignment",
+
+  externals: $ => [
+    $._start_list,
+    $.list_item,
+    $._end_list
+  ],
+
+  extras: $ => [/\s/, '□'],
+
+  rules: {
+    expression: $ => repeat($.list),
+    
+    list: $ => seq($._start_list, repeat1($.list_item), $._end_list)
+  }
+})
--- a/test/fixtures/test_grammars/external_unicode_column_alignment/scanner.c
+++ b/test/fixtures/test_grammars/external_unicode_column_alignment/scanner.c
@ -0,0 +1,85 @@
+#include <tree_sitter/parser.h>
+#include <wctype.h>
+#include <string.h>
+
+enum {
+  LIST_START,
+  LIST_ITEM,
+  LIST_END
+};
+
+typedef struct {
+  int32_t column;
+} Scanner;
+
+void *tree_sitter_external_unicode_column_alignment_external_scanner_create() {
+  Scanner *scanner = malloc(sizeof(Scanner));
+  *scanner = (Scanner){
+    .column = -1 
+  };
+  return scanner;
+}
+
+void tree_sitter_external_unicode_column_alignment_external_scanner_destroy(void *payload) {
+  free(payload);
+}
+
+unsigned tree_sitter_external_unicode_column_alignment_external_scanner_serialize(
+  void *payload,
+  char *buffer
+) {
+  Scanner *scanner = payload;
+  unsigned copied = sizeof(int32_t);
+  memcpy(buffer, &(scanner->column), copied);
+  return copied;
+}
+
+void tree_sitter_external_unicode_column_alignment_external_scanner_deserialize(
+  void *payload,
+  const char *buffer,
+  unsigned length
+) {
+  Scanner *scanner = payload;
+  scanner->column = -1;
+  if (length > 0) {
+    memcpy(&(scanner->column), buffer, sizeof(int32_t));
+  }
+}
+
+bool tree_sitter_external_unicode_column_alignment_external_scanner_scan(
+  void *payload,
+  TSLexer *lexer,
+  const bool *whitelist
+) {
+  Scanner *scanner = payload;
+  // U+25A1 is unicode codepoint □
+  while (iswspace(lexer->lookahead) || 0x25A1 == lexer->lookahead) {
+    lexer->advance(lexer, true);
+  } 
+  if ('-' == lexer->lookahead) {
+    const int32_t column = lexer->get_column(lexer);
+    if (-1 == scanner->column) {
+      lexer->result_symbol = LIST_START;
+      scanner->column = column;
+      return true;
+    } else {
+      if (column == scanner->column) {
+        lexer->result_symbol = LIST_ITEM;
+        lexer->advance(lexer, false);
+        return true;
+      } else {
+        lexer->result_symbol = LIST_END;
+        scanner->column = -1;
+        return true;
+      }
+    }
+  }
+  
+  if (lexer->eof(lexer) && -1 != scanner->column) {
+    lexer->result_symbol = LIST_END;
+    scanner->column = -1;
+    return true;
+  }
+  
+  return false;
+}
				`@ -0,0 +1 @@`
				This tests that `get_column` correctly counts codepoints since start of line.