From e29d3714f7ee821bb717ad4222bf5280ec7a67a9 Mon Sep 17 00:00:00 2001
From: Max Brunsfeld <maxbrunsfeld@gmail.com>
Date: Thu, 11 Mar 2021 11:25:10 -0800
Subject: [PATCH] Fix behavior of Lexer.get_column when at EOF

---
 lib/src/lexer.c                               | 115 +++++++--------
 .../uses_current_column/corpus.txt            |  76 ++++++++++
 .../uses_current_column/grammar.json          |  69 +++++++++
 .../uses_current_column/scanner.c             | 133 ++++++++++++++++++
 4 files changed, 337 insertions(+), 56 deletions(-)
 create mode 100644 test/fixtures/test_grammars/uses_current_column/corpus.txt
 create mode 100644 test/fixtures/test_grammars/uses_current_column/grammar.json
 create mode 100644 test/fixtures/test_grammars/uses_current_column/scanner.c

diff --git a/lib/src/lexer.c b/lib/src/lexer.c
index 08e90a8c..f349d76f 100644
--- a/lib/src/lexer.c
+++ b/lib/src/lexer.c
@@ -102,6 +102,56 @@ static void ts_lexer__get_lookahead(Lexer *self) {
   }
 }
 
+static void ts_lexer_goto(Lexer *self, Length position) {
+  self->current_position = position;
+  bool found_included_range = false;
+
+  // Move to the first valid position at or after the given position.
+  for (unsigned i = 0; i < self->included_range_count; i++) {
+    TSRange *included_range = &self->included_ranges[i];
+    if (included_range->end_byte > position.bytes) {
+      if (included_range->start_byte > position.bytes) {
+        self->current_position = (Length) {
+          .bytes = included_range->start_byte,
+          .extent = included_range->start_point,
+        };
+      }
+
+      self->current_included_range_index = i;
+      found_included_range = true;
+      break;
+    }
+  }
+
+  if (found_included_range) {
+    // If the current position is outside of the current chunk of text,
+    // then clear out the current chunk of text.
+    if (self->chunk && (
+      position.bytes < self->chunk_start ||
+      position.bytes >= self->chunk_start + self->chunk_size
+    )) {
+      ts_lexer__clear_chunk(self);
+    }
+
+    self->lookahead_size = 0;
+    self->data.lookahead = '\0';
+  }
+
+  // If the given position is beyond any of included ranges, move to the EOF
+  // state - past the end of the included ranges.
+  else {
+    self->current_included_range_index = self->included_range_count;
+    TSRange *last_included_range = &self->included_ranges[self->included_range_count - 1];
+    self->current_position = (Length) {
+      .bytes = last_included_range->end_byte,
+      .extent = last_included_range->end_point,
+    };
+    ts_lexer__clear_chunk(self);
+    self->lookahead_size = 1;
+    self->data.lookahead = '\0';
+  }
+}
+
 // Advance to the next character in the source code, retrieving a new
 // chunk of source code if needed.
 static void ts_lexer__advance(TSLexer *_self, bool skip) {
@@ -185,12 +235,15 @@ static uint32_t ts_lexer__get_column(TSLexer *_self) {
   Lexer *self = (Lexer *)_self;
   uint32_t goal_byte = self->current_position.bytes;
 
-  self->current_position.bytes -= self->current_position.extent.column;
-  self->current_position.extent.column = 0;
-
-  if (self->current_position.bytes < self->chunk_start) {
-    ts_lexer__get_chunk(self);
-  }
+  ts_lexer_goto(self, (Length) {
+    .bytes = self->current_position.bytes - self->current_position.extent.column,
+    .extent = {
+      .row = self->current_position.extent.row,
+      .column = 0,
+    }
+  });
+  if (!self->chunk_size) ts_lexer__get_chunk(self);
+  if (!self->lookahead_size) ts_lexer__get_lookahead(self);
 
   uint32_t result = 0;
   while (self->current_position.bytes < goal_byte) {
@@ -247,56 +300,6 @@ void ts_lexer_delete(Lexer *self) {
   ts_free(self->included_ranges);
 }
 
-static void ts_lexer_goto(Lexer *self, Length position) {
-  self->current_position = position;
-  bool found_included_range = false;
-
-  // Move to the first valid position at or after the given position.
-  for (unsigned i = 0; i < self->included_range_count; i++) {
-    TSRange *included_range = &self->included_ranges[i];
-    if (included_range->end_byte > position.bytes) {
-      if (included_range->start_byte > position.bytes) {
-        self->current_position = (Length) {
-          .bytes = included_range->start_byte,
-          .extent = included_range->start_point,
-        };
-      }
-
-      self->current_included_range_index = i;
-      found_included_range = true;
-      break;
-    }
-  }
-
-  if (found_included_range) {
-    // If the current position is outside of the current chunk of text,
-    // then clear out the current chunk of text.
-    if (self->chunk && (
-      position.bytes < self->chunk_start ||
-      position.bytes >= self->chunk_start + self->chunk_size
-    )) {
-      ts_lexer__clear_chunk(self);
-    }
-
-    self->lookahead_size = 0;
-    self->data.lookahead = '\0';
-  }
-
-  // If the given position is beyond any of included ranges, move to the EOF
-  // state - past the end of the included ranges.
-  else {
-    self->current_included_range_index = self->included_range_count;
-    TSRange *last_included_range = &self->included_ranges[self->included_range_count - 1];
-    self->current_position = (Length) {
-      .bytes = last_included_range->end_byte,
-      .extent = last_included_range->end_point,
-    };
-    ts_lexer__clear_chunk(self);
-    self->lookahead_size = 1;
-    self->data.lookahead = '\0';
-  }
-}
-
 void ts_lexer_set_input(Lexer *self, TSInput input) {
   self->input = input;
   ts_lexer__clear_chunk(self);
diff --git a/test/fixtures/test_grammars/uses_current_column/corpus.txt b/test/fixtures/test_grammars/uses_current_column/corpus.txt
new file mode 100644
index 00000000..9638e25e
--- /dev/null
+++ b/test/fixtures/test_grammars/uses_current_column/corpus.txt
@@ -0,0 +1,76 @@
+===============
+Simple blocks
+===============
+
+do a
+   e
+f
+
+---
+
+(block
+  (do_expression (block
+    (identifier)
+    (identifier)))
+  (identifier))
+
+=====================
+Nested blocks
+=====================
+
+a = do b
+       c + do e
+              f
+              g
+       h
+i
+
+---
+
+(block
+  (binary_expression
+    (identifier)
+    (do_expression (block
+      (identifier)
+      (binary_expression
+        (identifier)
+        (do_expression (block
+          (identifier)
+          (identifier)
+          (identifier))))
+      (identifier))))
+  (identifier))
+
+===============================
+Blocks with leading newlines
+===============================
+
+do
+
+
+   a = b
+   do
+      c
+      d
+   e
+ f
+
+---
+
+(block
+  (do_expression (block
+    (binary_expression (identifier) (identifier))
+    (do_expression (block
+      (identifier)
+      (identifier)))
+    (identifier)
+    (identifier))))
+
+=====================
+Unterminated blocks
+=====================
+
+do
+---
+
+(ERROR)
diff --git a/test/fixtures/test_grammars/uses_current_column/grammar.json b/test/fixtures/test_grammars/uses_current_column/grammar.json
new file mode 100644
index 00000000..90c740b6
--- /dev/null
+++ b/test/fixtures/test_grammars/uses_current_column/grammar.json
@@ -0,0 +1,69 @@
+{
+  "name": "uses_current_column",
+
+  "externals": [
+    {"type": "SYMBOL", "name": "_indent"},
+    {"type": "SYMBOL", "name": "_dedent"},
+    {"type": "SYMBOL", "name": "_newline"}
+  ],
+
+  "extras": [
+    {"type": "PATTERN", "value": "\\s"}
+  ],
+
+  "rules": {
+    "block": {
+      "type": "REPEAT1",
+      "content": {"type": "SYMBOL", "name": "_statement"}
+    },
+
+    "_statement": {
+      "type": "SEQ",
+      "members": [
+        {"type": "SYMBOL", "name": "_expression"},
+        {"type": "SYMBOL", "name": "_newline"}
+      ]
+    },
+
+    "_expression": {
+      "type": "CHOICE",
+      "members": [
+        {"type": "SYMBOL", "name": "do_expression"},
+        {"type": "SYMBOL", "name": "binary_expression"},
+        {"type": "SYMBOL", "name": "identifier"}
+      ]
+    },
+
+    "do_expression": {
+      "type": "SEQ",
+      "members": [
+        {"type": "STRING", "value": "do"},
+        {"type": "SYMBOL", "name": "_indent"},
+        {"type": "SYMBOL", "name": "block"},
+        {"type": "SYMBOL", "name": "_dedent"}
+      ]
+    },
+
+    "binary_expression": {
+      "type": "PREC_LEFT",
+      "value": 1,
+      "content": {
+        "type": "SEQ",
+        "members": [
+          {"type": "SYMBOL", "name": "_expression"},
+          {
+            "type": "CHOICE",
+            "members": [
+              {"type": "STRING", "value": "="},
+              {"type": "STRING", "value": "+"},
+              {"type": "STRING", "value": "-"}
+            ]
+          },
+          {"type": "SYMBOL", "name": "_expression"}
+        ]
+      }
+    },
+
+    "identifier": {"type": "PATTERN", "value": "\\w+"}
+  }
+}
diff --git a/test/fixtures/test_grammars/uses_current_column/scanner.c b/test/fixtures/test_grammars/uses_current_column/scanner.c
new file mode 100644
index 00000000..efd27f9f
--- /dev/null
+++ b/test/fixtures/test_grammars/uses_current_column/scanner.c
@@ -0,0 +1,133 @@
+#include <stdlib.h>
+#include <wctype.h>
+#include <tree_sitter/parser.h>
+
+enum TokenType {
+  INDENT,
+  DEDENT,
+  NEWLINE,
+};
+
+typedef struct {
+  uint8_t queued_dedent_count;
+  uint8_t indent_count;
+  int8_t indents[32];
+} Scanner;
+
+void *tree_sitter_uses_current_column_external_scanner_create() {
+  Scanner *self = malloc(sizeof(Scanner));
+  self->queued_dedent_count = 0;
+  self->indent_count = 1;
+  self->indents[0] = 0;
+  return (void *)self;
+}
+
+void tree_sitter_uses_current_column_external_scanner_destroy(void *payload) {
+  free(payload);
+}
+
+unsigned tree_sitter_uses_current_column_external_scanner_serialize(
+  void *payload,
+  char *buffer
+) {
+  Scanner *self = (Scanner *)payload;
+  buffer[0] = self->queued_dedent_count;
+  for (unsigned i = 0; i < self->indent_count; i++) {
+    buffer[i + 1] = self->indents[i];
+  }
+  return self->indent_count + 1;
+}
+
+void tree_sitter_uses_current_column_external_scanner_deserialize(
+  void *payload,
+  const char *buffer,
+  unsigned length
+) {
+  Scanner *self = (Scanner *)payload;
+  if (length > 0) {
+    self->queued_dedent_count = buffer[0];
+    self->indent_count = length - 1;
+    for (unsigned i = 0; i < self->indent_count; i++) {
+      self->indents[i] = buffer[i + 1];
+    }
+  } else {
+    self->queued_dedent_count = 0;
+    self->indent_count = 1;
+    self->indents[0] = 0;
+  }
+}
+
+bool tree_sitter_uses_current_column_external_scanner_scan(
+  void *payload,
+  TSLexer *lexer,
+  const bool *valid_symbols
+) {
+  Scanner *self = (Scanner *)payload;
+  lexer->mark_end(lexer);
+
+  // If dedents were found in a previous run, and are valid now,
+  // then return a dedent.
+  if (self->queued_dedent_count > 0 && valid_symbols[DEDENT]) {
+    lexer->result_symbol = DEDENT;
+    self->queued_dedent_count--;
+    return true;
+  }
+
+  // If an indent is valid, then add an entry to the indent stack
+  // for the current column, and return an indent.
+  if (valid_symbols[INDENT]) {
+    while (iswspace(lexer->lookahead)) {
+      lexer->advance(lexer, false);
+    }
+    uint32_t column = lexer->get_column(lexer);
+    if (column > self->indents[self->indent_count - 1]) {
+      self->indents[self->indent_count++] = column - 2;
+      lexer->result_symbol = INDENT;
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  // If at the end of a statement, then get the current indent
+  // level and pop some number of entries off of the indent stack.
+  if (valid_symbols[NEWLINE] || valid_symbols[DEDENT]) {
+    while (lexer->lookahead == ' ') {
+      lexer->advance(lexer, false);
+    }
+
+    if (lexer->lookahead == '\n') {
+      lexer->advance(lexer, false);
+
+      uint32_t next_column = 0;
+      for (;;) {
+        if (lexer->lookahead == ' ') {
+          next_column++;
+          lexer->advance(lexer, false);
+        } else if (lexer->lookahead == '\n') {
+          next_column = 0;
+          lexer->advance(lexer, false);
+        } else {
+          break;
+        }
+      }
+
+      unsigned dedent_count = 0;
+      while (next_column < self->indents[self->indent_count - 1]) {
+        dedent_count++;
+        self->indent_count--;
+      }
+
+      if (dedent_count > 0 && valid_symbols[DEDENT]) {
+        lexer->result_symbol = DEDENT;
+        return true;
+      } else if (valid_symbols[NEWLINE]) {
+        self->queued_dedent_count += dedent_count;
+        lexer->result_symbol = NEWLINE;
+        return true;
+      }
+    }
+  }
+
+  return false;
+}