Fix parse error when reusing a node at the end of an included range

2022-11-11 16:34:57 -08:00 · 2022-11-11 16:34:57 -08:00 · d07f864815
commit d07f864815
parent 618146260e
2 changed files with 46 additions and 25 deletions
--- a/cli/src/tests/parser_test.rs
+++ b/cli/src/tests/parser_test.rs
@ -951,7 +951,9 @@ fn test_parsing_with_included_range_containing_mismatched_positions() {

    parser.set_included_ranges(&[range_to_parse]).unwrap();

-    let html_tree = parser.parse(source_code, None).unwrap();
+    let html_tree = parser
+        .parse_with(&mut chunked_input(source_code, 3), None)
+        .unwrap();

    assert_eq!(html_tree.root_node().range(), range_to_parse);

@ -1078,7 +1080,9 @@ fn test_parsing_with_a_newly_excluded_range() {
    // Parse HTML including the template directive, which will cause an error
    let mut parser = Parser::new();
    parser.set_language(get_language("html")).unwrap();
-    let mut first_tree = parser.parse(&source_code, None).unwrap();
+    let mut first_tree = parser
+        .parse_with(&mut chunked_input(&source_code, 3), None)
+        .unwrap();

    // Insert code at the beginning of the document.
    let prefix = "a very very long line of plain text. ";
@ -1113,7 +1117,9 @@ fn test_parsing_with_a_newly_excluded_range() {
            },
        ])
        .unwrap();
-    let tree = parser.parse(&source_code, Some(&first_tree)).unwrap();
+    let tree = parser
+        .parse_with(&mut chunked_input(&source_code, 3), Some(&first_tree))
+        .unwrap();

    assert_eq!(
        tree.root_node().to_sexp(),
@ -1164,7 +1170,9 @@ fn test_parsing_with_a_newly_included_range() {
    parser
        .set_included_ranges(&[simple_range(range1_start, range1_end)])
        .unwrap();
-    let tree = parser.parse(source_code, None).unwrap();
+    let tree = parser
+        .parse_with(&mut chunked_input(&source_code, 3), None)
+        .unwrap();
    assert_eq!(
        tree.root_node().to_sexp(),
        concat!(
@ -1181,7 +1189,9 @@ fn test_parsing_with_a_newly_included_range() {
            simple_range(range3_start, range3_end),
        ])
        .unwrap();
-    let tree2 = parser.parse(&source_code, Some(&tree)).unwrap();
+    let tree2 = parser
+        .parse_with(&mut chunked_input(&source_code, 3), Some(&tree))
+        .unwrap();
    assert_eq!(
        tree2.root_node().to_sexp(),
        concat!(
@ -1289,3 +1299,7 @@ fn simple_range(start: usize, end: usize) -> Range {
        end_point: Point::new(0, end),
    }
 }
+
+fn chunked_input<'a>(text: &'a str, size: usize) -> impl FnMut(usize, Point) -> &'a [u8] {
+    move |offset, _| text[offset..text.len().min(offset + size)].as_bytes()
+}
--- a/lib/src/lexer.c
+++ b/lib/src/lexer.c
@ -104,13 +104,16 @@ static void ts_lexer__get_lookahead(Lexer *self) {

 static void ts_lexer_goto(Lexer *self, Length position) {
  self->current_position = position;
-  bool found_included_range = false;

  // Move to the first valid position at or after the given position.
+  bool found_included_range = false;
  for (unsigned i = 0; i < self->included_range_count; i++) {
    TSRange *included_range = &self->included_ranges[i];
-    if (included_range->end_byte > position.bytes) {
-      if (included_range->start_byte >= position.bytes) {
+    if (
+      included_range->end_byte > self->current_position.bytes &&
+      included_range->end_byte > included_range->start_byte
+    ) {
+      if (included_range->start_byte >= self->current_position.bytes) {
        self->current_position = (Length) {
          .bytes = included_range->start_byte,
          .extent = included_range->start_point,
@ -127,8 +130,8 @@ static void ts_lexer_goto(Lexer *self, Length position) {
    // If the current position is outside of the current chunk of text,
    // then clear out the current chunk of text.
    if (self->chunk && (
-      position.bytes < self->chunk_start ||
-      position.bytes >= self->chunk_start + self->chunk_size
+      self->current_position.bytes < self->chunk_start ||
+      self->current_position.bytes >= self->chunk_start + self->chunk_size
    )) {
      ts_lexer__clear_chunk(self);
    }
@ -164,27 +167,31 @@ static void ts_lexer__do_advance(Lexer *self, bool skip) {
    }
  }

-  const TSRange *current_range = NULL;
-  if (self->current_included_range_index < self->included_range_count) {
-    current_range = &self->included_ranges[self->current_included_range_index];
-    if (self->current_position.bytes == current_range->end_byte) {
-      self->current_included_range_index++;
-      if (self->current_included_range_index < self->included_range_count) {
-        current_range++;
-        self->current_position = (Length) {
-          current_range->start_byte,
-          current_range->start_point,
-        };
-      } else {
-        current_range = NULL;
-      }
+  const TSRange *current_range = &self->included_ranges[self->current_included_range_index];
+  while (
+    self->current_position.bytes >= current_range->end_byte ||
+    current_range->end_byte == current_range->start_byte
+  ) {
+    self->current_included_range_index++;
+    if (self->current_included_range_index < self->included_range_count) {
+      current_range++;
+      self->current_position = (Length) {
+        current_range->start_byte,
+        current_range->start_point,
+      };
+    } else {
+      current_range = NULL;
+      break;
    }
  }

  if (skip) self->token_start_position = self->current_position;

  if (current_range) {
-    if (self->current_position.bytes >= self->chunk_start + self->chunk_size) {
+    if (
+      self->current_position.bytes < self->chunk_start ||
+      self->current_position.bytes >= self->chunk_start + self->chunk_size
+    ) {
      ts_lexer__get_chunk(self);
    }
    ts_lexer__get_lookahead(self);