From 8e1dbb4617cffbb79e05c446c4bda7eb7922d6cc Mon Sep 17 00:00:00 2001 From: Amaan Qureshi Date: Sun, 8 Sep 2024 14:26:34 -0400 Subject: [PATCH] fix: properly handle utf8 code points for highlight and tag assertions (cherry picked from commit 6f050f0da51db6a97a8e59210a35d8d9feb541e6) --- Cargo.lock | 57 +++++++++++++++++++--------- Cargo.toml | 1 + cli/Cargo.toml | 1 + cli/src/query.rs | 10 ++--- cli/src/query_testing.rs | 51 ++++++++++++++++++++++--- cli/src/test_highlight.rs | 11 ++++-- cli/src/test_tags.rs | 11 ++++-- cli/src/tests/test_highlight_test.rs | 26 ++++++++----- cli/src/tests/test_tags_test.rs | 16 ++++---- 9 files changed, 131 insertions(+), 53 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8d6182c3..b1d35340 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -99,7 +99,7 @@ dependencies = [ "bitflags", "cexpr", "clang-sys", - "itertools", + "itertools 0.13.0", "log", "prettyplease", "proc-macro2", @@ -125,6 +125,17 @@ dependencies = [ "objc2", ] +[[package]] +name = "bstr" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40723b8fb387abc38f4f4a37c09073622e41dd12327033091ef8950659e6dc0c" +dependencies = [ + "memchr", + "regex-automata", + "serde", +] + [[package]] name = "bumpalo" version = "3.16.0" @@ -145,9 +156,9 @@ checksum = "8318a53db07bb3f8dca91a600466bdb3f2eaadeedfdbcf02e1accbad9271ba50" [[package]] name = "cc" -version = "1.1.19" +version = "1.1.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d74707dde2ba56f86ae90effb3b43ddd369504387e718014de010cec7959800" +checksum = "45bcde016d64c21da4be18b655631e5ab6d3107607e71a73a9f53eb48aae23fb" dependencies = [ "jobserver", "libc", @@ -391,7 +402,7 @@ dependencies = [ "cranelift-codegen", "cranelift-entity", "cranelift-frontend", - "itertools", + "itertools 0.12.1", "log", "smallvec", "wasmparser", @@ -677,6 +688,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.11" @@ -914,9 +934,9 @@ dependencies = [ [[package]] name = "object" -version = "0.36.3" +version = "0.36.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27b64972346851a39438c60b341ebc01bba47464ae329e55cf343eb93964efd9" +checksum = "084f1a5821ac4c651660a94a7153d27ac9d8a53736203f58b31945ded098070a" dependencies = [ "crc32fast", "hashbrown 0.14.5", @@ -1036,9 +1056,9 @@ dependencies = [ [[package]] name = "psm" -version = "0.1.21" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5787f7cda34e3033a72192c018bc5883100330f362ef279a8cbccfce8bb4e874" +checksum = "aa37f80ca58604976033fae9515a8a2989fc13797d953f7c04fb8fa36a11f205" dependencies = [ "cc", ] @@ -1084,9 +1104,9 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.5.3" +version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a908a6e00f1fdd0dfd9c0eb08ce85126f6d8bbda50017e74bc4a4b7d4a926a4" +checksum = "0884ad60e090bf1345b93da0a5de8923c93884cd03f40dfcfddd3b4bee661853" dependencies = [ "bitflags", ] @@ -1152,9 +1172,9 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" [[package]] name = "rustix" -version = "0.38.34" +version = "0.38.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" +checksum = "8acb788b847c24f28525660c4d7758620a7210875711f79e7f663cc152726811" dependencies = [ "bitflags", "errno", @@ -1279,9 +1299,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "syn" -version = "2.0.76" +version = "2.0.77" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "578e081a14e0cefc3279b0472138c513f37b41a08d5a3cca9b6e4e8ceb6cd525" +checksum = "9f35bcdf61fd8e7be6caf75f429fdca8beb3ed76584befb503b1569faee373ed" dependencies = [ "proc-macro2", "quote", @@ -1386,9 +1406,9 @@ dependencies = [ [[package]] name = "toml_edit" -version = "0.22.20" +version = "0.22.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "583c44c02ad26b0c3f3066fe629275e50627026c51ac2e595cca4c230ce1ce1d" +checksum = "3b072cee73c449a636ffd6f32bd8de3a9f7119139aff882f44943ce2986dc5cf" dependencies = [ "indexmap", "serde", @@ -1446,6 +1466,7 @@ version = "0.23.0" dependencies = [ "anstyle", "anyhow", + "bstr", "clap", "ctor", "ctrlc", @@ -1558,9 +1579,9 @@ checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" [[package]] name = "unicode-ident" -version = "1.0.12" +version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" +checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" [[package]] name = "unicode-normalization" diff --git a/Cargo.toml b/Cargo.toml index 1c95c66a..7594e80b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,6 +41,7 @@ strip = false [workspace.dependencies] anstyle = "1.0.8" anyhow = "1.0.89" +bstr = "1.10.0" cc = "1.1.19" clap = { version = "4.5.17", features = [ "cargo", diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 493503ce..34fa31db 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -27,6 +27,7 @@ wasm = ["tree-sitter/wasm", "tree-sitter-loader/wasm"] [dependencies] anstyle.workspace = true anyhow.workspace = true +bstr.workspace = true clap.workspace = true ctor.workspace = true ctrlc.workspace = true diff --git a/cli/src/query.rs b/cli/src/query.rs index bffa0588..f32c5450 100644 --- a/cli/src/query.rs +++ b/cli/src/query.rs @@ -9,7 +9,7 @@ use std::{ use anyhow::{Context, Result}; use tree_sitter::{Language, Parser, Point, Query, QueryCursor}; -use crate::query_testing; +use crate::query_testing::{self, to_utf8_point}; #[allow(clippy::too_many_arguments)] pub fn query_files_at_paths( @@ -70,8 +70,8 @@ pub fn query_files_at_paths( } results.push(query_testing::CaptureInfo { name: (*capture_name).to_string(), - start: capture.node.start_position(), - end: capture.node.end_position(), + start: to_utf8_point(capture.node.start_position(), source_code.as_slice()), + end: to_utf8_point(capture.node.end_position(), source_code.as_slice()), }); } } else { @@ -100,8 +100,8 @@ pub fn query_files_at_paths( } results.push(query_testing::CaptureInfo { name: (*capture_name).to_string(), - start: capture.node.start_position(), - end: capture.node.end_position(), + start: to_utf8_point(capture.node.start_position(), source_code.as_slice()), + end: to_utf8_point(capture.node.end_position(), source_code.as_slice()), }); } } diff --git a/cli/src/query_testing.rs b/cli/src/query_testing.rs index 3456dbe6..258770d1 100644 --- a/cli/src/query_testing.rs +++ b/cli/src/query_testing.rs @@ -1,6 +1,7 @@ use std::fs; use anyhow::{anyhow, Result}; +use bstr::{BStr, ByteSlice}; use lazy_static::lazy_static; use regex::Regex; use tree_sitter::{Language, Parser, Point}; @@ -9,16 +10,56 @@ lazy_static! { static ref CAPTURE_NAME_REGEX: Regex = Regex::new("[\\w_\\-.]+").unwrap(); } +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub struct Utf8Point { + pub row: usize, + pub column: usize, +} + +impl std::fmt::Display for Utf8Point { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "({}, {})", self.row, self.column) + } +} + +impl Utf8Point { + pub const fn new(row: usize, column: usize) -> Self { + Self { row, column } + } +} + +pub fn to_utf8_point(point: Point, source: &[u8]) -> Utf8Point { + if point.column == 0 { + return Utf8Point::new(point.row, 0); + } + + let bstr = BStr::new(source); + let line = bstr.lines_with_terminator().nth(point.row).unwrap(); + let mut utf8_column = 0; + + for (_, grapheme_end, _) in line.grapheme_indices() { + utf8_column += 1; + if grapheme_end >= point.column { + break; + } + } + + Utf8Point { + row: point.row, + column: utf8_column, + } +} + #[derive(Debug, Eq, PartialEq)] pub struct CaptureInfo { pub name: String, - pub start: Point, - pub end: Point, + pub start: Utf8Point, + pub end: Utf8Point, } #[derive(Debug, PartialEq, Eq)] pub struct Assertion { - pub position: Point, + pub position: Utf8Point, pub negative: bool, pub expected_capture_name: String, } @@ -27,7 +68,7 @@ impl Assertion { #[must_use] pub fn new(row: usize, col: usize, negative: bool, expected_capture_name: String) -> Self { Self { - position: Point::new(row, col), + position: Utf8Point::new(row, col), negative, expected_capture_name, } @@ -103,7 +144,7 @@ pub fn parse_position_comments( { assertion_ranges.push((node.start_position(), node.end_position())); result.push(Assertion { - position, + position: to_utf8_point(position, source), negative, expected_capture_name: mat.as_str().to_string(), }); diff --git a/cli/src/test_highlight.rs b/cli/src/test_highlight.rs index 541d98fd..34be438f 100644 --- a/cli/src/test_highlight.rs +++ b/cli/src/test_highlight.rs @@ -7,7 +7,7 @@ use tree_sitter_highlight::{Highlight, HighlightConfiguration, HighlightEvent, H use tree_sitter_loader::{Config, Loader}; use super::{ - query_testing::{parse_position_comments, Assertion}, + query_testing::{parse_position_comments, to_utf8_point, Assertion, Utf8Point}, test::paint, util, }; @@ -141,7 +141,7 @@ fn test_highlights_indented( } pub fn iterate_assertions( assertions: &[Assertion], - highlights: &[(Point, Point, Highlight)], + highlights: &[(Utf8Point, Utf8Point, Highlight)], highlight_names: &[String], ) -> Result { // Iterate through all of the highlighting assertions, checking each one against the @@ -224,7 +224,7 @@ pub fn get_highlight_positions( highlighter: &mut Highlighter, highlight_config: &HighlightConfiguration, source: &[u8], -) -> Result> { +) -> Result> { let mut row = 0; let mut column = 0; let mut byte_offset = 0; @@ -261,7 +261,10 @@ pub fn get_highlight_positions( } } if let Some(highlight) = highlight_stack.last() { - result.push((start_position, Point::new(row, column), *highlight)); + let utf8_start_position = to_utf8_point(start_position, source.as_bytes()); + let utf8_end_position = + to_utf8_point(Point::new(row, column), source.as_bytes()); + result.push((utf8_start_position, utf8_end_position, *highlight)); } } } diff --git a/cli/src/test_tags.rs b/cli/src/test_tags.rs index c5a1dc02..5b290bda 100644 --- a/cli/src/test_tags.rs +++ b/cli/src/test_tags.rs @@ -2,12 +2,11 @@ use std::{fs, path::Path}; use anstyle::AnsiColor; use anyhow::{anyhow, Result}; -use tree_sitter::Point; use tree_sitter_loader::{Config, Loader}; use tree_sitter_tags::{TagsConfiguration, TagsContext}; use super::{ - query_testing::{parse_position_comments, Assertion}, + query_testing::{parse_position_comments, to_utf8_point, Assertion, Utf8Point}, test::paint, util, }; @@ -168,7 +167,7 @@ pub fn get_tag_positions( tags_context: &mut TagsContext, tags_config: &TagsConfiguration, source: &[u8], -) -> Result> { +) -> Result> { let (tags_iter, _has_error) = tags_context.generate_tags(tags_config, source, None)?; let tag_positions = tags_iter .filter_map(std::result::Result::ok) @@ -179,7 +178,11 @@ pub fn get_tag_positions( } else { format!("reference.{tag_postfix}") }; - (tag.span.start, tag.span.end, tag_name) + ( + to_utf8_point(tag.span.start, source), + to_utf8_point(tag.span.end, source), + tag_name, + ) }) .collect(); Ok(tag_positions) diff --git a/cli/src/tests/test_highlight_test.rs b/cli/src/tests/test_highlight_test.rs index 8699c2a6..054e33f8 100644 --- a/cli/src/tests/test_highlight_test.rs +++ b/cli/src/tests/test_highlight_test.rs @@ -1,9 +1,9 @@ -use tree_sitter::{Parser, Point}; +use tree_sitter::Parser; use tree_sitter_highlight::{Highlight, Highlighter}; use super::helpers::fixtures::{get_highlight_config, get_language, test_loader}; use crate::{ - query_testing::{parse_position_comments, Assertion}, + query_testing::{parse_position_comments, Assertion, Utf8Point}, test_highlight::get_highlight_positions, }; @@ -28,6 +28,9 @@ fn test_highlight_test_with_basic_test() { " // ^ variable", " // ^ !variable", "};", + "var y̆y̆y̆y̆ = function() {}", + " // ^ function", + " // ^ keyword", ] .join("\n"); @@ -40,6 +43,8 @@ fn test_highlight_test_with_basic_test() { Assertion::new(1, 11, false, String::from("keyword")), Assertion::new(4, 9, false, String::from("variable")), Assertion::new(4, 11, true, String::from("variable")), + Assertion::new(8, 5, false, String::from("function")), + Assertion::new(8, 11, false, String::from("keyword")), ] ); @@ -50,13 +55,16 @@ fn test_highlight_test_with_basic_test() { assert_eq!( highlight_positions, &[ - (Point::new(1, 0), Point::new(1, 3), Highlight(2)), // "var" - (Point::new(1, 4), Point::new(1, 7), Highlight(0)), // "abc" - (Point::new(1, 10), Point::new(1, 18), Highlight(2)), // "function" - (Point::new(1, 19), Point::new(1, 20), Highlight(1)), // "d" - (Point::new(4, 2), Point::new(4, 8), Highlight(2)), // "return" - (Point::new(4, 9), Point::new(4, 10), Highlight(1)), // "d" - (Point::new(4, 13), Point::new(4, 14), Highlight(1)), // "e" + (Utf8Point::new(1, 0), Utf8Point::new(1, 3), Highlight(2)), // "var" + (Utf8Point::new(1, 4), Utf8Point::new(1, 7), Highlight(0)), // "abc" + (Utf8Point::new(1, 10), Utf8Point::new(1, 18), Highlight(2)), // "function" + (Utf8Point::new(1, 19), Utf8Point::new(1, 20), Highlight(1)), // "d" + (Utf8Point::new(4, 2), Utf8Point::new(4, 8), Highlight(2)), // "return" + (Utf8Point::new(4, 9), Utf8Point::new(4, 10), Highlight(1)), // "d" + (Utf8Point::new(4, 13), Utf8Point::new(4, 14), Highlight(1)), // "e" + (Utf8Point::new(8, 0), Utf8Point::new(8, 3), Highlight(2)), // "var" + (Utf8Point::new(8, 4), Utf8Point::new(8, 8), Highlight(0)), // "y̆y̆y̆y̆" + (Utf8Point::new(8, 11), Utf8Point::new(8, 19), Highlight(2)), // "function" ] ); } diff --git a/cli/src/tests/test_tags_test.rs b/cli/src/tests/test_tags_test.rs index 5e7bf9c9..5f7b88fc 100644 --- a/cli/src/tests/test_tags_test.rs +++ b/cli/src/tests/test_tags_test.rs @@ -1,9 +1,9 @@ -use tree_sitter::{Parser, Point}; +use tree_sitter::Parser; use tree_sitter_tags::TagsContext; use super::helpers::fixtures::{get_language, get_tags_config}; use crate::{ - query_testing::{parse_position_comments, Assertion}, + query_testing::{parse_position_comments, Assertion, Utf8Point}, test_tags::get_tag_positions, }; @@ -43,18 +43,18 @@ fn test_tags_test_with_basic_test() { tag_positions, &[ ( - Point::new(1, 4), - Point::new(1, 7), + Utf8Point::new(1, 4), + Utf8Point::new(1, 7), "definition.function".to_string() ), ( - Point::new(3, 8), - Point::new(3, 11), + Utf8Point::new(3, 8), + Utf8Point::new(3, 11), "reference.call".to_string() ), ( - Point::new(5, 11), - Point::new(5, 12), + Utf8Point::new(5, 11), + Utf8Point::new(5, 12), "reference.call".to_string() ), ]