From 1704c604bf663801876572fe08b746e787cd7fdb Mon Sep 17 00:00:00 2001
From: Will Lillis <will.lillis24@gmail.com>
Date: Sun, 20 Jul 2025 20:18:33 -0400
Subject: [PATCH] feat(test): allow cst as expected output of test case

---
 crates/cli/src/parse.rs                      | 139 ++++++++--------
 crates/cli/src/test.rs                       | 166 +++++++++++++++----
 docs/src/creating-parsers/5-writing-tests.md |   6 +-
 3 files changed, 210 insertions(+), 101 deletions(-)

diff --git a/crates/cli/src/parse.rs b/crates/cli/src/parse.rs
index d6966f72..2badad79 100644
--- a/crates/cli/src/parse.rs
+++ b/crates/cli/src/parse.rs
@@ -1,6 +1,6 @@
 use std::{
     fmt, fs,
-    io::{self, StdoutLock, Write},
+    io::{self, Write},
     path::{Path, PathBuf},
     sync::atomic::{AtomicUsize, Ordering},
     time::{Duration, Instant},
@@ -501,53 +501,7 @@ pub fn parse_file_at_path(
         }
 
         if opts.output == ParseOutput::Cst {
-            let lossy_source_code = String::from_utf8_lossy(&source_code);
-            let total_width = lossy_source_code
-                .lines()
-                .enumerate()
-                .map(|(row, col)| {
-                    (row as f64).log10() as usize + (col.len() as f64).log10() as usize + 1
-                })
-                .max()
-                .unwrap_or(1);
-            let mut indent_level = 1;
-            let mut did_visit_children = false;
-            let mut in_error = false;
-            loop {
-                if did_visit_children {
-                    if cursor.goto_next_sibling() {
-                        did_visit_children = false;
-                    } else if cursor.goto_parent() {
-                        did_visit_children = true;
-                        indent_level -= 1;
-                        if !cursor.node().has_error() {
-                            in_error = false;
-                        }
-                    } else {
-                        break;
-                    }
-                } else {
-                    cst_render_node(
-                        opts,
-                        &mut cursor,
-                        &source_code,
-                        &mut stdout,
-                        total_width,
-                        indent_level,
-                        in_error,
-                    )?;
-                    if cursor.goto_first_child() {
-                        did_visit_children = false;
-                        indent_level += 1;
-                        if cursor.node().has_error() {
-                            in_error = true;
-                        }
-                    } else {
-                        did_visit_children = true;
-                    }
-                }
-            }
-            cursor.reset(tree.root_node());
+            render_cst(&source_code, &tree, &mut cursor, opts, &mut stdout)?;
             println!();
         }
 
@@ -781,6 +735,61 @@ const fn escape_invisible(c: char) -> Option<&'static str> {
     })
 }
 
+pub fn render_cst<'a, 'b: 'a>(
+    source_code: &[u8],
+    tree: &'b Tree,
+    cursor: &mut TreeCursor<'a>,
+    opts: &ParseFileOptions,
+    out: &mut impl Write,
+) -> Result<()> {
+    let lossy_source_code = String::from_utf8_lossy(source_code);
+    let total_width = lossy_source_code
+        .lines()
+        .enumerate()
+        .map(|(row, col)| (row as f64).log10() as usize + (col.len() as f64).log10() as usize + 1)
+        .max()
+        .unwrap_or(1);
+    let mut indent_level = 1;
+    let mut did_visit_children = false;
+    let mut in_error = false;
+    loop {
+        if did_visit_children {
+            if cursor.goto_next_sibling() {
+                did_visit_children = false;
+            } else if cursor.goto_parent() {
+                did_visit_children = true;
+                indent_level -= 1;
+                if !cursor.node().has_error() {
+                    in_error = false;
+                }
+            } else {
+                break;
+            }
+        } else {
+            cst_render_node(
+                opts,
+                cursor,
+                source_code,
+                out,
+                total_width,
+                indent_level,
+                in_error,
+            )?;
+            if cursor.goto_first_child() {
+                did_visit_children = false;
+                indent_level += 1;
+                if cursor.node().has_error() {
+                    in_error = true;
+                }
+            } else {
+                did_visit_children = true;
+            }
+        }
+    }
+    cursor.reset(tree.root_node());
+    Ok(())
+}
+
 fn render_node_text(source: &str) -> String {
     source
         .chars()
@@ -796,7 +805,7 @@ fn render_node_text(source: &str) -> String {
 
 fn write_node_text(
     opts: &ParseFileOptions,
-    stdout: &mut StdoutLock<'static>,
+    out: &mut impl Write,
     cursor: &TreeCursor,
     is_named: bool,
     source: &str,
@@ -812,7 +821,7 @@ fn write_node_text(
 
     if !is_named {
         write!(
-            stdout,
+            out,
             "{}{}{}",
             paint(quote_color, &String::from(quote)),
             paint(color, &render_node_text(source)),
@@ -838,7 +847,7 @@ fn write_node_text(
             let formatted_line = render_line_feed(line, opts);
             if !opts.no_ranges {
                 write!(
-                    stdout,
+                    out,
                     "{}{}{}{}{}{}",
                     if multiline { "\n" } else { "" },
                     if multiline {
@@ -857,7 +866,7 @@ fn write_node_text(
                 )?;
             } else {
                 write!(
-                    stdout,
+                    out,
                     "\n{}{}{}{}",
                     "  ".repeat(indent_level + 1),
                     paint(quote_color, &String::from(quote)),
@@ -920,7 +929,7 @@ fn cst_render_node(
     opts: &ParseFileOptions,
     cursor: &mut TreeCursor,
     source_code: &[u8],
-    stdout: &mut StdoutLock<'static>,
+    out: &mut impl Write,
     total_width: usize,
     indent_level: usize,
     in_error: bool,
@@ -929,13 +938,13 @@ fn cst_render_node(
     let is_named = node.is_named();
     if !opts.no_ranges {
         write!(
-            stdout,
+            out,
             "{}",
             render_node_range(opts, cursor, is_named, false, total_width, node.range())
         )?;
     }
     write!(
-        stdout,
+        out,
         "{}{}",
         "  ".repeat(indent_level),
         if in_error && !node.has_error() {
@@ -947,14 +956,14 @@ fn cst_render_node(
     if is_named {
         if let Some(field_name) = cursor.field_name() {
             write!(
-                stdout,
+                out,
                 "{}",
                 paint(opts.parse_theme.field, &format!("{field_name}: "))
             )?;
         }
 
         if node.has_error() || node.is_error() {
-            write!(stdout, "{}", paint(opts.parse_theme.error, "•"))?;
+            write!(out, "{}", paint(opts.parse_theme.error, "•"))?;
         }
 
         let kind_color = if node.is_error() {
@@ -964,13 +973,13 @@ fn cst_render_node(
         } else {
             opts.parse_theme.node_kind
         };
-        write!(stdout, "{} ", paint(kind_color, node.kind()))?;
+        write!(out, "{} ", paint(kind_color, node.kind()))?;
 
         if node.child_count() == 0 {
             // Node text from a pattern or external scanner
             write_node_text(
                 opts,
-                stdout,
+                out,
                 cursor,
                 is_named,
                 &String::from_utf8_lossy(&source_code[node.start_byte()..node.end_byte()]),
@@ -979,17 +988,13 @@ fn cst_render_node(
             )?;
         }
     } else if node.is_missing() {
-        write!(stdout, "{}: ", paint(opts.parse_theme.missing, "MISSING"))?;
-        write!(
-            stdout,
-            "\"{}\"",
-            paint(opts.parse_theme.missing, node.kind())
-        )?;
+        write!(out, "{}: ", paint(opts.parse_theme.missing, "MISSING"))?;
+        write!(out, "\"{}\"", paint(opts.parse_theme.missing, node.kind()))?;
     } else {
         // Terminal literals, like "fn"
         write_node_text(
             opts,
-            stdout,
+            out,
             cursor,
             is_named,
             node.kind(),
@@ -997,7 +1002,7 @@ fn cst_render_node(
             (total_width, indent_level),
         )?;
     }
-    writeln!(stdout)?;
+    writeln!(out)?;
 
     Ok(())
 }
diff --git a/crates/cli/src/test.rs b/crates/cli/src/test.rs
index e128bc80..a0de1072 100644
--- a/crates/cli/src/test.rs
+++ b/crates/cli/src/test.rs
@@ -23,7 +23,9 @@ use tree_sitter::{format_sexp, Language, LogType, Parser, Query, Tree};
 use walkdir::WalkDir;
 
 use super::util;
-use crate::parse::Stats;
+use crate::parse::{
+    render_cst, ParseDebugType, ParseFileOptions, ParseOutput, ParseStats, ParseTheme, Stats,
+};
 
 static HEADER_REGEX: LazyLock<ByteRegex> = LazyLock::new(|| {
     ByteRegexBuilder::new(
@@ -82,6 +84,7 @@ pub struct TestAttributes {
     pub platform: bool,
     pub fail_fast: bool,
     pub error: bool,
+    pub cst: bool,
     pub languages: Vec<Box<str>>,
 }
 
@@ -102,6 +105,7 @@ impl Default for TestAttributes {
             platform: true,
             fail_fast: false,
             error: false,
+            cst: false,
             languages: vec!["".into()],
         }
     }
@@ -246,22 +250,27 @@ pub fn run_tests_at_path(parser: &mut Parser, opts: &mut TestOptions) -> Result<
                 if opts.color {
                     print_diff_key();
                 }
-                for (i, (name, actual, expected)) in failures.iter().enumerate() {
+                for (i, (name, actual, expected, is_cst)) in failures.iter().enumerate() {
                     if expected == "NO ERROR" {
                         println!("\n  {}. {name}:\n", i + 1);
                         println!("  Expected an ERROR node, but got:");
-                        println!(
-                            "  {}",
-                            paint(
-                                opts.color.then_some(AnsiColor::Red),
-                                &format_sexp(actual, 2)
-                            )
-                        );
+                        let actual = if *is_cst {
+                            actual
+                        } else {
+                            &format_sexp(actual, 2)
+                        };
+                        println!("  {}", paint(opts.color.then_some(AnsiColor::Red), actual));
                     } else {
                         println!("\n  {}. {name}:", i + 1);
-                        let actual = format_sexp(actual, 2);
-                        let expected = format_sexp(expected, 2);
-                        print_diff(&actual, &expected, opts.color);
+                        if *is_cst {
+                            print_diff(actual, expected, opts.color);
+                        } else {
+                            print_diff(
+                                &format_sexp(actual, 2),
+                                &format_sexp(expected, 2),
+                                opts.color,
+                            );
+                        }
                     }
                 }
             }
@@ -348,6 +357,8 @@ pub fn paint(color: Option<impl Into<Color>>, text: &str) -> String {
     format!("{style}{text}{style:#}")
 }
 
+// TODO: Move the ridicululous tuple arguments into structs
+
 /// This will return false if we want to "fail fast". It will bail and not parse any more tests.
 #[allow(clippy::too_many_arguments)]
 fn run_tests(
@@ -355,7 +366,9 @@ fn run_tests(
     test_entry: TestEntry,
     opts: &mut TestOptions,
     mut indent_level: u32,
-    failures: &mut Vec<(String, String, String)>,
+    // (name, actual, expected, is_cst)
+    failures: &mut Vec<(String, String, String, bool)>,
+    // ????
     corrected_entries: &mut Vec<(String, String, String, String, usize, usize)>,
     has_parse_errors: &mut bool,
 ) -> Result<bool> {
@@ -431,7 +444,11 @@ fn run_tests(
                         opts.stats.successful_parses += 1;
                         if opts.update {
                             let input = String::from_utf8(input.clone()).unwrap();
-                            let output = format_sexp(&output, 0);
+                            let output = if attributes.cst {
+                                output.clone()
+                            } else {
+                                format_sexp(&output, 0)
+                            };
                             corrected_entries.push((
                                 name.clone(),
                                 input,
@@ -445,7 +462,11 @@ fn run_tests(
                         if opts.update {
                             let input = String::from_utf8(input.clone()).unwrap();
                             // Keep the original `expected` output if the actual output has no error
-                            let output = format_sexp(&output, 0);
+                            let output = if attributes.cst {
+                                output.clone()
+                            } else {
+                                format_sexp(&output, 0)
+                            };
                             corrected_entries.push((
                                 name.clone(),
                                 input,
@@ -461,10 +482,16 @@ fn run_tests(
                             opts.test_num,
                             paint(opts.color.then_some(AnsiColor::Red), &name),
                         )?;
+                        let actual = if attributes.cst {
+                            render_test_cst(&input, &tree)?
+                        } else {
+                            tree.root_node().to_sexp()
+                        };
                         failures.push((
                             name.clone(),
-                            tree.root_node().to_sexp(),
+                            actual,
                             "NO ERROR".to_string(),
+                            attributes.cst,
                         ));
                     }
 
@@ -472,8 +499,12 @@ fn run_tests(
                         return Ok(false);
                     }
                 } else {
-                    let mut actual = tree.root_node().to_sexp();
-                    if !(opts.show_fields || has_fields) {
+                    let mut actual = if attributes.cst {
+                        render_test_cst(&input, &tree)?
+                    } else {
+                        tree.root_node().to_sexp()
+                    };
+                    if !(attributes.cst || opts.show_fields || has_fields) {
                         actual = strip_sexp_fields(&actual);
                     }
 
@@ -487,7 +518,11 @@ fn run_tests(
                         opts.stats.successful_parses += 1;
                         if opts.update {
                             let input = String::from_utf8(input.clone()).unwrap();
-                            let output = format_sexp(&output, 0);
+                            let output = if attributes.cst {
+                                actual
+                            } else {
+                                format_sexp(&output, 0)
+                            };
                             corrected_entries.push((
                                 name.clone(),
                                 input,
@@ -500,8 +535,11 @@ fn run_tests(
                     } else {
                         if opts.update {
                             let input = String::from_utf8(input.clone()).unwrap();
-                            let expected_output = format_sexp(&output, 0);
-                            let actual_output = format_sexp(&actual, 0);
+                            let (expected_output, actual_output) = if attributes.cst {
+                                (output.clone(), actual.clone())
+                            } else {
+                                (format_sexp(&output, 0), format_sexp(&actual, 0))
+                            };
 
                             // Only bail early before updating if the actual is not the output,
                             // sometimes users want to test cases that
@@ -544,7 +582,7 @@ fn run_tests(
                                 paint(opts.color.then_some(AnsiColor::Red), &name),
                             )?;
                         }
-                        failures.push((name.clone(), actual, output.clone()));
+                        failures.push((name.clone(), actual, output.clone(), attributes.cst));
 
                         if attributes.fail_fast {
                             return Ok(false);
@@ -657,6 +695,28 @@ fn run_tests(
     Ok(true)
 }
 
+/// Convenience wrapper to render a CST for a test entry.
+fn render_test_cst(input: &[u8], tree: &Tree) -> Result<String> {
+    let mut rendered_cst: Vec<u8> = Vec::new();
+    let mut cursor = tree.walk();
+    let opts = ParseFileOptions {
+        edits: &[],
+        output: ParseOutput::Cst,
+        stats: &mut ParseStats::default(),
+        print_time: false,
+        timeout: 0,
+        debug: ParseDebugType::Quiet,
+        debug_graph: false,
+        cancellation_flag: None,
+        encoding: None,
+        open_log: false,
+        no_ranges: false,
+        parse_theme: &ParseTheme::empty(),
+    };
+    render_cst(input, tree, &mut cursor, &opts, &mut rendered_cst)?;
+    Ok(String::from_utf8_lossy(&rendered_cst).trim().to_string())
+}
+
 // Parse time is interpreted in ns before converting to ms to avoid truncation issues
 // Parse rates often have several outliers, leading to a large standard deviation. Taking
 // the log of these rates serves to "flatten" out the distribution, yielding a more
@@ -776,8 +836,8 @@ fn parse_test_content(name: String, content: &str, file_path: Option<PathBuf>) -
             .name("suffix2")
             .map(|m| String::from_utf8_lossy(m.as_bytes()));
 
-        let (mut skip, mut platform, mut fail_fast, mut error, mut languages) =
-            (false, None, false, false, vec![]);
+        let (mut skip, mut platform, mut fail_fast, mut error, mut cst, mut languages) =
+            (false, None, false, false, false, vec![]);
 
         let test_name_and_markers = c
             .name("test_name_and_markers")
@@ -818,6 +878,7 @@ fn parse_test_content(name: String, content: &str, file_path: Option<PathBuf>) -
                         languages.push(lang.into());
                     }
                 }
+                ":cst" => (seen_marker, cst) = (true, true),
                 _ if !seen_marker => {
                     test_name.push_str(line);
                 }
@@ -858,6 +919,7 @@ fn parse_test_content(name: String, content: &str, file_path: Option<PathBuf>) -
                     platform: platform.unwrap_or(true),
                     fail_fast,
                     error,
+                    cst,
                     languages,
                 },
             ))
@@ -910,16 +972,22 @@ fn parse_test_content(name: String, content: &str, file_path: Option<PathBuf>) -
                         input.pop();
                     }
 
-                    // Remove all comments
-                    let output = COMMENT_REGEX.replace_all(output, "").to_string();
+                    let (output, has_fields) = if prev_attributes.cst {
+                        (output.trim().to_string(), false)
+                    } else {
+                        // Remove all comments
+                        let output = COMMENT_REGEX.replace_all(output, "").to_string();
 
-                    // Normalize the whitespace in the expected output.
-                    let output = WHITESPACE_REGEX.replace_all(output.trim(), " ");
-                    let output = output.replace(" )", ")");
+                        // Normalize the whitespace in the expected output.
+                        let output = WHITESPACE_REGEX.replace_all(output.trim(), " ");
+                        let output = output.replace(" )", ")");
 
-                    // Identify if the expected output has fields indicated. If not, then
-                    // fields will not be checked.
-                    let has_fields = SEXP_FIELD_REGEX.is_match(&output);
+                        // Identify if the expected output has fields indicated. If not, then
+                        // fields will not be checked.
+                        let has_fields = SEXP_FIELD_REGEX.is_match(&output);
+
+                        (output, has_fields)
+                    };
 
                     let file_name = if let Some(ref path) = file_path {
                         path.file_name().map(|n| n.to_string_lossy().to_string())
@@ -1493,6 +1561,7 @@ a
                         platform: true,
                         fail_fast: false,
                         error: false,
+                        cst: false,
                         languages: vec!["".into()]
                     },
                     file_name: None,
@@ -1522,6 +1591,16 @@ Test with bad platform marker
 a
 ---
 (b)
+
+====================
+Test with cst marker
+:cst
+====================
+1
+---
+0:0 - 1:0   source_file
+0:0 - 0:1   expression
+0:0 - 0:1     number_literal `1`
 ",
                 std::env::consts::OS,
                 if std::env::consts::OS == "linux" {
@@ -1552,6 +1631,7 @@ a
                             platform: true,
                             fail_fast: true,
                             error: false,
+                            cst: false,
                             languages: vec!["".into()]
                         },
                         file_name: None,
@@ -1573,9 +1653,31 @@ a
                             platform: false,
                             fail_fast: false,
                             error: false,
+                            cst: false,
                             languages: vec!["foo".into()]
                         },
                         file_name: None,
+                    },
+                    TestEntry::Example {
+                        name: "Test with cst marker".to_string(),
+                        input: b"1".to_vec(),
+                        output: "0:0 - 1:0   source_file
+0:0 - 0:1   expression
+0:0 - 0:1     number_literal `1`"
+                            .to_string(),
+                        header_delim_len: 20,
+                        divider_delim_len: 3,
+                        has_fields: false,
+                        attributes_str: ":cst".to_string(),
+                        attributes: TestAttributes {
+                            skip: false,
+                            platform: true,
+                            fail_fast: false,
+                            error: false,
+                            cst: true,
+                            languages: vec!["".into()]
+                        },
+                        file_name: None,
                     }
                 ]
             }
diff --git a/docs/src/creating-parsers/5-writing-tests.md b/docs/src/creating-parsers/5-writing-tests.md
index b1011968..7ed483b1 100644
--- a/docs/src/creating-parsers/5-writing-tests.md
+++ b/docs/src/creating-parsers/5-writing-tests.md
@@ -99,8 +99,8 @@ you can repeat the attribute on a new line.
 
 The following attributes are available:
 
-* `:skip` — This attribute will skip the test when running `tree-sitter test`.
-  This is useful when you want to temporarily disable running a test without deleting it.
+* `:cst` - This attribute specifies that the expected output should be in the form of a CST instead of the normal S-expression. This
+CST matches the format given by `parse --cst`.
 * `:error` — This attribute will assert that the parse tree contains an error. It's useful to just validate that a certain
 input is invalid without displaying the whole parse tree, as such you should omit the parse tree below the `---` line.
 * `:fail-fast` — This attribute will stop the testing additional tests if the test marked with this attribute fails.
@@ -109,6 +109,8 @@ multi-parser repos, such as XML and DTD, or Typescript and TSX. The default pars
 the `grammars` field in the `tree-sitter.json` config file, so having a way to pick a second or even third parser is useful.
 * `:platform(PLATFORM)` — This attribute specifies the platform on which the test should run. It is useful to test platform-specific
 behavior (e.g. Windows newlines are different from Unix). This attribute must match up with Rust's [`std::env::consts::OS`][constants].
+* `:skip` — This attribute will skip the test when running `tree-sitter test`.
+This is useful when you want to temporarily disable running a test without deleting it.
 
 Examples using attributes: