From 310a9f0704aeb8d9b1e32ff2bf9b6bd03c8032eb Mon Sep 17 00:00:00 2001
From: Amaan Qureshi <amaanq12@gmail.com>
Date: Sat, 2 Nov 2024 00:53:21 -0400
Subject: [PATCH] fix: disallow tokens that match the empty string

---
 .../src/prepare_grammar/expand_tokens.rs      |  12 ++
 .../src/prepare_grammar/flatten_grammar.rs    |  10 +-
 cli/generate/src/rules.rs                     |  10 ++
 cli/src/tests/async_context_test.rs           |   6 -
 cli/src/tests/parser_test.rs                  | 113 +++++++++++++++++-
 5 files changed, 140 insertions(+), 11 deletions(-)
diff --git a/cli/generate/src/prepare_grammar/expand_tokens.rs b/cli/generate/src/prepare_grammar/expand_tokens.rs
index 0a8a6e5a..84d05981 100644
--- a/cli/generate/src/prepare_grammar/expand_tokens.rs
+++ b/cli/generate/src/prepare_grammar/expand_tokens.rs
@@ -1,4 +1,5 @@
 use anyhow::{anyhow, Context, Result};
+use indoc::indoc;
 use regex_syntax::{
     hir::{Class, Hir, HirKind},
     ParserBuilder,
@@ -56,6 +57,17 @@ pub fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result<LexicalGram
 
     let mut variables = Vec::new();
     for (i, variable) in grammar.variables.into_iter().enumerate() {
+        if variable.rule.is_empty() {
+            return Err(anyhow!(
+                indoc! {"
+                The rule `{}` matches the empty string.
+                Tree-sitter does not support syntactic rules that match the empty string
+                unless they are used only as the grammar's start rule.
+            "},
+                variable.name
+            ));
+        }
+
         let is_immediate_token = match &variable.rule {
             Rule::Metadata { params, .. } => params.is_main_token,
             _ => false,
diff --git a/cli/generate/src/prepare_grammar/flatten_grammar.rs b/cli/generate/src/prepare_grammar/flatten_grammar.rs
index e01bc0b0..86eb0c73 100644
--- a/cli/generate/src/prepare_grammar/flatten_grammar.rs
+++ b/cli/generate/src/prepare_grammar/flatten_grammar.rs
@@ -1,4 +1,5 @@
 use anyhow::{anyhow, Result};
+use indoc::indoc;
 
 use super::ExtractedSyntaxGrammar;
 use crate::{
@@ -197,11 +198,12 @@ pub(super) fn flatten_grammar(grammar: ExtractedSyntaxGrammar) -> Result<SyntaxG
         for production in &variable.productions {
             if production.steps.is_empty() && symbol_is_used(&variables, symbol) {
                 return Err(anyhow!(
-                    "The rule `{}` matches the empty string.
+                    indoc! {"
+                The rule `{}` matches the empty string.
 
-Tree-sitter does not support syntactic rules that match the empty string
-unless they are used only as the grammar's start rule.
-",
+                Tree-sitter does not support syntactic rules that match the empty string
+                unless they are used only as the grammar's start rule.
+                "},
                     variable.name
                 ));
             }
diff --git a/cli/generate/src/rules.rs b/cli/generate/src/rules.rs
index 7657df88..6a922b31 100644
--- a/cli/generate/src/rules.rs
+++ b/cli/generate/src/rules.rs
@@ -149,6 +149,16 @@ impl Rule {
     pub const fn seq(rules: Vec<Self>) -> Self {
         Self::Seq(rules)
     }
+
+    pub fn is_empty(&self) -> bool {
+        match self {
+            Self::Blank | Self::Pattern(..) | Self::NamedSymbol(_) | Self::Symbol(_) => false,
+            Self::String(string) => string.is_empty(),
+            Self::Metadata { rule, .. } | Self::Repeat(rule) => rule.is_empty(),
+            Self::Choice(rules) => rules.iter().any(Self::is_empty),
+            Self::Seq(rules) => rules.iter().all(Self::is_empty),
+        }
+    }
 }
 
 impl Alias {
diff --git a/cli/src/tests/async_context_test.rs b/cli/src/tests/async_context_test.rs
index cb2345cc..edcd5e4c 100644
--- a/cli/src/tests/async_context_test.rs
+++ b/cli/src/tests/async_context_test.rs
@@ -22,7 +22,6 @@ fn test_node_in_fut() {
         let root_ref = &root;
 
         let fut_val_fn = || async {
-            // eprintln!("fut_val_fn: {}", root.child(0).unwrap().kind());
             yield_now().await;
             root.child(0).unwrap().kind()
         };
@@ -30,7 +29,6 @@ fn test_node_in_fut() {
         yield_now().await;
 
         let fut_ref_fn = || async {
-            // eprintln!("fut_ref_fn: {}", root_ref.child(0).unwrap().kind());
             yield_now().await;
             root_ref.child(0).unwrap().kind()
         };
@@ -40,13 +38,11 @@ fn test_node_in_fut() {
         assert_eq!(f1, f2);
 
         let fut_val = async {
-            // eprintln!("fut_val: {}", root.child(0).unwrap().kind());
             yield_now().await;
             root.child(0).unwrap().kind()
         };
 
         let fut_ref = async {
-            // eprintln!("fut_ref: {}", root_ref.child(0).unwrap().kind());
             yield_now().await;
             root_ref.child(0).unwrap().kind()
         };
@@ -58,7 +54,6 @@ fn test_node_in_fut() {
         f1
     })
     .join();
-    // eprintln!("pended: {pended:?}");
     assert_eq!(ret, "comment");
     assert_eq!(pended, 5);
 }
@@ -215,7 +210,6 @@ where
         match future.as_mut().poll(&mut cx) {
             Poll::Pending => pending += 1,
             Poll::Ready(r) => {
-                // eprintln!("ready, pended: {pending}");
                 break r;
             }
         }
diff --git a/cli/src/tests/parser_test.rs b/cli/src/tests/parser_test.rs
index 1f2bc6e7..e5dc15d9 100644
--- a/cli/src/tests/parser_test.rs
+++ b/cli/src/tests/parser_test.rs
@@ -1679,7 +1679,6 @@ fn test_decode_utf32() {
                     )
                 }
             } else {
-                println!("bad decode: {bytes:?}");
                 (0, 0)
             }
         }
@@ -1816,6 +1815,118 @@ fn test_decode_utf24le() {
     );
 }
 
+#[test]
+fn test_grammars_that_should_not_compile() {
+    assert!(generate_parser_for_grammar(
+        r#"
+        {
+            "name": "issue_1111",
+            "rules": {
+                "source_file": { "type": "STRING", "value": "" }
+            },
+        }
+        "#
+    )
+    .is_err());
+
+    assert!(generate_parser_for_grammar(
+        r#"
+        {
+            "name": "issue_1271",
+            "rules": {
+                "source_file": { "type": "SYMBOL", "name": "identifier" },
+                "identifier": {
+                    "type": "TOKEN",
+                    "content": {
+                        "type": "REPEAT",
+                        "content": { "type": "PATTERN", "value": "a" }
+                    }
+                }
+            },
+        }
+        "#,
+    )
+    .is_err());
+
+    assert!(generate_parser_for_grammar(
+        r#"
+        {
+            "name": "issue_1156_expl_1",
+            "rules": {
+                "source_file": {
+                    "type": "TOKEN",
+                    "content": {
+                        "type": "REPEAT",
+                        "content": { "type": "STRING", "value": "c" }
+                    }
+                }
+            },
+        }
+    "#
+    )
+    .is_err());
+
+    assert!(generate_parser_for_grammar(
+        r#"
+        {
+            "name": "issue_1156_expl_2",
+            "rules": {
+                "source_file": {
+                    "type": "TOKEN",
+                    "content": {
+                        "type": "CHOICE",
+                        "members": [
+                            { "type": "STRING", "value": "e" },
+                            { "type": "BLANK" }
+                        ]
+                    }
+                }
+            },
+        }
+    "#
+    )
+    .is_err());
+
+    assert!(generate_parser_for_grammar(
+        r#"
+        {
+            "name": "issue_1156_expl_3",
+            "rules": {
+                "source_file": {
+                    "type": "IMMEDIATE_TOKEN",
+                    "content": {
+                        "type": "REPEAT",
+                        "content": { "type": "STRING", "value": "p" }
+                    }
+                }
+            },
+        }
+    "#
+    )
+    .is_err());
+
+    assert!(generate_parser_for_grammar(
+        r#"
+        {
+            "name": "issue_1156_expl_4",
+            "rules": {
+                "source_file": {
+                    "type": "IMMEDIATE_TOKEN",
+                    "content": {
+                        "type": "CHOICE",
+                        "members": [
+                            { "type": "STRING", "value": "r" },
+                            { "type": "BLANK" }
+                        ]
+                    }
+                }
+            },
+        }
+    "#
+    )
+    .is_err());
+}
+
 const fn simple_range(start: usize, end: usize) -> Range {
     Range {
         start_byte: start,