fix: disallow tokens that match the empty string

2024-11-02 00:53:21 -04:00 · 2024-11-02 00:53:21 -04:00 · 310a9f0704
commit 310a9f0704
parent 8c802da174
5 changed files with 140 additions and 11 deletions
--- a/cli/generate/src/prepare_grammar/expand_tokens.rs
+++ b/cli/generate/src/prepare_grammar/expand_tokens.rs
@ -1,4 +1,5 @@
 use anyhow::{anyhow, Context, Result};
+use indoc::indoc;
 use regex_syntax::{
    hir::{Class, Hir, HirKind},
    ParserBuilder,
@ -56,6 +57,17 @@ pub fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result<LexicalGram

    let mut variables = Vec::new();
    for (i, variable) in grammar.variables.into_iter().enumerate() {
+        if variable.rule.is_empty() {
+            return Err(anyhow!(
+                indoc! {"
+                The rule `{}` matches the empty string.
+                Tree-sitter does not support syntactic rules that match the empty string
+                unless they are used only as the grammar's start rule.
+            "},
+                variable.name
+            ));
+        }
+
        let is_immediate_token = match &variable.rule {
            Rule::Metadata { params, .. } => params.is_main_token,
            _ => false,
--- a/cli/generate/src/prepare_grammar/flatten_grammar.rs
+++ b/cli/generate/src/prepare_grammar/flatten_grammar.rs
@ -1,4 +1,5 @@
 use anyhow::{anyhow, Result};
+use indoc::indoc;

 use super::ExtractedSyntaxGrammar;
 use crate::{
@ -197,11 +198,12 @@ pub(super) fn flatten_grammar(grammar: ExtractedSyntaxGrammar) -> Result<SyntaxG
        for production in &variable.productions {
            if production.steps.is_empty() && symbol_is_used(&variables, symbol) {
                return Err(anyhow!(
-                    "The rule `{}` matches the empty string.
+                    indoc! {"
+                The rule `{}` matches the empty string.

-Tree-sitter does not support syntactic rules that match the empty string
-unless they are used only as the grammar's start rule.
-",
+                Tree-sitter does not support syntactic rules that match the empty string
+                unless they are used only as the grammar's start rule.
+                "},
                    variable.name
                ));
            }
--- a/cli/generate/src/rules.rs
+++ b/cli/generate/src/rules.rs
@ -149,6 +149,16 @@ impl Rule {
    pub const fn seq(rules: Vec<Self>) -> Self {
        Self::Seq(rules)
    }
+
+    pub fn is_empty(&self) -> bool {
+        match self {
+            Self::Blank | Self::Pattern(..) | Self::NamedSymbol(_) | Self::Symbol(_) => false,
+            Self::String(string) => string.is_empty(),
+            Self::Metadata { rule, .. } | Self::Repeat(rule) => rule.is_empty(),
+            Self::Choice(rules) => rules.iter().any(Self::is_empty),
+            Self::Seq(rules) => rules.iter().all(Self::is_empty),
+        }
+    }
 }

 impl Alias {
--- a/cli/src/tests/async_context_test.rs
+++ b/cli/src/tests/async_context_test.rs
@ -22,7 +22,6 @@ fn test_node_in_fut() {
        let root_ref = &root;

        let fut_val_fn = || async {
-            // eprintln!("fut_val_fn: {}", root.child(0).unwrap().kind());
            yield_now().await;
            root.child(0).unwrap().kind()
        };
@ -30,7 +29,6 @@ fn test_node_in_fut() {
        yield_now().await;

        let fut_ref_fn = || async {
-            // eprintln!("fut_ref_fn: {}", root_ref.child(0).unwrap().kind());
            yield_now().await;
            root_ref.child(0).unwrap().kind()
        };
@ -40,13 +38,11 @@ fn test_node_in_fut() {
        assert_eq!(f1, f2);

        let fut_val = async {
-            // eprintln!("fut_val: {}", root.child(0).unwrap().kind());
            yield_now().await;
            root.child(0).unwrap().kind()
        };

        let fut_ref = async {
-            // eprintln!("fut_ref: {}", root_ref.child(0).unwrap().kind());
            yield_now().await;
            root_ref.child(0).unwrap().kind()
        };
@ -58,7 +54,6 @@ fn test_node_in_fut() {
        f1
    })
    .join();
-    // eprintln!("pended: {pended:?}");
    assert_eq!(ret, "comment");
    assert_eq!(pended, 5);
 }
@ -215,7 +210,6 @@ where
        match future.as_mut().poll(&mut cx) {
            Poll::Pending => pending += 1,
            Poll::Ready(r) => {
-                // eprintln!("ready, pended: {pending}");
                break r;
            }
        }
--- a/cli/src/tests/parser_test.rs
+++ b/cli/src/tests/parser_test.rs
@ -1679,7 +1679,6 @@ fn test_decode_utf32() {
                    )
                }
            } else {
-                println!("bad decode: {bytes:?}");
                (0, 0)
            }
        }
@ -1816,6 +1815,118 @@ fn test_decode_utf24le() {
    );
 }

+#[test]
+fn test_grammars_that_should_not_compile() {
+    assert!(generate_parser_for_grammar(
+        r#"
+        {
+            "name": "issue_1111",
+            "rules": {
+                "source_file": { "type": "STRING", "value": "" }
+            },
+        }
+        "#
+    )
+    .is_err());
+
+    assert!(generate_parser_for_grammar(
+        r#"
+        {
+            "name": "issue_1271",
+            "rules": {
+                "source_file": { "type": "SYMBOL", "name": "identifier" },
+                "identifier": {
+                    "type": "TOKEN",
+                    "content": {
+                        "type": "REPEAT",
+                        "content": { "type": "PATTERN", "value": "a" }
+                    }
+                }
+            },
+        }
+        "#,
+    )
+    .is_err());
+
+    assert!(generate_parser_for_grammar(
+        r#"
+        {
+            "name": "issue_1156_expl_1",
+            "rules": {
+                "source_file": {
+                    "type": "TOKEN",
+                    "content": {
+                        "type": "REPEAT",
+                        "content": { "type": "STRING", "value": "c" }
+                    }
+                }
+            },
+        }
+    "#
+    )
+    .is_err());
+
+    assert!(generate_parser_for_grammar(
+        r#"
+        {
+            "name": "issue_1156_expl_2",
+            "rules": {
+                "source_file": {
+                    "type": "TOKEN",
+                    "content": {
+                        "type": "CHOICE",
+                        "members": [
+                            { "type": "STRING", "value": "e" },
+                            { "type": "BLANK" }
+                        ]
+                    }
+                }
+            },
+        }
+    "#
+    )
+    .is_err());
+
+    assert!(generate_parser_for_grammar(
+        r#"
+        {
+            "name": "issue_1156_expl_3",
+            "rules": {
+                "source_file": {
+                    "type": "IMMEDIATE_TOKEN",
+                    "content": {
+                        "type": "REPEAT",
+                        "content": { "type": "STRING", "value": "p" }
+                    }
+                }
+            },
+        }
+    "#
+    )
+    .is_err());
+
+    assert!(generate_parser_for_grammar(
+        r#"
+        {
+            "name": "issue_1156_expl_4",
+            "rules": {
+                "source_file": {
+                    "type": "IMMEDIATE_TOKEN",
+                    "content": {
+                        "type": "CHOICE",
+                        "members": [
+                            { "type": "STRING", "value": "r" },
+                            { "type": "BLANK" }
+                        ]
+                    }
+                }
+            },
+        }
+    "#
+    )
+    .is_err());
+}
+
 const fn simple_range(start: usize, end: usize) -> Range {
    Range {
        start_byte: start,