fix: disallow tokens that match the empty string

This commit is contained in:
Amaan Qureshi 2024-11-02 00:53:21 -04:00
parent 8c802da174
commit 310a9f0704
5 changed files with 140 additions and 11 deletions

View file

@ -1,4 +1,5 @@
use anyhow::{anyhow, Context, Result};
use indoc::indoc;
use regex_syntax::{
hir::{Class, Hir, HirKind},
ParserBuilder,
@ -56,6 +57,17 @@ pub fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result<LexicalGram
let mut variables = Vec::new();
for (i, variable) in grammar.variables.into_iter().enumerate() {
if variable.rule.is_empty() {
return Err(anyhow!(
indoc! {"
The rule `{}` matches the empty string.
Tree-sitter does not support syntactic rules that match the empty string
unless they are used only as the grammar's start rule.
"},
variable.name
));
}
let is_immediate_token = match &variable.rule {
Rule::Metadata { params, .. } => params.is_main_token,
_ => false,

View file

@ -1,4 +1,5 @@
use anyhow::{anyhow, Result};
use indoc::indoc;
use super::ExtractedSyntaxGrammar;
use crate::{
@ -197,11 +198,12 @@ pub(super) fn flatten_grammar(grammar: ExtractedSyntaxGrammar) -> Result<SyntaxG
for production in &variable.productions {
if production.steps.is_empty() && symbol_is_used(&variables, symbol) {
return Err(anyhow!(
"The rule `{}` matches the empty string.
indoc! {"
The rule `{}` matches the empty string.
Tree-sitter does not support syntactic rules that match the empty string
unless they are used only as the grammar's start rule.
",
Tree-sitter does not support syntactic rules that match the empty string
unless they are used only as the grammar's start rule.
"},
variable.name
));
}

View file

@ -149,6 +149,16 @@ impl Rule {
pub const fn seq(rules: Vec<Self>) -> Self {
Self::Seq(rules)
}
pub fn is_empty(&self) -> bool {
match self {
Self::Blank | Self::Pattern(..) | Self::NamedSymbol(_) | Self::Symbol(_) => false,
Self::String(string) => string.is_empty(),
Self::Metadata { rule, .. } | Self::Repeat(rule) => rule.is_empty(),
Self::Choice(rules) => rules.iter().any(Self::is_empty),
Self::Seq(rules) => rules.iter().all(Self::is_empty),
}
}
}
impl Alias {

View file

@ -22,7 +22,6 @@ fn test_node_in_fut() {
let root_ref = &root;
let fut_val_fn = || async {
// eprintln!("fut_val_fn: {}", root.child(0).unwrap().kind());
yield_now().await;
root.child(0).unwrap().kind()
};
@ -30,7 +29,6 @@ fn test_node_in_fut() {
yield_now().await;
let fut_ref_fn = || async {
// eprintln!("fut_ref_fn: {}", root_ref.child(0).unwrap().kind());
yield_now().await;
root_ref.child(0).unwrap().kind()
};
@ -40,13 +38,11 @@ fn test_node_in_fut() {
assert_eq!(f1, f2);
let fut_val = async {
// eprintln!("fut_val: {}", root.child(0).unwrap().kind());
yield_now().await;
root.child(0).unwrap().kind()
};
let fut_ref = async {
// eprintln!("fut_ref: {}", root_ref.child(0).unwrap().kind());
yield_now().await;
root_ref.child(0).unwrap().kind()
};
@ -58,7 +54,6 @@ fn test_node_in_fut() {
f1
})
.join();
// eprintln!("pended: {pended:?}");
assert_eq!(ret, "comment");
assert_eq!(pended, 5);
}
@ -215,7 +210,6 @@ where
match future.as_mut().poll(&mut cx) {
Poll::Pending => pending += 1,
Poll::Ready(r) => {
// eprintln!("ready, pended: {pending}");
break r;
}
}

View file

@ -1679,7 +1679,6 @@ fn test_decode_utf32() {
)
}
} else {
println!("bad decode: {bytes:?}");
(0, 0)
}
}
@ -1816,6 +1815,118 @@ fn test_decode_utf24le() {
);
}
#[test]
fn test_grammars_that_should_not_compile() {
assert!(generate_parser_for_grammar(
r#"
{
"name": "issue_1111",
"rules": {
"source_file": { "type": "STRING", "value": "" }
},
}
"#
)
.is_err());
assert!(generate_parser_for_grammar(
r#"
{
"name": "issue_1271",
"rules": {
"source_file": { "type": "SYMBOL", "name": "identifier" },
"identifier": {
"type": "TOKEN",
"content": {
"type": "REPEAT",
"content": { "type": "PATTERN", "value": "a" }
}
}
},
}
"#,
)
.is_err());
assert!(generate_parser_for_grammar(
r#"
{
"name": "issue_1156_expl_1",
"rules": {
"source_file": {
"type": "TOKEN",
"content": {
"type": "REPEAT",
"content": { "type": "STRING", "value": "c" }
}
}
},
}
"#
)
.is_err());
assert!(generate_parser_for_grammar(
r#"
{
"name": "issue_1156_expl_2",
"rules": {
"source_file": {
"type": "TOKEN",
"content": {
"type": "CHOICE",
"members": [
{ "type": "STRING", "value": "e" },
{ "type": "BLANK" }
]
}
}
},
}
"#
)
.is_err());
assert!(generate_parser_for_grammar(
r#"
{
"name": "issue_1156_expl_3",
"rules": {
"source_file": {
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "REPEAT",
"content": { "type": "STRING", "value": "p" }
}
}
},
}
"#
)
.is_err());
assert!(generate_parser_for_grammar(
r#"
{
"name": "issue_1156_expl_4",
"rules": {
"source_file": {
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "CHOICE",
"members": [
{ "type": "STRING", "value": "r" },
{ "type": "BLANK" }
]
}
}
},
}
"#
)
.is_err());
}
const fn simple_range(start: usize, end: usize) -> Range {
Range {
start_byte: start,