From 8291d294fb0b251addc745c90863e22792f5cc28 Mon Sep 17 00:00:00 2001
From: Max Brunsfeld <maxbrunsfeld@gmail.com>
Date: Mon, 7 Jan 2019 17:57:27 -0800
Subject: [PATCH] Add test subcommand

Co-Authored-By: Timothy Clem <timothy.clem@gmail.com>
---
 Cargo.lock        |   2 +
 cli/Cargo.toml    |   2 +
 cli/src/error.rs  |  14 +++
 cli/src/loader.rs | 241 ++++++++++++++++++++++++++++++++++++++++++++++
 cli/src/main.rs   |  34 +++++--
 cli/src/test.rs   | 212 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 496 insertions(+), 9 deletions(-)
 create mode 100644 cli/src/loader.rs
 create mode 100644 cli/src/test.rs
diff --git a/Cargo.lock b/Cargo.lock
index 758dcad7..7a249312 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -641,6 +641,7 @@ dependencies = [
 name = "tree-sitter-cli"
 version = "0.1.0"
 dependencies = [
+ "ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "dirs 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
  "hashbrown 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -648,6 +649,7 @@ dependencies = [
  "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "libloading 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
+ "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)",
  "rusqlite 0.14.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "serde 1.0.80 (registry+https://github.com/rust-lang/crates.io-index)",
diff --git a/cli/Cargo.toml b/cli/Cargo.toml
index 6a9c253d..200fd2f1 100644
--- a/cli/Cargo.toml
+++ b/cli/Cargo.toml
@@ -9,6 +9,7 @@ name = "tree-sitter"
 path = "src/main.rs"
 
 [dependencies]
+ansi_term = "0.11"
 lazy_static = "1.2.0"
 smallbitvec = "2.3.0"
 clap = "2.32"
@@ -20,6 +21,7 @@ rusqlite = "0.14.0"
 serde = "1.0"
 serde_derive = "1.0"
 regex-syntax = "0.6.4"
+regex = "1"
 
 [dependencies.tree-sitter]
 path = "../lib"
diff --git a/cli/src/error.rs b/cli/src/error.rs
index 9a5801f8..1b8b1a79 100644
--- a/cli/src/error.rs
+++ b/cli/src/error.rs
@@ -1,3 +1,5 @@
+use std::io;
+
 #[derive(Debug)]
 pub struct Error(pub String);
 
@@ -22,3 +24,15 @@ impl From<serde_json::Error> for Error {
         Error(error.to_string())
     }
 }
+
+impl From<io::Error> for Error {
+    fn from(error: io::Error) -> Self {
+        Error(error.to_string())
+    }
+}
+
+impl From<String> for Error {
+    fn from(error: String) -> Self {
+        Error(error)
+    }
+}
diff --git a/cli/src/loader.rs b/cli/src/loader.rs
new file mode 100644
index 00000000..7dfb233b
--- /dev/null
+++ b/cli/src/loader.rs
@@ -0,0 +1,241 @@
+use libloading::{Library, Symbol};
+use regex::{Regex, RegexBuilder};
+use std::collections::HashMap;
+use std::fs;
+use std::io;
+use std::mem;
+use std::path::{Path, PathBuf};
+use std::process::Command;
+use tree_sitter::{Language, PropertySheet};
+
+const PACKAGE_JSON_PATH: &'static str = "package.json";
+const PARSER_C_PATH: &'static str = "src/parser.c";
+const SCANNER_C_PATH: &'static str = "src/scanner.c";
+const SCANNER_CC_PATH: &'static str = "src/scanner.cc";
+
+#[cfg(unix)]
+const DYLIB_EXTENSION: &'static str = "so";
+
+#[cfg(windows)]
+const DYLIB_EXTENSION: &'static str = "dll";
+
+struct LanguageRepo {
+    name: String,
+    path: PathBuf,
+    language: Option<Language>,
+    configurations: Vec<LanguageConfiguration>,
+}
+
+pub struct LanguageConfiguration {
+    name: String,
+    content_regex: Option<Regex>,
+    first_line_regex: Option<Regex>,
+    file_types: Vec<String>,
+    highlight_property_sheet: Option<Result<PropertySheet, PathBuf>>,
+}
+
+pub struct Loader {
+    parser_lib_path: PathBuf,
+    language_repos: Vec<LanguageRepo>,
+    language_configuration_indices_by_file_type: HashMap<String, Vec<(usize, usize)>>,
+}
+
+unsafe impl Send for Loader {}
+unsafe impl Sync for Loader {}
+
+impl Loader {
+    pub fn new(parser_lib_path: PathBuf) -> Self {
+        Loader {
+            parser_lib_path,
+            language_repos: Vec::new(),
+            language_configuration_indices_by_file_type: HashMap::new(),
+        }
+    }
+
+    pub fn find_parsers(&mut self, parser_src_paths: &Vec<PathBuf>) -> io::Result<()> {
+        for parser_container_dir in parser_src_paths.iter() {
+            for entry in fs::read_dir(parser_container_dir)? {
+                let entry = entry?;
+                if let Some(parser_dir_name) = entry.file_name().to_str() {
+                    if parser_dir_name.starts_with("tree-sitter-") {
+                        if self.load_language_configurations(
+                            &parser_container_dir.join(parser_dir_name),
+                        ).is_err() {
+                            eprintln!("Error loading {}", parser_dir_name);
+                        }
+                    }
+                }
+            }
+        }
+        Ok(())
+    }
+
+    pub fn language_configuration_at_path(
+        &mut self,
+        path: &Path,
+    ) -> io::Result<Option<(Language, &LanguageConfiguration)>> {
+        let repo_index = self.load_language_configurations(path)?;
+        self.load_language_from_repo(repo_index, 0)
+    }
+
+    pub fn language_for_file_name(
+        &mut self,
+        path: &Path,
+    ) -> io::Result<Option<(Language, &LanguageConfiguration)>> {
+        let indices = path
+            .file_name()
+            .and_then(|n| n.to_str())
+            .and_then(|file_name| {
+                self.language_configuration_indices_by_file_type
+                    .get(file_name)
+            })
+            .or_else(|| {
+                path.extension()
+                    .and_then(|extension| extension.to_str())
+                    .and_then(|extension| {
+                        self.language_configuration_indices_by_file_type
+                            .get(extension)
+                    })
+            });
+
+        if let Some(indices) = indices {
+            // TODO use `content-regex` to pick one
+            for (repo_index, conf_index) in indices {
+                return self.load_language_from_repo(*repo_index, *conf_index);
+            }
+        }
+        Ok(None)
+    }
+
+    fn load_language_from_repo(
+        &mut self,
+        repo_index: usize,
+        conf_index: usize,
+    ) -> io::Result<Option<(Language, &LanguageConfiguration)>> {
+        let repo = &self.language_repos[repo_index];
+        let language = if let Some(language) = repo.language {
+            language
+        } else {
+            let language = self.load_language_at_path(&repo.name, &repo.path)?;
+            self.language_repos[repo_index].language = Some(language);
+            language
+        };
+        if let Some(configuration) = self.language_repos[repo_index]
+            .configurations
+            .get(conf_index)
+        {
+            Ok(Some((language, configuration)))
+        } else {
+            Ok(None)
+        }
+    }
+
+    fn load_language_at_path(&self, name: &str, language_path: &Path) -> io::Result<Language> {
+        let parser_c_path = language_path.join(PARSER_C_PATH);
+        let mut library_path = self.parser_lib_path.join(name);
+        library_path.set_extension(DYLIB_EXTENSION);
+
+        if !library_path.exists() || was_modified_more_recently(&parser_c_path, &library_path)? {
+            let compiler_name = std::env::var("CXX").unwrap_or("c++".to_owned());
+            let mut command = Command::new(compiler_name);
+            command
+                .arg("-shared")
+                .arg("-fPIC")
+                .arg("-I")
+                .arg(language_path.join("src"))
+                .arg("-o")
+                .arg(&library_path)
+                .arg("-xc")
+                .arg(parser_c_path);
+            let scanner_c_path = language_path.join(SCANNER_C_PATH);
+            let scanner_cc_path = language_path.join(SCANNER_CC_PATH);
+            if scanner_c_path.exists() {
+                command.arg("-xc").arg(scanner_c_path);
+            } else if scanner_cc_path.exists() {
+                command.arg("-xc++").arg(scanner_cc_path);
+            }
+            command.output()?;
+        }
+
+        let library = Library::new(library_path)?;
+        let language_fn_name = format!("tree_sitter_{}", name);
+        let language = unsafe {
+            let language_fn: Symbol<unsafe extern "C" fn() -> Language> =
+                library.get(language_fn_name.as_bytes())?;
+            language_fn()
+        };
+        mem::forget(library);
+        Ok(language)
+    }
+
+    fn load_language_configurations<'a>(&'a mut self, parser_path: &Path) -> io::Result<usize> {
+        let name = parser_path
+            .file_name()
+            .unwrap()
+            .to_str()
+            .unwrap()
+            .split_at("tree-sitter-".len())
+            .1;
+
+        #[derive(Deserialize)]
+        struct LanguageConfigurationJSON {
+            name: String,
+            #[serde(rename = "file-types")]
+            file_types: Option<Vec<String>>,
+            #[serde(rename = "content-regex")]
+            content_regex: Option<String>,
+            #[serde(rename = "first-line-regex")]
+            first_line_regex: Option<String>,
+            highlights: Option<String>,
+        }
+
+        #[derive(Deserialize)]
+        struct PackageJSON {
+            #[serde(rename = "tree-sitter")]
+            tree_sitter: Option<Vec<LanguageConfigurationJSON>>,
+        }
+
+        let package_json_contents = fs::read_to_string(&parser_path.join(PACKAGE_JSON_PATH))?;
+        let package_json: PackageJSON = serde_json::from_str(&package_json_contents)?;
+        let configurations = package_json
+            .tree_sitter
+            .map_or(Vec::new(), |configurations| {
+                configurations
+                    .into_iter()
+                    .map(|conf| LanguageConfiguration {
+                        name: conf.name,
+                        file_types: conf.file_types.unwrap_or(Vec::new()),
+                        content_regex: conf
+                            .content_regex
+                            .and_then(|r| RegexBuilder::new(&r).multi_line(true).build().ok()),
+                        first_line_regex: conf
+                            .first_line_regex
+                            .and_then(|r| RegexBuilder::new(&r).multi_line(true).build().ok()),
+                        highlight_property_sheet: conf.highlights.map(|d| Err(d.into())),
+                    })
+                    .collect()
+            });
+
+        for (i, configuration) in configurations.iter().enumerate() {
+            for file_type in &configuration.file_types {
+                self.language_configuration_indices_by_file_type
+                    .entry(file_type.to_string())
+                    .or_insert(Vec::new())
+                    .push((self.language_repos.len(), i));
+            }
+        }
+
+        self.language_repos.push(LanguageRepo {
+            name: name.to_string(),
+            path: parser_path.to_owned(),
+            language: None,
+            configurations,
+        });
+
+        Ok(self.language_repos.len() - 1)
+    }
+}
+
+fn was_modified_more_recently(a: &Path, b: &Path) -> io::Result<bool> {
+    Ok(fs::metadata(a)?.modified()? > fs::metadata(b)?.modified()?)
+}
diff --git a/cli/src/main.rs b/cli/src/main.rs
index fe6ffd8c..87f9e26d 100644
--- a/cli/src/main.rs
+++ b/cli/src/main.rs
@@ -5,14 +5,20 @@ extern crate log;
 #[macro_use]
 extern crate serde_derive;
 extern crate hashbrown;
+extern crate regex;
 extern crate serde_json;
 
 mod error;
 mod generate;
+mod loader;
 mod logger;
+mod parse;
+mod test;
 
+use self::loader::Loader;
 use clap::{App, Arg, SubCommand};
 use std::env;
+use std::path::Path;
 use std::process::exit;
 use std::usize;
 
@@ -44,15 +50,13 @@ fn run() -> error::Result<()> {
                 .about("Parse a file")
                 .arg(Arg::with_name("path").index(1)),
         )
-        .subcommand(
-            SubCommand::with_name("test")
-                .about("Run a parser's tests")
-                .arg(Arg::with_name("path").index(1).required(true))
-                .arg(Arg::with_name("line").index(2).required(true))
-                .arg(Arg::with_name("column").index(3).required(true)),
-        )
+        .subcommand(SubCommand::with_name("test").about("Run a parser's tests"))
         .get_matches();
 
+    let home_dir = dirs::home_dir().unwrap();
+    let current_dir = env::current_dir().unwrap();
+    let mut loader = Loader::new(home_dir.join(".tree-sitter"));
+
     if let Some(matches) = matches.subcommand_matches("generate") {
         if matches.is_present("log") {
             logger::init();
@@ -65,11 +69,23 @@ fn run() -> error::Result<()> {
                 ids.filter_map(|id| usize::from_str_radix(id, 10).ok())
                     .collect()
             });
-        let mut grammar_path = env::current_dir().expect("Failed to read CWD");
-        grammar_path.push("grammar.js");
+        let grammar_path = current_dir.join("grammar.js");
         let code =
             generate::generate_parser_for_grammar(&grammar_path, minimize, state_ids_to_log)?;
         println!("{}", code);
+        return Ok(());
+    }
+
+    if let Some(_matches) = matches.subcommand_matches("test") {
+        let corpus_path = current_dir.join("corpus");
+        let home_dir = dirs::home_dir().unwrap();
+        let mut loader = Loader::new(home_dir.join(".tree-sitter"));
+        if let Some((language, _)) = loader.language_configuration_at_path(&current_dir)? {
+            test::run_tests_at_path(language, &corpus_path)?;
+        } else {
+            eprintln!("No language found");
+        }
+    }
     }
 
     Ok(())
diff --git a/cli/src/test.rs b/cli/src/test.rs
new file mode 100644
index 00000000..7ef63bb7
--- /dev/null
+++ b/cli/src/test.rs
@@ -0,0 +1,212 @@
+use super::error::Result;
+use ansi_term::Colour;
+use regex::bytes::{Regex as ByteRegex, RegexBuilder as ByteRegexBuilder};
+use regex::Regex;
+use std::char;
+use std::fs;
+use std::io;
+use std::path::Path;
+use std::str;
+use tree_sitter::{Language, Parser};
+
+lazy_static! {
+    static ref HEADER_REGEX: ByteRegex = ByteRegexBuilder::new(r"^===+\r?\n([^=]*)\r?\n===+\r?\n")
+        .multi_line(true)
+        .build()
+        .unwrap();
+    static ref DIVIDER_REGEX: ByteRegex = ByteRegexBuilder::new(r"^---+\r?\n")
+        .multi_line(true)
+        .build()
+        .unwrap();
+    static ref WHITESPACE_REGEX: Regex = Regex::new(r"\s+").unwrap();
+}
+
+#[derive(Debug, PartialEq, Eq)]
+pub enum TestEntry {
+    Group {
+        name: String,
+        children: Vec<TestEntry>,
+    },
+    Example {
+        name: String,
+        input: Vec<u8>,
+        output: String,
+    },
+}
+
+pub fn run_tests_at_path(language: Language, path: &Path) -> Result<()> {
+    let test_entry = parse_tests(path)?;
+    let mut parser = Parser::new();
+    parser.set_language(language)?;
+
+    let mut failures = Vec::new();
+    if let TestEntry::Group { children, .. } = test_entry {
+        for child in children {
+            run_tests(&mut parser, child, 0, &mut failures)?;
+        }
+    }
+
+    if failures.len() > 0 {
+        println!("");
+
+        if failures.len() == 1 {
+            println!("1 failure:")
+        } else {
+            println!("{} failures:", failures.len())
+        }
+
+        for (name, actual, expected) in failures {
+            println!("\n  {}:", name);
+            println!("    Expected: {}", expected);
+            println!("    Actual: {}", actual);
+        }
+    }
+
+    Ok(())
+}
+
+fn run_tests(
+    parser: &mut Parser,
+    test_entry: TestEntry,
+    mut indent_level: i32,
+    failures: &mut Vec<(String, String, String)>,
+) -> Result<()> {
+    for _ in 0..indent_level {
+        print!("  ");
+    }
+    match test_entry {
+        TestEntry::Example {
+            name,
+            input,
+            output,
+        } => {
+            let tree = parser
+                .parse_utf8(&mut |byte_offset, _| &input[byte_offset..], None)
+                .unwrap();
+            let actual = tree.root_node().to_sexp();
+            if actual == output {
+                println!("✓ {}", Colour::Green.paint(&name));
+            } else {
+                println!("✗ {}", Colour::Red.paint(&name));
+                failures.push((name, actual, output));
+            }
+        }
+        TestEntry::Group { name, children } => {
+            println!("{}:", name);
+            indent_level += 1;
+            for child in children {
+                run_tests(parser, child, indent_level, failures)?;
+            }
+        }
+    }
+    Ok(())
+}
+
+pub fn parse_tests(path: &Path) -> io::Result<TestEntry> {
+    let name = path
+        .file_name()
+        .and_then(|s| s.to_str())
+        .unwrap_or("")
+        .to_string();
+    if path.is_dir() {
+        let mut children = Vec::new();
+        for entry in fs::read_dir(path)? {
+            let entry = entry?;
+            children.push(parse_tests(&entry.path())?);
+        }
+        Ok(TestEntry::Group { name, children })
+    } else {
+        let content = fs::read_to_string(path)?;
+        Ok(parse_test_content(name, content))
+    }
+}
+
+fn parse_test_content(name: String, content: String) -> TestEntry {
+    let mut children = Vec::new();
+    let bytes = content.as_bytes();
+    let mut previous_name = String::new();
+    let mut previous_header_end = 0;
+    for header_match in HEADER_REGEX
+        .find_iter(&bytes)
+        .map(|m| (m.start(), m.end()))
+        .chain(Some((bytes.len(), bytes.len())))
+    {
+        let (header_start, header_end) = header_match;
+        if previous_header_end > 0 {
+            if let Some(divider_match) =
+                DIVIDER_REGEX.find(&bytes[previous_header_end..header_start])
+            {
+                let (divider_start, divider_end) = (
+                    previous_header_end + divider_match.start(),
+                    previous_header_end + divider_match.end(),
+                );
+                if let Ok(output) = str::from_utf8(&bytes[divider_end..header_start]) {
+                    let input = bytes[previous_header_end..divider_start].to_vec();
+                    let output = WHITESPACE_REGEX.replace_all(output.trim(), " ").to_string();
+                    children.push(TestEntry::Example {
+                        name: previous_name,
+                        input,
+                        output,
+                    });
+                }
+            }
+        }
+        previous_name = String::from_utf8_lossy(&bytes[header_start..header_end])
+            .trim_matches(|c| char::is_whitespace(c) || c == '=')
+            .to_string();
+        previous_header_end = header_end;
+    }
+    TestEntry::Group { name, children }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parse_test_content() {
+        let entry = parse_test_content(
+            "the-filename".to_string(),
+            r#"
+===============
+The first test
+===============
+
+a b c
+
+---
+
+(a
+    (b c))
+
+================
+The second test
+================
+d
+---
+(d)
+        "#
+            .trim()
+            .to_string(),
+        );
+
+        assert_eq!(
+            entry,
+            TestEntry::Group {
+                name: "the-filename".to_string(),
+                children: vec![
+                    TestEntry::Example {
+                        name: "The first test".to_string(),
+                        input: "\na b c\n\n".as_bytes().to_vec(),
+                        output: "(a (b c))".to_string(),
+                    },
+                    TestEntry::Example {
+                        name: "The second test".to_string(),
+                        input: "d\n".as_bytes().to_vec(),
+                        output: "(d)".to_string(),
+                    },
+                ]
+            }
+        );
+    }
+}