diff --git a/.gitignore b/.gitignore index 72e30dbe..4087ed6a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,8 @@ # Compiled binaries out +fuzz-results + # Generated build config files gyp-mac-tool Makefile diff --git a/project.gyp b/project.gyp index 4e1ae182..5647c398 100644 --- a/project.gyp +++ b/project.gyp @@ -135,6 +135,10 @@ 'OTHER_CPLUSPLUSFLAGS': ['-fsanitize=address'], }, }, + 'Fuzz': { + 'cflags': [ '-g', '-fsanitize=address,undefined', '-fsanitize-coverage=trace-pc-guard' ], + 'ldflags': [ '-g', '-fsanitize=address,undefined', '-fsanitize-coverage=trace-pc-guard' ], + }, 'Release': { 'cflags': [ '-O2', '-fno-strict-aliasing' ], 'cflags!': [ '-O3', '-fstrict-aliasing' ], diff --git a/script/build-fuzzers b/script/build-fuzzers new file mode 100755 index 00000000..8e8de5a6 --- /dev/null +++ b/script/build-fuzzers @@ -0,0 +1,60 @@ +#!/bin/bash +set -e + +if [[ "$(uname -s)" != Linux ]]; then + echo "Fuzzing is only supported on Linux" + exit 1 +fi + +if [[ -z "$LIB_FUZZER_PATH" ]]; then + echo "LIB_FUZZER_PATH not set" + exit 1 +fi + +CC=${CC:-clang} +CXX=${CXX:-clang++} +LINK=${LINK:-clang++} + +CC=$CC CXX=$CXX LINK=$LINK ./script/configure + +export BUILDTYPE=Fuzz +make runtime + +CFLAGS="-fsanitize=address,undefined -fsanitize-coverage=trace-pc-guard" +CXXFLAGS="-fsanitize=address,undefined -fsanitize-coverage=trace-pc-guard" + +if [ -z "$@" ]; then + languages=$(ls test/fixtures/grammars) +else + languages="$@" +fi + +for lang in ${languages[@]}; do + echo "Building $lang fuzzer..." + lang_dir="test/fixtures/grammars/$lang" + + # The following assumes each language is implemented as src/parser.c plus an + # optional scanner in src/scanner.{c,cc} + objects=() + + lang_scanner="${lang_dir}/src/scanner" + if [ -e "${lang_scanner}.cc" ]; then + $CXX $CXXFLAGS -g -O1 "-I${lang_dir}/src" -c "${lang_scanner}.cc" -o "${lang_scanner}.o" + objects+=("${lang_scanner}.o") + elif [ -e "${lang_scanner}.c" ]; then + $CC $CFLAGS -std=c99 -g -O1 "-I${lang_dir}/src" -c "${lang_scanner}.c" -o "${lang_scanner}.o" + objects+=("${lang_scanner}.o") + fi + + + # Compiling with -O0 speeds up the build dramatically + $CC $CFLAGS -g -O0 "-I${lang_dir}/src" "${lang_dir}/src/parser.c" -c -o "${lang_dir}/src/parser.o" + objects+=("${lang_dir}/src/parser.o") + + $CXX $CXXFLAGS -std=c++11 -Iinclude -D TSLANG="tree_sitter_$lang" \ + "test/fuzz/fuzzer.cc" "${objects[@]}" \ + out/Fuzz/obj.target/libruntime.a "$LIB_FUZZER_PATH" \ + -o "out/${lang}_fuzzer" + + python test/fuzz/gen-dict.py "${lang_dir}/src/grammar.json" > "out/$lang.dict" +done diff --git a/script/reproduce b/script/reproduce new file mode 100755 index 00000000..ee7b3a23 --- /dev/null +++ b/script/reproduce @@ -0,0 +1,15 @@ +#!/bin/bash + +set -ex + +if [ "$#" -lt 2 ]; then + echo "usage: $0 " + exit 1 +fi + +lang="$1" +testcase="$2" + +out="out" +ASAN_OPTIONS="quarantine_size_mb=10:detect_leaks=1" UBSAN="print_stacktrace=1:halt_on_error=1" \ + "${out}/${lang}_fuzzer" "$testcase" -timeout=1 -runs=100 diff --git a/script/run-fuzzer b/script/run-fuzzer new file mode 100755 index 00000000..4cdbaa26 --- /dev/null +++ b/script/run-fuzzer @@ -0,0 +1,23 @@ +#!/bin/bash + +set -ex + +if [ "$#" -lt 1 ]; then + echo "usage: $0 " + exit 1 +fi + +lang="$1" +shift # Treat remainder of arguments as libFuzzer arguments + +# Fuzzing logs and testcases are always written to `pwd`, so `cd` there first +mkdir -p "fuzz-results/${lang}" +cd "fuzz-results/${lang}" + +# Create a corpus directory, so new discoveries are stored on disk. These will +# then be loaded on subsequent fuzzing runs +mkdir -p corpus + +out="../../out" +ASAN_OPTIONS="quarantine_size_mb=10:detect_leaks=1" UBSAN="print_stacktrace=1:halt_on_error=1" \ + "${out}/${lang}_fuzzer" "-dict=${out}/${lang}.dict" "-artifact_prefix=${lang}_" -max_len=128 -timeout=1 "./corpus" "$@" diff --git a/test/fuzz/README.md b/test/fuzz/README.md new file mode 100644 index 00000000..ba19b10b --- /dev/null +++ b/test/fuzz/README.md @@ -0,0 +1,43 @@ +# Fuzzing tree-sitter + +The tree-sitter fuzzing support requires 1) the `libFuzzer` runtime library and 2) a recent version of clang + +## libFuzzer + +The main fuzzing logic is implemented by `libFuzzer` which is part of the LLVM project but is not shipped by distros. It will need to be built from source but does not require building the _whole_ LLVM project. LLVM can be downloaded from llvm.org using SVN or [llvm-mirror](https://github.com/llvm-mirror/llvm) using git. `libFuzzer` can be built as, e.g.: + +``` +cd ~/src +git clone https://github.com/llvm-mirror/llvm +cd llvm/lib/Fuzzer +./build.sh +``` + +## clang + +Using libFuzzer requires a reasonably new version of `clang` and will probably _not_ work with your system-installed version. The easiest way to get started is to use the version provided by the Chromium team. Instructions are available at [libFuzzer.info](http://libfuzzer.info). + +The fuzzers can then be built with: +``` +export CLANG_DIR=$HOME/src/third_party/llvm-build/Release+Asserts/bin +CC="$CLANG_DIR/clang" CXX="$CLANG_DIR/clang++" LINK="$CLANG_DIR/clang++" \ + LIB_FUZZER_PATH=$HOME/src/llvm/lib/Fuzzer/libFuzzer.a \ + ./script/build_fuzzers +``` + +This will generate a separate fuzzer for each grammar defined in `test/fixtures/grammars` and will be instrumented with [AddressSanitizer](https://clang.llvm.org/docs/AddressSanitizer.html) and [UndefinedBehaviorSanitizer](https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html). Individual fuzzers can be built with, for example, `./script/build_fuzzers python ruby`. + +The `run-fuzzer` script handles running an individual fuzzer with a sensible default set of arguments: +``` +./script/run-fuzzer +``` + +which will log information to stdout. Failing testcases and a fuzz corpus will be saved to `fuzz-results/`. The most important extra `libFuzzer` options are `-jobs` and `-workers` which allow parallel fuzzing. This is can done with, e.g.: +``` +./script/run-fuzzer -jobs=32 -workers=32 +``` + +The testcase can be used to reproduce the crash by running: +``` +./script/reproduce +``` diff --git a/test/fuzz/fuzzer.cc b/test/fuzz/fuzzer.cc new file mode 100644 index 00000000..c1b5095e --- /dev/null +++ b/test/fuzz/fuzzer.cc @@ -0,0 +1,27 @@ +#include +#include "tree_sitter/runtime.h" + +void test_log(void *payload, TSLogType type, const char *string) { } + +TSLogger logger = { + .log = test_log, +}; + +extern "C" const TSLanguage *TSLANG(); + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + const char *str = reinterpret_cast(data); + + TSDocument *document = ts_document_new(); + ts_document_set_language(document, TSLANG()); + ts_document_set_input_string_with_length(document, str, size); + + TSParseOptions options = {}; + options.halt_on_error = false; + ts_document_parse_with_options(document, options); + + TSNode root_node = ts_document_root_node(document); + ts_document_free(document); + + return 0; +} diff --git a/test/fuzz/gen-dict.py b/test/fuzz/gen-dict.py new file mode 100644 index 00000000..a9e07838 --- /dev/null +++ b/test/fuzz/gen-dict.py @@ -0,0 +1,31 @@ +import json +import sys + +def find_literals(literals, node): + '''Recursively find STRING literals in the grammar definition''' + + if type(node) is dict: + if 'type' in node and node['type'] == 'STRING' and 'value' in node: + literals.add(node['value']) + + for key, value in node.iteritems(): + find_literals(literals, value) + + elif type(node) is list: + for item in node: + find_literals(literals, item) + +def main(): + '''Generate a libFuzzer / AFL dictionary from a tree-sitter grammar.json''' + with open(sys.argv[1]) as f: + grammar = json.load(f) + + literals = set() + find_literals(literals, grammar) + + for lit in sorted(literals): + if lit: + print '"%s"' % ''.join([(c if c.isalnum() else '\\x%02x' % ord(c)) for c in lit]) + +if __name__ == '__main__': + main()