From 798ef5e4dc32d0811d0da5e04abfbf8b650009b4 Mon Sep 17 00:00:00 2001 From: Phil Turnbull Date: Fri, 14 Jul 2017 10:42:01 -0700 Subject: [PATCH] Add libFuzzer support This adds support for fuzzing tree-sitter grammars with libFuzzer. This currently only works on Linux because of linking issues on macOS. Breifly, the AddressSanitizer library is dynamically linked into the fuzzer binary and cannot be found at runtime if built with a compiler that wasn't provided by Xcode(?). The runtime library is statically linked on Linux so this isn't a problem. --- .gitignore | 2 ++ project.gyp | 4 +++ script/build-fuzzers | 60 +++++++++++++++++++++++++++++++++++++++++++ script/reproduce | 15 +++++++++++ script/run-fuzzer | 23 +++++++++++++++++ test/fuzz/README.md | 43 +++++++++++++++++++++++++++++++ test/fuzz/fuzzer.cc | 27 +++++++++++++++++++ test/fuzz/gen-dict.py | 31 ++++++++++++++++++++++ 8 files changed, 205 insertions(+) create mode 100755 script/build-fuzzers create mode 100755 script/reproduce create mode 100755 script/run-fuzzer create mode 100644 test/fuzz/README.md create mode 100644 test/fuzz/fuzzer.cc create mode 100644 test/fuzz/gen-dict.py diff --git a/.gitignore b/.gitignore index 72e30dbe..4087ed6a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,8 @@ # Compiled binaries out +fuzz-results + # Generated build config files gyp-mac-tool Makefile diff --git a/project.gyp b/project.gyp index 4e1ae182..5647c398 100644 --- a/project.gyp +++ b/project.gyp @@ -135,6 +135,10 @@ 'OTHER_CPLUSPLUSFLAGS': ['-fsanitize=address'], }, }, + 'Fuzz': { + 'cflags': [ '-g', '-fsanitize=address,undefined', '-fsanitize-coverage=trace-pc-guard' ], + 'ldflags': [ '-g', '-fsanitize=address,undefined', '-fsanitize-coverage=trace-pc-guard' ], + }, 'Release': { 'cflags': [ '-O2', '-fno-strict-aliasing' ], 'cflags!': [ '-O3', '-fstrict-aliasing' ], diff --git a/script/build-fuzzers b/script/build-fuzzers new file mode 100755 index 00000000..8e8de5a6 --- /dev/null +++ b/script/build-fuzzers @@ -0,0 +1,60 @@ +#!/bin/bash +set -e + +if [[ "$(uname -s)" != Linux ]]; then + echo "Fuzzing is only supported on Linux" + exit 1 +fi + +if [[ -z "$LIB_FUZZER_PATH" ]]; then + echo "LIB_FUZZER_PATH not set" + exit 1 +fi + +CC=${CC:-clang} +CXX=${CXX:-clang++} +LINK=${LINK:-clang++} + +CC=$CC CXX=$CXX LINK=$LINK ./script/configure + +export BUILDTYPE=Fuzz +make runtime + +CFLAGS="-fsanitize=address,undefined -fsanitize-coverage=trace-pc-guard" +CXXFLAGS="-fsanitize=address,undefined -fsanitize-coverage=trace-pc-guard" + +if [ -z "$@" ]; then + languages=$(ls test/fixtures/grammars) +else + languages="$@" +fi + +for lang in ${languages[@]}; do + echo "Building $lang fuzzer..." + lang_dir="test/fixtures/grammars/$lang" + + # The following assumes each language is implemented as src/parser.c plus an + # optional scanner in src/scanner.{c,cc} + objects=() + + lang_scanner="${lang_dir}/src/scanner" + if [ -e "${lang_scanner}.cc" ]; then + $CXX $CXXFLAGS -g -O1 "-I${lang_dir}/src" -c "${lang_scanner}.cc" -o "${lang_scanner}.o" + objects+=("${lang_scanner}.o") + elif [ -e "${lang_scanner}.c" ]; then + $CC $CFLAGS -std=c99 -g -O1 "-I${lang_dir}/src" -c "${lang_scanner}.c" -o "${lang_scanner}.o" + objects+=("${lang_scanner}.o") + fi + + + # Compiling with -O0 speeds up the build dramatically + $CC $CFLAGS -g -O0 "-I${lang_dir}/src" "${lang_dir}/src/parser.c" -c -o "${lang_dir}/src/parser.o" + objects+=("${lang_dir}/src/parser.o") + + $CXX $CXXFLAGS -std=c++11 -Iinclude -D TSLANG="tree_sitter_$lang" \ + "test/fuzz/fuzzer.cc" "${objects[@]}" \ + out/Fuzz/obj.target/libruntime.a "$LIB_FUZZER_PATH" \ + -o "out/${lang}_fuzzer" + + python test/fuzz/gen-dict.py "${lang_dir}/src/grammar.json" > "out/$lang.dict" +done diff --git a/script/reproduce b/script/reproduce new file mode 100755 index 00000000..ee7b3a23 --- /dev/null +++ b/script/reproduce @@ -0,0 +1,15 @@ +#!/bin/bash + +set -ex + +if [ "$#" -lt 2 ]; then + echo "usage: $0 " + exit 1 +fi + +lang="$1" +testcase="$2" + +out="out" +ASAN_OPTIONS="quarantine_size_mb=10:detect_leaks=1" UBSAN="print_stacktrace=1:halt_on_error=1" \ + "${out}/${lang}_fuzzer" "$testcase" -timeout=1 -runs=100 diff --git a/script/run-fuzzer b/script/run-fuzzer new file mode 100755 index 00000000..4cdbaa26 --- /dev/null +++ b/script/run-fuzzer @@ -0,0 +1,23 @@ +#!/bin/bash + +set -ex + +if [ "$#" -lt 1 ]; then + echo "usage: $0 " + exit 1 +fi + +lang="$1" +shift # Treat remainder of arguments as libFuzzer arguments + +# Fuzzing logs and testcases are always written to `pwd`, so `cd` there first +mkdir -p "fuzz-results/${lang}" +cd "fuzz-results/${lang}" + +# Create a corpus directory, so new discoveries are stored on disk. These will +# then be loaded on subsequent fuzzing runs +mkdir -p corpus + +out="../../out" +ASAN_OPTIONS="quarantine_size_mb=10:detect_leaks=1" UBSAN="print_stacktrace=1:halt_on_error=1" \ + "${out}/${lang}_fuzzer" "-dict=${out}/${lang}.dict" "-artifact_prefix=${lang}_" -max_len=128 -timeout=1 "./corpus" "$@" diff --git a/test/fuzz/README.md b/test/fuzz/README.md new file mode 100644 index 00000000..ba19b10b --- /dev/null +++ b/test/fuzz/README.md @@ -0,0 +1,43 @@ +# Fuzzing tree-sitter + +The tree-sitter fuzzing support requires 1) the `libFuzzer` runtime library and 2) a recent version of clang + +## libFuzzer + +The main fuzzing logic is implemented by `libFuzzer` which is part of the LLVM project but is not shipped by distros. It will need to be built from source but does not require building the _whole_ LLVM project. LLVM can be downloaded from llvm.org using SVN or [llvm-mirror](https://github.com/llvm-mirror/llvm) using git. `libFuzzer` can be built as, e.g.: + +``` +cd ~/src +git clone https://github.com/llvm-mirror/llvm +cd llvm/lib/Fuzzer +./build.sh +``` + +## clang + +Using libFuzzer requires a reasonably new version of `clang` and will probably _not_ work with your system-installed version. The easiest way to get started is to use the version provided by the Chromium team. Instructions are available at [libFuzzer.info](http://libfuzzer.info). + +The fuzzers can then be built with: +``` +export CLANG_DIR=$HOME/src/third_party/llvm-build/Release+Asserts/bin +CC="$CLANG_DIR/clang" CXX="$CLANG_DIR/clang++" LINK="$CLANG_DIR/clang++" \ + LIB_FUZZER_PATH=$HOME/src/llvm/lib/Fuzzer/libFuzzer.a \ + ./script/build_fuzzers +``` + +This will generate a separate fuzzer for each grammar defined in `test/fixtures/grammars` and will be instrumented with [AddressSanitizer](https://clang.llvm.org/docs/AddressSanitizer.html) and [UndefinedBehaviorSanitizer](https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html). Individual fuzzers can be built with, for example, `./script/build_fuzzers python ruby`. + +The `run-fuzzer` script handles running an individual fuzzer with a sensible default set of arguments: +``` +./script/run-fuzzer +``` + +which will log information to stdout. Failing testcases and a fuzz corpus will be saved to `fuzz-results/`. The most important extra `libFuzzer` options are `-jobs` and `-workers` which allow parallel fuzzing. This is can done with, e.g.: +``` +./script/run-fuzzer -jobs=32 -workers=32 +``` + +The testcase can be used to reproduce the crash by running: +``` +./script/reproduce +``` diff --git a/test/fuzz/fuzzer.cc b/test/fuzz/fuzzer.cc new file mode 100644 index 00000000..c1b5095e --- /dev/null +++ b/test/fuzz/fuzzer.cc @@ -0,0 +1,27 @@ +#include +#include "tree_sitter/runtime.h" + +void test_log(void *payload, TSLogType type, const char *string) { } + +TSLogger logger = { + .log = test_log, +}; + +extern "C" const TSLanguage *TSLANG(); + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + const char *str = reinterpret_cast(data); + + TSDocument *document = ts_document_new(); + ts_document_set_language(document, TSLANG()); + ts_document_set_input_string_with_length(document, str, size); + + TSParseOptions options = {}; + options.halt_on_error = false; + ts_document_parse_with_options(document, options); + + TSNode root_node = ts_document_root_node(document); + ts_document_free(document); + + return 0; +} diff --git a/test/fuzz/gen-dict.py b/test/fuzz/gen-dict.py new file mode 100644 index 00000000..a9e07838 --- /dev/null +++ b/test/fuzz/gen-dict.py @@ -0,0 +1,31 @@ +import json +import sys + +def find_literals(literals, node): + '''Recursively find STRING literals in the grammar definition''' + + if type(node) is dict: + if 'type' in node and node['type'] == 'STRING' and 'value' in node: + literals.add(node['value']) + + for key, value in node.iteritems(): + find_literals(literals, value) + + elif type(node) is list: + for item in node: + find_literals(literals, item) + +def main(): + '''Generate a libFuzzer / AFL dictionary from a tree-sitter grammar.json''' + with open(sys.argv[1]) as f: + grammar = json.load(f) + + literals = set() + find_literals(literals, grammar) + + for lit in sorted(literals): + if lit: + print '"%s"' % ''.join([(c if c.isalnum() else '\\x%02x' % ord(c)) for c in lit]) + +if __name__ == '__main__': + main()