Merge pull request #91 from tree-sitter/libFuzzer

Add support for fuzzing with libFuzzer
2017-07-17 11:43:01 -07:00 · 2017-07-17 11:43:01 -07:00 · 34279257f9
commit 34279257f9
parent 66dc12587a 153c2033df
9 changed files with 208 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,8 @@
 # Compiled binaries
 out

+fuzz-results
+
 # Generated build config files 
 gyp-mac-tool
 Makefile
--- a/project.gyp
+++ b/project.gyp
@ -136,6 +136,10 @@
          'OTHER_CPLUSPLUSFLAGS': ['-fsanitize=address'],
        },
      },
+      'Fuzz': {
+        'cflags': [ '-g', '-fsanitize=address,undefined', '-fsanitize-coverage=trace-pc-guard' ],
+        'ldflags': [ '-g', '-fsanitize=address,undefined', '-fsanitize-coverage=trace-pc-guard' ],
+      },
      'Release': {
        'cflags': [ '-O2', '-fno-strict-aliasing' ],
        'cflags!': [ '-O3', '-fstrict-aliasing' ],
--- a/script/build-fuzzers
+++ b/script/build-fuzzers
@ -0,0 +1,60 @@
+#!/bin/bash
+set -e
+
+if [[ "$(uname -s)" != Linux ]]; then
+  echo "Fuzzing is only supported on Linux"
+  exit 1
+fi
+
+if [[ -z "$LIB_FUZZER_PATH" ]]; then
+  echo "LIB_FUZZER_PATH not set"
+  exit 1
+fi
+
+CC=${CC:-clang}
+CXX=${CXX:-clang++}
+LINK=${LINK:-clang++}
+
+CC=$CC CXX=$CXX LINK=$LINK ./script/configure
+
+export BUILDTYPE=Fuzz
+make runtime
+
+CFLAGS="-fsanitize=address,undefined -fsanitize-coverage=trace-pc-guard"
+CXXFLAGS="-fsanitize=address,undefined -fsanitize-coverage=trace-pc-guard"
+
+if [ -z "$@" ]; then
+  languages=$(ls test/fixtures/grammars)
+else
+  languages="$@"
+fi
+
+for lang in ${languages[@]}; do
+  echo "Building $lang fuzzer..."
+  lang_dir="test/fixtures/grammars/$lang"
+
+  # The following assumes each language is implemented as src/parser.c plus an
+  # optional scanner in src/scanner.{c,cc}
+  objects=()
+
+  lang_scanner="${lang_dir}/src/scanner"
+  if [ -e "${lang_scanner}.cc" ]; then
+    $CXX $CXXFLAGS -g -O1 "-I${lang_dir}/src" -c "${lang_scanner}.cc" -o "${lang_scanner}.o"
+    objects+=("${lang_scanner}.o")
+  elif [ -e "${lang_scanner}.c" ]; then
+    $CC $CFLAGS -std=c99 -g -O1 "-I${lang_dir}/src" -c "${lang_scanner}.c" -o "${lang_scanner}.o"
+    objects+=("${lang_scanner}.o")
+  fi
+
+
+  # Compiling with -O0 speeds up the build dramatically
+  $CC $CFLAGS -g -O0 "-I${lang_dir}/src" "${lang_dir}/src/parser.c" -c -o "${lang_dir}/src/parser.o"
+  objects+=("${lang_dir}/src/parser.o")
+
+  $CXX $CXXFLAGS -std=c++11 -Iinclude -D TSLANG="tree_sitter_$lang" \
+    "test/fuzz/fuzzer.cc" "${objects[@]}" \
+    out/Fuzz/obj.target/libruntime.a "$LIB_FUZZER_PATH" \
+    -o "out/${lang}_fuzzer"
+
+  python test/fuzz/gen-dict.py "${lang_dir}/src/grammar.json" > "out/$lang.dict"
+done
--- a/script/fetch-fixtures
+++ b/script/fetch-fixtures
@ -26,3 +26,6 @@ fetch_grammar 'json'       'origin/master'
 fetch_grammar 'c'          'origin/master'
 fetch_grammar 'cpp'        'origin/master'
 fetch_grammar 'python'     'origin/master'
+fetch_grammar 'go'         'origin/master'
+fetch_grammar 'ruby'       'origin/master'
+fetch_grammar 'typescript' 'origin/master'
--- a/script/reproduce
+++ b/script/reproduce
@ -0,0 +1,15 @@
+#!/bin/bash
+
+set -ex
+
+if [ "$#" -lt 2 ]; then
+  echo "usage: $0 <language> <testcase>"
+  exit 1
+fi
+
+lang="$1"
+testcase="$2"
+
+out="out"
+ASAN_OPTIONS="quarantine_size_mb=10:detect_leaks=1" UBSAN="print_stacktrace=1:halt_on_error=1" \
+  "${out}/${lang}_fuzzer" "$testcase" -timeout=1 -runs=100
--- a/script/run-fuzzer
+++ b/script/run-fuzzer
@ -0,0 +1,23 @@
+#!/bin/bash
+
+set -ex
+
+if [ "$#" -lt 1 ]; then
+  echo "usage: $0 <language> <libFuzzer args...>"
+  exit 1
+fi
+
+lang="$1"
+shift # Treat remainder of arguments as libFuzzer arguments
+
+# Fuzzing logs and testcases are always written to `pwd`, so `cd` there first
+mkdir -p "fuzz-results/${lang}"
+cd "fuzz-results/${lang}"
+
+# Create a corpus directory, so new discoveries are stored on disk. These will
+# then be loaded on subsequent fuzzing runs
+mkdir -p corpus
+
+out="../../out"
+ASAN_OPTIONS="quarantine_size_mb=10:detect_leaks=1" UBSAN="print_stacktrace=1:halt_on_error=1" \
+  "${out}/${lang}_fuzzer" "-dict=${out}/${lang}.dict" "-artifact_prefix=${lang}_" -max_len=128 -timeout=1 "./corpus" "$@"
--- a/test/fuzz/README.md
+++ b/test/fuzz/README.md
@ -0,0 +1,43 @@
+# Fuzzing tree-sitter
+
+The tree-sitter fuzzing support requires 1) the `libFuzzer` runtime library and 2) a recent version of clang
+
+## libFuzzer
+
+The main fuzzing logic is implemented by `libFuzzer` which is part of the LLVM project but is not shipped by distros. It will need to be built from source but does not require building the _whole_ LLVM project. LLVM can be downloaded from llvm.org using SVN or [llvm-mirror](https://github.com/llvm-mirror/llvm) using git. `libFuzzer` can be built as, e.g.:
+
+```
+cd ~/src
+git clone https://github.com/llvm-mirror/llvm
+cd llvm/lib/Fuzzer
+./build.sh
+```
+
+## clang
+
+Using libFuzzer requires a reasonably new version of `clang` and will probably _not_ work with your system-installed version. The easiest way to get started is to use the version provided by the Chromium team. Instructions are available at [libFuzzer.info](http://libfuzzer.info).
+
+The fuzzers can then be built with:
+```
+export CLANG_DIR=$HOME/src/third_party/llvm-build/Release+Asserts/bin
+CC="$CLANG_DIR/clang" CXX="$CLANG_DIR/clang++" LINK="$CLANG_DIR/clang++" \
+  LIB_FUZZER_PATH=$HOME/src/llvm/lib/Fuzzer/libFuzzer.a \
+  ./script/build_fuzzers
+```
+
+This will generate a separate fuzzer for each grammar defined in `test/fixtures/grammars` and will be instrumented with [AddressSanitizer](https://clang.llvm.org/docs/AddressSanitizer.html) and [UndefinedBehaviorSanitizer](https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html). Individual fuzzers can be built with, for example, `./script/build_fuzzers python ruby`.
+
+The `run-fuzzer` script handles running an individual fuzzer with a sensible default set of arguments:
+```
+./script/run-fuzzer <grammar-name> <extra libFuzzer arguments...>
+```
+
+which will log information to stdout. Failing testcases and a fuzz corpus will be saved to `fuzz-results/<grammar-name>`. The most important extra `libFuzzer` options are `-jobs` and `-workers` which allow parallel fuzzing. This is can done with, e.g.:
+```
+./script/run-fuzzer <grammer-name> -jobs=32 -workers=32
+```
+
+The testcase can be used to reproduce the crash by running:
+```
+./script/reproduce <grammar-name> <path-to-testcase>
+```
--- a/test/fuzz/fuzzer.cc
+++ b/test/fuzz/fuzzer.cc
@ -0,0 +1,27 @@
+#include <string.h>
+#include "tree_sitter/runtime.h"
+
+void test_log(void *payload, TSLogType type, const char *string) { }
+
+TSLogger logger = {
+  .log = test_log,
+};
+
+extern "C" const TSLanguage *TSLANG();
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  const char *str = reinterpret_cast<const char *>(data);
+
+  TSDocument *document = ts_document_new();
+  ts_document_set_language(document, TSLANG());
+  ts_document_set_input_string_with_length(document, str, size);
+
+  TSParseOptions options = {};
+  options.halt_on_error = false;
+  ts_document_parse_with_options(document, options);
+
+  TSNode root_node = ts_document_root_node(document);
+  ts_document_free(document);
+
+  return 0;
+}
--- a/test/fuzz/gen-dict.py
+++ b/test/fuzz/gen-dict.py
@ -0,0 +1,31 @@
+import json
+import sys
+
+def find_literals(literals, node):
+  '''Recursively find STRING literals in the grammar definition'''
+
+  if type(node) is dict:
+    if 'type' in node and node['type'] == 'STRING' and 'value' in node:
+      literals.add(node['value'])
+
+    for key, value in node.iteritems():
+      find_literals(literals, value)
+
+  elif type(node) is list:
+    for item in node:
+      find_literals(literals, item)
+
+def main():
+  '''Generate a libFuzzer / AFL dictionary from a tree-sitter grammar.json'''
+  with open(sys.argv[1]) as f:
+    grammar = json.load(f)
+
+  literals = set()
+  find_literals(literals, grammar)
+
+  for lit in sorted(literals):
+    if lit:
+      print '"%s"' % ''.join([(c if c.isalnum() else '\\x%02x' % ord(c)) for c in lit])
+
+if __name__ == '__main__':
+  main()