From 2cfd2007b4a73bb061506e7c521570e9a0ec3f96 Mon Sep 17 00:00:00 2001 From: David Lakin Date: Wed, 8 May 2024 03:20:18 -0400 Subject: [PATCH] Update OSS-Fuzz Scripts to Use New QA-Assets Repo Structure This change is required to support the changes to the seed data repo structure introduced in: https://github.com/gitpython-developers/qa-assets/pull/2 This moves most of the seed data related build steps into the OSS-Fuzz Docker image build via `container-environment-bootstrap.sh`. This includes moveing the dictionaries into that repo. The fuzzing/README.md here should be updated in a follow-up with a link to the qa-assets repo (and probably some context setting about corpora in general) but I have opted to defer that as I think the functionality added by the seed data improvements is valuable as is and shouldn't be blocked by documentation writers block. --- fuzzing/README.md | 19 ------ fuzzing/dictionaries/fuzz_blob.dict | 1 - fuzzing/dictionaries/fuzz_config.dict | 56 ---------------- fuzzing/oss-fuzz-scripts/build.sh | 27 +------- .../container-environment-bootstrap.sh | 64 +++++++++++++++---- 5 files changed, 53 insertions(+), 114 deletions(-) delete mode 100644 fuzzing/dictionaries/fuzz_blob.dict delete mode 100644 fuzzing/dictionaries/fuzz_config.dict diff --git a/fuzzing/README.md b/fuzzing/README.md index 9d02bf72f..286f529eb 100644 --- a/fuzzing/README.md +++ b/fuzzing/README.md @@ -76,25 +76,6 @@ Contains Python files for each fuzz test. reason, fuzz tests should gracefully handle anticipated exception cases with a `try`/`except` block to avoid false positives that halt the fuzzing engine. -### Dictionaries (`dictionaries/`) - -Provides hints to the fuzzing engine about inputs that might trigger unique code paths. Each fuzz target may have a -corresponding `.dict` file. For information about dictionary syntax, refer to -the [LibFuzzer documentation on the subject](https://llvm.org/docs/LibFuzzer.html#dictionaries). - -**Things to Know**: - -- OSS-Fuzz loads dictionary files per fuzz target if one exists with the same name, all others are ignored. -- Most entries in the dictionary files found here are escaped hex or Unicode values that were recommended by the fuzzing - engine after previous runs. -- A default set of dictionary entries are created for all fuzz targets as part of the build process, regardless of an - existing file here. -- Development or updates to dictionaries should reflect the varied formats and edge cases relevant to the - functionalities under test. -- Example dictionaries (some of which are used to build the default dictionaries mentioned above) can be found here: - - [AFL++ dictionary repository](https://github.com/AFLplusplus/AFLplusplus/tree/stable/dictionaries#readme) - - [Google/fuzzing dictionary repository](https://github.com/google/fuzzing/tree/master/dictionaries) - ### OSS-Fuzz Scripts (`oss-fuzz-scripts/`) Includes scripts for building and integrating fuzz targets with OSS-Fuzz: diff --git a/fuzzing/dictionaries/fuzz_blob.dict b/fuzzing/dictionaries/fuzz_blob.dict deleted file mode 100644 index 7f123f830..000000000 --- a/fuzzing/dictionaries/fuzz_blob.dict +++ /dev/null @@ -1 +0,0 @@ -"\\377\\377\\377\\377\\377\\377\\377\\377" diff --git a/fuzzing/dictionaries/fuzz_config.dict b/fuzzing/dictionaries/fuzz_config.dict deleted file mode 100644 index b545ddfc8..000000000 --- a/fuzzing/dictionaries/fuzz_config.dict +++ /dev/null @@ -1,56 +0,0 @@ -"\\004\\000\\000\\000\\000\\000\\000\\000" -"\\006\\000\\000\\000\\000\\000\\000\\000" -"_validate_value_" -"\\000\\000\\000\\000\\000\\000\\000\\000" -"rem" -"__eq__" -"\\001\\000\\000\\000" -"__abstrac" -"_mutating_methods_" -"items" -"\\0021\\"" -"\\001\\000" -"\\000\\000\\000\\000" -"DEFAULT" -"getfloat" -"\\004\\000\\000\\000\\000\\000\\000\\000" -"news" -"\\037\\000\\000\\000\\000\\000\\000\\000" -"\\001\\000\\000\\000\\000\\000\\000\\037" -"\\000\\000\\000\\000\\000\\000\\000\\014" -"list" -"\\376\\377\\377\\377\\377\\377\\377\\377" -"items_all" -"\\004\\000\\000\\000\\000\\000\\000\\000" -"\\377\\377\\377\\377\\377\\377\\377\\014" -"\\001\\000\\000\\000" -"_acqui" -"\\000\\000\\000\\000\\000\\000\\000\\000" -"__ne__" -"__exit__" -"__modu" -"uucp" -"__str__" -"\\001\\000\\000\\000" -"\\017\\000\\000\\000\\000\\000\\000\\000" -"_has_incl" -"update" -"\\377\\377\\377\\377\\377\\377\\377\\023" -"setdef" -"setdefaul" -"\\000\\000\\000\\000" -"\\001\\000\\000\\000" -"\\001\\000" -"\\022\\000\\000\\000\\000\\000\\000\\000" -"_value_to_string" -"__abstr" -"\\001\\000\\000\\000\\000\\000\\000\\000" -"\\000\\000\\000\\000\\000\\000\\000\\022" -"\\377\\377\\377\\377" -"\\004\\000\\000\\000\\000\\000\\000\\000" -"\\000\\000\\000\\000\\000\\000\\000\\000" -"\\000\\000\\000\\000\\000\\000\\000\\037" -"\\001\\000\\000\\000\\000\\000\\000\\013" -"_OPT_TM" -"__name__" -"_get_conv" diff --git a/fuzzing/oss-fuzz-scripts/build.sh b/fuzzing/oss-fuzz-scripts/build.sh index 58c9adb5a..e0b3a50ab 100644 --- a/fuzzing/oss-fuzz-scripts/build.sh +++ b/fuzzing/oss-fuzz-scripts/build.sh @@ -7,34 +7,13 @@ set -euo pipefail python3 -m pip install . -# Directory to look in for dictionaries, options files, and seed corpora: -SEED_DATA_DIR="$SRC/seed_data" - -find "$SEED_DATA_DIR" \( -name '*_seed_corpus.zip' -o -name '*.options' -o -name '*.dict' \) \ - ! \( -name '__base.*' \) -exec printf 'Copying: %s\n' {} \; \ +find "$SRC" -maxdepth 1 \ + \( -name '*_seed_corpus.zip' -o -name '*.options' -o -name '*.dict' \) \ + -exec printf '[%s] Copying: %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" {} \; \ -exec chmod a-x {} \; \ -exec cp {} "$OUT" \; # Build fuzzers in $OUT. find "$SRC/gitpython/fuzzing" -name 'fuzz_*.py' -print0 | while IFS= read -r -d '' fuzz_harness; do compile_python_fuzzer "$fuzz_harness" --add-binary="$(command -v git):." - - common_base_dictionary_filename="$SEED_DATA_DIR/__base.dict" - if [[ -r "$common_base_dictionary_filename" ]]; then - # Strip the `.py` extension from the filename and replace it with `.dict`. - fuzz_harness_dictionary_filename="$(basename "$fuzz_harness" .py).dict" - output_file="$OUT/$fuzz_harness_dictionary_filename" - - printf 'Appending %s to %s\n' "$common_base_dictionary_filename" "$output_file" - if [[ -s "$output_file" ]]; then - # If a dictionary file for this fuzzer already exists and is not empty, - # we append a new line to the end of it before appending any new entries. - # - # LibFuzzer will happily ignore multiple empty lines in a dictionary but fail with an error - # if any single line has incorrect syntax (e.g., if we accidentally add two entries to the same line.) - # See docs for valid syntax: https://llvm.org/docs/LibFuzzer.html#id32 - echo >>"$output_file" - fi - cat "$common_base_dictionary_filename" >>"$output_file" - fi done diff --git a/fuzzing/oss-fuzz-scripts/container-environment-bootstrap.sh b/fuzzing/oss-fuzz-scripts/container-environment-bootstrap.sh index 76ec97c7f..bbdcf5357 100755 --- a/fuzzing/oss-fuzz-scripts/container-environment-bootstrap.sh +++ b/fuzzing/oss-fuzz-scripts/container-environment-bootstrap.sh @@ -9,23 +9,20 @@ set -euo pipefail # Prerequisites # ################# -for cmd in python3 git wget rsync; do +for cmd in python3 git wget zip; do command -v "$cmd" >/dev/null 2>&1 || { printf '[%s] Required command %s not found, exiting.\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$cmd" >&2 exit 1 } done -SEED_DATA_DIR="$SRC/seed_data" -mkdir -p "$SEED_DATA_DIR" - ############# # Functions # ############# download_and_concatenate_common_dictionaries() { # Assign the first argument as the target file where all contents will be concatenated - target_file="$1" + local target_file="$1" # Shift the arguments so the first argument (target_file path) is removed # and only URLs are left for the loop below. @@ -38,22 +35,61 @@ download_and_concatenate_common_dictionaries() { done } -fetch_seed_corpora() { - # Seed corpus zip files are hosted in a separate repository to avoid additional bloat in this repo. - git clone --depth 1 https://github.com/gitpython-developers/qa-assets.git qa-assets && - rsync -avc qa-assets/gitpython/corpra/ "$SEED_DATA_DIR/" && - rm -rf qa-assets # Clean up the cloned repo to keep the Docker image as slim as possible. +create_seed_corpora_zips() { + local seed_corpora_dir="$1" + local output_zip + for dir in "$seed_corpora_dir"/*; do + if [ -d "$dir" ] && [ -n "$dir" ]; then + output_zip="$SRC/$(basename "$dir")_seed_corpus.zip" + printf '[%s] Zipping the contents of %s into %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$dir" "$output_zip" + zip -jur "$output_zip" "$dir"/* + fi + done +} + +prepare_dictionaries_for_fuzz_targets() { + local dictionaries_dir="$1" + local fuzz_targets_dir="$2" + local common_base_dictionary_filename="$WORK/__base.dict" + + printf '[%s] Copying .dict files from %s to %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$dictionaries_dir" "$SRC/" + cp -v "$dictionaries_dir"/*.dict "$SRC/" + + download_and_concatenate_common_dictionaries "$common_base_dictionary_filename" \ + "https://raw.githubusercontent.com/google/fuzzing/master/dictionaries/utf8.dict" \ + "https://raw.githubusercontent.com/google/fuzzing/master/dictionaries/url.dict" + + find "$fuzz_targets_dir" -name 'fuzz_*.py' -print0 | while IFS= read -r -d '' fuzz_harness; do + if [[ -r "$common_base_dictionary_filename" ]]; then + # Strip the `.py` extension from the filename and replace it with `.dict`. + fuzz_harness_dictionary_filename="$(basename "$fuzz_harness" .py).dict" + local output_file="$SRC/$fuzz_harness_dictionary_filename" + + printf '[%s] Appending %s to %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$common_base_dictionary_filename" "$output_file" + if [[ -s "$output_file" ]]; then + # If a dictionary file for this fuzzer already exists and is not empty, + # we append a new line to the end of it before appending any new entries. + # + # LibFuzzer will happily ignore multiple empty lines in a dictionary but fail with an error + # if any single line has incorrect syntax (e.g., if we accidentally add two entries to the same line.) + # See docs for valid syntax: https://llvm.org/docs/LibFuzzer.html#id32 + echo >>"$output_file" + fi + cat "$common_base_dictionary_filename" >>"$output_file" + fi + done } ######################## # Main execution logic # ######################## +# Seed corpora and dictionaries are hosted in a separate repository to avoid additional bloat in this repo. +# We clone into the $WORK directory because OSS-Fuzz cleans it up after building the image, keeping the image small. +git clone --depth 1 https://github.com/gitpython-developers/qa-assets.git "$WORK/qa-assets" -fetch_seed_corpora +create_seed_corpora_zips "$WORK/qa-assets/gitpython/corpora" -download_and_concatenate_common_dictionaries "$SEED_DATA_DIR/__base.dict" \ - "https://raw.githubusercontent.com/google/fuzzing/master/dictionaries/utf8.dict" \ - "https://raw.githubusercontent.com/google/fuzzing/master/dictionaries/url.dict" +prepare_dictionaries_for_fuzz_targets "$WORK/qa-assets/gitpython/dictionaries" "$SRC/gitpython/fuzzing" # The OSS-Fuzz base image has outdated dependencies by default so we upgrade them below. python3 -m pip install --upgrade pip