Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into IFU-2024-02-20
Browse files Browse the repository at this point in the history
  • Loading branch information
root committed Feb 20, 2024
2 parents 4b4a755 + b4afb6d commit 360e3f1
Show file tree
Hide file tree
Showing 98 changed files with 3,835 additions and 1,307 deletions.
123 changes: 123 additions & 0 deletions .github/scripts/fbgemm_build.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#!/bin/bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.


# shellcheck disable=SC1091,SC2128
. "$( dirname -- "$BASH_SOURCE"; )/utils_base.bash"

################################################################################
# FBGEMM Build Auxiliary Functions
################################################################################

__configure_fbgemm_build () {
# shellcheck disable=SC2155
local env_prefix=$(env_name_or_prefix "${env_name}")

# shellcheck disable=SC2155,SC2086
local python_path=$(conda run ${env_prefix} which python)

# shellcheck disable=SC2206
build_args=(
-DUSE_SANITIZER=address
-DFBGEMM_LIBRARY_TYPE=${fbgemm_library_type}
-DPYTHON_EXECUTABLE=${python_path}
)

if print_exec "conda run ${env_prefix} c++ --version | grep -i clang"; then
echo "[BUILD] Host compiler is Clang; adding extra compiler flags ..."

# shellcheck disable=SC2155,SC2086
local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
# shellcheck disable=SC2155,SC2086
local cc_path=$(conda run ${env_prefix} which cc)
# shellcheck disable=SC2155,SC2086
local cxx_path=$(conda run ${env_prefix} which c++)

# shellcheck disable=SC2206
build_args+=(
-DCMAKE_C_COMPILER="${cc_path}"
-DCMAKE_CXX_COMPILER="${cxx_path}"
-DCMAKE_C_FLAGS=\"-fopenmp=libomp -stdlib=libc++ -I ${conda_prefix}/include\"
-DCMAKE_CXX_FLAGS=\"-fopenmp=libomp -stdlib=libc++ -I ${conda_prefix}/include\"
)
fi

# shellcheck disable=SC2145
echo "[BUILD] FBGEMM build arguments have been set: ${build_args[@]}"
}

################################################################################
# FBGEMM_GPU Build Functions
################################################################################

build_fbgemm_library () {
env_name="$1"
local build_dir="$2"
fbgemm_library_type="$3"
if [ "$fbgemm_library_type" == "" ]; then
echo "Usage: ${FUNCNAME[0]} ENV_NAME BUILD_DIR LIBRARY_TYPE COMPILER"
echo "Example(s):"
echo " ${FUNCNAME[0]} build_env shared # Build shared library"
echo " ${FUNCNAME[0]} build_env static # Build static library"
return 1
fi

# shellcheck disable=SC2155
local env_prefix=$(env_name_or_prefix "${env_name}")

echo "################################################################################"
echo "# Build FBGEMM Library"
echo "#"
echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
echo "################################################################################"
echo ""

# Set up and configure the build
__configure_fbgemm_build || return 1

mkdir "$build_dir" || return 1
cd "$build_dir" || return 1

echo "[BUILD] Running CMake ..."
# shellcheck disable=SC2086
print_exec conda run --no-capture-output ${env_prefix} \
cmake "${build_args[@]}" ..

echo "[BUILD] Running the build ..."
# shellcheck disable=SC2086
print_exec conda run --no-capture-output ${env_prefix} \
make -j VERBOSE=1

cd - || return 1
}

################################################################################
# FBGEMM_GPU Test Functions
################################################################################

test_fbgemm_library () {
local env_name="$1"
local build_dir="$2"
if [ "$build_dir" == "" ]; then
echo "Usage: ${FUNCNAME[0]} ENV_NAME BUILD_DIR"
echo "Example(s):"
echo " ${FUNCNAME[0]} build_env build # Run tests"
return 1
fi

# shellcheck disable=SC2155
local env_prefix=$(env_name_or_prefix "${env_name}")

cd "$build_dir" || return 1

echo "[BUILD] Running FBGEMM tests ..."
# shellcheck disable=SC2086
print_exec conda run --no-capture-output ${env_prefix} \
ctest --rerun-failed --output-on-failure

cd - || return 1
}
32 changes: 26 additions & 6 deletions .github/scripts/fbgemm_gpu_build.bash
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,23 @@ prepare_fbgemm_gpu_build () {
echo "[BUILD] Successfully ran git submodules update"
}

__configure_compiler_flags () {
# shellcheck disable=SC2155
local env_prefix=$(env_name_or_prefix "${env_name}")

if print_exec "conda run ${env_prefix} c++ --version | grep -i clang"; then
echo "[BUILD] Clang is available; configuring for Clang-based build ..."

# shellcheck disable=SC2155,SC2086
local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)

# shellcheck disable=SC2206
build_args+=(
--cxxprefix ${conda_prefix}
)
fi
}

__configure_fbgemm_gpu_build_cpu () {
# Update the package name and build args depending on if CUDA is specified
echo "[BUILD] Setting CPU-only build args ..."
Expand Down Expand Up @@ -162,7 +179,7 @@ __configure_fbgemm_gpu_build () {
if [ "$fbgemm_variant" == "" ]; then
echo "Usage: ${FUNCNAME[0]} FBGEMM_VARIANT"
echo "Example(s):"
echo " ${FUNCNAME[0]} cpu # CPU-only variant"
echo " ${FUNCNAME[0]} cpu # CPU-only variant using Clang"
echo " ${FUNCNAME[0]} cuda # CUDA variant for default target(s)"
echo " ${FUNCNAME[0]} cuda '7.0;8.0' # CUDA variant for custom target(s)"
echo " ${FUNCNAME[0]} rocm # ROCm variant for default target(s)"
Expand Down Expand Up @@ -190,6 +207,9 @@ __configure_fbgemm_gpu_build () {
__configure_fbgemm_gpu_build_cuda "${fbgemm_variant_targets}"
fi

# Set other compiler flags as needed
__configure_compiler_flags

# shellcheck disable=SC2145
echo "[BUILD] FBGEMM_GPU build arguments have been set: ${build_args[@]}"
}
Expand Down Expand Up @@ -375,11 +395,11 @@ build_fbgemm_gpu_package () {
if [ "$fbgemm_variant" == "" ]; then
echo "Usage: ${FUNCNAME[0]} ENV_NAME RELEASE_TYPE VARIANT [VARIANT_TARGETS]"
echo "Example(s):"
echo " ${FUNCNAME[0]} build_env nightly cpu # Nightly CPU-only variant"
echo " ${FUNCNAME[0]} build_env nightly cuda # Nightly CUDA variant for default target(s)"
echo " ${FUNCNAME[0]} build_env nightly cuda '7.0;8.0' # Nightly CUDA variant for custom target(s)"
echo " ${FUNCNAME[0]} build_env release rocm # Release ROCm variant for default target(s)"
echo " ${FUNCNAME[0]} build_env release rocm 'gfx906;gfx908;gfx90a' # Release ROCm variant for custom target(s)"
echo " ${FUNCNAME[0]} build_env cpu # CPU-only variant"
echo " ${FUNCNAME[0]} build_env cuda # CUDA variant for default target(s)"
echo " ${FUNCNAME[0]} build_env cuda '7.0;8.0' # CUDA variant for custom target(s)"
echo " ${FUNCNAME[0]} build_env rocm # ROCm variant for default target(s)"
echo " ${FUNCNAME[0]} build_env rocm 'gfx906;gfx908;gfx90a' # ROCm variant for custom target(s)"
return 1
fi

Expand Down
168 changes: 119 additions & 49 deletions .github/scripts/fbgemm_gpu_test.bash
Original file line number Diff line number Diff line change
Expand Up @@ -32,28 +32,135 @@ run_python_test () {
local env_prefix=$(env_name_or_prefix "${env_name}")

# shellcheck disable=SC2086
if exec_with_retries 2 conda run --no-capture-output ${env_prefix} python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then
if print_exec conda run --no-capture-output ${env_prefix} python -m pytest "${pytest_args[@]}" --cache-clear "${python_test_file}"; then
echo "[TEST] Python test suite PASSED: ${python_test_file}"
echo ""
echo ""
echo ""
return 0
fi

echo "[TEST] Some tests FAILED. Re-attempting only FAILED tests: ${python_test_file}"
echo ""
echo ""

# NOTE: Running large test suites may result in OOM error that will cause the
# process to be prematurely killed. To work around this, when we re-run test
# suites, we only run tests that have failed in the previous round. This is
# enabled by using the pytest cache and the --lf flag.

# shellcheck disable=SC2086
if exec_with_retries 2 conda run --no-capture-output ${env_prefix} python -m pytest "${pytest_args[@]}" --lf --last-failed-no-failures none "${python_test_file}"; then
echo "[TEST] Python test suite PASSED with retries: ${python_test_file}"
echo ""
echo ""
echo ""
else
echo "[TEST] Python test suite FAILED: ${python_test_file}"
echo "[TEST] Python test suite FAILED for some or all tests despite multiple retries: ${python_test_file}"
echo ""
echo ""
echo ""
return 1
fi
}

__configure_fbgemm_gpu_test_cpu () {
ignored_tests=(
./ssd_split_table_batched_embeddings_test.py
# These tests have non-CPU operators referenced in @given
./uvm/copy_test.py
./uvm/uvm_test.py
)
}

__configure_fbgemm_gpu_test_cuda () {
ignored_tests=(
./ssd_split_table_batched_embeddings_test.py
)
}

__configure_fbgemm_gpu_test_rocm () {
# shellcheck disable=SC2155
local env_prefix=$(env_name_or_prefix "${env_name}")

echo "[TEST] Set environment variables for ROCm testing ..."
# shellcheck disable=SC2086
print_exec conda env config vars set ${env_prefix} FBGEMM_TEST_WITH_ROCM=1
# shellcheck disable=SC2086
print_exec conda env config vars set ${env_prefix} HIP_LAUNCH_BLOCKING=1

# Starting from MI250 AMD GPUs support per process XNACK mode change
# shellcheck disable=SC2155
local rocm_version=$(awk -F'[.-]' '{print $1 * 10000 + $2 * 100 + $3}' /opt/rocm/.info/version-dev)
if [ "$rocm_version" -ge 50700 ]; then
# shellcheck disable=SC2086
print_exec conda env config vars set ${env_prefix} HSA_XNACK=1
fi

ignored_tests=(
./ssd_split_table_batched_embeddings_test.py
# https://github.com/pytorch/FBGEMM/issues/1559
./batched_unary_embeddings_test.py
)
}

__setup_fbgemm_gpu_test () {
# shellcheck disable=SC2155
local env_prefix=$(env_name_or_prefix "${env_name}")

# Configure the environment for ignored test suites for each FBGEMM_GPU
# variant
if [ "$fbgemm_variant" == "cpu" ]; then
echo "[TEST] Configuring for CPU-based testing ..."
__configure_fbgemm_gpu_test_cpu

elif [ "$fbgemm_variant" == "rocm" ]; then
echo "[TEST] Configuring for ROCm-based testing ..."
__configure_fbgemm_gpu_test_rocm

else
echo "[TEST] Configuring for CUDA-based testing ..."
__configure_fbgemm_gpu_test_cuda
fi

if [[ $MACHINE_NAME == 'aarch64' ]]; then
# NOTE: Setting KMP_DUPLICATE_LIB_OK silences the error about multiple
# OpenMP being linked when FBGEMM_GPU is compiled under Clang on aarch64
# machines:
# https://stackoverflow.com/questions/53014306/error-15-initializing-libiomp5-dylib-but-found-libiomp5-dylib-already-initial
echo "[TEST] Platform is aarch64; will set KMP_DUPLICATE_LIB_OK ..."
# shellcheck disable=SC2086
print_exec conda env config vars set ${env_prefix} KMP_DUPLICATE_LIB_OK=1
fi

echo "[TEST] Installing PyTest ..."
# shellcheck disable=SC2086
(exec_with_retries 3 conda install ${env_prefix} -y pytest expecttest) || return 1

echo "[TEST] Checking imports ..."
(test_python_import_package "${env_name}" fbgemm_gpu) || return 1
(test_python_import_package "${env_name}" fbgemm_gpu.split_embedding_codegen_lookup_invokers) || return 1

# Configure the PyTest args
pytest_args=(
-v
-rsx
-s
-W ignore::pytest.PytestCollectionWarning
)

# shellcheck disable=SC2145
echo "[TEST] PyTest args: ${pytest_args[@]}"
}


################################################################################
# FBGEMM_GPU Test Functions
################################################################################

run_fbgemm_gpu_tests () {
local env_name="$1"
local fbgemm_variant="$2"
env_name="$1"
fbgemm_variant="$2"
if [ "$fbgemm_variant" == "" ]; then
echo "Usage: ${FUNCNAME[0]} ENV_NAME [FBGEMM_VARIANT]"
echo "Example(s):"
Expand All @@ -72,61 +179,24 @@ run_fbgemm_gpu_tests () {

# shellcheck disable=SC2155
local env_prefix=$(env_name_or_prefix "${env_name}")
__setup_fbgemm_gpu_test

# Enable ROCM testing if specified
if [ "$fbgemm_variant" == "rocm" ]; then
echo "[TEST] Set environment variables for ROCm testing ..."
# shellcheck disable=SC2086
print_exec conda env config vars set ${env_prefix} FBGEMM_TEST_WITH_ROCM=1
# shellcheck disable=SC2086
print_exec conda env config vars set ${env_prefix} HIP_LAUNCH_BLOCKING=1
fi

# These are either non-tests or currently-broken tests in both FBGEMM_GPU and FBGEMM_GPU-CPU
local files_to_skip=(
./ssd_split_table_batched_embeddings_test.py
)

if [ "$fbgemm_variant" == "cpu" ]; then
# These tests have non-CPU operators referenced in @given
local ignored_tests=(
./uvm/copy_test.py
./uvm/uvm_test.py
)
elif [ "$fbgemm_variant" == "rocm" ]; then
local ignored_tests=(
# https://github.com/pytorch/FBGEMM/issues/1559
./batched_unary_embeddings_test.py
./tbe/backward_adagrad_test.py
./tbe/backward_dense_test.py
./tbe/backward_none_test.py
./tbe/backward_sgd_test.py
)
else
local ignored_tests=()
fi

echo "[TEST] Installing pytest ..."
# shellcheck disable=SC2086
(exec_with_retries 3 conda install ${env_prefix} -y pytest expecttest) || return 1

echo "[TEST] Checking imports ..."
(test_python_import_package "${env_name}" fbgemm_gpu) || return 1
(test_python_import_package "${env_name}" fbgemm_gpu.split_embedding_codegen_lookup_invokers) || return 1

echo "[TEST] Enumerating test files ..."
echo "[TEST] Enumerating ALL test files ..."
# shellcheck disable=SC2155
local all_test_files=$(find . -type f -name '*_test.py' -print | sort)
for f in $all_test_files; do echo "$f"; done
echo ""

echo "[TEST] Enumerating IGNORED test files ..."
for f in $ignored_tests; do echo "$f"; done
echo ""

# NOTE: Tests running on single CPU core with a less powerful testing GPU in
# GHA can take up to 5 hours.
for test_file in $all_test_files; do
if echo "${files_to_skip[@]}" | grep "${test_file}"; then
echo "[TEST] Skipping test file known to be broken: ${test_file}"
elif echo "${ignored_tests[@]}" | grep "${test_file}"; then
if echo "${ignored_tests[@]}" | grep "${test_file}"; then
echo "[TEST] Skipping test file: ${test_file}"
echo ""
elif run_python_test "${env_name}" "${test_file}"; then
echo ""
else
Expand Down
Loading

0 comments on commit 360e3f1

Please sign in to comment.