ROCm · liligwu · Feb 20, 2024 · Feb 1, 2024 · Feb 1, 2024 · Feb 2, 2024
diff --git a/.github/scripts/fbgemm_build.bash b/.github/scripts/fbgemm_build.bash
@@ -0,0 +1,123 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+# shellcheck disable=SC1091,SC2128
+. "$( dirname -- "$BASH_SOURCE"; )/utils_base.bash"
+
+################################################################################
+# FBGEMM Build Auxiliary Functions
+################################################################################
+
+__configure_fbgemm_build () {
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
+  # shellcheck disable=SC2155,SC2086
+  local python_path=$(conda run ${env_prefix} which python)
+
+  # shellcheck disable=SC2206
+  build_args=(
+    -DUSE_SANITIZER=address
+    -DFBGEMM_LIBRARY_TYPE=${fbgemm_library_type}
+    -DPYTHON_EXECUTABLE=${python_path}
+  )
+
+  if print_exec "conda run ${env_prefix} c++ --version | grep -i clang"; then
+    echo "[BUILD] Host compiler is Clang; adding extra compiler flags ..."
+
+    # shellcheck disable=SC2155,SC2086
+    local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
+    # shellcheck disable=SC2155,SC2086
+    local cc_path=$(conda run ${env_prefix} which cc)
+    # shellcheck disable=SC2155,SC2086
+    local cxx_path=$(conda run ${env_prefix} which c++)
+
+    # shellcheck disable=SC2206
+    build_args+=(
+      -DCMAKE_C_COMPILER="${cc_path}"
+      -DCMAKE_CXX_COMPILER="${cxx_path}"
+      -DCMAKE_C_FLAGS=\"-fopenmp=libomp -stdlib=libc++ -I ${conda_prefix}/include\"
+      -DCMAKE_CXX_FLAGS=\"-fopenmp=libomp -stdlib=libc++ -I ${conda_prefix}/include\"
+    )
+  fi
+
+  # shellcheck disable=SC2145
+  echo "[BUILD] FBGEMM build arguments have been set:  ${build_args[@]}"
+}
+
+################################################################################
+# FBGEMM_GPU Build Functions
+################################################################################
+
+build_fbgemm_library () {
+  env_name="$1"
+  local build_dir="$2"
+  fbgemm_library_type="$3"
+  if [ "$fbgemm_library_type" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} ENV_NAME BUILD_DIR LIBRARY_TYPE COMPILER"
+    echo "Example(s):"
+    echo "    ${FUNCNAME[0]} build_env shared   # Build shared library"
+    echo "    ${FUNCNAME[0]} build_env static   # Build static library"
+    return 1
+  fi
+
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
+  echo "################################################################################"
+  echo "# Build FBGEMM Library"
+  echo "#"
+  echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
+  echo "################################################################################"
+  echo ""
+
+  # Set up and configure the build
+  __configure_fbgemm_build || return 1
+
+  mkdir "$build_dir" || return 1
+  cd "$build_dir" || return 1
+
+  echo "[BUILD] Running CMake ..."
+  # shellcheck disable=SC2086
+  print_exec conda run --no-capture-output ${env_prefix} \
+    cmake "${build_args[@]}" ..
+
+  echo "[BUILD] Running the build ..."
+  # shellcheck disable=SC2086
+  print_exec conda run --no-capture-output ${env_prefix} \
+    make -j VERBOSE=1
+
+  cd - || return 1
+}
+
+################################################################################
+# FBGEMM_GPU Test Functions
+################################################################################
+
+test_fbgemm_library () {
+  local env_name="$1"
+  local build_dir="$2"
+  if [ "$build_dir" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} ENV_NAME BUILD_DIR"
+    echo "Example(s):"
+    echo "    ${FUNCNAME[0]} build_env build    # Run tests"
+    return 1
+  fi
+
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
+  cd "$build_dir" || return 1
+
+  echo "[BUILD] Running FBGEMM tests ..."
+  # shellcheck disable=SC2086
+  print_exec conda run --no-capture-output ${env_prefix} \
+    ctest --rerun-failed --output-on-failure
+
+  cd - || return 1
+}
diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
@@ -55,6 +55,23 @@ prepare_fbgemm_gpu_build () {
   echo "[BUILD] Successfully ran git submodules update"
 }
 
+__configure_compiler_flags () {
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
+  if print_exec "conda run ${env_prefix} c++ --version | grep -i clang"; then
+    echo "[BUILD] Clang is available; configuring for Clang-based build ..."
+
+    # shellcheck disable=SC2155,SC2086
+    local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
+
+    # shellcheck disable=SC2206
+    build_args+=(
+      --cxxprefix ${conda_prefix}
+    )
+  fi
+}
+
 __configure_fbgemm_gpu_build_cpu () {
   # Update the package name and build args depending on if CUDA is specified
   echo "[BUILD] Setting CPU-only build args ..."
@@ -162,7 +179,7 @@ __configure_fbgemm_gpu_build () {
   if [ "$fbgemm_variant" == "" ]; then
     echo "Usage: ${FUNCNAME[0]} FBGEMM_VARIANT"
     echo "Example(s):"
-    echo "    ${FUNCNAME[0]} cpu                          # CPU-only variant"
+    echo "    ${FUNCNAME[0]} cpu                          # CPU-only variant using Clang"
     echo "    ${FUNCNAME[0]} cuda                         # CUDA variant for default target(s)"
     echo "    ${FUNCNAME[0]} cuda '7.0;8.0'               # CUDA variant for custom target(s)"
     echo "    ${FUNCNAME[0]} rocm                         # ROCm variant for default target(s)"
@@ -190,6 +207,9 @@ __configure_fbgemm_gpu_build () {
     __configure_fbgemm_gpu_build_cuda "${fbgemm_variant_targets}"
   fi
 
+  # Set other compiler flags as needed
+  __configure_compiler_flags
+
   # shellcheck disable=SC2145
   echo "[BUILD] FBGEMM_GPU build arguments have been set:  ${build_args[@]}"
 }
@@ -375,11 +395,11 @@ build_fbgemm_gpu_package () {
   if [ "$fbgemm_variant" == "" ]; then
     echo "Usage: ${FUNCNAME[0]} ENV_NAME RELEASE_TYPE VARIANT [VARIANT_TARGETS]"
     echo "Example(s):"
-    echo "    ${FUNCNAME[0]} build_env nightly cpu                           # Nightly CPU-only variant"
-    echo "    ${FUNCNAME[0]} build_env nightly cuda                          # Nightly CUDA variant for default target(s)"
-    echo "    ${FUNCNAME[0]} build_env nightly cuda '7.0;8.0'                # Nightly CUDA variant for custom target(s)"
-    echo "    ${FUNCNAME[0]} build_env release rocm                          # Release ROCm variant for default target(s)"
-    echo "    ${FUNCNAME[0]} build_env release rocm 'gfx906;gfx908;gfx90a'   # Release ROCm variant for custom target(s)"
+    echo "    ${FUNCNAME[0]} build_env cpu                          # CPU-only variant"
+    echo "    ${FUNCNAME[0]} build_env cuda                         # CUDA variant for default target(s)"
+    echo "    ${FUNCNAME[0]} build_env cuda '7.0;8.0'               # CUDA variant for custom target(s)"
+    echo "    ${FUNCNAME[0]} build_env rocm                         # ROCm variant for default target(s)"
+    echo "    ${FUNCNAME[0]} build_env rocm 'gfx906;gfx908;gfx90a'  # ROCm variant for custom target(s)"
     return 1
   fi
 

diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash
@@ -32,28 +32,135 @@ run_python_test () {
   local env_prefix=$(env_name_or_prefix "${env_name}")
 
   # shellcheck disable=SC2086
-  if exec_with_retries 2 conda run --no-capture-output ${env_prefix} python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then
+  if print_exec conda run --no-capture-output ${env_prefix} python -m pytest "${pytest_args[@]}" --cache-clear  "${python_test_file}"; then
     echo "[TEST] Python test suite PASSED: ${python_test_file}"
     echo ""
     echo ""
     echo ""
+    return 0
+  fi
+
+  echo "[TEST] Some tests FAILED.  Re-attempting only FAILED tests: ${python_test_file}"
+  echo ""
+  echo ""
+
+  # NOTE: Running large test suites may result in OOM error that will cause the
+  # process to be prematurely killed.  To work around this, when we re-run test
+  # suites, we only run tests that have failed in the previous round.  This is
+  # enabled by using the pytest cache and the --lf flag.
+
+  # shellcheck disable=SC2086
+  if exec_with_retries 2 conda run --no-capture-output ${env_prefix} python -m pytest "${pytest_args[@]}" --lf --last-failed-no-failures none "${python_test_file}"; then
+    echo "[TEST] Python test suite PASSED with retries: ${python_test_file}"
+    echo ""
+    echo ""
+    echo ""
   else
-    echo "[TEST] Python test suite FAILED: ${python_test_file}"
+    echo "[TEST] Python test suite FAILED for some or all tests despite multiple retries: ${python_test_file}"
     echo ""
     echo ""
     echo ""
     return 1
   fi
 }
 
+__configure_fbgemm_gpu_test_cpu () {
+  ignored_tests=(
+    ./ssd_split_table_batched_embeddings_test.py
+    # These tests have non-CPU operators referenced in @given
+    ./uvm/copy_test.py
+    ./uvm/uvm_test.py
+  )
+}
+
+__configure_fbgemm_gpu_test_cuda () {
+  ignored_tests=(
+    ./ssd_split_table_batched_embeddings_test.py
+  )
+}
+
+__configure_fbgemm_gpu_test_rocm () {
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
+  echo "[TEST] Set environment variables for ROCm testing ..."
+  # shellcheck disable=SC2086
+  print_exec conda env config vars set ${env_prefix} FBGEMM_TEST_WITH_ROCM=1
+  # shellcheck disable=SC2086
+  print_exec conda env config vars set ${env_prefix} HIP_LAUNCH_BLOCKING=1
+
+  # Starting from MI250 AMD GPUs support per process XNACK mode change
+  # shellcheck disable=SC2155
+  local rocm_version=$(awk -F'[.-]' '{print $1 * 10000 + $2 * 100 + $3}' /opt/rocm/.info/version-dev)
+  if [ "$rocm_version" -ge 50700 ]; then
+    # shellcheck disable=SC2086
+    print_exec conda env config vars set ${env_prefix} HSA_XNACK=1
+  fi
+
+  ignored_tests=(
+    ./ssd_split_table_batched_embeddings_test.py
+    # https://github.com/pytorch/FBGEMM/issues/1559
+    ./batched_unary_embeddings_test.py
+  )
+}
+
+__setup_fbgemm_gpu_test () {
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
+  # Configure the environment for ignored test suites for each FBGEMM_GPU
+  # variant
+  if [ "$fbgemm_variant" == "cpu" ]; then
+    echo "[TEST] Configuring for CPU-based testing ..."
+    __configure_fbgemm_gpu_test_cpu
+
+  elif [ "$fbgemm_variant" == "rocm" ]; then
+    echo "[TEST] Configuring for ROCm-based testing ..."
+    __configure_fbgemm_gpu_test_rocm
+
+  else
+    echo "[TEST] Configuring for CUDA-based testing ..."
+    __configure_fbgemm_gpu_test_cuda
+  fi
+
+  if [[ $MACHINE_NAME == 'aarch64' ]]; then
+    # NOTE: Setting KMP_DUPLICATE_LIB_OK silences the error about multiple
+    # OpenMP being linked when FBGEMM_GPU is compiled under Clang on aarch64
+    # machines:
+    #   https://stackoverflow.com/questions/53014306/error-15-initializing-libiomp5-dylib-but-found-libiomp5-dylib-already-initial
+    echo "[TEST] Platform is aarch64; will set KMP_DUPLICATE_LIB_OK ..."
+    # shellcheck disable=SC2086
+    print_exec conda env config vars set ${env_prefix} KMP_DUPLICATE_LIB_OK=1
+  fi
+
+  echo "[TEST] Installing PyTest ..."
+  # shellcheck disable=SC2086
+  (exec_with_retries 3 conda install ${env_prefix} -y pytest expecttest) || return 1
+
+  echo "[TEST] Checking imports ..."
+  (test_python_import_package "${env_name}" fbgemm_gpu) || return 1
+  (test_python_import_package "${env_name}" fbgemm_gpu.split_embedding_codegen_lookup_invokers) || return 1
+
+  # Configure the PyTest args
+  pytest_args=(
+    -v
+    -rsx
+    -s
+    -W ignore::pytest.PytestCollectionWarning
+  )
+
+  # shellcheck disable=SC2145
+  echo "[TEST] PyTest args:  ${pytest_args[@]}"
+}
+
 
 ################################################################################
 # FBGEMM_GPU Test Functions
 ################################################################################
 
 run_fbgemm_gpu_tests () {
-  local env_name="$1"
-  local fbgemm_variant="$2"
+  env_name="$1"
+  fbgemm_variant="$2"
   if [ "$fbgemm_variant" == "" ]; then
     echo "Usage: ${FUNCNAME[0]} ENV_NAME [FBGEMM_VARIANT]"
     echo "Example(s):"
@@ -72,61 +179,24 @@ run_fbgemm_gpu_tests () {
 
   # shellcheck disable=SC2155
   local env_prefix=$(env_name_or_prefix "${env_name}")
+  __setup_fbgemm_gpu_test
 
-  # Enable ROCM testing if specified
-  if [ "$fbgemm_variant" == "rocm" ]; then
-    echo "[TEST] Set environment variables for ROCm testing ..."
-    # shellcheck disable=SC2086
-    print_exec conda env config vars set ${env_prefix} FBGEMM_TEST_WITH_ROCM=1
-    # shellcheck disable=SC2086
-    print_exec conda env config vars set ${env_prefix} HIP_LAUNCH_BLOCKING=1
-  fi
-
-  # These are either non-tests or currently-broken tests in both FBGEMM_GPU and FBGEMM_GPU-CPU
-  local files_to_skip=(
-    ./ssd_split_table_batched_embeddings_test.py
-  )
-
-  if [ "$fbgemm_variant" == "cpu" ]; then
-    # These tests have non-CPU operators referenced in @given
-    local ignored_tests=(
-      ./uvm/copy_test.py
-      ./uvm/uvm_test.py
-    )
-  elif [ "$fbgemm_variant" == "rocm" ]; then
-    local ignored_tests=(
-      # https://github.com/pytorch/FBGEMM/issues/1559
-      ./batched_unary_embeddings_test.py
-      ./tbe/backward_adagrad_test.py
-      ./tbe/backward_dense_test.py
-      ./tbe/backward_none_test.py
-      ./tbe/backward_sgd_test.py
-    )
-  else
-    local ignored_tests=()
-  fi
-
-  echo "[TEST] Installing pytest ..."
-  # shellcheck disable=SC2086
-  (exec_with_retries 3 conda install ${env_prefix} -y pytest expecttest) || return 1
-
-  echo "[TEST] Checking imports ..."
-  (test_python_import_package "${env_name}" fbgemm_gpu) || return 1
-  (test_python_import_package "${env_name}" fbgemm_gpu.split_embedding_codegen_lookup_invokers) || return 1
-
-  echo "[TEST] Enumerating test files ..."
+  echo "[TEST] Enumerating ALL test files ..."
   # shellcheck disable=SC2155
   local all_test_files=$(find . -type f -name '*_test.py' -print | sort)
   for f in $all_test_files; do echo "$f"; done
   echo ""
 
+  echo "[TEST] Enumerating IGNORED test files ..."
+  for f in $ignored_tests; do echo "$f"; done
+  echo ""
+
   # NOTE: Tests running on single CPU core with a less powerful testing GPU in
   # GHA can take up to 5 hours.
   for test_file in $all_test_files; do
-    if echo "${files_to_skip[@]}" | grep "${test_file}"; then
-      echo "[TEST] Skipping test file known to be broken: ${test_file}"
-    elif echo "${ignored_tests[@]}" | grep "${test_file}"; then
+    if echo "${ignored_tests[@]}" | grep "${test_file}"; then
       echo "[TEST] Skipping test file: ${test_file}"
+      echo ""
     elif run_python_test "${env_name}" "${test_file}"; then
       echo ""
     else