Merge pull request #58 from ROCm/IFU-2024-04-19

Ifu 2024 04 19
ROCm · Apr 19, 2024 · a821cf4 · a821cf4
2 parents 7ec3b82 + 6ed2049
commit a821cf4
Show file tree

Hide file tree

Showing 116 changed files with 2,750 additions and 2,928 deletions.
diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
@@ -64,7 +64,7 @@ __configure_fbgemm_gpu_build_clang () {
   local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
   # shellcheck disable=SC2206
   build_args+=(
-    --cxxprefix ${conda_prefix}
+    --cxxprefix=${conda_prefix}
   )
 }
 
@@ -258,6 +258,11 @@ __configure_fbgemm_gpu_build () {
     __configure_fbgemm_gpu_build_clang
   fi
 
+  # Set verbosity
+  build_args+=(
+    --verbose
+  )
+
   # shellcheck disable=SC2145
   echo "[BUILD] FBGEMM_GPU build arguments have been set:  ${build_args[@]}"
 }
@@ -307,7 +312,7 @@ __build_fbgemm_gpu_set_run_multicore () {
   export run_multicore=""
   if [[ $core =~ $re && $sockets =~ $re ]] ; then
     local n_core=$((core * sockets))
-    export run_multicore=" -j ${n_core}"
+    export run_multicore="-j ${n_core}"
   fi
 
   echo "[BUILD] Set multicore run option for setup.py: ${run_multicore}"
@@ -443,15 +448,26 @@ build_fbgemm_gpu_package () {
   echo "################################################################################"
   echo ""
 
-  # Distribute Python extensions as wheels on Linux
+  # Set packaging options
+  build_args+=(
+    --package_channel="${fbgemm_release_channel}"
+    --python-tag="${python_tag}"
+    --plat-name="${python_plat_name}"
+  )
+
+  # Prepend build options correctly for `python -m build`
+  # https://build.pypa.io/en/stable/index.html
+  # https://gregoryszorc.com/blog/2023/10/30/my-user-experience-porting-off-setup.py/
+  for i in "${!build_args[@]}"; do
+    build_args[i]="--config-setting=--build-option=${build_args[i]}"
+  done
+
+  # Build the wheel.  Invoke using `python -m build`
+  #   https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html
   echo "[BUILD] Building FBGEMM-GPU wheel (VARIANT=${fbgemm_variant}) ..."
   # shellcheck disable=SC2086
   print_exec conda run --no-capture-output ${env_prefix} \
-    python setup.py "${run_multicore}" bdist_wheel \
-      --package_channel="${fbgemm_release_channel}" \
-      --python-tag="${python_tag}" \
-      --plat-name="${python_plat_name}" \
-      --verbose \
+    python -m build --wheel --no-isolation \
       "${build_args[@]}"
 
   # Run checks on the built libraries
@@ -503,7 +519,6 @@ build_fbgemm_gpu_install () {
   # shellcheck disable=SC2086
   print_exec conda run --no-capture-output ${env_prefix} \
     python setup.py "${run_multicore}" install \
-      --verbose \
       "${build_args[@]}"
 
   # Run checks on the built libraries
@@ -519,47 +534,3 @@ build_fbgemm_gpu_install () {
 
   echo "[BUILD] FBGEMM-GPU build + install completed"
 }
-
-build_fbgemm_gpu_develop () {
-  env_name="$1"
-  fbgemm_variant="$2"
-  fbgemm_variant_targets="$3"
-  if [ "$fbgemm_variant" == "" ]; then
-    echo "Usage: ${FUNCNAME[0]} ENV_NAME VARIANT [TARGETS]"
-    echo "Example(s):"
-    echo "    ${FUNCNAME[0]} build_env cpu                          # CPU-only variant"
-    echo "    ${FUNCNAME[0]} build_env cuda                         # CUDA variant for default target(s)"
-    echo "    ${FUNCNAME[0]} build_env cuda '7.0;8.0'               # CUDA variant for custom target(s)"
-    echo "    ${FUNCNAME[0]} build_env rocm                         # ROCm variant for default target(s)"
-    echo "    ${FUNCNAME[0]} build_env rocm 'gfx906;gfx908;gfx90a'  # ROCm variant for custom target(s)"
-    return 1
-  fi
-
-  # shellcheck disable=SC2155
-  local env_prefix=$(env_name_or_prefix "${env_name}")
-
-  # Set up and configure the build
-  __build_fbgemm_gpu_common_pre_steps || return 1
-  __configure_fbgemm_gpu_build "${fbgemm_variant}" "${fbgemm_variant_targets}" || return 1
-
-  echo "################################################################################"
-  echo "# Build + Install FBGEMM-GPU Package (Develop)"
-  echo "#"
-  echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
-  echo "################################################################################"
-  echo ""
-
-  # Parallelism may need to be limited to prevent the build from being
-  # canceled for going over ulimits
-  echo "[BUILD] Building (develop) FBGEMM-GPU (VARIANT=${fbgemm_variant}) ..."
-  # shellcheck disable=SC2086
-  print_exec conda run --no-capture-output ${env_prefix} \
-    python setup.py "${run_multicore}" build develop \
-      --verbose \
-      "${build_args[@]}"
-
-  # Run checks on the built libraries
-  (run_fbgemm_gpu_postbuild_checks "${fbgemm_variant}") || return 1
-
-  echo "[BUILD] FBGEMM-GPU build + develop completed"
-}
diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash
@@ -205,6 +205,22 @@ run_fbgemm_gpu_tests () {
   done
 }
 
+test_all_fbgemm_gpu_modules () {
+  local env_name="$1"
+  local fbgemm_variant="$2"
+
+  local target_directories=(
+    fbgemm_gpu/test
+    fbgemm_gpu/experimental/example/test
+  )
+
+  for test_dir in "${target_directories[@]}"; do
+    cd "${test_dir}"                                        || return 1
+    run_fbgemm_gpu_tests "${env_name}" "${fbgemm_variant}"  || return 1
+    cd -                                                    || return 1
+  done
+}
+
 
 ################################################################################
 # FBGEMM_GPU Test Bulk-Combination Functions
@@ -292,9 +308,8 @@ test_fbgemm_gpu_build_and_install () {
   cd ~/FBGEMM/                                                                || return 1
   install_fbgemm_gpu_wheel    "${env_name}" fbgemm_gpu/dist/*.whl             || return 1
 
-  cd ~/FBGEMM/fbgemm_gpu/test                                                 || return 1
-  run_fbgemm_gpu_tests        "${env_name}" "${pytorch_variant_type}"         || return 1
-  cd -                                                                        || return 1
+  cd ~/FBGEMM/                                                                || return 1
+  test_all_fbgemm_gpu_modules "${env_name}" "${pytorch_variant_type}"         || return 1
 }
 
 test_fbgemm_gpu_setup_and_pip_install () {
@@ -323,11 +338,11 @@ test_fbgemm_gpu_setup_and_pip_install () {
 
     local env_name="test_py${py_version}_pytorch_${pytorch_channel_version}_fbgemm_${fbgemm_gpu_channel_version}_${variant_type}/${variant_version}"
     local env_name="${env_name//\//_}"
-    test_setup_conda_environment  "${env_name}" 'no-compiler' "${py_version}" pip "${pytorch_channel_version}" "${variant_type}" "${variant_version}"  || return 1
-    install_fbgemm_gpu_pip        "${env_name}" "${fbgemm_gpu_channel_version}" "${variant_type}/${variant_version}"                        || return 1
-    cd ~/FBGEMM/fbgemm_gpu/test                                                                                                             || return 1
+    test_setup_conda_environment  "${env_name}" 'no-compiler' "${py_version}" pip "${pytorch_channel_version}" "${variant_type}" "${variant_version}"   || return 1
+    install_fbgemm_gpu_pip        "${env_name}" "${fbgemm_gpu_channel_version}" "${variant_type}/${variant_version}"                                    || return 1
+    cd ~/FBGEMM                                                                                                                                         || return 1
 
-    run_fbgemm_gpu_tests "${env_name}" "${variant_type}";
+    test_all_fbgemm_gpu_modules "${env_name}" "${variant_type}";
     local retcode=$?
 
     echo "################################################################################"

diff --git a/.github/scripts/nova_postscript.bash b/.github/scripts/nova_postscript.bash
@@ -42,8 +42,8 @@ else
 fi
 
 $CONDA_RUN python3 -c "import torch; print('cuda.is_available() ', torch.cuda.is_available()); print ('device_count() ',torch.cuda.device_count());"
-cd "${FBGEMM_REPO}/fbgemm_gpu/test" || { echo "[NOVA] Failed to cd to fbgemm_gpu/test from $(pwd)"; };
-run_fbgemm_gpu_tests "${BUILD_ENV_NAME}" "${fbgemm_variant}"
+cd "${FBGEMM_REPO}" || { echo "[NOVA] Failed to cd to ${FBGEMM_REPO} from $(pwd)"; };
+test_all_fbgemm_gpu_modules "${BUILD_ENV_NAME}" "${fbgemm_variant}"
 
 # Workaround EACCES: permission denied error at checkout step
 chown -R 1000:1000 /__w/FBGEMM/FBGEMM/ || echo "Unable to chown 1000:1000 from $USER, uid: $(id -u)"
diff --git a/.github/scripts/utils_base.bash b/.github/scripts/utils_base.bash
@@ -88,15 +88,16 @@ env_name_or_prefix () {
 }
 
 test_network_connection () {
-  wget -q --timeout 1 pypi.org -O /dev/null
+  exec_with_retries 3 wget -q --timeout 1 pypi.org -O /dev/null
   local exit_status=$?
 
   # https://man7.org/linux/man-pages/man1/wget.1.html
   if [ $exit_status == 0 ]; then
     echo "[CHECK] Network does not appear to be blocked."
   else
     echo "[CHECK] Network check exit status: ${exit_status}"
-    echo "[CHECK] Network appears to be blocked; please proxy the network connetions, i.e. re-run the command prefixed with 'with-proxy'."
+    echo "[CHECK] Network appears to be blocked or suffering from poor connection."
+    echo "[CHECK] Please remember to proxy the network connetions if needed, i.e. re-run the command prefixed with 'with-proxy'."
     return 1
   fi
 }

diff --git a/.github/scripts/utils_build.bash b/.github/scripts/utils_build.bash
@@ -246,6 +246,7 @@ install_build_tools () {
   # shellcheck disable=SC2086
   (exec_with_retries 3 conda install ${env_prefix} -c conda-forge -y \
     bazel \
+    build \
     click \
     cmake \
     hypothesis \

diff --git a/.github/workflows/fbgemm_gpu_ci_cpu.yml b/.github/workflows/fbgemm_gpu_ci_cpu.yml
@@ -182,7 +182,7 @@ jobs:
 
     - name: Test with PyTest
       timeout-minutes: ${{ matrix.host-machine.timeout }}
-      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu
+      run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV cpu
 
     - name: Push Wheel to PyPI
       if: ${{ (github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true')) && matrix.compiler == 'gcc' }}

diff --git a/.github/workflows/fbgemm_gpu_ci_cuda.yml b/.github/workflows/fbgemm_gpu_ci_cuda.yml
@@ -202,7 +202,7 @@ jobs:
 
     - name: Test with PyTest
       timeout-minutes: 20
-      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cuda
+      run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV cuda
 
     - name: Push Wheel to PyPI
       if: ${{ (github.event_name == 'schedule' && matrix.cuda-version == matrix.cuda-version-publish) || (github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == matrix.cuda-version-publish) }}

diff --git a/.github/workflows/fbgemm_gpu_ci_rocm.yml b/.github/workflows/fbgemm_gpu_ci_rocm.yml
@@ -191,4 +191,4 @@ jobs:
 
     - name: Test with PyTest
       timeout-minutes: 20
-      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm
+      run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV rocm
diff --git a/.github/workflows/fbgemm_gpu_pip.yml b/.github/workflows/fbgemm_gpu_pip.yml
@@ -99,7 +99,7 @@ jobs:
 
     - name: Test with PyTest
       timeout-minutes: ${{ matrix.host-machine.timeout }}
-      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu
+      run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV cpu
 
 
   test_pypi_install_cuda:
@@ -159,7 +159,7 @@ jobs:
 
     - name: Test with PyTest
       timeout-minutes: 20
-      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cuda
+      run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV cuda
 
 
   test_pypi_install_rocm:
@@ -225,4 +225,4 @@ jobs:
 
     - name: Test with PyTest
       timeout-minutes: 20
-      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm
+      run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV rocm
diff --git a/.github/workflows/fbgemm_gpu_release_cpu.yml b/.github/workflows/fbgemm_gpu_release_cpu.yml
@@ -174,7 +174,7 @@ jobs:
 
     - name: Test with PyTest
       timeout-minutes: ${{ matrix.host-machine.timeout }}
-      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu
+      run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV cpu
 
     - name: Push FBGEMM_GPU (CPU version) Binary to PYPI
       if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' }}

diff --git a/.github/workflows/fbgemm_gpu_release_cuda.yml b/.github/workflows/fbgemm_gpu_release_cuda.yml
@@ -184,7 +184,7 @@ jobs:
 
     - name: Test with PyTest
       timeout-minutes: 20
-      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cuda
+      run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV cuda
 
     - name: Push FBGEMM_GPU Binary to PYPI
       if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == github.event.inputs.cuda_version }}

diff --git a/bench/ConvUnifiedBenchmark.cc b/bench/ConvUnifiedBenchmark.cc
@@ -281,12 +281,8 @@ void performance_test(
 #ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
   cout << "WARNING: the timer may be inaccurate when used by multiple threads."
        << endl;
-  cout << header << "Im2Col (ms), "
-       << "Packing (ms), "
-       << "Kernel (ms), "
-       << "Postprocessing (ms), "
-       << "fbgemmPacked (ms), "
-       << "Total (ms), "
+  cout << header << "Im2Col (ms), " << "Packing (ms), " << "Kernel (ms), "
+       << "Postprocessing (ms), " << "fbgemmPacked (ms), " << "Total (ms), "
        << "GOPS" << endl;
 #else
   cout << setw(6) << header << setw(5) << "GOPS" << endl;

diff --git a/bench/ConvertBenchmark.cc b/bench/ConvertBenchmark.cc
@@ -28,9 +28,8 @@ void performance_test() {
   normal_distribution<float> dist;
   default_random_engine engine;
 
-  cout << setw(4) << "M"
-       << " elements_per_sec_ref"
-       << " elements_per_sec_simd" << endl;
+  cout << setw(4) << "M" << " elements_per_sec_ref" << " elements_per_sec_simd"
+       << endl;
 
   array<int, 8> dims{1, 10, 32, 40, 129, 256, 1024, 8000};
 

diff --git a/bench/EmbeddingQuantizeBenchmark.cc b/bench/EmbeddingQuantizeBenchmark.cc
@@ -34,11 +34,9 @@ void performance_test() {
   } else {
     cout << "With scale and bias as float" << endl;
   }
-  cout << setw(8) << "bit_rate"
-       << ", " << setw(6) << "rows"
-       << "," << setw(6) << "cols"
-       << "," << setw(16) << "elems_per_usec"
-       << "," << setw(10) << "GB/Sec" << endl;
+  cout << setw(8) << "bit_rate" << ", " << setw(6) << "rows" << "," << setw(6)
+       << "cols" << "," << setw(16) << "elems_per_usec" << "," << setw(10)
+       << "GB/Sec" << endl;
   std::vector<int> bit_rates;
   if (is_same<T, float16>::value) {
     bit_rates = {2, 4, 8};

diff --git a/bench/EmbeddingSpMDMNBitBenchmark.cc b/bench/EmbeddingSpMDMNBitBenchmark.cc
@@ -352,17 +352,15 @@ int run_benchmark(
         cout << "prefetch off, ";
       }
 
-      cout << "b/w, " << bytes / 1e9 / t << ", GB/s, "
-           << "effective b/w, " << bytes_padded / 1e9 / t << ", GB/s, "
-           << "time, " << t << ", autovec b/w, " << bytes / 1e9 / t_autovec
-           << ", GB/s, "
+      cout << "b/w, " << bytes / 1e9 / t << ", GB/s, " << "effective b/w, "
+           << bytes_padded / 1e9 / t << ", GB/s, " << "time, " << t
+           << ", autovec b/w, " << bytes / 1e9 / t_autovec << ", GB/s, "
            << "autovec eff. b/w, " << bytes_padded / 1e9 / t_autovec
-           << ", GB/s, "
-           << "autovec time, " << t_autovec << ", ref b/w, "
-           << bytes / 1e9 / t_ref << ", GB/s, "
-           << "ref eff. b/w, " << bytes_padded / 1e9 / t_ref << ", GB/s, "
-           << "ref time, " << t_ref << ", autovec speedup, "
-           << t_ref / t_autovec << ", asmjit speedup, " << t_ref / t << endl;
+           << ", GB/s, " << "autovec time, " << t_autovec << ", ref b/w, "
+           << bytes / 1e9 / t_ref << ", GB/s, " << "ref eff. b/w, "
+           << bytes_padded / 1e9 / t_ref << ", GB/s, " << "ref time, " << t_ref
+           << ", autovec speedup, " << t_ref / t_autovec << ", asmjit speedup, "
+           << t_ref / t << endl;
     } // flush_cache
   } // has_weight
   return 0;
-Original file line number
+Diff line change
@@ Expand Up / @@ -246,6 +246,7 @@ install_build_tools () { @@
       # shellcheck disable=SC2086
       (exec_with_retries 3 conda install ${env_prefix} -c conda-forge -y \
         bazel \
+        build \
         click \
         cmake \
         hypothesis \
@@ Expand Down @@