Merge pull request #52 from ROCmSoftwarePlatform/IFU-2023-12-14

Ifu 2023 12 14
ROCm · Dec 14, 2023 · 03b582b · 03b582b
2 parents 61a7e50 + 10ace05
commit 03b582b
Show file tree

Hide file tree

Showing 42 changed files with 658 additions and 858 deletions.
diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
@@ -77,8 +77,12 @@ __configure_fbgemm_gpu_build_rocm () {
       echo "[BUILD] Architectures list from rocminfo: ${arch_list}"
 
       if [ "$arch_list" == "" ]; then
-        # By default, build for MI250 only to save time
-        local arch_list=gfx90a
+        echo "[BUILD] rocminfo did not return anything valid!"
+
+        # By default, we build just for MI100 and MI250 to save time.  This list
+        # needs to be updated if the CI ROCm machines have different hardware.
+        # Architecture mapping can be found at: https://wiki.gentoo.org/wiki/ROCm
+        local arch_list="gfx908,gfx90a"
       fi
     else
       echo "[BUILD] rocminfo not found in PATH!"
@@ -92,7 +96,12 @@ __configure_fbgemm_gpu_build_rocm () {
   echo "[BUILD] Setting ROCm build args ..."
   build_args=(
     --package_variant=rocm
-    -DTORCH_USE_HIP_DSA=1
+    # HIP_ROOT_DIR now required for HIP to be correctly detected by CMake
+    -DHIP_ROOT_DIR=/opt/rocm
+    # Enable device-side assertions in HIP
+    # https://stackoverflow.com/questions/44284275/passing-compiler-options-in-cmake-command-line
+    -DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA"
+    -DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA"
   )
 }
 
@@ -140,6 +149,7 @@ __configure_fbgemm_gpu_build_cuda () {
   build_args=(
     --package_variant=cuda
     --nvml_lib_path="${nvml_lib_path}"
+    # Pass to PyTorch CMake
     -DTORCH_CUDA_ARCH_LIST="'${arch_list}'"
   )
 }

diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash
@@ -50,11 +50,11 @@ run_python_test () {
 run_fbgemm_gpu_tests () {
   local env_name="$1"
   local fbgemm_variant="$2"
-  if [ "$env_name" == "" ]; then
+  if [ "$fbgemm_variant" == "" ]; then
     echo "Usage: ${FUNCNAME[0]} ENV_NAME [FBGEMM_VARIANT]"
     echo "Example(s):"
-    echo "    ${FUNCNAME[0]} build_env        # Run all tests applicable to CUDA"
     echo "    ${FUNCNAME[0]} build_env cpu    # Run all tests applicable to CPU"
+    echo "    ${FUNCNAME[0]} build_env cuda   # Run all tests applicable to CUDA"
     echo "    ${FUNCNAME[0]} build_env rocm   # Run all tests applicable to ROCm"
     return 1
   else
@@ -71,9 +71,11 @@ run_fbgemm_gpu_tests () {
 
   # Enable ROCM testing if specified
   if [ "$fbgemm_variant" == "rocm" ]; then
-    echo "[TEST] Set environment variable FBGEMM_TEST_WITH_ROCM to enable ROCm tests ..."
+    echo "[TEST] Set environment variables for ROCm testing ..."
     # shellcheck disable=SC2086
     print_exec conda env config vars set ${env_prefix} FBGEMM_TEST_WITH_ROCM=1
+    # shellcheck disable=SC2086
+    print_exec conda env config vars set ${env_prefix} HIP_LAUNCH_BLOCKING=1
   fi
 
   # These are either non-tests or currently-broken tests in both FBGEMM_GPU and FBGEMM_GPU-CPU
@@ -138,7 +140,7 @@ test_setup_conda_environment () {
   if [ "$pytorch_variant_type" == "" ]; then
     echo "Usage: ${FUNCNAME[0]} ENV_NAME PYTHON_VERSION PYTORCH_INSTALLER PYTORCH_VERSION PYTORCH_VARIANT_TYPE [PYTORCH_VARIANT_VERSION]"
     echo "Example(s):"
-    echo "    ${FUNCNAME[0]} build_env 3.10 pip test cuda 12.1.0       # Setup environment with pytorch-test for Python 3.10 + CUDA 12.1.0"
+    echo "    ${FUNCNAME[0]} build_env 3.12 pip test cuda 12.1.0       # Setup environment with pytorch-test for Python 3.12 + CUDA 12.1.0"
     return 1
   else
     echo "################################################################################"
@@ -210,8 +212,8 @@ test_fbgemm_gpu_build_and_install () {
   cd -
   install_fbgemm_gpu_wheel    "${env_name}" fbgemm_gpu/dist/*.whl             || return 1
 
-  cd fbgemm_gpu/test                        || return 1
-  run_fbgemm_gpu_tests        "${env_name}" || return 1
+  cd fbgemm_gpu/test                                                          || return 1
+  run_fbgemm_gpu_tests        "${env_name}" "${pytorch_variant_type}"         || return 1
   # shellcheck disable=SC2164
   cd -
 }
diff --git a/.github/scripts/nova_postscript.bash b/.github/scripts/nova_postscript.bash
@@ -20,6 +20,10 @@ echo "[NOVA] Current working directory: $(pwd)"
 # shellcheck source=.github/scripts/setup_env.bash
 . "${PRELUDE}";
 
+# Collect PyTorch environment information
+collect_pytorch_env_info "${BUILD_ENV_NAME}"
+
+# Install the wheel
 install_fbgemm_gpu_wheel "${BUILD_ENV_NAME}" fbgemm_gpu/dist/*.whl
 
 # Test with PyTest
@@ -31,3 +35,6 @@ fi
 $CONDA_RUN python3 -c "import torch; print('cuda.is_available() ', torch.cuda.is_available()); print ('device_count() ',torch.cuda.device_count());"
 cd "${FBGEMM_REPO}/fbgemm_gpu/test" || { echo "[NOVA] Failed to cd to fbgemm_gpu/test from $(pwd)"; };
 run_fbgemm_gpu_tests "${BUILD_ENV_NAME}" "${CPU_GPU}"
+
+# Workaround EACCES: permission denied error at checkout step
+chown -R 1000:1000 /__w/FBGEMM/FBGEMM/ || echo "Unable to chown 1000:1000 from $USER, uid: $(id -u)"
diff --git a/.github/scripts/nova_prescript.bash b/.github/scripts/nova_prescript.bash
@@ -33,6 +33,9 @@ install_cxx_compiler "${BUILD_ENV_NAME}"
 # Install Build Tools
 install_build_tools "${BUILD_ENV_NAME}"
 
+# Collect PyTorch environment information
+collect_pytorch_env_info "${BUILD_ENV_NAME}"
+
 if [[ $CU_VERSION = cu* ]]; then
   # Extract the CUDA version number from CU_VERSION
   cuda_version=$(echo "[NOVA] ${CU_VERSION}" | cut -c 3-)

diff --git a/.github/scripts/utils_cuda.bash b/.github/scripts/utils_cuda.bash
@@ -77,6 +77,14 @@ install_cuda () {
   # Print nvcc version
   # shellcheck disable=SC2086
   print_exec conda run ${env_prefix} nvcc --version
+
+  if which nvidia-smi; then
+    # If nvidia-smi is installed on a machine without GPUs, this will return error
+    (print_exec nvidia-smi) || true
+  else
+    echo "[CHECK] nvidia-smi not found"
+  fi
+
   echo "[INSTALL] Successfully installed CUDA ${cuda_version}"
 }
 

diff --git a/.github/scripts/utils_pytorch.bash b/.github/scripts/utils_pytorch.bash
@@ -146,3 +146,38 @@ install_pytorch_pip () {
 
   echo "[INSTALL] Successfully installed PyTorch through PyTorch PIP"
 }
+
+
+################################################################################
+# PyTorch Diagnose Functions
+################################################################################
+
+collect_pytorch_env_info () {
+  local env_name="$1"
+  if [ "$env_name" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} ENV_NAME"
+    echo "Example(s):"
+    echo "    ${FUNCNAME[0]} build_env         # Collect PyTorch environment information from Conda environment build_env"
+    return 1
+  else
+    echo "################################################################################"
+    echo "# Collect PyTorch Environment Information (for Reporting Issues)"
+    echo "#"
+    echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
+    echo "################################################################################"
+    echo ""
+  fi
+
+  test_network_connection || return 1
+
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
+  # This is the script required for collecting info and reporting to https://github.com/pytorch/pytorch/issues/new
+  echo "[INFO] Downloading the PyTorch environment info collection script ..."
+  print_exec wget -q "https://raw.githubusercontent.com/pytorch/pytorch/main/torch/utils/collect_env.py"
+
+  echo "[INFO] Collecting PyTorch environment info (will be needed for reporting issues to PyTorch) ..."
+  # shellcheck disable=SC2086
+  (exec_with_retries 3 conda run ${env_prefix} python collect_env.py) || return 1
+}
diff --git a/.github/scripts/utils_rocm.bash b/.github/scripts/utils_rocm.bash
@@ -75,6 +75,8 @@ install_rocm_ubuntu () {
   print_exec rm -f "${package_name}"
 
   echo "[INFO] Check ROCM GPU info ..."
+  # If rocm-smi is installed on a machine without GPUs, this will return error
+  (print_exec rocminfo) || true
   print_exec rocm-smi
 
   echo "[INSTALL] Successfully installed ROCm ${rocm_version}"

diff --git a/.github/scripts/utils_system.bash b/.github/scripts/utils_system.bash
@@ -118,6 +118,12 @@ print_gpu_info () {
       return 1
     fi
   else
+    if which rocminfo; then
+      # If rocminfo is installed on a machine without GPUs, this will return error
+      (print_exec rocminfo) || true
+    else
+      echo "[CHECK] rocminfo not found"
+    fi
     if which rocm-smi; then
       # If rocm-smi is installed on a machine without GPUs, this will return error
       (print_exec rocm-smi) || true

diff --git a/.github/workflows/fbgemm_gpu_cpu_nightly.yml → .github/workflows/fbgemm_gpu_ci_cpu.yml b/.github/workflows/fbgemm_gpu_cpu_nightly.yml → .github/workflows/fbgemm_gpu_ci_cpu.yml
@@ -3,7 +3,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-name: FBGEMM_GPU-CPU Nightly Build
+# This workflow is used for FBGEMM_GPU-CPU CI as well as nightly builds of
+# FBGEMM_GPU-CPU against PyTorch-CPU Nightly.
+name: FBGEMM_GPU-CPU CI
 
 on:
   # PR Trigger (enabled for regression checks and debugging)
@@ -64,7 +66,7 @@ jobs:
           { arch: x86, instance: "linux.4xlarge" },
           { arch: arm, instance: "linux.arm64.2xlarge" },
         ]
-        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
 
     steps:
     - name: Setup Build Container
@@ -96,10 +98,14 @@ jobs:
     - name: Install PyTorch-CPU Nightly
       run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cpu
 
+    - name: Collect PyTorch Environment Info
+      if: ${{ success() || failure() }}
+      run:  . $PRELUDE; collect_pytorch_env_info $BUILD_ENV
+
     - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
-    - name: Build FBGEMM_GPU Nightly (CPU version)
+    - name: Build FBGEMM_GPU Wheel
       run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV nightly cpu
 
     - name: Upload Built Wheel as GHA Artifact
@@ -128,7 +134,7 @@ jobs:
           { arch: x86, instance: "linux.4xlarge" },
           { arch: arm, instance: "linux.arm64.2xlarge" },
         ]
-        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
     needs: build_artifact
 
     steps:
@@ -164,10 +170,14 @@ jobs:
     - name: Install PyTorch-CPU Nightly
       run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cpu
 
+    - name: Collect PyTorch Environment Info
+      if: ${{ success() || failure() }}
+      run:  . $PRELUDE; collect_pytorch_env_info $BUILD_ENV
+
     - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
-    - name: Install FBGEMM_GPU Nightly (CPU version)
+    - name: Install FBGEMM_GPU Wheel
       run: |
         . $PRELUDE
         pwd; ls -la .
@@ -177,8 +187,74 @@ jobs:
       timeout-minutes: 15
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu
 
-    - name: Push FBGEMM_GPU Nightly (CPU version) Binary to PYPI
+    - name: Push Wheel to PyPI
       if: ${{ github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true') }}
       env:
         PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
       run: . $PRELUDE; publish_to_pypi $BUILD_ENV fbgemm_gpu_nightly_cpu-*.whl "$PYPI_TOKEN"
+
+
+  build_and_test_ubuntu:
+    runs-on: ${{ matrix.host-machine.instance }}
+    container:
+      image: ${{ matrix.container-image }}
+      options: --user root
+    defaults:
+      run:
+        shell: bash
+    env:
+      PRELUDE: .github/scripts/setup_env.bash
+      BUILD_ENV: build_binary
+    strategy:
+      fail-fast: false
+      matrix:
+        host-machine: [
+          { arch: x86, instance: "linux.4xlarge" },
+          { arch: arm, instance: "linux.arm64.2xlarge" },
+        ]
+        container-image: [ "ubuntu:20.04", "ubuntu:22.04" ]
+        python-version: [ "3.11" ]
+
+    steps:
+    - name: Setup Build Container
+      run: |
+        apt update -y
+        apt install -y binutils build-essential git pciutils sudo wget
+        git config --global --add safe.directory '*'
+
+    - name: Checkout the Repository
+      uses: actions/checkout@v4
+      with:
+        submodules: true
+
+    - name: Display System Info
+      run: . $PRELUDE; print_system_info
+
+    - name: Display GPU Info
+      run: . $PRELUDE; print_gpu_info
+
+    - name: Setup Miniconda
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
+
+    - name: Create Conda Environment
+      run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
+
+    - name: Install Build Tools
+      run: . $PRELUDE; install_build_tools $BUILD_ENV
+
+    - name: Install PyTorch
+      run:  . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cpu
+
+    - name: Collect PyTorch Environment Info
+      if: ${{ success() || failure() }}
+      run:  . $PRELUDE; collect_pytorch_env_info $BUILD_ENV
+
+    - name: Prepare FBGEMM_GPU Build
+      run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
+
+    - name: Build + Install FBGEMM_GPU (CPU version)
+      run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_install $BUILD_ENV cpu
+
+    - name: Test FBGEMM_GPU-CPU Nightly Installation
+      timeout-minutes: 15
+      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu
diff --git a/...hub/workflows/fbgemm_gpu_cuda_nightly.yml → .github/workflows/fbgemm_gpu_ci_cuda.yml b/...hub/workflows/fbgemm_gpu_cuda_nightly.yml → .github/workflows/fbgemm_gpu_ci_cuda.yml
@@ -3,7 +3,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-name: FBGEMM_GPU-CUDA Nightly Build
+# This workflow is used for FBGEMM_GPU-CUDA CI as well as nightly builds of
+# FBGEMM_GPU-CUDA against PyTorch-CUDA Nightly.
+name: FBGEMM_GPU-CUDA CI
 
 on:
   # PR Trigger (enabled for regression checks and debugging)
@@ -62,7 +64,7 @@ jobs:
         host-machine: [
           { arch: x86, instance: "linux.24xlarge" },
         ]
-        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
         cuda-version: [ "11.8.0", "12.1.1" ]
 
     steps:
@@ -99,13 +101,17 @@ jobs:
     - name: Install PyTorch Nightly
       run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda ${{ matrix.cuda-version }}
 
+    - name: Collect PyTorch Environment Info
+      if: ${{ success() || failure() }}
+      run:  . $PRELUDE; collect_pytorch_env_info $BUILD_ENV
+
     - name: Install cuDNN
       run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }}
 
     - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
-    - name: Build FBGEMM_GPU Nightly
+    - name: Build FBGEMM_GPU Wheel
       run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV nightly cuda
 
     - name: Upload Built Wheel as GHA Artifact
@@ -133,7 +139,7 @@ jobs:
         host-machine: [
           { arch: x86, instance: "linux.g5.4xlarge.nvidia.gpu" },
         ]
-        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
         cuda-version: [ "11.8.0", "12.1.1" ]
         # Specify exactly ONE CUDA version for artifact publish
         cuda-version-publish: [ "12.1.1" ]
@@ -174,17 +180,21 @@ jobs:
     - name: Install PyTorch Nightly
       run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda ${{ matrix.cuda-version }}
 
+    - name: Collect PyTorch Environment Info
+      if: ${{ success() || failure() }}
+      run:  . $PRELUDE; collect_pytorch_env_info $BUILD_ENV
+
     - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
-    - name: Install FBGEMM_GPU Nightly
+    - name: Install FBGEMM_GPU Wheel
       run: . $PRELUDE; install_fbgemm_gpu_wheel $BUILD_ENV *.whl
 
     - name: Test with PyTest
       timeout-minutes: 15
-      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
+      run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cuda
 
-    - name: Push FBGEMM_GPU Nightly Binary to PYPI
+    - name: Push Wheel to PyPI
       if: ${{ (github.event_name == 'schedule' && matrix.cuda-version == matrix.cuda-version-publish) || (github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == matrix.cuda-version-publish) }}
       env:
         PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}