Skip to content

Commit

Permalink
Merge pull request #52 from ROCmSoftwarePlatform/IFU-2023-12-14
Browse files Browse the repository at this point in the history
Ifu 2023 12 14
  • Loading branch information
liligwu authored Dec 14, 2023
2 parents 61a7e50 + 10ace05 commit 03b582b
Show file tree
Hide file tree
Showing 42 changed files with 658 additions and 858 deletions.
16 changes: 13 additions & 3 deletions .github/scripts/fbgemm_gpu_build.bash
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,12 @@ __configure_fbgemm_gpu_build_rocm () {
echo "[BUILD] Architectures list from rocminfo: ${arch_list}"

if [ "$arch_list" == "" ]; then
# By default, build for MI250 only to save time
local arch_list=gfx90a
echo "[BUILD] rocminfo did not return anything valid!"

# By default, we build just for MI100 and MI250 to save time. This list
# needs to be updated if the CI ROCm machines have different hardware.
# Architecture mapping can be found at: https://wiki.gentoo.org/wiki/ROCm
local arch_list="gfx908,gfx90a"
fi
else
echo "[BUILD] rocminfo not found in PATH!"
Expand All @@ -92,7 +96,12 @@ __configure_fbgemm_gpu_build_rocm () {
echo "[BUILD] Setting ROCm build args ..."
build_args=(
--package_variant=rocm
-DTORCH_USE_HIP_DSA=1
# HIP_ROOT_DIR now required for HIP to be correctly detected by CMake
-DHIP_ROOT_DIR=/opt/rocm
# Enable device-side assertions in HIP
# https://stackoverflow.com/questions/44284275/passing-compiler-options-in-cmake-command-line
-DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA"
-DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA"
)
}

Expand Down Expand Up @@ -140,6 +149,7 @@ __configure_fbgemm_gpu_build_cuda () {
build_args=(
--package_variant=cuda
--nvml_lib_path="${nvml_lib_path}"
# Pass to PyTorch CMake
-DTORCH_CUDA_ARCH_LIST="'${arch_list}'"
)
}
Expand Down
14 changes: 8 additions & 6 deletions .github/scripts/fbgemm_gpu_test.bash
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,11 @@ run_python_test () {
run_fbgemm_gpu_tests () {
local env_name="$1"
local fbgemm_variant="$2"
if [ "$env_name" == "" ]; then
if [ "$fbgemm_variant" == "" ]; then
echo "Usage: ${FUNCNAME[0]} ENV_NAME [FBGEMM_VARIANT]"
echo "Example(s):"
echo " ${FUNCNAME[0]} build_env # Run all tests applicable to CUDA"
echo " ${FUNCNAME[0]} build_env cpu # Run all tests applicable to CPU"
echo " ${FUNCNAME[0]} build_env cuda # Run all tests applicable to CUDA"
echo " ${FUNCNAME[0]} build_env rocm # Run all tests applicable to ROCm"
return 1
else
Expand All @@ -71,9 +71,11 @@ run_fbgemm_gpu_tests () {

# Enable ROCM testing if specified
if [ "$fbgemm_variant" == "rocm" ]; then
echo "[TEST] Set environment variable FBGEMM_TEST_WITH_ROCM to enable ROCm tests ..."
echo "[TEST] Set environment variables for ROCm testing ..."
# shellcheck disable=SC2086
print_exec conda env config vars set ${env_prefix} FBGEMM_TEST_WITH_ROCM=1
# shellcheck disable=SC2086
print_exec conda env config vars set ${env_prefix} HIP_LAUNCH_BLOCKING=1
fi

# These are either non-tests or currently-broken tests in both FBGEMM_GPU and FBGEMM_GPU-CPU
Expand Down Expand Up @@ -138,7 +140,7 @@ test_setup_conda_environment () {
if [ "$pytorch_variant_type" == "" ]; then
echo "Usage: ${FUNCNAME[0]} ENV_NAME PYTHON_VERSION PYTORCH_INSTALLER PYTORCH_VERSION PYTORCH_VARIANT_TYPE [PYTORCH_VARIANT_VERSION]"
echo "Example(s):"
echo " ${FUNCNAME[0]} build_env 3.10 pip test cuda 12.1.0 # Setup environment with pytorch-test for Python 3.10 + CUDA 12.1.0"
echo " ${FUNCNAME[0]} build_env 3.12 pip test cuda 12.1.0 # Setup environment with pytorch-test for Python 3.12 + CUDA 12.1.0"
return 1
else
echo "################################################################################"
Expand Down Expand Up @@ -210,8 +212,8 @@ test_fbgemm_gpu_build_and_install () {
cd -
install_fbgemm_gpu_wheel "${env_name}" fbgemm_gpu/dist/*.whl || return 1

cd fbgemm_gpu/test || return 1
run_fbgemm_gpu_tests "${env_name}" || return 1
cd fbgemm_gpu/test || return 1
run_fbgemm_gpu_tests "${env_name}" "${pytorch_variant_type}" || return 1
# shellcheck disable=SC2164
cd -
}
7 changes: 7 additions & 0 deletions .github/scripts/nova_postscript.bash
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ echo "[NOVA] Current working directory: $(pwd)"
# shellcheck source=.github/scripts/setup_env.bash
. "${PRELUDE}";

# Collect PyTorch environment information
collect_pytorch_env_info "${BUILD_ENV_NAME}"

# Install the wheel
install_fbgemm_gpu_wheel "${BUILD_ENV_NAME}" fbgemm_gpu/dist/*.whl

# Test with PyTest
Expand All @@ -31,3 +35,6 @@ fi
$CONDA_RUN python3 -c "import torch; print('cuda.is_available() ', torch.cuda.is_available()); print ('device_count() ',torch.cuda.device_count());"
cd "${FBGEMM_REPO}/fbgemm_gpu/test" || { echo "[NOVA] Failed to cd to fbgemm_gpu/test from $(pwd)"; };
run_fbgemm_gpu_tests "${BUILD_ENV_NAME}" "${CPU_GPU}"

# Workaround EACCES: permission denied error at checkout step
chown -R 1000:1000 /__w/FBGEMM/FBGEMM/ || echo "Unable to chown 1000:1000 from $USER, uid: $(id -u)"
3 changes: 3 additions & 0 deletions .github/scripts/nova_prescript.bash
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ install_cxx_compiler "${BUILD_ENV_NAME}"
# Install Build Tools
install_build_tools "${BUILD_ENV_NAME}"

# Collect PyTorch environment information
collect_pytorch_env_info "${BUILD_ENV_NAME}"

if [[ $CU_VERSION = cu* ]]; then
# Extract the CUDA version number from CU_VERSION
cuda_version=$(echo "[NOVA] ${CU_VERSION}" | cut -c 3-)
Expand Down
8 changes: 8 additions & 0 deletions .github/scripts/utils_cuda.bash
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,14 @@ install_cuda () {
# Print nvcc version
# shellcheck disable=SC2086
print_exec conda run ${env_prefix} nvcc --version

if which nvidia-smi; then
# If nvidia-smi is installed on a machine without GPUs, this will return error
(print_exec nvidia-smi) || true
else
echo "[CHECK] nvidia-smi not found"
fi

echo "[INSTALL] Successfully installed CUDA ${cuda_version}"
}

Expand Down
35 changes: 35 additions & 0 deletions .github/scripts/utils_pytorch.bash
Original file line number Diff line number Diff line change
Expand Up @@ -146,3 +146,38 @@ install_pytorch_pip () {

echo "[INSTALL] Successfully installed PyTorch through PyTorch PIP"
}


################################################################################
# PyTorch Diagnose Functions
################################################################################

collect_pytorch_env_info () {
local env_name="$1"
if [ "$env_name" == "" ]; then
echo "Usage: ${FUNCNAME[0]} ENV_NAME"
echo "Example(s):"
echo " ${FUNCNAME[0]} build_env # Collect PyTorch environment information from Conda environment build_env"
return 1
else
echo "################################################################################"
echo "# Collect PyTorch Environment Information (for Reporting Issues)"
echo "#"
echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
echo "################################################################################"
echo ""
fi

test_network_connection || return 1

# shellcheck disable=SC2155
local env_prefix=$(env_name_or_prefix "${env_name}")

# This is the script required for collecting info and reporting to https://github.com/pytorch/pytorch/issues/new
echo "[INFO] Downloading the PyTorch environment info collection script ..."
print_exec wget -q "https://raw.githubusercontent.com/pytorch/pytorch/main/torch/utils/collect_env.py"

echo "[INFO] Collecting PyTorch environment info (will be needed for reporting issues to PyTorch) ..."
# shellcheck disable=SC2086
(exec_with_retries 3 conda run ${env_prefix} python collect_env.py) || return 1
}
2 changes: 2 additions & 0 deletions .github/scripts/utils_rocm.bash
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ install_rocm_ubuntu () {
print_exec rm -f "${package_name}"

echo "[INFO] Check ROCM GPU info ..."
# If rocm-smi is installed on a machine without GPUs, this will return error
(print_exec rocminfo) || true
print_exec rocm-smi

echo "[INSTALL] Successfully installed ROCm ${rocm_version}"
Expand Down
6 changes: 6 additions & 0 deletions .github/scripts/utils_system.bash
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,12 @@ print_gpu_info () {
return 1
fi
else
if which rocminfo; then
# If rocminfo is installed on a machine without GPUs, this will return error
(print_exec rocminfo) || true
else
echo "[CHECK] rocminfo not found"
fi
if which rocm-smi; then
# If rocm-smi is installed on a machine without GPUs, this will return error
(print_exec rocm-smi) || true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

name: FBGEMM_GPU-CPU Nightly Build
# This workflow is used for FBGEMM_GPU-CPU CI as well as nightly builds of
# FBGEMM_GPU-CPU against PyTorch-CPU Nightly.
name: FBGEMM_GPU-CPU CI

on:
# PR Trigger (enabled for regression checks and debugging)
Expand Down Expand Up @@ -64,7 +66,7 @@ jobs:
{ arch: x86, instance: "linux.4xlarge" },
{ arch: arm, instance: "linux.arm64.2xlarge" },
]
python-version: [ "3.8", "3.9", "3.10", "3.11" ]
python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]

steps:
- name: Setup Build Container
Expand Down Expand Up @@ -96,10 +98,14 @@ jobs:
- name: Install PyTorch-CPU Nightly
run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cpu

- name: Collect PyTorch Environment Info
if: ${{ success() || failure() }}
run: . $PRELUDE; collect_pytorch_env_info $BUILD_ENV

- name: Prepare FBGEMM_GPU Build
run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV

- name: Build FBGEMM_GPU Nightly (CPU version)
- name: Build FBGEMM_GPU Wheel
run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV nightly cpu

- name: Upload Built Wheel as GHA Artifact
Expand Down Expand Up @@ -128,7 +134,7 @@ jobs:
{ arch: x86, instance: "linux.4xlarge" },
{ arch: arm, instance: "linux.arm64.2xlarge" },
]
python-version: [ "3.8", "3.9", "3.10", "3.11" ]
python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
needs: build_artifact

steps:
Expand Down Expand Up @@ -164,10 +170,14 @@ jobs:
- name: Install PyTorch-CPU Nightly
run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cpu

- name: Collect PyTorch Environment Info
if: ${{ success() || failure() }}
run: . $PRELUDE; collect_pytorch_env_info $BUILD_ENV

- name: Prepare FBGEMM_GPU Build
run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV

- name: Install FBGEMM_GPU Nightly (CPU version)
- name: Install FBGEMM_GPU Wheel
run: |
. $PRELUDE
pwd; ls -la .
Expand All @@ -177,8 +187,74 @@ jobs:
timeout-minutes: 15
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu

- name: Push FBGEMM_GPU Nightly (CPU version) Binary to PYPI
- name: Push Wheel to PyPI
if: ${{ github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true') }}
env:
PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
run: . $PRELUDE; publish_to_pypi $BUILD_ENV fbgemm_gpu_nightly_cpu-*.whl "$PYPI_TOKEN"


build_and_test_ubuntu:
runs-on: ${{ matrix.host-machine.instance }}
container:
image: ${{ matrix.container-image }}
options: --user root
defaults:
run:
shell: bash
env:
PRELUDE: .github/scripts/setup_env.bash
BUILD_ENV: build_binary
strategy:
fail-fast: false
matrix:
host-machine: [
{ arch: x86, instance: "linux.4xlarge" },
{ arch: arm, instance: "linux.arm64.2xlarge" },
]
container-image: [ "ubuntu:20.04", "ubuntu:22.04" ]
python-version: [ "3.11" ]

steps:
- name: Setup Build Container
run: |
apt update -y
apt install -y binutils build-essential git pciutils sudo wget
git config --global --add safe.directory '*'
- name: Checkout the Repository
uses: actions/checkout@v4
with:
submodules: true

- name: Display System Info
run: . $PRELUDE; print_system_info

- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Setup Miniconda
run: . $PRELUDE; setup_miniconda $HOME/miniconda

- name: Create Conda Environment
run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}

- name: Install Build Tools
run: . $PRELUDE; install_build_tools $BUILD_ENV

- name: Install PyTorch
run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cpu

- name: Collect PyTorch Environment Info
if: ${{ success() || failure() }}
run: . $PRELUDE; collect_pytorch_env_info $BUILD_ENV

- name: Prepare FBGEMM_GPU Build
run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV

- name: Build + Install FBGEMM_GPU (CPU version)
run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_install $BUILD_ENV cpu

- name: Test FBGEMM_GPU-CPU Nightly Installation
timeout-minutes: 15
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

name: FBGEMM_GPU-CUDA Nightly Build
# This workflow is used for FBGEMM_GPU-CUDA CI as well as nightly builds of
# FBGEMM_GPU-CUDA against PyTorch-CUDA Nightly.
name: FBGEMM_GPU-CUDA CI

on:
# PR Trigger (enabled for regression checks and debugging)
Expand Down Expand Up @@ -62,7 +64,7 @@ jobs:
host-machine: [
{ arch: x86, instance: "linux.24xlarge" },
]
python-version: [ "3.8", "3.9", "3.10", "3.11" ]
python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
cuda-version: [ "11.8.0", "12.1.1" ]

steps:
Expand Down Expand Up @@ -99,13 +101,17 @@ jobs:
- name: Install PyTorch Nightly
run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda ${{ matrix.cuda-version }}

- name: Collect PyTorch Environment Info
if: ${{ success() || failure() }}
run: . $PRELUDE; collect_pytorch_env_info $BUILD_ENV

- name: Install cuDNN
run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }}

- name: Prepare FBGEMM_GPU Build
run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV

- name: Build FBGEMM_GPU Nightly
- name: Build FBGEMM_GPU Wheel
run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV nightly cuda

- name: Upload Built Wheel as GHA Artifact
Expand Down Expand Up @@ -133,7 +139,7 @@ jobs:
host-machine: [
{ arch: x86, instance: "linux.g5.4xlarge.nvidia.gpu" },
]
python-version: [ "3.8", "3.9", "3.10", "3.11" ]
python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
cuda-version: [ "11.8.0", "12.1.1" ]
# Specify exactly ONE CUDA version for artifact publish
cuda-version-publish: [ "12.1.1" ]
Expand Down Expand Up @@ -174,17 +180,21 @@ jobs:
- name: Install PyTorch Nightly
run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda ${{ matrix.cuda-version }}

- name: Collect PyTorch Environment Info
if: ${{ success() || failure() }}
run: . $PRELUDE; collect_pytorch_env_info $BUILD_ENV

- name: Prepare FBGEMM_GPU Build
run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV

- name: Install FBGEMM_GPU Nightly
- name: Install FBGEMM_GPU Wheel
run: . $PRELUDE; install_fbgemm_gpu_wheel $BUILD_ENV *.whl

- name: Test with PyTest
timeout-minutes: 15
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cuda

- name: Push FBGEMM_GPU Nightly Binary to PYPI
- name: Push Wheel to PyPI
if: ${{ (github.event_name == 'schedule' && matrix.cuda-version == matrix.cuda-version-publish) || (github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == matrix.cuda-version-publish) }}
env:
PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
Expand Down
Loading

0 comments on commit 03b582b

Please sign in to comment.