diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash index 58c5e705b2..945c9b0318 100644 --- a/.github/scripts/fbgemm_gpu_build.bash +++ b/.github/scripts/fbgemm_gpu_build.bash @@ -77,8 +77,12 @@ __configure_fbgemm_gpu_build_rocm () { echo "[BUILD] Architectures list from rocminfo: ${arch_list}" if [ "$arch_list" == "" ]; then - # By default, build for MI250 only to save time - local arch_list=gfx90a + echo "[BUILD] rocminfo did not return anything valid!" + + # By default, we build just for MI100 and MI250 to save time. This list + # needs to be updated if the CI ROCm machines have different hardware. + # Architecture mapping can be found at: https://wiki.gentoo.org/wiki/ROCm + local arch_list="gfx908,gfx90a" fi else echo "[BUILD] rocminfo not found in PATH!" @@ -92,7 +96,12 @@ __configure_fbgemm_gpu_build_rocm () { echo "[BUILD] Setting ROCm build args ..." build_args=( --package_variant=rocm - -DTORCH_USE_HIP_DSA=1 + # HIP_ROOT_DIR now required for HIP to be correctly detected by CMake + -DHIP_ROOT_DIR=/opt/rocm + # Enable device-side assertions in HIP + # https://stackoverflow.com/questions/44284275/passing-compiler-options-in-cmake-command-line + -DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" + -DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA" ) } @@ -140,6 +149,7 @@ __configure_fbgemm_gpu_build_cuda () { build_args=( --package_variant=cuda --nvml_lib_path="${nvml_lib_path}" + # Pass to PyTorch CMake -DTORCH_CUDA_ARCH_LIST="'${arch_list}'" ) } diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash index 36bb802eed..0325a257dd 100644 --- a/.github/scripts/fbgemm_gpu_test.bash +++ b/.github/scripts/fbgemm_gpu_test.bash @@ -50,11 +50,11 @@ run_python_test () { run_fbgemm_gpu_tests () { local env_name="$1" local fbgemm_variant="$2" - if [ "$env_name" == "" ]; then + if [ "$fbgemm_variant" == "" ]; then echo "Usage: ${FUNCNAME[0]} ENV_NAME [FBGEMM_VARIANT]" echo "Example(s):" - echo " ${FUNCNAME[0]} build_env # Run all tests applicable to CUDA" echo " ${FUNCNAME[0]} build_env cpu # Run all tests applicable to CPU" + echo " ${FUNCNAME[0]} build_env cuda # Run all tests applicable to CUDA" echo " ${FUNCNAME[0]} build_env rocm # Run all tests applicable to ROCm" return 1 else @@ -71,9 +71,11 @@ run_fbgemm_gpu_tests () { # Enable ROCM testing if specified if [ "$fbgemm_variant" == "rocm" ]; then - echo "[TEST] Set environment variable FBGEMM_TEST_WITH_ROCM to enable ROCm tests ..." + echo "[TEST] Set environment variables for ROCm testing ..." # shellcheck disable=SC2086 print_exec conda env config vars set ${env_prefix} FBGEMM_TEST_WITH_ROCM=1 + # shellcheck disable=SC2086 + print_exec conda env config vars set ${env_prefix} HIP_LAUNCH_BLOCKING=1 fi # These are either non-tests or currently-broken tests in both FBGEMM_GPU and FBGEMM_GPU-CPU @@ -138,7 +140,7 @@ test_setup_conda_environment () { if [ "$pytorch_variant_type" == "" ]; then echo "Usage: ${FUNCNAME[0]} ENV_NAME PYTHON_VERSION PYTORCH_INSTALLER PYTORCH_VERSION PYTORCH_VARIANT_TYPE [PYTORCH_VARIANT_VERSION]" echo "Example(s):" - echo " ${FUNCNAME[0]} build_env 3.10 pip test cuda 12.1.0 # Setup environment with pytorch-test for Python 3.10 + CUDA 12.1.0" + echo " ${FUNCNAME[0]} build_env 3.12 pip test cuda 12.1.0 # Setup environment with pytorch-test for Python 3.12 + CUDA 12.1.0" return 1 else echo "################################################################################" @@ -210,8 +212,8 @@ test_fbgemm_gpu_build_and_install () { cd - install_fbgemm_gpu_wheel "${env_name}" fbgemm_gpu/dist/*.whl || return 1 - cd fbgemm_gpu/test || return 1 - run_fbgemm_gpu_tests "${env_name}" || return 1 + cd fbgemm_gpu/test || return 1 + run_fbgemm_gpu_tests "${env_name}" "${pytorch_variant_type}" || return 1 # shellcheck disable=SC2164 cd - } diff --git a/.github/scripts/nova_postscript.bash b/.github/scripts/nova_postscript.bash index 4602d6bd21..19d2d4ec8d 100644 --- a/.github/scripts/nova_postscript.bash +++ b/.github/scripts/nova_postscript.bash @@ -20,6 +20,10 @@ echo "[NOVA] Current working directory: $(pwd)" # shellcheck source=.github/scripts/setup_env.bash . "${PRELUDE}"; +# Collect PyTorch environment information +collect_pytorch_env_info "${BUILD_ENV_NAME}" + +# Install the wheel install_fbgemm_gpu_wheel "${BUILD_ENV_NAME}" fbgemm_gpu/dist/*.whl # Test with PyTest @@ -31,3 +35,6 @@ fi $CONDA_RUN python3 -c "import torch; print('cuda.is_available() ', torch.cuda.is_available()); print ('device_count() ',torch.cuda.device_count());" cd "${FBGEMM_REPO}/fbgemm_gpu/test" || { echo "[NOVA] Failed to cd to fbgemm_gpu/test from $(pwd)"; }; run_fbgemm_gpu_tests "${BUILD_ENV_NAME}" "${CPU_GPU}" + +# Workaround EACCES: permission denied error at checkout step +chown -R 1000:1000 /__w/FBGEMM/FBGEMM/ || echo "Unable to chown 1000:1000 from $USER, uid: $(id -u)" diff --git a/.github/scripts/nova_prescript.bash b/.github/scripts/nova_prescript.bash index f52e3b163a..9fda9d847e 100644 --- a/.github/scripts/nova_prescript.bash +++ b/.github/scripts/nova_prescript.bash @@ -33,6 +33,9 @@ install_cxx_compiler "${BUILD_ENV_NAME}" # Install Build Tools install_build_tools "${BUILD_ENV_NAME}" +# Collect PyTorch environment information +collect_pytorch_env_info "${BUILD_ENV_NAME}" + if [[ $CU_VERSION = cu* ]]; then # Extract the CUDA version number from CU_VERSION cuda_version=$(echo "[NOVA] ${CU_VERSION}" | cut -c 3-) diff --git a/.github/scripts/utils_cuda.bash b/.github/scripts/utils_cuda.bash index 09b0b543d5..1bef3cd5ca 100644 --- a/.github/scripts/utils_cuda.bash +++ b/.github/scripts/utils_cuda.bash @@ -77,6 +77,14 @@ install_cuda () { # Print nvcc version # shellcheck disable=SC2086 print_exec conda run ${env_prefix} nvcc --version + + if which nvidia-smi; then + # If nvidia-smi is installed on a machine without GPUs, this will return error + (print_exec nvidia-smi) || true + else + echo "[CHECK] nvidia-smi not found" + fi + echo "[INSTALL] Successfully installed CUDA ${cuda_version}" } diff --git a/.github/scripts/utils_pytorch.bash b/.github/scripts/utils_pytorch.bash index 77e88a8130..a25d78c2d1 100644 --- a/.github/scripts/utils_pytorch.bash +++ b/.github/scripts/utils_pytorch.bash @@ -146,3 +146,38 @@ install_pytorch_pip () { echo "[INSTALL] Successfully installed PyTorch through PyTorch PIP" } + + +################################################################################ +# PyTorch Diagnose Functions +################################################################################ + +collect_pytorch_env_info () { + local env_name="$1" + if [ "$env_name" == "" ]; then + echo "Usage: ${FUNCNAME[0]} ENV_NAME" + echo "Example(s):" + echo " ${FUNCNAME[0]} build_env # Collect PyTorch environment information from Conda environment build_env" + return 1 + else + echo "################################################################################" + echo "# Collect PyTorch Environment Information (for Reporting Issues)" + echo "#" + echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}" + echo "################################################################################" + echo "" + fi + + test_network_connection || return 1 + + # shellcheck disable=SC2155 + local env_prefix=$(env_name_or_prefix "${env_name}") + + # This is the script required for collecting info and reporting to https://github.com/pytorch/pytorch/issues/new + echo "[INFO] Downloading the PyTorch environment info collection script ..." + print_exec wget -q "https://raw.githubusercontent.com/pytorch/pytorch/main/torch/utils/collect_env.py" + + echo "[INFO] Collecting PyTorch environment info (will be needed for reporting issues to PyTorch) ..." + # shellcheck disable=SC2086 + (exec_with_retries 3 conda run ${env_prefix} python collect_env.py) || return 1 +} diff --git a/.github/scripts/utils_rocm.bash b/.github/scripts/utils_rocm.bash index beac98d303..ff57758047 100644 --- a/.github/scripts/utils_rocm.bash +++ b/.github/scripts/utils_rocm.bash @@ -75,6 +75,8 @@ install_rocm_ubuntu () { print_exec rm -f "${package_name}" echo "[INFO] Check ROCM GPU info ..." + # If rocm-smi is installed on a machine without GPUs, this will return error + (print_exec rocminfo) || true print_exec rocm-smi echo "[INSTALL] Successfully installed ROCm ${rocm_version}" diff --git a/.github/scripts/utils_system.bash b/.github/scripts/utils_system.bash index d6be9707ff..b01441eccc 100644 --- a/.github/scripts/utils_system.bash +++ b/.github/scripts/utils_system.bash @@ -118,6 +118,12 @@ print_gpu_info () { return 1 fi else + if which rocminfo; then + # If rocminfo is installed on a machine without GPUs, this will return error + (print_exec rocminfo) || true + else + echo "[CHECK] rocminfo not found" + fi if which rocm-smi; then # If rocm-smi is installed on a machine without GPUs, this will return error (print_exec rocm-smi) || true diff --git a/.github/workflows/fbgemm_gpu_cpu_nightly.yml b/.github/workflows/fbgemm_gpu_ci_cpu.yml similarity index 67% rename from .github/workflows/fbgemm_gpu_cpu_nightly.yml rename to .github/workflows/fbgemm_gpu_ci_cpu.yml index 8c5efd66fe..d331e626b8 100644 --- a/.github/workflows/fbgemm_gpu_cpu_nightly.yml +++ b/.github/workflows/fbgemm_gpu_ci_cpu.yml @@ -3,7 +3,9 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -name: FBGEMM_GPU-CPU Nightly Build +# This workflow is used for FBGEMM_GPU-CPU CI as well as nightly builds of +# FBGEMM_GPU-CPU against PyTorch-CPU Nightly. +name: FBGEMM_GPU-CPU CI on: # PR Trigger (enabled for regression checks and debugging) @@ -64,7 +66,7 @@ jobs: { arch: x86, instance: "linux.4xlarge" }, { arch: arm, instance: "linux.arm64.2xlarge" }, ] - python-version: [ "3.8", "3.9", "3.10", "3.11" ] + python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ] steps: - name: Setup Build Container @@ -96,10 +98,14 @@ jobs: - name: Install PyTorch-CPU Nightly run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cpu + - name: Collect PyTorch Environment Info + if: ${{ success() || failure() }} + run: . $PRELUDE; collect_pytorch_env_info $BUILD_ENV + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - - name: Build FBGEMM_GPU Nightly (CPU version) + - name: Build FBGEMM_GPU Wheel run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV nightly cpu - name: Upload Built Wheel as GHA Artifact @@ -128,7 +134,7 @@ jobs: { arch: x86, instance: "linux.4xlarge" }, { arch: arm, instance: "linux.arm64.2xlarge" }, ] - python-version: [ "3.8", "3.9", "3.10", "3.11" ] + python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ] needs: build_artifact steps: @@ -164,10 +170,14 @@ jobs: - name: Install PyTorch-CPU Nightly run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cpu + - name: Collect PyTorch Environment Info + if: ${{ success() || failure() }} + run: . $PRELUDE; collect_pytorch_env_info $BUILD_ENV + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - - name: Install FBGEMM_GPU Nightly (CPU version) + - name: Install FBGEMM_GPU Wheel run: | . $PRELUDE pwd; ls -la . @@ -177,8 +187,74 @@ jobs: timeout-minutes: 15 run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu - - name: Push FBGEMM_GPU Nightly (CPU version) Binary to PYPI + - name: Push Wheel to PyPI if: ${{ github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true') }} env: PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} run: . $PRELUDE; publish_to_pypi $BUILD_ENV fbgemm_gpu_nightly_cpu-*.whl "$PYPI_TOKEN" + + + build_and_test_ubuntu: + runs-on: ${{ matrix.host-machine.instance }} + container: + image: ${{ matrix.container-image }} + options: --user root + defaults: + run: + shell: bash + env: + PRELUDE: .github/scripts/setup_env.bash + BUILD_ENV: build_binary + strategy: + fail-fast: false + matrix: + host-machine: [ + { arch: x86, instance: "linux.4xlarge" }, + { arch: arm, instance: "linux.arm64.2xlarge" }, + ] + container-image: [ "ubuntu:20.04", "ubuntu:22.04" ] + python-version: [ "3.11" ] + + steps: + - name: Setup Build Container + run: | + apt update -y + apt install -y binutils build-essential git pciutils sudo wget + git config --global --add safe.directory '*' + + - name: Checkout the Repository + uses: actions/checkout@v4 + with: + submodules: true + + - name: Display System Info + run: . $PRELUDE; print_system_info + + - name: Display GPU Info + run: . $PRELUDE; print_gpu_info + + - name: Setup Miniconda + run: . $PRELUDE; setup_miniconda $HOME/miniconda + + - name: Create Conda Environment + run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} + + - name: Install Build Tools + run: . $PRELUDE; install_build_tools $BUILD_ENV + + - name: Install PyTorch + run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cpu + + - name: Collect PyTorch Environment Info + if: ${{ success() || failure() }} + run: . $PRELUDE; collect_pytorch_env_info $BUILD_ENV + + - name: Prepare FBGEMM_GPU Build + run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV + + - name: Build + Install FBGEMM_GPU (CPU version) + run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_install $BUILD_ENV cpu + + - name: Test FBGEMM_GPU-CPU Nightly Installation + timeout-minutes: 15 + run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu diff --git a/.github/workflows/fbgemm_gpu_cuda_nightly.yml b/.github/workflows/fbgemm_gpu_ci_cuda.yml similarity index 90% rename from .github/workflows/fbgemm_gpu_cuda_nightly.yml rename to .github/workflows/fbgemm_gpu_ci_cuda.yml index f5ed26aec3..8eb94da4ac 100644 --- a/.github/workflows/fbgemm_gpu_cuda_nightly.yml +++ b/.github/workflows/fbgemm_gpu_ci_cuda.yml @@ -3,7 +3,9 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -name: FBGEMM_GPU-CUDA Nightly Build +# This workflow is used for FBGEMM_GPU-CUDA CI as well as nightly builds of +# FBGEMM_GPU-CUDA against PyTorch-CUDA Nightly. +name: FBGEMM_GPU-CUDA CI on: # PR Trigger (enabled for regression checks and debugging) @@ -62,7 +64,7 @@ jobs: host-machine: [ { arch: x86, instance: "linux.24xlarge" }, ] - python-version: [ "3.8", "3.9", "3.10", "3.11" ] + python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ] cuda-version: [ "11.8.0", "12.1.1" ] steps: @@ -99,13 +101,17 @@ jobs: - name: Install PyTorch Nightly run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda ${{ matrix.cuda-version }} + - name: Collect PyTorch Environment Info + if: ${{ success() || failure() }} + run: . $PRELUDE; collect_pytorch_env_info $BUILD_ENV + - name: Install cuDNN run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }} - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - - name: Build FBGEMM_GPU Nightly + - name: Build FBGEMM_GPU Wheel run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV nightly cuda - name: Upload Built Wheel as GHA Artifact @@ -133,7 +139,7 @@ jobs: host-machine: [ { arch: x86, instance: "linux.g5.4xlarge.nvidia.gpu" }, ] - python-version: [ "3.8", "3.9", "3.10", "3.11" ] + python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ] cuda-version: [ "11.8.0", "12.1.1" ] # Specify exactly ONE CUDA version for artifact publish cuda-version-publish: [ "12.1.1" ] @@ -174,17 +180,21 @@ jobs: - name: Install PyTorch Nightly run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda ${{ matrix.cuda-version }} + - name: Collect PyTorch Environment Info + if: ${{ success() || failure() }} + run: . $PRELUDE; collect_pytorch_env_info $BUILD_ENV + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - - name: Install FBGEMM_GPU Nightly + - name: Install FBGEMM_GPU Wheel run: . $PRELUDE; install_fbgemm_gpu_wheel $BUILD_ENV *.whl - name: Test with PyTest timeout-minutes: 15 - run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV + run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cuda - - name: Push FBGEMM_GPU Nightly Binary to PYPI + - name: Push Wheel to PyPI if: ${{ (github.event_name == 'schedule' && matrix.cuda-version == matrix.cuda-version-publish) || (github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == matrix.cuda-version-publish) }} env: PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} diff --git a/.github/workflows/fbgemm_gpu_ci.yml b/.github/workflows/fbgemm_gpu_ci_rocm.yml similarity index 59% rename from .github/workflows/fbgemm_gpu_ci.yml rename to .github/workflows/fbgemm_gpu_ci_rocm.yml index 43006e5a3e..ebf9c7f532 100644 --- a/.github/workflows/fbgemm_gpu_ci.yml +++ b/.github/workflows/fbgemm_gpu_ci_rocm.yml @@ -3,10 +3,12 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -name: FBGEMM_GPU CI +# This workflow is used for FBGEMM_GPU-ROCm CI as well as nightly builds of +# FBGEMM_GPU-ROCm against PyTorch-ROCm Nightly. +name: FBGEMM_GPU-ROCm CI on: - # PR Trigger + # PR Trigger (enabled for regression checks and debugging) # pull_request: branches: @@ -18,9 +20,23 @@ on: branches: - main - # Manual Trigger (for testing only) + # Cron Trigger (UTC) + # + # Based on the Conda page for PyTorch-nightly, the GPU nightly releases appear + # around 02:30 PST every day (roughly 2 hours after the CPU releases) + # + schedule: + - cron: '45 12 * * *' + + # Manual Trigger # workflow_dispatch: + inputs: + publish_to_pypi: + description: Publish Artifact to PyPI + type: boolean + required: false + default: false concurrency: # Cancel previous runs in the PR if a new commit is pushed @@ -28,7 +44,8 @@ concurrency: cancel-in-progress: true jobs: - build_and_test_amd: + # Build on CPU hosts and upload to GHA + build_artifact: runs-on: ${{ matrix.host-machine.instance }} container: image: ${{ matrix.container-image }} @@ -43,10 +60,10 @@ jobs: fail-fast: false matrix: host-machine: [ - { arch: x86, instance: "linux.12xlarge" }, + { arch: x86, instance: "linux.24xlarge" }, ] container-image: [ "ubuntu:20.04" ] - python-version: [ "3.8", "3.9", "3.10" ] + python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ] rocm-version: [ "5.7" ] steps: @@ -79,24 +96,34 @@ jobs: - name: Install ROCm run: . $PRELUDE; install_rocm_ubuntu $BUILD_ENV ${{ matrix.rocm-version }} + - name: Install C/C++ Compilers + run: . $PRELUDE; install_cxx_compiler $BUILD_ENV + - name: Install Build Tools run: . $PRELUDE; install_build_tools $BUILD_ENV - name: Install PyTorch-ROCm Nightly run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly rocm ${{ matrix.rocm-version }} + - name: Collect PyTorch Environment Info + if: ${{ success() || failure() }} + run: . $PRELUDE; collect_pytorch_env_info $BUILD_ENV + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - - name: Build FBGEMM_GPU-ROCm Nightly - run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_develop $BUILD_ENV rocm gfx90a + - name: Build FBGEMM_GPU Wheel + run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV nightly rocm - - name: Test FBGEMM_GPU-ROCm Nightly Installation - timeout-minutes: 15 - run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm + - name: Upload Built Wheel as GHA Artifact + uses: actions/upload-artifact@v3 + with: + name: fbgemm_gpu_nightly_rocm_${{ matrix.host-machine.arch }}_${{ matrix.python-version }}_rocm${{ matrix.rocm-version }}.whl + path: fbgemm_gpu/dist/fbgemm_gpu_nightly_rocm-*.whl - test_amd_gpu: + # Download the built artifact from GHA, test on GPU, and push to PyPI + test_and_publish_artifact: runs-on: ${{ matrix.host-machine.instance }} container: image: "rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}-complete" @@ -114,9 +141,10 @@ jobs: host-machine: [ { arch: x86, instance: "rocm" }, ] - # ROCm machines are limited, so we only test against Python 3.10 - python-version: [ "3.10" ] + # ROCm machines are limited, so we only test a subset of Python versions + python-version: [ "3.11", "3.12" ] rocm-version: [ "5.7" ] + needs: build_artifact steps: - name: Setup Build Container @@ -126,9 +154,12 @@ jobs: git config --global --add safe.directory '*' - name: Checkout the Repository - uses: actions/checkout@v4 + uses: actions/checkout@v3 + + - name: Download Wheel Artifact from GHA + uses: actions/download-artifact@v3 with: - submodules: true + name: fbgemm_gpu_nightly_rocm_${{ matrix.host-machine.arch }}_${{ matrix.python-version }}_rocm${{ matrix.rocm-version }}.whl - name: Display System Info run: . $PRELUDE; print_system_info @@ -151,74 +182,16 @@ jobs: - name: Install PyTorch-ROCm Nightly run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly rocm ${{ matrix.rocm-version }} - - name: Prepare FBGEMM_GPU Build - run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - - - name: Build FBGEMM_GPU-ROCm Nightly - run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_develop $BUILD_ENV rocm - - - name: Test FBGEMM_GPU-ROCm Nightly Installation - timeout-minutes: 15 - run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm - - - build_and_test_cpu: - runs-on: ${{ matrix.host-machine.instance }} - container: - image: ${{ matrix.container-image }} - options: --user root - defaults: - run: - shell: bash - env: - PRELUDE: .github/scripts/setup_env.bash - BUILD_ENV: build_binary - strategy: - fail-fast: false - matrix: - host-machine: [ - { arch: x86, instance: "linux.4xlarge" }, - { arch: arm, instance: "linux.arm64.2xlarge" }, - ] - container-image: [ "ubuntu:20.04", "ubuntu:22.04" ] - python-version: [ "3.8", "3.9", "3.10", "3.11" ] - - steps: - - name: Setup Build Container - run: | - apt update -y - apt install -y binutils build-essential git pciutils sudo wget - git config --global --add safe.directory '*' - - - name: Checkout the Repository - uses: actions/checkout@v4 - with: - submodules: true - - - name: Display System Info - run: . $PRELUDE; print_system_info - - - name: Display GPU Info - run: . $PRELUDE; print_gpu_info - - - name: Setup Miniconda - run: . $PRELUDE; setup_miniconda $HOME/miniconda - - - name: Create Conda Environment - run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} - - - name: Install Build Tools - run: . $PRELUDE; install_build_tools $BUILD_ENV - - - name: Install PyTorch - run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cpu + - name: Collect PyTorch Environment Info + if: ${{ success() || failure() }} + run: . $PRELUDE; collect_pytorch_env_info $BUILD_ENV - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - - name: Build + Install FBGEMM_GPU (CPU version) - run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_install $BUILD_ENV cpu + - name: Install FBGEMM_GPU Wheel + run: . $PRELUDE; install_fbgemm_gpu_wheel $BUILD_ENV *.whl - - name: Test FBGEMM_GPU-CPU Nightly Installation + - name: Test with PyTest timeout-minutes: 15 - run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu + run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm diff --git a/.github/workflows/fbgemm_gpu_docs.yml b/.github/workflows/fbgemm_gpu_docs.yml index d3a69bca5b..0dc3d6e890 100644 --- a/.github/workflows/fbgemm_gpu_docs.yml +++ b/.github/workflows/fbgemm_gpu_docs.yml @@ -72,6 +72,10 @@ jobs: - name: Install PyTorch-CPU Nightly run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cpu + - name: Collect PyTorch Environment Info + if: ${{ success() || failure() }} + run: . $PRELUDE; collect_pytorch_env_info $BUILD_ENV + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV diff --git a/.github/workflows/fbgemm_gpu_pip.yml b/.github/workflows/fbgemm_gpu_pip.yml index d9f6dc2ff6..f1e433cc3c 100644 --- a/.github/workflows/fbgemm_gpu_pip.yml +++ b/.github/workflows/fbgemm_gpu_pip.yml @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# This workflow is used for testing the download and installation of FBGEMM_GPU +# nightly releases published to PyTorch PyPI. name: FBGEMM_GPU PIP Install + Test on: @@ -59,7 +61,7 @@ jobs: { instance: "linux.4xlarge" }, { instance: "linux.arm64.2xlarge" }, ] - python-version: [ "3.8", "3.9", "3.10", "3.11" ] + python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ] steps: - name: Setup Build Container @@ -86,6 +88,10 @@ jobs: - name: Install PyTorch-CPU run: . $PRELUDE; install_pytorch_pip $BUILD_ENV ${{ github.event.inputs.pytorch_version || 'nightly' }} cpu + - name: Collect PyTorch Environment Info + if: ${{ success() || failure() }} + run: . $PRELUDE; collect_pytorch_env_info $BUILD_ENV + - name: Install FBGEMM_GPU-CPU run: . $PRELUDE; install_fbgemm_gpu_pip $BUILD_ENV ${{ github.event.inputs.fbgemm_gpu_version || 'nightly' }} cpu @@ -110,7 +116,7 @@ jobs: host-machine: [ { instance: "linux.g5.4xlarge.nvidia.gpu" }, ] - python-version: [ "3.8", "3.9", "3.10", "3.11" ] + python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ] cuda-version: [ "11.8.0", "12.1.1" ] # Specify exactly ONE CUDA version for artifact publish cuda-version-publish: [ "11.8.0" ] @@ -144,12 +150,16 @@ jobs: - name: Install PyTorch-CUDA run: . $PRELUDE; install_pytorch_pip $BUILD_ENV ${{ github.event.inputs.pytorch_version || 'nightly' }} cuda ${{ matrix.cuda-version }} + - name: Collect PyTorch Environment Info + if: ${{ success() || failure() }} + run: . $PRELUDE; collect_pytorch_env_info $BUILD_ENV + - name: Install FBGEMM_GPU-CUDA run: . $PRELUDE; install_fbgemm_gpu_pip $BUILD_ENV ${{ github.event.inputs.fbgemm_gpu_version || 'nightly' }} cuda ${{ matrix.cuda-version }} - name: Test with PyTest timeout-minutes: 15 - run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV + run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cuda test_pypi_install_rocm: @@ -169,10 +179,10 @@ jobs: fail-fast: false matrix: host-machine: [ - { instance: "rocm" }, + { arch: x86, instance: "rocm" }, ] - # ROCm machines are limited, so we only test against Python 3.10 - python-version: [ "3.10" ] + # ROCm machines are limited, so we only test a subset of Python versions + python-version: [ "3.11", "3.12" ] rocm-version: [ "5.7" ] steps: @@ -206,6 +216,10 @@ jobs: - name: Install PyTorch-ROCm run: . $PRELUDE; install_pytorch_pip $BUILD_ENV ${{ github.event.inputs.pytorch_version || 'nightly' }} rocm ${{ matrix.rocm-version }} + - name: Collect PyTorch Environment Info + if: ${{ success() || failure() }} + run: . $PRELUDE; collect_pytorch_env_info $BUILD_ENV + - name: Install FBGEMM_GPU-ROCm run: . $PRELUDE; install_fbgemm_gpu_pip $BUILD_ENV ${{ github.event.inputs.fbgemm_gpu_version || 'nightly' }} rocm ${{ matrix.rocm-version }} diff --git a/.github/workflows/fbgemm_gpu_cpu_release.yml b/.github/workflows/fbgemm_gpu_release_cpu.yml similarity index 89% rename from .github/workflows/fbgemm_gpu_cpu_release.yml rename to .github/workflows/fbgemm_gpu_release_cpu.yml index aba87df783..213164cc59 100644 --- a/.github/workflows/fbgemm_gpu_cpu_release.yml +++ b/.github/workflows/fbgemm_gpu_release_cpu.yml @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# This workflow is used for building and publishing FBGEMM_GPU-CPU release +# builds against PyTorch-CPU Release to public PyPI. name: FBGEMM_GPU-CPU Release Build on: @@ -10,13 +12,13 @@ on: # pull_request: branches: - - main + - ^v([0-9]+)\.([0-9]+)\.([0-9]+)-release # Push Trigger (enable to catch errors coming out of multiple merges) # push: branches: - - main + - ^v([0-9]+)\.([0-9]+)\.([0-9]+)-release # Manual Trigger # @@ -61,7 +63,7 @@ jobs: { arch: x86, instance: "linux.4xlarge" }, { arch: arm, instance: "linux.arm64.2xlarge" }, ] - python-version: [ "3.8", "3.9", "3.10", "3.11" ] + python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ] steps: - name: Setup Build Container @@ -93,6 +95,10 @@ jobs: - name: Install PyTorch-CPU Test run: . $PRELUDE; install_pytorch_pip $BUILD_ENV ${{ github.event.inputs.pytorch_channel || 'test' }} cpu + - name: Collect PyTorch Environment Info + if: ${{ success() || failure() }} + run: . $PRELUDE; collect_pytorch_env_info $BUILD_ENV + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV @@ -125,7 +131,7 @@ jobs: { arch: x86, instance: "linux.4xlarge" }, { arch: arm, instance: "linux.arm64.2xlarge" }, ] - python-version: [ "3.8", "3.9", "3.10", "3.11" ] + python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ] needs: build_artifact steps: @@ -157,6 +163,10 @@ jobs: - name: Install PyTorch Test run: . $PRELUDE; install_pytorch_pip $BUILD_ENV ${{ github.event.inputs.pytorch_channel || 'test' }} cpu + - name: Collect PyTorch Environment Info + if: ${{ success() || failure() }} + run: . $PRELUDE; collect_pytorch_env_info $BUILD_ENV + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV diff --git a/.github/workflows/fbgemm_gpu_cuda_release.yml b/.github/workflows/fbgemm_gpu_release_cuda.yml similarity index 89% rename from .github/workflows/fbgemm_gpu_cuda_release.yml rename to .github/workflows/fbgemm_gpu_release_cuda.yml index 74b79a88dc..72f42db605 100644 --- a/.github/workflows/fbgemm_gpu_cuda_release.yml +++ b/.github/workflows/fbgemm_gpu_release_cuda.yml @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# This workflow is used for building and publishing FBGEMM_GPU-CUDA release +# builds against PyTorch-CUDA Release to public PyPI. name: FBGEMM_GPU-CUDA Release Build on: @@ -10,13 +12,13 @@ on: # pull_request: branches: - - main + - ^v([0-9]+)\.([0-9]+)\.([0-9]+)-release # Push Trigger (enable to catch errors coming out of multiple merges) # push: branches: - - main + - ^v([0-9]+)\.([0-9]+)\.([0-9]+)-release # Manual Trigger # @@ -66,7 +68,7 @@ jobs: host-machine: [ { arch: x86, instance: "linux.24xlarge" }, ] - python-version: [ "3.8", "3.9", "3.10", "3.11" ] + python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ] cuda-version: [ "11.8.0", "12.1.1" ] steps: @@ -102,6 +104,10 @@ jobs: - name: Install PyTorch Test run: . $PRELUDE; install_pytorch_pip $BUILD_ENV ${{ github.event.inputs.pytorch_channel || 'test' }} cuda ${{ matrix.cuda-version }} + - name: Collect PyTorch Environment Info + if: ${{ success() || failure() }} + run: . $PRELUDE; collect_pytorch_env_info $BUILD_ENV + - name: Install cuDNN run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }} @@ -134,7 +140,7 @@ jobs: host-machine: [ { arch: x86, instance: "linux.g5.4xlarge.nvidia.gpu" }, ] - python-version: [ "3.8", "3.9", "3.10", "3.11" ] + python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ] cuda-version: [ "11.8.0", "12.1.1" ] needs: build_artifact @@ -170,6 +176,10 @@ jobs: - name: Install PyTorch Test run: . $PRELUDE; install_pytorch_pip $BUILD_ENV ${{ github.event.inputs.pytorch_channel || 'test' }} cuda ${{ matrix.cuda-version }} + - name: Collect PyTorch Environment Info + if: ${{ success() || failure() }} + run: . $PRELUDE; collect_pytorch_env_info $BUILD_ENV + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV @@ -178,7 +188,7 @@ jobs: - name: Test with PyTest timeout-minutes: 15 - run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV + run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cuda - name: Push FBGEMM_GPU Binary to PYPI if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == github.event.inputs.cuda_version }} diff --git a/fbgemm_gpu/bench/merge_embeddings_benchmark.py b/fbgemm_gpu/bench/merge_embeddings_benchmark.py index d7f574d6f7..8059c85541 100644 --- a/fbgemm_gpu/bench/merge_embeddings_benchmark.py +++ b/fbgemm_gpu/bench/merge_embeddings_benchmark.py @@ -499,7 +499,6 @@ def main( "output size (MB), all-to-one BW (GB/s), link BW (GB/s), t (ms)" ) if sweep: - # pyre-fixme[3]: Return type must be annotated. # pyre-fixme[2]: Parameter must be annotated. def handler(signum, frame): diff --git a/fbgemm_gpu/bench/sparse_ops_benchmark.py b/fbgemm_gpu/bench/sparse_ops_benchmark.py index a578f3f40a..0602d0ae82 100644 --- a/fbgemm_gpu/bench/sparse_ops_benchmark.py +++ b/fbgemm_gpu/bench/sparse_ops_benchmark.py @@ -878,7 +878,6 @@ def ben(fn, name, ad_indices, ad_lengths, batch_offsets, num_ads_in_batch): def block_bucketize_sparse_features_bench( row_size: int, batch_size: int, bucket_num: int, input_precision: str, device: str ) -> None: - dtype = torch.int if input_precision == "int": dtype = torch.int diff --git a/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py b/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py index 143b8a0e3d..cb7d30a817 100644 --- a/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py +++ b/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py @@ -785,6 +785,7 @@ def benchmark_cpu_requests( @click.option("--output-dtype", type=SparseType, default=SparseType.FP16) @click.option("--fp8-exponent-bits", type=int, default=None) @click.option("--fp8-exponent-bias", type=int, default=None) +@click.option("--pooling", type=str, default="sum") def nbit_cpu( # noqa C901 alpha: float, bag_size: int, @@ -807,6 +808,7 @@ def nbit_cpu( # noqa C901 output_dtype: SparseType, fp8_exponent_bits: Optional[int], fp8_exponent_bias: Optional[int], + pooling: str, ) -> None: np.random.seed(42) torch.manual_seed(42) @@ -825,11 +827,23 @@ def nbit_cpu( # noqa C901 else: Ds = [D] * T + if pooling is None or pooling == "sum": + pooling = "sum" + pooling_mode = PoolingMode.SUM + do_pooling = True + elif pooling == "mean": + pooling_mode = PoolingMode.MEAN + do_pooling = True + else: # "none" + pooling_mode = PoolingMode.NONE + do_pooling = False + emb = IntNBitTableBatchedEmbeddingBagsCodegen( [("", E, d, weights_precision, EmbeddingLocation.HOST) for d in Ds], device="cpu", index_remapping=[torch.arange(E) for _ in Ds] if index_remapping else None, output_dtype=output_dtype, + pooling_mode=pooling_mode, fp8_exponent_bits=fp8_exponent_bits, fp8_exponent_bias=fp8_exponent_bias, ).cpu() @@ -839,9 +853,16 @@ def nbit_cpu( # noqa C901 nparams_byte = sum(w.numel() for (w, _) in emb.split_embedding_weights()) param_size_multiplier = weights_precision.bit_rate() / 8.0 output_size_multiplier = output_dtype.bit_rate() / 8.0 - read_write_bytes = ( - output_size_multiplier * B * T * D + param_size_multiplier * B * T * L * D - ) + if do_pooling: + read_write_bytes = ( + output_size_multiplier * B * T * D + param_size_multiplier * B * T * L * D + ) + else: + read_write_bytes = ( + output_size_multiplier * B * T * L * D + + param_size_multiplier * B * T * L * D + ) + logging.info( f"{weights_precision} Embedding tables: {E * T} rows, {nparams_byte / param_size_multiplier / 1.0e9: .2f} GParam, " f"{nparams_byte / 1.0e9: .2f} GB" # IntN TBE use byte for storage diff --git a/fbgemm_gpu/codegen/embedding_backward_split_cpu_template.cpp b/fbgemm_gpu/codegen/embedding_backward_split_cpu_template.cpp index fbdbcf45b4..5f0f445f2f 100644 --- a/fbgemm_gpu/codegen/embedding_backward_split_cpu_template.cpp +++ b/fbgemm_gpu/codegen/embedding_backward_split_cpu_template.cpp @@ -403,9 +403,9 @@ for (const auto d : c10::irange(D)) { TORCH_LIBRARY_FRAGMENT(fbgemm, m) { {% if not dense %} - m.def("split_embedding_backward_codegen_{{ optimizer }}_cpu(Tensor grad_output, Tensor(a!) host_weights, Tensor weights_placements, Tensor weights_offsets, Tensor D_offsets, int max_D, Tensor hash_size_cumsum, int total_hash_size_bits, Tensor indices, Tensor offsets,int pooling_mode, Tensor indice_weights, bool stochastic_rounding, {{ (args.split_function_args | join(", ")).replace("double", "float").replace("int64_t", "int")}}, int output_dtype = 0) -> ()"); + m.def("split_embedding_backward_codegen_{{ optimizer }}_cpu(Tensor grad_output, Tensor(a!) host_weights, Tensor weights_placements, Tensor weights_offsets, Tensor D_offsets, int max_D, Tensor hash_size_cumsum, int total_hash_size_bits, Tensor indices, Tensor offsets,int pooling_mode, Tensor indice_weights, bool stochastic_rounding, {{ (args.split_function_args | join(", ")).replace("double", "float").replace("int64_t", "int").replace("Tensor momentum1_host", "Tensor(b!) momentum1_host")}}, int output_dtype = 0) -> ()"); {% else %} - m.def("split_embedding_backward_codegen_{{ optimizer }}_cpu(Tensor grad_output, Tensor(a!) host_weights, Tensor weights_offsets, Tensor D_offsets, int max_D, Tensor hash_size_cumsum, int total_hash_size_bits, Tensor indices, Tensor offsets,int pooling_mode, Tensor indice_weights, {{ (args.split_function_args | join(", ")).replace("double", "float").replace("int64_t", "int")}}) -> Tensor"); + m.def("split_embedding_backward_codegen_{{ optimizer }}_cpu(Tensor grad_output, Tensor(a!) host_weights, Tensor weights_offsets, Tensor D_offsets, int max_D, Tensor hash_size_cumsum, int total_hash_size_bits, Tensor indices, Tensor offsets,int pooling_mode, Tensor indice_weights, {{ (args.split_function_args | join(", ")).replace("double", "float").replace("int64_t", "int").replace("Tensor momentum1_host", "Tensor(b!) momentum1_host")}}) -> Tensor"); {% endif %} DISPATCH_TO_CPU("split_embedding_backward_codegen_{{ optimizer }}_cpu", split_embedding_backward_codegen_{{ optimizer }}_cpu); } diff --git a/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp b/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp index 2791dd3c12..2efef10f82 100644 --- a/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp +++ b/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp @@ -10,6 +10,7 @@ #include #include #include +#include "fbgemm_gpu/dispatch_macros.h" #include "fbgemm_gpu/embedding_common.h" #include "fbgemm_gpu/sparse_ops_utils.h" @@ -169,7 +170,8 @@ TORCH_LIBRARY_FRAGMENT(fb, m) { // The (a!) tells PyTorch this is an impure operation and so cannot be CSE'd // or DCE'd, etc. m.def( - "bounds_check_indices(Tensor rows_per_table, Tensor(a!) indices, Tensor(b!) offsets, int bounds_check_mode, Tensor(c!) warning, Tensor(d!)? weights=None, Tensor? B_offsets=None, int max_B=-1) -> ()"); + "bounds_check_indices(Tensor rows_per_table, Tensor(a!) indices, Tensor(b!) offsets, int bounds_check_mode, Tensor(c!) warning, Tensor(d!)? weights=None, Tensor? B_offsets=None, int max_B=-1) -> ()", + {PT2_COMPLIANT_TAG}); DISPATCH_TO_CPU("bounds_check_indices", bounds_check_indices_cpu); } diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp b/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp index 652fc894cf..6540dfc6ca 100644 --- a/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp +++ b/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp @@ -200,6 +200,13 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{ const float* indice_weights_acc = indice_weights.data_ptr(); {% endif %} + using float16 = uint16_t; + using bfloat16 = uint16_t; + using fbgemm_out_t = typename std::conditional< + std::is_same::value, + float16, + std::conditional::value, bfloat16, float>::type >::type; + AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_", [&] { const auto* indices_acc = indices.data_ptr(); const auto* offsets_acc = offsets.data_ptr(); @@ -208,10 +215,9 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{ auto* output_acc = output.data_ptr(); int32_t num_indices_m_1 = indices.numel() - 1; - int32_t D_start_ = 0; -for (const auto t : c10::irange(T)) { + for (const auto t : c10::irange(T)) { {% if not nobag %} const auto* D_offsets_acc = D_offsets.data_ptr(); const int32_t D_start = D_offsets_acc[t]; @@ -226,164 +232,100 @@ for (const auto t : c10::irange(T)) { const auto& weight_tensor = (placement == PlacementType::HOST) ? dev_weights : uvm_weights; weights_acc = weight_tensor.data_ptr(); const uint8_t* weights = &weights_acc[weights_offsets_acc[t]]; - auto weight_ty = static_cast(weights_tys_acc[t]); + const auto weight_ty = static_cast(weights_tys_acc[t]); // default to 1 byte alignment for CPU TBE const int32_t D_bytes = nbit::padded_row_size_in_bytes(D, weight_ty, row_alignment); int tt; for (tt = t + 1; tt < T && weights_offsets_acc[tt] == weights_offsets_acc[t]; ++tt); - size_t num_rows = ((tt == T ? weight_tensor.numel() : weights_offsets_acc[tt]) - weights_offsets_acc[t]) / D_bytes; + const size_t num_rows = ((tt == T ? weight_tensor.numel() : weights_offsets_acc[tt]) - weights_offsets_acc[t]) / D_bytes; const index_t* offsets_begin_ptr = offsets_acc + t * B; - using float16 = uint16_t; - using bfloat16 = uint16_t; - using fbgemm_out_t = typename std::conditional< - std::is_same::value, - float16, - std::conditional::value, bfloat16, float>::type >::type; - bool success = true; - bool has_weight = {{ "true" if weighted else "false" }}; - bool normalize_by_lengths = static_cast(pooling_mode) == PoolingMode::MEAN; + const bool has_weight = {{ "true" if weighted else "false" }}; + const bool normalize_by_lengths = static_cast(pooling_mode) == PoolingMode::MEAN; + + const index_t index_size = offsets_acc[(t + 1) * B] - *offsets_begin_ptr; + + {% if nobag %} + // Create virtual offsets for the nobag case. Lengths are all ones. + const auto offsets_nobag = at::arange(*offsets_begin_ptr, offsets_acc[(t + 1) * B] + 1, offsets.options()); + const index_t* offsets_nobag_ptr = offsets_nobag.data_ptr(); + TORCH_CHECK(offsets_nobag.numel() == index_size + 1); + TORCH_CHECK(offsets_nobag_ptr[index_size] - offsets_nobag_ptr[0] == index_size); + {% endif %} - index_t index_size = offsets_acc[(t + 1) * B] - *offsets_begin_ptr; const float* indice_weights_ptr = nullptr; {% if weighted %} indice_weights_ptr = indice_weights_acc + *offsets_begin_ptr; {% endif %} + + {% macro generate_and_exec_kernel(weight_type, use_base, use_nbit, use_fp8) %} + {% set has_asmjit = use_base or use_nbit %} + {% set kernel_name = "GenerateEmbeddingSpMDMWithStrides" + if use_base else ("GenerateEmbeddingSpMDMNBitWithStrides" + if use_nbit else "GenerateEmbeddingSpMDMFP8WithStrides") + %} + const auto kernel = fbgemm::{{ kernel_name }}< + {% if use_base %} + {{ weight_type }}, + {% endif %} + index_t, + index_t, + {% if has_asmjit %} + fbgemm_out_t, + /*THREAD_LOCAL=*/true + {% else %} + fbgemm_out_t + {% endif %} + >( + {% if use_nbit %} + /*bit_rate=*/bit_rate, + {% endif %} + D, + {% if has_asmjit %} + has_weight, + {% endif %} + normalize_by_lengths, + {% if has_asmjit %} + /*prefetch=*/16, + {% endif %} + /*is_weight_positional=*/false, + /*use_offsets=*/true, + /*output_stride=*/{{ "total_D" if not nobag else "D" }}, + /*input_stride=*/D_bytes / sizeof({{ weight_type }}), + {% if use_fp8 %} + /*exponent_bits=*/fp8_exponent_bits, + /*exponent_bias=*/fp8_exponent_bias, + {% endif %} + {% if has_asmjit %} + /*scale_bias_last=*/false, + {% endif %} + {% if use_base %} + /*no_bag=*/false, + {% endif %} + /*is_bf16_out=*/output_is_bf16 + ); + success = kernel( + {{ "B" if not nobag else "index_size"}}, + index_size, + num_rows, + reinterpret_cast(weights), + indices_acc + *offsets_begin_ptr, + {{ "offsets_begin_ptr" if not nobag else "offsets_nobag_ptr" }}, + indice_weights_ptr, + reinterpret_cast(output_acc + D_start)); + {% endmacro %} + if (weight_ty == SparseType::FP32) { - auto kernel = fbgemm::GenerateEmbeddingSpMDMWithStrides( - D, - has_weight, - normalize_by_lengths, - /*prefetch=*/16, - /*is_weight_positional=*/false, - /*use_offsets=*/true, - {% if not nobag %} - /*output_stride=*/total_D, - {% else %} - /*output_stride=*/D, - {% endif %} - /*input_stride=*/D_bytes / sizeof(float), - {% if not nobag %} - /*scale_bias_last=*/false, - /*no_bag=*/false, - /*is_bf16_out=*/output_is_bf16); - {% else %} - /*scale_bias_last=*/false, - /*no_bag=*/true, - /*is_bf16_out=*/output_is_bf16); - {% endif %} - success = kernel( - {% if not nobag %} - B, - {% else %} - index_size, - {% endif %} - index_size, - num_rows, - reinterpret_cast(weights), - indices_acc + *offsets_begin_ptr, - offsets_begin_ptr, - indice_weights_ptr, - reinterpret_cast(output_acc + D_start)); + {{ generate_and_exec_kernel("float", True, False, False) }} } else if (weight_ty == SparseType::FP16) { - auto kernel = fbgemm::GenerateEmbeddingSpMDMWithStrides( - D, - has_weight, - normalize_by_lengths, - /*prefetch=*/16, - /*is_weight_positional=*/false, - /*use_offsets=*/true, - {% if not nobag %} - /*output_stride=*/total_D, - {% else %} - /*output_stride=*/D, - {% endif %} - /*input_stride=*/D_bytes / sizeof(float16), - {% if not nobag %} - /*scale_bias_last=*/false, - /*no_bag=*/false, - /*is_bf16_out=*/output_is_bf16); - {% else %} - /*scale_bias_last=*/false, - /*no_bag=*/true, - /*is_bf16_out=*/output_is_bf16); - {% endif %} - success = kernel( - {% if not nobag %} - B, - {% else %} - index_size, - {% endif %} - index_size, - num_rows, - reinterpret_cast(weights), - indices_acc + *offsets_begin_ptr, - offsets_begin_ptr, - indice_weights_ptr, - reinterpret_cast(output_acc + D_start)); + {{ generate_and_exec_kernel("float16", True, False, False) }} + } else if (weight_ty == SparseType::INT8) { + {{ generate_and_exec_kernel("uint8_t", True, False, False) }} } else if (weight_ty == SparseType::FP8) { assert(fp8_exponent_bits > 0 && fp8_exponent_bias > 0); - auto kernel = fbgemm::GenerateEmbeddingSpMDMFP8WithStrides( - D, - normalize_by_lengths, - /*is_weight_positional=*/false, - /*use_offsets=*/true, - {% if not nobag %} - /*output_stride=*/total_D, - {% else %} - /*output_stride=*/D, - {% endif %} - /*input_stride=*/D_bytes / sizeof(uint8_t), - /*exponent_bits=*/fp8_exponent_bits, - /*exponent_bias=*/fp8_exponent_bias, - /*is_bf16_out=*/output_is_bf16); - success = kernel( - B, - index_size, - num_rows, - weights, - indices_acc + *offsets_begin_ptr, - offsets_begin_ptr, - indice_weights_ptr, - reinterpret_cast(output_acc + D_start)); - } else if (weight_ty == SparseType::INT8) { - auto kernel = fbgemm::GenerateEmbeddingSpMDMWithStrides( - D, - has_weight, - normalize_by_lengths, - /*prefetch=*/16, - /*is_weight_positional=*/false, - /*use_offsets=*/true, - {% if not nobag %} - /*output_stride=*/total_D, - {% else %} - /*output_stride=*/D, - {% endif %} - /*input_stride=*/D_bytes / sizeof(uint8_t), - {% if not nobag %} - /*scale_bias_last=*/false, - /*no_bag=*/false, - /*is_bf16_out=*/output_is_bf16); - {% else %} - /*scale_bias_last=*/false, - /*no_bag=*/true, - /*is_bf16_out=*/output_is_bf16); - {% endif %} - success = kernel( - {% if not nobag %} - B, - {% else %} - index_size, - {% endif %} - index_size, - num_rows, - weights, - indices_acc + *offsets_begin_ptr, - offsets_begin_ptr, - indice_weights_ptr, - reinterpret_cast(output_acc + D_start)); + {{ generate_and_exec_kernel("uint8_t", False, False, True) }} } else if (weight_ty == SparseType::INT4 || weight_ty == SparseType::INT2) { int bit_rate; switch (weight_ty) { @@ -394,35 +336,13 @@ for (const auto t : c10::irange(T)) { bit_rate = 2; break; default: - throw std::logic_error("Unsupported SparseType: " + std::to_string(static_cast(weight_ty))); + throw std::logic_error( + "Unsupported SparseType: " + std::to_string(static_cast(weight_ty))); } - auto kernel = fbgemm::GenerateEmbeddingSpMDMNBitWithStrides( - /*bit_rate=*/bit_rate, - D, - has_weight, - normalize_by_lengths, - /*prefetch=*/16, - /*is_weight_positional=*/false, - /*use_offsets=*/true, - {% if not nobag %} - /*output_stride=*/total_D, - {% else %} - /*output_stride=*/D, - {% endif %} - /*input_stride=*/D_bytes / sizeof(uint8_t), - /*scale_bias_last=*/false, - /*is_bf16_out=*/output_is_bf16); - success = kernel( - B, - index_size, - num_rows, - weights, - indices_acc + *offsets_begin_ptr, - offsets_begin_ptr, - indice_weights_ptr, - reinterpret_cast(output_acc + D_start)); + {{ generate_and_exec_kernel("uint8_t", False, True, False) }} } else { - throw std::logic_error("Unsupported SparseType: " + std::to_string(static_cast(weight_ty))); + throw std::logic_error( + "Unsupported SparseType: " + std::to_string(static_cast(weight_ty))); } if (!success) { fbgemm_gpu::report_embedding_error( diff --git a/fbgemm_gpu/codegen/embedding_forward_split_cpu.cpp b/fbgemm_gpu/codegen/embedding_forward_split_cpu.cpp index b440652b28..7d14b52664 100644 --- a/fbgemm_gpu/codegen/embedding_forward_split_cpu.cpp +++ b/fbgemm_gpu/codegen/embedding_forward_split_cpu.cpp @@ -234,6 +234,42 @@ Tensor split_embedding_codegen_forward_cpu( return output; } +Tensor split_embedding_codegen_forward_cpu_meta( + Tensor weights, + Tensor weights_offsets, + Tensor D_offsets, + int64_t total_D, + Tensor hash_size_cumsum, + Tensor indices, + Tensor offsets, + int64_t pooling_mode, + Tensor indice_weights, + int64_t output_dtype) { + c10::SymInt T = D_offsets.sym_numel() - 1; + TORCH_CHECK_GT(T, 0); + // offsets = [T x B + 1] + c10::SymInt B = (offsets.sym_size(0) - 1) / T; + TORCH_CHECK_GE(B, 0); + + Tensor output; + if (output_dtype == static_cast(SparseType::FP32)) { + output = + at::empty_symint({B, total_D}, weights.options().dtype(at::kFloat)); + } else if (output_dtype == static_cast(SparseType::FP16)) { + output = at::empty_symint({B, total_D}, weights.options().dtype(at::kHalf)); + } else if (output_dtype == static_cast(SparseType::BF16)) { + output = + at::empty_symint({B, total_D}, weights.options().dtype(at::kBFloat16)); + } else { + output = at::empty_symint({B, total_D}, weights.options()); + } + + // It is assumed that the indice_weights will always be float + TORCH_CHECK( + !indice_weights.defined() || indice_weights.scalar_type() != at::kHalf); + return output; +} + template void split_embedding_grad_indice_weights_cpu_kernel( Tensor grad_output, @@ -632,4 +668,10 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) { split_embedding_codegen_forward_cpu); } +TORCH_LIBRARY_IMPL(fbgemm, Meta, m) { + m.impl( + "split_embedding_codegen_forward_cpu", + &split_embedding_codegen_forward_cpu_meta); +} + } // namespace diff --git a/fbgemm_gpu/codegen/embedding_forward_split_meta_template.cpp b/fbgemm_gpu/codegen/embedding_forward_split_meta_template.cpp index 0c8c930ecd..f9067f0a88 100644 --- a/fbgemm_gpu/codegen/embedding_forward_split_meta_template.cpp +++ b/fbgemm_gpu/codegen/embedding_forward_split_meta_template.cpp @@ -179,6 +179,10 @@ Tensor return output; } + {%- if not nobag and vbe %} + output = output.reshape({-1}); + {%- endif %} + return output; } diff --git a/fbgemm_gpu/fbgemm_gpu/sparse_ops.py b/fbgemm_gpu/fbgemm_gpu/sparse_ops.py index 0979959a53..c6bfa60e0a 100644 --- a/fbgemm_gpu/fbgemm_gpu/sparse_ops.py +++ b/fbgemm_gpu/fbgemm_gpu/sparse_ops.py @@ -326,20 +326,6 @@ def merge_pooled_embeddings( ) -@impl_abstract("fbgemm::bounds_check_indices") -def bounds_check_indices( - rows_per_table: torch.Tensor, - indices: torch.Tensor, - offsets: torch.Tensor, - bounds_check_mode: int, - warning: torch.Tensor, - weights: Optional[torch.Tensor] = None, - B_offsets: Optional[torch.Tensor] = None, - max_B: int = -1, -) -> None: - pass - - @impl_abstract("fbgemm::permute_sparse_features") def permute_sparse_features_abstract( permute: Tensor, lengths: Tensor, indices: Tensor, weights: Optional[Tensor] = None @@ -371,3 +357,31 @@ def segment_sum_csr_abstract( output_size = csr_seg.numel() - 1 output = values.new_empty(output_size) return output + + +@impl_abstract("fbgemm::dense_to_jagged_forward") +def dense_to_jagged_forward( + dense: torch.Tensor, + offsets: List[torch.Tensor], + total_L: Optional[torch.SymInt] = None, +) -> torch.Tensor: + if not total_L: + total_L = torch.library.get_ctx().new_dynamic_size() + return dense.new_zeros( + total_L, + dense.size()[-1], + dtype=dense.dtype, + device=dense.device, + layout=dense.layout, + ) + + +@impl_abstract("fbgemm::dense_to_jagged") +def dense_to_jagged( + dense: torch.Tensor, + offsets: List[torch.Tensor], + total_L: Optional[torch.SymInt] = None, +) -> Tuple[torch.Tensor, List[torch.Tensor]]: + if not total_L: + total_L = torch.library.get_ctx().new_dynamic_size() + return (dense_to_jagged_forward(dense, offsets, total_L), offsets) diff --git a/fbgemm_gpu/include/fbgemm_gpu/dispatch_macros.h b/fbgemm_gpu/include/fbgemm_gpu/dispatch_macros.h index 834a226ce4..05f391e597 100644 --- a/fbgemm_gpu/include/fbgemm_gpu/dispatch_macros.h +++ b/fbgemm_gpu/include/fbgemm_gpu/dispatch_macros.h @@ -203,8 +203,10 @@ TYPE, NAME, FBGEMM_DISPATCH_FLOAT_HALF_AND_BFLOAT16_CASE(__VA_ARGS__)) // We can cleanup the following once fbgemm uses PyTorch 2.2 in January 2024. +#ifndef PT2_COMPLIANT_TAG #ifdef HAS_PT2_COMPLIANT_TAG #define PT2_COMPLIANT_TAG at::Tag::pt2_compliant_tag #else #define PT2_COMPLIANT_TAG #endif +#endif diff --git a/fbgemm_gpu/setup.py b/fbgemm_gpu/setup.py index cb7ab36475..deee823190 100644 --- a/fbgemm_gpu/setup.py +++ b/fbgemm_gpu/setup.py @@ -393,18 +393,17 @@ def main(argv: List[str]) -> None: cmdclass={ "install": FbgemmGpuInstaller, }, - # PyPI package information. + # PyPI package information classifiers=[ "Development Status :: 4 - Beta", "Intended Audience :: Developers", "Intended Audience :: Science/Research", "License :: OSI Approved :: BSD License", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", "Topic :: Scientific/Engineering :: Artificial Intelligence", + ] + + [ + f"Programming Language :: Python :: {x}" + for x in ["3", "3.8", "3.9", "3.10", "3.11", "3.12"] ], ) diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp index 5a1753b239..fb5ba53798 100644 --- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp +++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp @@ -1635,6 +1635,9 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) { // SymInt is a new PyTorch 2.0 feature to support dynamic shape. See more // details at https://pytorch.org/get-started/pytorch-2.0/#dynamic-shapes. If // you find it doesn't compile, please pull the new PyTorch 2.0 code + m.impl_abstract_pystub( + "fbgemm_gpu.sparse_ops", + "//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_py"); m.def( "dense_to_jagged(Tensor dense, Tensor[] x_offsets, SymInt? total_L=None) -> (Tensor, Tensor[])", {PT2_COMPLIANT_TAG}); diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp index b9e249cb90..fabcd6455b 100644 --- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp +++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp @@ -92,28 +92,6 @@ Tensor jagged_dense_elementwise_add_meta( return at::empty_like(y); } -Tensor dense_to_jagged_forward_meta( - const Tensor& dense, - const std::vector& offsets, - c10::optional total_L) { - auto dense_values = dense; - at::SymInt D = dense_values.sym_size(-1); - TORCH_CHECK_NOT_IMPLEMENTED( - total_L.has_value(), "total_L is required for meta backend"); - auto& total_L_computed = total_L.value(); - auto values = at::zeros_symint({total_L_computed, D}, dense_values.options()); - - TORCH_CHECK(values.is_meta()); - return values; -} - -std::tuple> dense_to_jagged_meta( - const Tensor& dense, - const std::vector& offsets, - c10::optional total_L) { - return {dense_to_jagged_forward_meta(dense, offsets, total_L), offsets}; -} - std::tuple> jagged_dense_elementwise_mul_meta( const Tensor& x_values, const std::vector& x_offsets, @@ -241,10 +219,6 @@ TORCH_LIBRARY_IMPL(fbgemm, Meta, m) { m.impl( "jagged_to_padded_dense_backward", TORCH_FN(fbgemm_gpu::jagged_to_padded_dense_backward_meta)); - m.impl( - "dense_to_jagged_forward", - TORCH_FN(fbgemm_gpu::dense_to_jagged_forward_meta)); - m.impl("dense_to_jagged", TORCH_FN(fbgemm_gpu::dense_to_jagged_meta)); m.impl( "jagged_dense_dense_elementwise_add_jagged_output_forward", TORCH_FN( diff --git a/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp b/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp index a6ff0d5dce..09f5c011ad 100644 --- a/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp +++ b/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp @@ -148,7 +148,8 @@ at::Tensor permute_pooled_embs_auto_grad_meta( TORCH_LIBRARY_FRAGMENT(fbgemm, m) { m.def( - "permute_pooled_embs(Tensor pooled_embs, Tensor offset_dim_list, Tensor permute_list, Tensor inv_offset_dim_list, Tensor inv_permute_list) -> Tensor"); + "permute_pooled_embs(Tensor pooled_embs, Tensor offset_dim_list, Tensor permute_list, Tensor inv_offset_dim_list, Tensor inv_permute_list) -> Tensor", + {PT2_COMPLIANT_TAG}); m.def( "permute_pooled_embs_auto_grad(Tensor pooled_embs, Tensor offset_dim_list, Tensor permute_list, Tensor inv_offset_dim_list, Tensor inv_permute_list) -> Tensor", {PT2_COMPLIANT_TAG}); diff --git a/fbgemm_gpu/src/sparse_ops/common.cuh b/fbgemm_gpu/src/sparse_ops/common.cuh index 5cfca60e23..021736a675 100644 --- a/fbgemm_gpu/src/sparse_ops/common.cuh +++ b/fbgemm_gpu/src/sparse_ops/common.cuh @@ -32,7 +32,7 @@ #include "fbgemm_gpu/split_embeddings_utils.cuh" #ifdef USE_ROCM -#include +#include #endif #ifdef USE_ROCM diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp index ae17a393c5..7f3922417c 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp +++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp @@ -2769,7 +2769,8 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) { m.def( "generic_histogram_binning_calibration_by_feature(Tensor logit, Tensor segment_value, Tensor segment_lengths, SymInt num_segments, Tensor bin_num_examples, Tensor bin_num_positives, Tensor bin_boundaries, float positive_weight, SymInt bin_ctr_in_use_after, float bin_ctr_weight_value) -> (Tensor, Tensor)"); m.def( - "segment_sum_csr(SymInt batch_size, Tensor csr_seg, Tensor values) -> Tensor"); + "segment_sum_csr(SymInt batch_size, Tensor csr_seg, Tensor values) -> Tensor", + {PT2_COMPLIANT_TAG}); m.def( "embedding_bag_rowwise_prune(Tensor weight, Tensor indicator, float threshold, ScalarType compressed_indices_dtype, bool abs=True, SymInt min_num_rows=0, float? min_save_ratio=1.0) -> (Tensor, Tensor)"); m.def("lengths_range(Tensor t_in, SymInt[]? shape=None) -> Tensor"); diff --git a/fbgemm_gpu/src/split_embeddings_utils/split_embeddings_utils.cpp b/fbgemm_gpu/src/split_embeddings_utils/split_embeddings_utils.cpp index e464c879d3..38195ece18 100644 --- a/fbgemm_gpu/src/split_embeddings_utils/split_embeddings_utils.cpp +++ b/fbgemm_gpu/src/split_embeddings_utils/split_embeddings_utils.cpp @@ -12,6 +12,30 @@ #include #include +using Tensor = at::Tensor; +using namespace fbgemm_gpu; + +namespace { + +std::tuple +generate_vbe_metadata_meta( + const Tensor& B_offsets, + const Tensor& B_offsets_rank_per_feature, + const Tensor& output_offsets_feature_rank, + const Tensor& D_offsets, + const int64_t D, + const bool nobag, + const int64_t max_B_feature_rank, + const int64_t info_B_num_bits, + const c10::SymInt total_B) { + Tensor row_output_offsets = + at::empty_symint({total_B}, output_offsets_feature_rank.options()); + Tensor b_t_map = at::empty_symint({total_B}, B_offsets.options()); + return {row_output_offsets, b_t_map}; +} + +} // namespace + TORCH_LIBRARY_FRAGMENT(fbgemm, m) { m.def( "transpose_embedding_input(" @@ -40,9 +64,13 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) { " bool nobag, " " int max_B_feature_rank, " " int info_B_num_bits, " - " int total_B" + " SymInt total_B" ") -> (Tensor, Tensor)"); DISPATCH_TO_CUDA("transpose_embedding_input", transpose_embedding_input); DISPATCH_TO_CUDA("get_infos_metadata", get_infos_metadata); DISPATCH_TO_CUDA("generate_vbe_metadata", generate_vbe_metadata); } + +TORCH_LIBRARY_IMPL(fbgemm, Meta, m) { + m.impl("generate_vbe_metadata", &generate_vbe_metadata_meta); +} diff --git a/fbgemm_gpu/test/batched_unary_embeddings_test.py b/fbgemm_gpu/test/batched_unary_embeddings_test.py index 3d63aff90f..1577a11f3b 100644 --- a/fbgemm_gpu/test/batched_unary_embeddings_test.py +++ b/fbgemm_gpu/test/batched_unary_embeddings_test.py @@ -7,9 +7,10 @@ import random +import sys import unittest from math import sqrt -from typing import List, Tuple +from typing import Callable, List, Tuple import fbgemm_gpu.batched_unary_embeddings_ops as batched_unary_embeddings_ops import numpy as np @@ -45,6 +46,15 @@ } +# pyre-fixme[2] +# pyre-fixme[24] +def torch_compiled(model: Callable, **kwargs) -> Callable: + if sys.version_info < (3, 12, 0): + return torch.compile(model, **kwargs) + else: + return model + + class TableBatchedEmbeddingsTest(unittest.TestCase): class RefEmb(torch.nn.Module): def __init__(self, num_tasks: int, hash_sizes: List[int]) -> None: @@ -147,7 +157,7 @@ def _test_main( param.detach().copy_(ref_emb.emb_modules[i].weight) output_ref = ref_emb(offsets, indices) if torch_compile: - unary_emb = torch.compile(unary_emb, dynamic=True, fullgraph=True) + unary_emb = torch_compiled(unary_emb, dynamic=True, fullgraph=True) output = unary_emb(offsets_tensor, indices_tensor) torch.testing.assert_close( output_ref, @@ -169,7 +179,7 @@ def _test_main( param.detach().copy_(ref_emb.emb_modules[i].weight) output_ref = ref_emb(offsets, indices) if torch_compile: - unary_emb = torch.compile(unary_emb, dynamic=True, fullgraph=True) + unary_emb = torch_compiled(unary_emb, dynamic=True, fullgraph=True) output = unary_emb(offsets_tensor.long(), indices_tensor.long()) torch.testing.assert_close( output_ref, diff --git a/fbgemm_gpu/test/failures_dict.json b/fbgemm_gpu/test/failures_dict.json index 43efa968c9..eaf0ba2389 100644 --- a/fbgemm_gpu/test/failures_dict.json +++ b/fbgemm_gpu/test/failures_dict.json @@ -115,40 +115,7 @@ "status": "xfail" } }, - "fbgemm::dense_to_jagged": { - "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_dense_to_jagged": { - "comment": "", - "status": "xfail" - }, - "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_dense_to_jagged_meta_backend": { - "comment": "", - "status": "xfail" - }, - "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_dense_to_jagged_opt": { - "comment": "", - "status": "xfail" - }, - "JaggedTensorOpsTest.test_aot_dispatch_dynamic__test_dense_to_jagged_opt_large_batch": { - "comment": "", - "status": "xfail" - }, - "JaggedTensorOpsTest.test_faketensor__test_dense_to_jagged": { - "comment": "", - "status": "xfail" - }, - "JaggedTensorOpsTest.test_faketensor__test_dense_to_jagged_meta_backend": { - "comment": "", - "status": "xfail" - }, - "JaggedTensorOpsTest.test_faketensor__test_dense_to_jagged_opt": { - "comment": "", - "status": "xfail" - }, - "JaggedTensorOpsTest.test_faketensor__test_dense_to_jagged_opt_large_batch": { - "comment": "", - "status": "xfail" - } - }, + "fbgemm::dense_to_jagged": {}, "fbgemm::expand_into_jagged_permute": {}, "fbgemm::generic_histogram_binning_calibration_by_feature": { "SparseOpsTest.test_aot_dispatch_dynamic__test_generic_histogram_binning_calibration_by_feature": { @@ -438,6 +405,18 @@ }, "fbgemm::permute_1D_sparse_data": {}, "fbgemm::permute_2D_sparse_data": {}, + "fbgemm::permute_duplicate_pooled_embs_auto_grad": { + "PooledEmbeddingModulesTest.test_aot_dispatch_dynamic__test_duplicate_permutations": { + "comment": "", + "status": "xfail" + }, + "PooledEmbeddingModulesTest.test_faketensor__test_duplicate_permutations": { + "comment": "", + "status": "xfail" + } + }, + "fbgemm::permute_pooled_embs": {}, + "fbgemm::permute_pooled_embs_auto_grad": {}, "fbgemm::permute_sequence_embeddings": { "SparseOpsTest.test_aot_dispatch_dynamic__test_permute_embeddings": { "comment": "", diff --git a/fbgemm_gpu/test/failures_dict_fast.json b/fbgemm_gpu/test/failures_dict_fast.json index cec2a1adc7..5676072e0b 100644 --- a/fbgemm_gpu/test/failures_dict_fast.json +++ b/fbgemm_gpu/test/failures_dict_fast.json @@ -35,10 +35,6 @@ "comment": "", "status": "xfail" }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_cpu_bf16_out": { - "comment": "", - "status": "xfail" - }, "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_gpu_no_cache": { "comment": "", "status": "xfail" @@ -75,10 +71,6 @@ "comment": "", "status": "xfail" }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_cpu_bf16_out": { - "comment": "", - "status": "xfail" - }, "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_gpu_no_cache": { "comment": "", "status": "xfail" @@ -88,120 +80,7 @@ "status": "xfail" } }, - "fbgemm::bounds_check_indices": { - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmMEAN": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmNONE": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmSUM": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmMEAN": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmNONE": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmSUM": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_none": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_none_with_rowwise_adagrad": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_optimizers_adagrad": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_optimizers_adam": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_optimizers_lamb": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_optimizers_lars": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd_really_long_segments": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_bounds_check": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_miss_counter": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_pipeline": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline_stream_1": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline_stream_2": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_forward_fused_pooled_emb_quant": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_int_nbit_split_embedding_uvm_caching_codegen_lookup_function": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_cache_miss_counter": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_direct_mapped_uvm_cache_stats": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_fused_pooled_emb_quant": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_uvm_cache": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_uvm_cache_stats": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_stb_uvm_cache_stats": { - "comment": "", - "status": "xfail" - } - }, + "fbgemm::bounds_check_indices": {}, "fbgemm::dense_embedding_codegen_lookup_function": { "SplitTableBatchedEmbeddingsTest.test_autograd_registration__test_backward_dense": { "comment": "", @@ -212,16 +91,7 @@ "status": "xfail" } }, - "fbgemm::direct_mapped_lru_cache_populate_byte": { - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_direct_mapped_uvm_cache_stats": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_uvm_cache": { - "comment": "", - "status": "xfail" - } - }, + "fbgemm::direct_mapped_lru_cache_populate_byte": {}, "fbgemm::direct_mapped_lxu_cache_lookup": { "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_direct_mapped_uvm_cache_stats": { "comment": "", @@ -232,12 +102,7 @@ "status": "xfail" } }, - "fbgemm::emb_inplace_update": { - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_embedding_inplace_update": { - "comment": "", - "status": "xfail" - } - }, + "fbgemm::emb_inplace_update": {}, "fbgemm::get_unique_indices": { "SplitTableBatchedEmbeddingsTest.test_faketensor__test_unique_lxu_cache_lookup": { "comment": "", @@ -470,80 +335,9 @@ "status": "skip" } }, - "fbgemm::lru_cache_populate_byte": { - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_int_nbit_split_embedding_uvm_caching_codegen_lookup_function": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_cache_miss_counter": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_uvm_cache": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_uvm_cache_stats": { - "comment": "", - "status": "xfail" - } - }, - "fbgemm::lxu_cache_flush": { - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmMEAN": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmNONE": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmSUM": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmMEAN": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmNONE": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmSUM": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd_really_long_segments": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_pipeline": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline_stream_1": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline_stream_2": { - "comment": "", - "status": "xfail" - } - }, - "fbgemm::lxu_cache_locking_counter_decrement": { - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_lxu_cache_locking_counter_decrement": { - "comment": "", - "status": "xfail" - } - }, + "fbgemm::lru_cache_populate_byte": {}, + "fbgemm::lxu_cache_flush": {}, + "fbgemm::lxu_cache_locking_counter_decrement": {}, "fbgemm::lxu_cache_lookup": { "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmMEAN": { "comment": "", @@ -671,20 +465,7 @@ "status": "xfail" } }, - "fbgemm::pruned_hashmap_insert": { - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_gpu_no_cache": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_uvm_cache": { - "comment": "", - "status": "xfail" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_pruning": { - "comment": "", - "status": "xfail" - } - }, + "fbgemm::pruned_hashmap_insert": {}, "fbgemm::pruned_hashmap_lookup": { "SplitTableBatchedEmbeddingsTest.test_faketensor__test_nbit_forward_uvm_cache": { "comment": "", @@ -706,115 +487,17 @@ } }, "fbgemm::split_embedding_codegen_lookup_adagrad_function": {}, - "fbgemm::split_embedding_codegen_lookup_adagrad_function_cpu": { - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_optimizers_adagrad": { - "comment": "", - "status": "xfail" - } - }, - "fbgemm::split_embedding_codegen_lookup_adam_function": { - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_optimizers_adam": { - "comment": "", - "status": "xfail" - } - }, + "fbgemm::split_embedding_codegen_lookup_adagrad_function_cpu": {}, + "fbgemm::split_embedding_codegen_lookup_adam_function": {}, "fbgemm::split_embedding_codegen_lookup_lamb_function": {}, "fbgemm::split_embedding_codegen_lookup_lars_sgd_function": {}, "fbgemm::split_embedding_codegen_lookup_none_function": {}, "fbgemm::split_embedding_codegen_lookup_partial_rowwise_adam_function": {}, "fbgemm::split_embedding_codegen_lookup_partial_rowwise_lamb_function": {}, - "fbgemm::split_embedding_codegen_lookup_rowwise_adagrad_function": { - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmMEAN": { - "comment": "", - "status": "skip" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmNONE": { - "comment": "", - "status": "skip" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmSUM": { - "comment": "", - "status": "skip" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmMEAN": { - "comment": "", - "status": "skip" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmNONE": { - "comment": "", - "status": "skip" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmSUM": { - "comment": "", - "status": "skip" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_optimizers_adagrad": { - "comment": "", - "status": "skip" - } - }, - "fbgemm::split_embedding_codegen_lookup_rowwise_adagrad_function_cpu": { - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmMEAN": { - "comment": "", - "status": "skip" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmNONE": { - "comment": "", - "status": "skip" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp16_pmSUM": { - "comment": "", - "status": "skip" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmMEAN": { - "comment": "", - "status": "skip" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmNONE": { - "comment": "", - "status": "skip" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_adagrad_fp32_pmSUM": { - "comment": "", - "status": "skip" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_optimizers_adagrad": { - "comment": "", - "status": "skip" - } - }, + "fbgemm::split_embedding_codegen_lookup_rowwise_adagrad_function": {}, + "fbgemm::split_embedding_codegen_lookup_rowwise_adagrad_function_cpu": {}, "fbgemm::split_embedding_codegen_lookup_rowwise_weighted_adagrad_function": {}, - "fbgemm::split_embedding_codegen_lookup_sgd_function": { - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd": { - "comment": "", - "status": "skip" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd_really_long_segments": { - "comment": "", - "status": "skip" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_pipeline": { - "comment": "", - "status": "skip" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline": { - "comment": "", - "status": "skip" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline_stream_1": { - "comment": "", - "status": "skip" - }, - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_cache_prefetch_pipeline_stream_2": { - "comment": "", - "status": "skip" - } - }, - "fbgemm::split_embedding_codegen_lookup_sgd_function_cpu": { - "SplitTableBatchedEmbeddingsTest.test_faketensor__test_backward_sgd": { - "comment": "", - "status": "xfail" - } - } + "fbgemm::split_embedding_codegen_lookup_sgd_function": {}, + "fbgemm::split_embedding_codegen_lookup_sgd_function_cpu": {} } } diff --git a/fbgemm_gpu/test/input_combine_test.py b/fbgemm_gpu/test/input_combine_test.py index db1f0e8928..32e1dd2465 100644 --- a/fbgemm_gpu/test/input_combine_test.py +++ b/fbgemm_gpu/test/input_combine_test.py @@ -132,10 +132,6 @@ def forward( # noqa C901 # skips and failures in deeplearning/fbgemm/fbgemm_gpu/test/failures_dict.json # pyre-fixme[24]: Generic type `Callable` expects 2 type parameters. additional_decorators: Dict[str, List[Callable]] = { - "test_pt2_compliant_tag_fbgemm_dense_to_jagged": [ - # This operator has been grandfathered in. We need to fix this test failure. - unittest.expectedFailure, - ], "test_pt2_compliant_tag_fbgemm_jagged_dense_elementwise_add": [ # This operator has been grandfathered in. We need to fix this test failure. unittest.expectedFailure, diff --git a/fbgemm_gpu/test/jagged_tensor_ops_test.py b/fbgemm_gpu/test/jagged_tensor_ops_test.py index 8465490282..01e4333db4 100644 --- a/fbgemm_gpu/test/jagged_tensor_ops_test.py +++ b/fbgemm_gpu/test/jagged_tensor_ops_test.py @@ -9,6 +9,7 @@ import itertools import random +import sys import unittest from typing import Callable, Dict, List, Tuple @@ -27,7 +28,6 @@ gpu_available, gpu_unavailable, gradcheck, - on_arm_platform, optests, symint_vector_unsupported, TEST_WITH_ROCM, @@ -40,7 +40,6 @@ gpu_available, gpu_unavailable, gradcheck, - on_arm_platform, optests, symint_vector_unsupported, TEST_WITH_ROCM, @@ -127,15 +126,20 @@ def hash_size_cumsum_to_offsets(hash_size_cum_sum_list: List[int]) -> List[int]: return hash_size_offsets_list +# pyre-fixme[2] +# pyre-fixme[24] +def torch_compiled(model: Callable, **kwargs) -> Callable: + if sys.version_info < (3, 12, 0): + return torch.compile(model, **kwargs) + else: + return model + + # e.g. "test_faketensor__test_cumsum": [unittest.expectedFailure] # Please avoid putting tests here, you should put operator-specific # skips and failures in deeplearning/fbgemm/fbgemm_gpu/test/failures_dict.json # pyre-ignore[24]: Generic type `Callable` expects 2 type parameters. additional_decorators: Dict[str, List[Callable]] = { - "test_pt2_compliant_tag_fbgemm_dense_to_jagged": [ - # This operator has been grandfathered in. We need to fix this test failure. - unittest.expectedFailure, - ], "test_pt2_compliant_tag_fbgemm_jagged_dense_elementwise_add": [ # This operator has been grandfathered in. We need to fix this test failure. unittest.expectedFailure, @@ -381,7 +385,7 @@ def test_jagged_2d_to_dense_dynamic_shape( values = ref_values.clone().to(dtype).detach().requires_grad_(True) offsets = offsets.to(device_type) ref_output_values = ref_output_values.to(device_type) - output_values = torch.compile( + output_values = torch_compiled( torch.ops.fbgemm.jagged_2d_to_dense, dynamic=True, fullgraph=True )( values=values, @@ -597,7 +601,7 @@ def lengths_to_segment_ids(lengths: torch.Tensor) -> torch.Tensor: values = ref_values.clone().detach().requires_grad_(False) offsets = offsets.to(device_type) ref_output_values = ref_output_values.to(device_type) - output_values = torch.compile( + output_values = torch_compiled( torch.ops.fbgemm.jagged_1d_to_dense, dynamic=True, fullgraph=True )( values=values, @@ -977,9 +981,10 @@ def test_dense_to_jagged_dynamic_shape( ) values_2d = values_2d.clone().detach().requires_grad_(True) - @torch.compile(fullgraph=True, dynamic=True) def jagged_to_dense( - values: torch.Tensor, offsets: torch.Tensor, max_lengths: List[int] + values: torch.Tensor, + offsets: List[torch.LongTensor], + max_lengths: List[int], ) -> torch.Tensor: return torch.ops.fbgemm.jagged_to_padded_dense(values, offsets, max_lengths) @@ -993,15 +998,13 @@ def jagged_to_dense( torch._dynamo.mark_dynamic(dense, 0) torch._dynamo.mark_dynamic(dense, -1) - @torch.compile(fullgraph=True, dynamic=True) def dense_to_jagged_withL( - dense: torch.Tensor, offsets: torch.Tensor, total_L: List[int] + dense: torch.Tensor, offsets: List[torch.LongTensor], total_L: List[int] ) -> Tuple[torch.Tensor, torch.Tensor]: return torch.ops.fbgemm.dense_to_jagged(dense, offsets, total_L) - @torch.compile(fullgraph=False, dynamic=True) def dense_to_jagged_noL( - dense: torch.Tensor, offsets: torch.Tensor + dense: torch.Tensor, offsets: List[torch.LongTensor] ) -> Tuple[torch.Tensor, torch.Tensor]: return torch.ops.fbgemm.dense_to_jagged(dense, offsets) @@ -1325,24 +1328,21 @@ def test_jagged_elementwise_binary_dynamic_shape( x_padded = self._to_padded_dense(x_values, x_offsets, max_lengths) - @torch.compile(fullgraph=True, dynamic=True) def jagged_dense_elementwise_add( - x_values: torch.Tensor, x_offsets: torch.Tensor, y: torch.Tensor + x_values: torch.Tensor, x_offsets: List[torch.LongTensor], y: torch.Tensor ) -> torch.Tensor: return torch.ops.fbgemm.jagged_dense_elementwise_add(x_values, x_offsets, y) - @torch.compile(fullgraph=True, dynamic=True) def jagged_dense_elementwise_add_jagged_output( - x_values: torch.Tensor, x_offsets: torch.Tensor, y: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor]: + x_values: torch.Tensor, x_offsets: List[torch.LongTensor], y: torch.Tensor + ) -> Tuple[torch.Tensor, List[torch.LongTensor]]: return torch.ops.fbgemm.jagged_dense_elementwise_add_jagged_output( x_values, x_offsets, y ) - @torch.compile(fullgraph=True, dynamic=True) def jagged_dense_elementwise_mul( - x_values: torch.Tensor, x_offsets: torch.Tensor, y: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor]: + x_values: torch.Tensor, x_offsets: List[torch.LongTensor], y: torch.Tensor + ) -> Tuple[torch.Tensor, List[torch.LongTensor]]: return torch.ops.fbgemm.jagged_dense_elementwise_mul(x_values, x_offsets, y) if operation == "add": @@ -1614,7 +1614,7 @@ def test_jagged_dense_dense_elementwise_add_jagged_output_dynamic_shape( ) output_ref = x_padded + y_0 + y_1 x_values.to(device_type) - (output, output_offsets) = torch.compile( + (output, output_offsets) = torch_compiled( torch.ops.fbgemm.jagged_dense_dense_elementwise_add_jagged_output, fullgraph=True, dynamic=True, @@ -1825,7 +1825,7 @@ def test_batched_dense_vec_jagged_2d_mul_dynamic_shape( torch._dynamo.mark_dynamic(values, 1) torch._dynamo.mark_dynamic(offsets, 0) - output = torch.compile( + output = torch_compiled( torch.ops.fbgemm.batched_dense_vec_jagged_2d_mul, fullgraph=True, dynamic=True, @@ -2363,7 +2363,6 @@ def test_jagged_softmax( if gpu_available else st.just("cpu"), ) - @unittest.skipIf(*on_arm_platform) @settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None) def test_jagged_jagged_bmm( self, @@ -2429,7 +2428,6 @@ def test_jagged_jagged_bmm( if gpu_available else st.just("cpu"), ) - @unittest.skipIf(*on_arm_platform) @settings(verbosity=Verbosity.verbose, max_examples=2, deadline=None) def test_jagged_dense_bmm( self, @@ -2492,7 +2490,6 @@ def test_jagged_dense_bmm( dtype=st.sampled_from([torch.float, torch.double]), device_type=st.just("cpu"), ) - @unittest.skipIf(*on_arm_platform) @settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None) def test_jagged_dense_bmm_dynamic_shape( self, @@ -2521,7 +2518,7 @@ def test_jagged_dense_bmm_dynamic_shape( torch._dynamo.mark_dynamic(x_values, 1) torch._dynamo.mark_dynamic(lengths, 0) # offsets = lengths + 1 - output, _ = torch.compile( + output, _ = torch_compiled( torch.ops.fbgemm.jagged_dense_bmm, fullgraph=True, dynamic=True )( x_values, diff --git a/fbgemm_gpu/test/permute_pooled_embedding_test.py b/fbgemm_gpu/test/permute_pooled_embedding_test.py index 3723ee76c4..ff7575477b 100644 --- a/fbgemm_gpu/test/permute_pooled_embedding_test.py +++ b/fbgemm_gpu/test/permute_pooled_embedding_test.py @@ -8,9 +8,10 @@ import sys import unittest from itertools import accumulate -from typing import List, Tuple +from typing import Any, Callable, Dict, List, Tuple import fbgemm_gpu +import hypothesis.strategies as st import torch import torch._dynamo from fbgemm_gpu.permute_pooled_embedding_modules import PermutePooledEmbeddings @@ -20,12 +21,13 @@ # pyre-fixme[16]: Module `fbgemm_gpu` has no attribute `open_source`. if getattr(fbgemm_gpu, "open_source", False): # pyre-ignore[21] - from test_utils import cpu_and_maybe_gpu, gpu_unavailable, on_arm_platform + from test_utils import cpu_and_maybe_gpu, gpu_unavailable, on_arm_platform, optests else: from fbgemm_gpu.test.test_utils import ( cpu_and_maybe_gpu, gpu_unavailable, on_arm_platform, + optests, ) typed_gpu_unavailable: Tuple[bool, str] = gpu_unavailable @@ -66,12 +68,29 @@ ) +class PermutePooledEmbeddingsFwdOnly(PermutePooledEmbeddings): + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + + def __call__(self, pooled_embs: torch.Tensor) -> torch.Tensor: + result = torch.ops.fbgemm.permute_pooled_embs( + pooled_embs, + self._offset_dim_list.to(device=pooled_embs.device), + self._permute.to(device=pooled_embs.device), + self._inv_offset_dim_list.to(device=pooled_embs.device), + self._inv_permute.to(device=pooled_embs.device), + ) + return result + + class Net(torch.nn.Module): - def __init__(self) -> None: + def __init__(self, fwd_only: bool = False) -> None: super(Net, self).__init__() self.fc1 = torch.nn.Linear(1, 10, bias=False) - self.permute_pooled_embeddings = PermutePooledEmbeddings( - [2, 3, 1, 4], [3, 0, 2, 1] + op_cls = PermutePooledEmbeddingsFwdOnly if fwd_only else PermutePooledEmbeddings + self.permute_pooled_embeddings: PermutePooledEmbeddings = op_cls( + [2, 3, 1, 4], + [3, 0, 2, 1], ) self.fc2 = torch.nn.Linear(10, 1, bias=False) @@ -82,7 +101,24 @@ def forward(self, x: Tensor) -> Tensor: return x +# e.g. "test_faketensor__test_cumsum": [unittest.expectedFailure] +# Please avoid putting tests here, you should put operator-specific +# skips and failures in deeplearning/fbgemm/fbgemm_gpu/test/failures_dict.json +# pyre-ignore[24]: Generic type `Callable` expects 2 type parameters. +additional_decorators: Dict[str, List[Callable]] = { + "test_pt2_compliant_tag_fbgemm_jagged_dense_elementwise_add": [ + # This operator has been grandfathered in. We need to fix this test failure. + unittest.expectedFailure, + ], + "test_pt2_compliant_tag_fbgemm_jagged_dense_elementwise_add_jagged_output": [ + # This operator has been grandfathered in. We need to fix this test failure. + unittest.expectedFailure, + ], +} + + # @parameterized_class([{"device_type": "cpu"}, {"device_type": "cuda"}]) +@optests.generate_opcheck_tests(additional_decorators=additional_decorators) class PooledEmbeddingModulesTest(unittest.TestCase): @settings(deadline=10000, suppress_health_check=suppressed_list) # pyre-fixme[56]: Pyre was not able to infer the type of argument @@ -90,8 +126,11 @@ class PooledEmbeddingModulesTest(unittest.TestCase): def setUp(self, device_type: torch.device) -> None: self.device = device_type - def test_permutation(self) -> None: - net = Net().to(self.device) + @settings(deadline=500) + # pyre-fixme[56]: Pyre was not able to infer the type of argument + @given(fwd_only=st.booleans()) + def test_permutation(self, fwd_only: bool) -> None: + net = Net(fwd_only=fwd_only).to(self.device) input = torch.Tensor([range(10)]).to(self.device) self.assertEqual( diff --git a/fbgemm_gpu/test/quantize_ops_test.py b/fbgemm_gpu/test/quantize_ops_test.py index 493f35f9f8..2b1d6dcde5 100644 --- a/fbgemm_gpu/test/quantize_ops_test.py +++ b/fbgemm_gpu/test/quantize_ops_test.py @@ -7,6 +7,7 @@ import logging import os import random +import sys import unittest from ctypes import c_float, c_int32, cast, POINTER, pointer from typing import Callable, Dict, List, Tuple @@ -1006,10 +1007,6 @@ def test_quantize_and_dequantize_op_cuda_large_nrows_bf16( # skips and failures in deeplearning/fbgemm/fbgemm_gpu/test/failures_dict.json # pyre-ignore[24]: Generic type `Callable` expects 2 type parameters. additional_decorators: Dict[str, List[Callable]] = { - "test_pt2_compliant_tag_fbgemm_dense_to_jagged": [ - # This operator has been grandfathered in. We need to fix this test failure. - unittest.expectedFailure, - ], "test_pt2_compliant_tag_fbgemm_jagged_dense_elementwise_add": [ # This operator has been grandfathered in. We need to fix this test failure. unittest.expectedFailure, @@ -1084,7 +1081,7 @@ def test_quantize_and_dequantize_op_fp8_rowwise( dynamic=True, fullgraph=True, ) - if test_compile + if test_compile and sys.version_info < (3, 12, 0) else torch.ops.fbgemm.FP8RowwiseQuantizedToFloat ) diff --git a/fbgemm_gpu/test/sparse_ops_test.py b/fbgemm_gpu/test/sparse_ops_test.py index d06b7988d0..8f64837016 100644 --- a/fbgemm_gpu/test/sparse_ops_test.py +++ b/fbgemm_gpu/test/sparse_ops_test.py @@ -13,6 +13,7 @@ import logging import os import random +import sys import unittest from itertools import accumulate from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Type, Union @@ -103,6 +104,15 @@ def permute_scripted( ) +# pyre-fixme[2] +# pyre-fixme[24] +def torch_compiled(model: Callable, **kwargs) -> Callable: + if sys.version_info < (3, 12, 0): + return torch.compile(model, **kwargs) + else: + return model + + class SparseOpsTest(unittest.TestCase): @staticmethod @settings(suppress_health_check=suppressed_list) @@ -2018,7 +2028,7 @@ def test_pack_segments( pack_segments_fun = torch.ops.fbgemm.pack_segments if torch_compile: - pack_segments_fun = torch.compile(pack_segments_fun, dynamic=True) + pack_segments_fun = torch_compiled(pack_segments_fun, dynamic=True) packed_cuda = pack_segments_fun( t_in=input_data.cuda(), @@ -2114,7 +2124,7 @@ def test_pack_segments_smaller_max_len( if gpu_available: pack_segments_fun = torch.ops.fbgemm.pack_segments if torch_compile: - pack_segments_fun = torch.compile(pack_segments_fun) + pack_segments_fun = torch_compiled(pack_segments_fun) packed_cuda = pack_segments_fun( t_in=input_data.cuda(), @@ -2721,10 +2731,6 @@ def test_permute_sparse_features_with_repeats( "test_faketensor__test_index_select_dim0": [unittest.skip("hangs")], "test_autograd_registration__test_index_select_dim0": [unittest.skip("hangs")], "test_schema__test_index_select_dim0": [unittest.skip("hangs")], - "test_pt2_compliant_tag_fbgemm_dense_to_jagged": [ - # This operator has been grandfathered in. We need to fix this test failure. - unittest.expectedFailure, - ], "test_pt2_compliant_tag_fbgemm_jagged_dense_elementwise_add": [ # This operator has been grandfathered in. We need to fix this test failure. unittest.expectedFailure, diff --git a/fbgemm_gpu/test/split_table_batched_embeddings_test.py b/fbgemm_gpu/test/split_table_batched_embeddings_test.py index ce8e41e630..037de61815 100644 --- a/fbgemm_gpu/test/split_table_batched_embeddings_test.py +++ b/fbgemm_gpu/test/split_table_batched_embeddings_test.py @@ -4526,6 +4526,12 @@ def execute_nbit_forward_( # noqa C901 nbit_weights_ty=get_nbit_weights_ty(), use_array_for_index_remapping=st.booleans(), do_pruning=st.booleans(), + pooling_mode=st.sampled_from( + [PoolingMode.SUM, PoolingMode.NONE, PoolingMode.MEAN] + ), + output_dtype=st.sampled_from( + [SparseType.FP32, SparseType.FP16, SparseType.BF16] + ), ) @settings( verbosity=VERBOSITY, @@ -4537,6 +4543,8 @@ def test_nbit_forward_cpu( nbit_weights_ty: Optional[SparseType], use_array_for_index_remapping: bool, do_pruning: bool, + pooling_mode: PoolingMode, + output_dtype: SparseType, ) -> None: use_cpu = True T = random.randint(1, 50) @@ -4549,27 +4557,7 @@ def test_nbit_forward_cpu( # cache_algorithm is don't care as we don't use cache. cache_algorithm = CacheAlgorithm.LRU - pooling_mode = random.choice( - [ - PoolingMode.SUM, - PoolingMode.MEAN, - PoolingMode.NONE, - ] - ) mixed = random.choice([True, False]) - if pooling_mode == PoolingMode.NONE: - nbit_weights_ty = random.choice( - [ - SparseType.FP32, - SparseType.FP16, - # CPU sequence embedding does not support FP8/INT4/INT2 yet - # SparseType.FP8, - SparseType.INT8, - # SparseType.INT4, - # SparseType.INT2, - ] - ) - if pooling_mode == PoolingMode.SUM: weighted = random.choice([True, False]) else: @@ -4582,81 +4570,7 @@ def test_nbit_forward_cpu( else: weights_ty: SparseType = nbit_weights_ty mixed_weights_ty = False - output_dtype = random.choice( - ( - [SparseType.BF16] - if weights_ty in [SparseType.INT4, SparseType.INT2] - else [] - ) - + [SparseType.FP32, SparseType.FP16] - ) - self.execute_nbit_forward_( - T, - D, - B, - log_E, - L, - weighted, - mixed, - pooling_mode, - weights_ty, - use_cache, - cache_algorithm, - use_cpu, - use_array_for_index_remapping, - do_pruning, - mixed_weights_ty, - output_dtype, - ) - - @given( - nbit_weights_ty=get_nbit_weights_ty(), - use_array_for_index_remapping=st.booleans(), - do_pruning=st.booleans(), - ) - @settings( - verbosity=VERBOSITY, - max_examples=MAX_EXAMPLES_LONG_RUNNING, - deadline=None, - ) - def test_nbit_forward_cpu_bf16_out( - self, - nbit_weights_ty: Optional[SparseType], - use_array_for_index_remapping: bool, - do_pruning: bool, - ) -> None: - use_cpu = True - T = random.randint(1, 50) - B = random.randint(0, 128) - L = random.randint(0, 32) - D = random.randint(2, 2048) - log_E = random.randint(2, 4) - - use_cache = False - # cache_algorithm is don't care as we don't use cache. - cache_algorithm = CacheAlgorithm.LRU - - pooling_mode = random.choice( - [ - PoolingMode.SUM, - PoolingMode.MEAN, - ] - ) - mixed = random.choice([True, False]) - - if pooling_mode == PoolingMode.SUM: - weighted = random.choice([True, False]) - else: - weighted = False - if nbit_weights_ty is None: - # don't care when mixed type is used. - weights_ty: SparseType = SparseType.INT8 - mixed_weights_ty = True - else: - weights_ty: SparseType = nbit_weights_ty - mixed_weights_ty = False - output_dtype = SparseType.BF16 self.execute_nbit_forward_( T, D, diff --git a/src/DirectConv.h b/src/DirectConv.h index ded8c2c62d..e10597e759 100644 --- a/src/DirectConv.h +++ b/src/DirectConv.h @@ -224,4 +224,4 @@ CodeCache< typename DirectConvCodeGenBase::jit_micro_kernel_fp_convT> DirectConvCodeGenBase::codeCacheT_; -}; // namespace fbgemm +} // namespace fbgemm