From c4d332826a41dd87ac85af585662870cf505b210 Mon Sep 17 00:00:00 2001
From: Li Li
Date: Wed, 25 Sep 2024 20:00:24 +0000
Subject: [PATCH] enable performance branch on
rocm/pytorch-private:20240715_exec_dashboard_vLLM_nightly
---
fbgemm_gpu/cmake/Hip.cmake | 233 +++++++-----------
.../codegen/embedding_backward_dense_host.cpp | 4 +-
...embedding_backward_split_host_template.cpp | 4 +-
.../embedding_backward_split_template.cu | 8 +-
...edding_forward_quantized_split_template.cu | 4 +-
.../embedding_forward_split_template.cu | 4 +-
fbgemm_gpu/hip_kernel/split_tbe_fwd_hip.cpp | 2 +-
.../include/fbgemm_gpu/fbgemm_cuda_utils.cuh | 28 +--
fbgemm_gpu/include/fbgemm_gpu/sparse_ops.cuh | 2 +-
fbgemm_gpu/src/jagged_tensor_ops.cu | 16 +-
.../src/merge_pooled_embeddings_gpu.cpp | 2 +-
fbgemm_gpu/src/metric_ops.cu | 4 +-
fbgemm_gpu/src/quantize_ops.cu | 6 +-
fbgemm_gpu/src/sparse_ops.cu | 6 +-
fbgemm_gpu/src/sparse_ops_cpu.cpp | 2 +-
fbgemm_gpu/src/split_embeddings_cache_cuda.cu | 8 +-
src/EmbeddingSpMDM.cc | 2 +-
17 files changed, 142 insertions(+), 193 deletions(-)
diff --git a/fbgemm_gpu/cmake/Hip.cmake b/fbgemm_gpu/cmake/Hip.cmake
index 7db59ea9fe..b031ce7ce2 100644
--- a/fbgemm_gpu/cmake/Hip.cmake
+++ b/fbgemm_gpu/cmake/Hip.cmake
@@ -1,123 +1,84 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
set(FBGEMM_HAVE_HIP FALSE)
-IF(NOT DEFINED ENV{ROCM_PATH})
- SET(ROCM_PATH /opt/rocm)
-ELSE()
- SET(ROCM_PATH $ENV{ROCM_PATH})
-ENDIF()
-if(NOT DEFINED ENV{ROCM_INCLUDE_DIRS})
- set(ROCM_INCLUDE_DIRS ${ROCM_PATH}/include)
+if(NOT DEFINED ENV{ROCM_PATH})
+ set(ROCM_PATH /opt/rocm)
else()
- set(ROCM_INCLUDE_DIRS $ENV{ROCM_INCLUDE_DIRS})
+ set(ROCM_PATH $ENV{ROCM_PATH})
endif()
-# HIP_PATH
-IF(NOT DEFINED ENV{HIP_PATH})
- SET(HIP_PATH ${ROCM_PATH}/hip)
-ELSE()
- SET(HIP_PATH $ENV{HIP_PATH})
-ENDIF()
-
-IF(NOT EXISTS ${HIP_PATH})
- return()
-ENDIF()
-
-# HCC_PATH
-IF(NOT DEFINED ENV{HCC_PATH})
- SET(HCC_PATH ${ROCM_PATH}/hcc)
-ELSE()
- SET(HCC_PATH $ENV{HCC_PATH})
-ENDIF()
-
-# HSA_PATH
-IF(NOT DEFINED ENV{HSA_PATH})
- SET(HSA_PATH ${ROCM_PATH}/hsa)
-ELSE()
- SET(HSA_PATH $ENV{HSA_PATH})
-ENDIF()
-
-# ROCBLAS_PATH
-IF(NOT DEFINED ENV{ROCBLAS_PATH})
- SET(ROCBLAS_PATH ${ROCM_PATH}/rocblas)
-ELSE()
- SET(ROCBLAS_PATH $ENV{ROCBLAS_PATH})
-ENDIF()
-
-# ROCSPARSE_PATH
-IF(NOT DEFINED ENV{ROCSPARSE_PATH})
- SET(ROCSPARSE_PATH ${ROCM_PATH}/rocsparse)
-ELSE()
- SET(ROCSPARSE_PATH $ENV{ROCSPARSE_PATH})
-ENDIF()
-
-# ROCFFT_PATH
-IF(NOT DEFINED ENV{ROCFFT_PATH})
- SET(ROCFFT_PATH ${ROCM_PATH}/rocfft)
-ELSE()
- SET(ROCFFT_PATH $ENV{ROCFFT_PATH})
-ENDIF()
-
-# HIPSPARSE_PATH
-IF(NOT DEFINED ENV{HIPSPARSE_PATH})
- SET(HIPSPARSE_PATH ${ROCM_PATH}/hipsparse)
-ELSE()
- SET(HIPSPARSE_PATH $ENV{HIPSPARSE_PATH})
-ENDIF()
-
-# THRUST_PATH
-IF(NOT DEFINED ENV{THRUST_PATH})
- SET(THRUST_PATH ${ROCM_PATH}/include)
-ELSE()
- SET(THRUST_PATH $ENV{THRUST_PATH})
-ENDIF()
-
-# HIPRAND_PATH
-IF(NOT DEFINED ENV{HIPRAND_PATH})
- SET(HIPRAND_PATH ${ROCM_PATH}/hiprand)
-ELSE()
- SET(HIPRAND_PATH $ENV{HIPRAND_PATH})
-ENDIF()
-
-# ROCRAND_PATH
-IF(NOT DEFINED ENV{ROCRAND_PATH})
- SET(ROCRAND_PATH ${ROCM_PATH}/rocrand)
-ELSE()
- SET(ROCRAND_PATH $ENV{ROCRAND_PATH})
-ENDIF()
-
-# MIOPEN_PATH
-IF(NOT DEFINED ENV{MIOPEN_PATH})
- SET(MIOPEN_PATH ${ROCM_PATH}/miopen)
-ELSE()
- SET(MIOPEN_PATH $ENV{MIOPEN_PATH})
-ENDIF()
-# Add HIP to the CMAKE Module Path
-set(CMAKE_MODULE_PATH ${HIP_PATH}/cmake ${CMAKE_MODULE_PATH})
+macro(torch_hip_get_arch_list store_var)
+ if(DEFINED ENV{PYTORCH_ROCM_ARCH})
+ set(_TMP $ENV{PYTORCH_ROCM_ARCH})
+ else()
+ # Use arch of installed GPUs as default
+ execute_process(COMMAND "rocm_agent_enumerator" COMMAND bash "-c" "grep -v gfx000 | sort -u | xargs | tr -d '\n'"
+ RESULT_VARIABLE ROCM_AGENT_ENUMERATOR_RESULT
+ OUTPUT_VARIABLE ROCM_ARCH_INSTALLED)
+ if(NOT ROCM_AGENT_ENUMERATOR_RESULT EQUAL 0)
+ message(FATAL_ERROR " Could not detect ROCm arch for GPUs on machine. Result: '${ROCM_AGENT_ENUMERATOR_RESULT}'")
+ endif()
+ set(_TMP ${ROCM_ARCH_INSTALLED})
+ endif()
+ string(REPLACE " " ";" ${store_var} "${_TMP}")
+endmacro()
+
+torch_hip_get_arch_list(PYTORCH_ROCM_ARCH)
+if(PYTORCH_ROCM_ARCH STREQUAL "")
+ message(FATAL_ERROR "No GPU arch specified for ROCm build. Please use PYTORCH_ROCM_ARCH environment variable to specify GPU archs to build for.")
+endif()
+message("Building FBGEMM for GPU arch: ${PYTORCH_ROCM_ARCH}")
-# Disable Asserts In Code (Can't use asserts on HIP stack.)
ADD_DEFINITIONS(-DNDEBUG)
ADD_DEFINITIONS(-DUSE_ROCM)
-IF(NOT DEFINED ENV{PYTORCH_ROCM_ARCH})
- SET(FBGEMM_ROCM_ARCH gfx900;gfx906;gfx908;gfx90a)
-ELSE()
- SET(FBGEMM_ROCM_ARCH $ENV{PYTORCH_ROCM_ARCH})
-ENDIF()
+# Add HIP to the CMAKE Module Path
+set(CMAKE_MODULE_PATH ${ROCM_PATH}/lib/cmake/hip ${CMAKE_MODULE_PATH})
+
+macro(find_package_and_print_version PACKAGE_NAME)
+ find_package("${PACKAGE_NAME}" ${ARGN})
+ message("${PACKAGE_NAME} VERSION: ${${PACKAGE_NAME}_VERSION}")
+endmacro()
# Find the HIP Package
-find_package(HIP)
+find_package_and_print_version(HIP 1.0)
-IF(HIP_FOUND)
+if(HIP_FOUND)
set(FBGEMM_HAVE_HIP TRUE)
+ set(FOUND_ROCM_VERSION_H FALSE)
+
+ if(EXISTS ${ROCM_PATH}/.info/version-dev)
+ # ROCM < 4.5, we don't have the header api file, use flat file
+ file(READ "${ROCM_PATH}/.info/version-dev" ROCM_VERSION_DEV_RAW)
+ message("\n***** ROCm version from ${ROCM_PATH}/.info/version-dev ****\n")
+ endif()
+
+ set(PROJECT_RANDOM_BINARY_DIR "${PROJECT_BINARY_DIR}")
+ set(file "${PROJECT_BINARY_DIR}/detect_rocm_version.cc")
# Find ROCM version for checks
# ROCM 5.0 and later will have header api for version management
if(EXISTS ${ROCM_INCLUDE_DIRS}/rocm_version.h)
-
- set(PROJECT_RANDOM_BINARY_DIR "${PROJECT_BINARY_DIR}")
- set(file "${PROJECT_BINARY_DIR}/detect_rocm_version.cc")
+ set(FOUND_ROCM_VERSION_H TRUE)
file(WRITE ${file} ""
"#include \n"
+ )
+ elseif(EXISTS ${ROCM_INCLUDE_DIRS}/rocm-core/rocm_version.h)
+ set(FOUND_ROCM_VERSION_H TRUE)
+ file(WRITE ${file} ""
+ "#include \n"
+ )
+ else()
+ message("********************* rocm_version.h couldnt be found ******************\n")
+ endif()
+
+ if(FOUND_ROCM_VERSION_H)
+ file(APPEND ${file} ""
"#include \n"
"#ifndef ROCM_VERSION_PATCH\n"
@@ -177,37 +138,27 @@ IF(HIP_FOUND)
message("\n***** Library versions from cmake find_package *****\n")
- # As of ROCm 5.1.x, all *.cmake files are under /opt/rocm/lib/cmake/
- if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "5.1.0")
- set(hip_DIR ${HIP_PATH}/lib/cmake/hip)
- set(hsa-runtime64_DIR ${ROCM_PATH}/lib/cmake/hsa-runtime64)
- set(AMDDeviceLibs_DIR ${ROCM_PATH}/lib/cmake/AMDDeviceLibs)
- set(amd_comgr_DIR ${ROCM_PATH}/lib/cmake/amd_comgr)
- set(rocrand_DIR ${ROCM_PATH}/lib/cmake/rocrand)
- set(hiprand_DIR ${ROCM_PATH}/lib/cmake/hiprand)
- set(rocblas_DIR ${ROCM_PATH}/lib/cmake/rocblas)
- set(miopen_DIR ${ROCM_PATH}/lib/cmake/miopen)
- set(rocfft_DIR ${ROCM_PATH}/lib/cmake/rocfft)
- set(hipfft_DIR ${ROCM_PATH}/lib/cmake/hipfft)
- set(hipsparse_DIR ${ROCM_PATH}/lib/cmake/hipsparse)
- set(rccl_DIR ${ROCM_PATH}/lib/cmake/rccl)
- set(rocprim_DIR ${ROCM_PATH}/lib/cmake/rocprim)
- set(hipcub_DIR ${ROCM_PATH}/lib/cmake/hipcub)
- set(rocthrust_DIR ${ROCM_PATH}/lib/cmake/rocthrust)
- set(ROCclr_DIR ${ROCM_PATH}/rocclr/lib/cmake/rocclr)
- set(ROCRAND_INCLUDE ${ROCM_PATH}/include)
- set(ROCM_SMI_INCLUDE ${ROCM_PATH}/rocm_smi/include)
- else()
- message(FATAL_ERROR "\n***** The minimal ROCm version is 5.1.0 but have ${ROCM_VERSION_DEV} installed *****\n")
- endif()
+ set(CMAKE_HCC_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
+ set(CMAKE_HCC_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
- find_package(hip REQUIRED)
- find_package(rocblas REQUIRED)
- find_package(hipfft REQUIRED)
- find_package(hiprand REQUIRED)
- find_package(rocrand REQUIRED)
- find_package(hipsparse REQUIRED)
- find_package(rocprim REQUIRED)
+ find_package_and_print_version(hip REQUIRED)
+ find_package_and_print_version(hsa-runtime64 REQUIRED)
+ find_package_and_print_version(amd_comgr REQUIRED)
+ find_package_and_print_version(rocrand REQUIRED)
+ find_package_and_print_version(hiprand REQUIRED)
+ find_package_and_print_version(hipblas REQUIRED)
+ find_package_and_print_version(rocblas REQUIRED)
+ find_package_and_print_version(miopen REQUIRED)
+ find_package_and_print_version(rocfft REQUIRED)
+ find_package_and_print_version(hipsparse REQUIRED)
+ find_package_and_print_version(rccl)
+ find_package_and_print_version(rocprim REQUIRED)
+ find_package_and_print_version(hipcub REQUIRED)
+ find_package_and_print_version(rocthrust REQUIRED)
+ find_package_and_print_version(hipsolver REQUIRED)
+
+ # Enabling HIP language support
+ enable_language(HIP)
if(HIP_COMPILER STREQUAL clang)
set(hip_library_name amdhip64)
@@ -216,9 +167,11 @@ IF(HIP_FOUND)
endif()
message("HIP library name: ${hip_library_name}")
- set(CMAKE_HCC_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
- set(CMAKE_HCC_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
- FIND_LIBRARY(FBGEMM_HIP_HCC_LIBRARIES ${hip_library_name} HINTS ${HIP_PATH}/lib)
+ # TODO: hip_hcc has an interface include flag "-hc" which is only
+ # recognizable by hcc, but not gcc and clang. Right now in our
+ # setup, hcc is only used for linking, but it should be used to
+ # compile the *_hip.cc files as well.
+ find_library(FBGEMM_HIP_HCC_LIBRARIES ${hip_library_name} HINTS ${ROCM_PATH}/lib)
list(APPEND HIP_CXX_FLAGS -D__HIP_NO_HALF_OPERATORS__=1)
# list(APPEND HIP_CXX_FLAGS -D__HIP_NO_HALF_CONVERSIONS__=1)
@@ -228,25 +181,21 @@ IF(HIP_FOUND)
list(APPEND HIP_CXX_FLAGS -mavx2)
list(APPEND HIP_CXX_FLAGS -mf16c)
list(APPEND HIP_CXX_FLAGS -mfma)
- list(APPEND HIP_CXX_FLAGS -std=c++17)
+ list(APPEND HIP_CXX_FLAGS -std=c++20)
set(HIP_HCC_FLAGS ${HIP_CXX_FLAGS})
# Ask hcc to generate device code during compilation so we can use
# host linker to link.
list(APPEND HIP_HCC_FLAGS -fno-gpu-rdc)
list(APPEND HIP_HCC_FLAGS -Wno-defaulted-function-deleted)
- foreach(fbgemm_rocm_arch ${FBGEMM_ROCM_ARCH})
+ foreach(fbgemm_rocm_arch ${PYTORCH_ROCM_ARCH})
list(APPEND HIP_HCC_FLAGS --offload-arch=${fbgemm_rocm_arch})
endforeach()
set(FBGEMM_HIP_INCLUDE ${ROCM_PATH}/include ${FBGEMM_HIP_INCLUDE})
set(FBGEMM_HIP_INCLUDE ${hip_INCLUDE_DIRS} $ $ ${FBGEMM_HIP_INCLUDE})
- hip_include_directories(${FBGEMM_HIP_INCLUDE} ${ROCRAND_INCLUDE} ${ROCM_SMI_INCLUDE})
+ hip_include_directories(${FBGEMM_HIP_INCLUDE} ${ROCRAND_INCLUDE} ${ROCM_SMI_INCLUDE} /opt/rocm/include/hipblas)
list (APPEND CMAKE_PREFIX_PATH ${HIP_PATH} ${ROCM_PATH})
- set(CMAKE_MODULE_PATH ${HIP_PATH}/cmake ${CMAKE_MODULE_PATH})
-
-ELSE()
- message("Not able to find HIP installation.")
-ENDIF()
+endif()
\ No newline at end of file
diff --git a/fbgemm_gpu/codegen/embedding_backward_dense_host.cpp b/fbgemm_gpu/codegen/embedding_backward_dense_host.cpp
index cbb3d15bce..b33fab15d6 100644
--- a/fbgemm_gpu/codegen/embedding_backward_dense_host.cpp
+++ b/fbgemm_gpu/codegen/embedding_backward_dense_host.cpp
@@ -112,7 +112,7 @@ class SplitLookupFunction_Dense_Op
ctx->saved_data["total_hash_size_bits"] = total_hash_size_bits;
ctx->saved_data["pooling_mode"] = pooling_mode;
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
constexpr int32_t BT_block_size = 64;
#else
constexpr int32_t BT_block_size = 32;
@@ -164,7 +164,7 @@ class SplitLookupFunction_Dense_Op
TORCH_CHECK(grad_outputs.size() == 1);
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
constexpr int32_t BT_block_size = 64;
constexpr int32_t max_segment_length_per_warp = 64;
#else
diff --git a/fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp b/fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp
index 8cc4dcdd41..aa026f2d09 100644
--- a/fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp
+++ b/fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp
@@ -191,7 +191,7 @@ class Split{{ "NoBag" if nobag else "" }}LookupFunction_{{ optimizer }}_Op :
{% endfor %}
{% if not nobag %}
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
constexpr int32_t BT_block_size = 64;
#else
constexpr int32_t BT_block_size = 32;
@@ -264,7 +264,7 @@ class Split{{ "NoBag" if nobag else "" }}LookupFunction_{{ optimizer }}_Op :
TORCH_CHECK(grad_outputs.size() == 1);
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
constexpr int32_t BT_block_size = 64;
constexpr int32_t max_segment_length_per_warp = 64;
#else
diff --git a/fbgemm_gpu/codegen/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/embedding_backward_split_template.cu
index 343bcefff3..5b68400798 100644
--- a/fbgemm_gpu/codegen/embedding_backward_split_template.cu
+++ b/fbgemm_gpu/codegen/embedding_backward_split_template.cu
@@ -865,7 +865,7 @@ split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_
// V100: 96 KB; A100: 160 KB.
int max_shared_bytes = 0;
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef USE_ROCM
cudaDeviceGetAttribute(&max_shared_bytes, cudaDevAttrMaxSharedMemoryPerBlockOptin, dev_weights.get_device());
#else
// MI100 has 64 KB local memory (shared memory) per workgroup
@@ -874,7 +874,7 @@ split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_
C10_CUDA_KERNEL_LAUNCH_CHECK();
int shared_kb = max_shared_bytes >> 10;
// V100: 64 KB; A100: 96 KB.
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef USE_ROCM
// Use 2/3 of the available GPU shared mem; leave rooms for L1$.
int used_shared_kb = round_down(shared_kb * 2 / 3, 16);
TORCH_CHECK(used_shared_kb > 0);
@@ -1112,7 +1112,7 @@ split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_
// must use dynamic shared memory (rather than statically sized
// arrays) and require an explicit opt-in using cudaFuncSetAttribute()".
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef USE_ROCM
cudaFuncSetAttribute(
split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_{{ wdesc }}_kernel_cta_per_row_1<
{% if not dense %}
@@ -1218,7 +1218,7 @@ split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_
{% endif %},
true>) * 4 * kWarpSize *
kMaxVecsPerThread;
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef USE_ROCM
cudaFuncSetAttribute(
split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_{{ wdesc }}_kernel_warp_per_row_1<
{% if not dense %}
diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu b/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu
index e6f7f8c466..3cdf5fb371 100644
--- a/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu
+++ b/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu
@@ -297,7 +297,7 @@ __global__ void {{ type_map[emb_weight_type].enum_name }}_split_embedding{{ "_no
}
// equivalent to fence + wait.
cp_async_wait<0>();
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
// Performance - replace a block level __syncthreads with per CU __threadfence_block
// __threadfence_block is fine replacement for __syncwarp on AMD GPUs, it is because
// a. memory fencing: __threadfence_block ops. at CU level, same as __syncwarp at SM
@@ -491,7 +491,7 @@ __global__ __launch_bounds__(kMaxThreads) void int_nbit_split_embedding_codegen_
const uint32_t subwarp_id = threadIdx.x / 4;
const uint32_t subwarp_tid = threadIdx.x % 4;
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
const uint64_t subwarp_mask = static_cast(0xF) << (4 * subwarp_id);
#else
const uint32_t subwarp_mask = static_cast(0xF) << (4 * subwarp_id);
diff --git a/fbgemm_gpu/codegen/embedding_forward_split_template.cu b/fbgemm_gpu/codegen/embedding_forward_split_template.cu
index 3017b66dbb..59a6713782 100644
--- a/fbgemm_gpu/codegen/embedding_forward_split_template.cu
+++ b/fbgemm_gpu/codegen/embedding_forward_split_template.cu
@@ -21,7 +21,7 @@
#define SHFL_SYNC(val, srcLane) shfl_sync(val, srcLane, kThreadGroupSize, shfl_sync_mask)
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
#include "hip_kernel/split_tbe_fwd.hip.hpp"
#endif
@@ -522,7 +522,7 @@ Tensor {{ "dense" if dense else "split" }}_embedding{{ "_nobag" if nobag else ""
return output;
}
-#ifdef __HIP_PLATFORM_HCC__ // HIP Optimal Kernel
+#ifdef USE_ROCM // HIP Optimal Kernel
/*
* current limitations
1. sparse, and bag
diff --git a/fbgemm_gpu/hip_kernel/split_tbe_fwd_hip.cpp b/fbgemm_gpu/hip_kernel/split_tbe_fwd_hip.cpp
index 03d0471e19..c0a468d396 100644
--- a/fbgemm_gpu/hip_kernel/split_tbe_fwd_hip.cpp
+++ b/fbgemm_gpu/hip_kernel/split_tbe_fwd_hip.cpp
@@ -20,7 +20,7 @@
* THE SOFTWARE.
*
******************************************************************************/
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
#include
#include
diff --git a/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh
index 4437902d02..0366ccf2ea 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh
+++ b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh
@@ -13,7 +13,7 @@
#include
#include
#include
-#if !defined(__HIP_PLATFORM_HCC__) && defined(CUDA_VERSION) && \
+#if !defined(USE_ROCM) && defined(CUDA_VERSION) && \
CUDA_VERSION >= 9000
#define FBGEMM_USE_SUBWARP_SHUFFLE
#endif
@@ -32,7 +32,7 @@ enum class PrimitiveType : uint8_t { FP = 0, INT = 1, BF = 2 };
#define DEVICE_INLINE __device__ inline __attribute__((always_inline))
// Warp size
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
static constexpr int32_t kWarpSize = 64;
#else
static constexpr int32_t kWarpSize = 32;
@@ -53,7 +53,7 @@ struct Half4 {
half2 b;
__device__ inline void store(at::Half* p) {
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
p[0] = __low2half(a);
p[1] = __high2half(a);
p[2] = __low2half(b);
@@ -101,7 +101,7 @@ struct Vec4T {
}
DEVICE_INLINE Vec4T(const at::Half* p) {
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
union U {
half2 h[2];
uint2 ui;
@@ -235,7 +235,7 @@ struct Vec4T {
}
DEVICE_INLINE Vec4T(const at::Half* p) {
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
union U {
half2 h[2];
uint2 ui;
@@ -329,7 +329,7 @@ struct Vec4T {
}
DEVICE_INLINE static void copy(const at::Half* src, at::Half* dst) {
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
dst[0] = src[0];
dst[1] = src[1];
dst[2] = src[2];
@@ -429,7 +429,7 @@ struct Vec4T {
}
DEVICE_INLINE Vec4T(const at::Half* p) {
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
union U {
half2 h[2];
uint2 ui;
@@ -589,7 +589,7 @@ struct Vec4T {
}
DEVICE_INLINE Vec4T(const at::Half* p) {
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
union U {
half2 h[2];
uint2 ui;
@@ -734,7 +734,7 @@ DEVICE_INLINE T shfl_xor(
int laneMask,
int width = kWarpSize,
unsigned shfl_sync_mask = 0xffffffffu) {
-#if defined(__HIP_PLATFORM_HCC__) || CUDA_VERSION < 9000
+#if defined(USE_ROCM) || CUDA_VERSION < 9000
return __shfl_xor(val, laneMask, width);
#else
return __shfl_xor_sync(shfl_sync_mask, val, laneMask, width);
@@ -747,7 +747,7 @@ DEVICE_INLINE T shfl_sync(
int srcLane = 0,
int width = kWarpSize,
unsigned shfl_sync_mask = 0xffffffffu) {
-#if defined(__HIP_PLATFORM_HCC__) || CUDA_VERSION < 9000
+#if defined(USE_ROCM) || CUDA_VERSION < 9000
return __shfl(val, srcLane, width);
#else
return __shfl_sync(shfl_sync_mask, val, srcLane, width);
@@ -841,7 +841,7 @@ inline __device__ void warpBitonicMergeLE16(K& k, V& v) {
template
struct BitonicSort {
static inline __device__ void sort(K k[1], V v[1]) {
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
static_assert(fbgemm_gpu::kWarpSize == 64, "unexpected warp size");
#else
static_assert(fbgemm_gpu::kWarpSize == 32, "unexpected warp size");
@@ -1432,7 +1432,7 @@ DEVICE_INLINE float_16 make_zero_float_16() {
__forceinline__ __device__ __half2
hfma2(const __half2 a, const __half2 b, const __half2 c) {
#if (__CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610) || \
- defined(__HIP_PLATFORM_HCC__)
+ defined(USE_ROCM)
return __hfma2(a, b, c);
#else
float2 fa, fb, fc;
@@ -1447,7 +1447,7 @@ hfma2(const __half2 a, const __half2 b, const __half2 c) {
__forceinline__ __device__ half hmul(half a, half b) {
#if (__CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610) || \
- defined(__HIP_PLATFORM_HCC__)
+ defined(USE_ROCM)
return __hmul(a, b);
#else
return __float2half(__half2float(a) * __half2float(b));
@@ -2620,7 +2620,7 @@ DEVICE_INLINE float float16_min(float_16 val) {
// ROCm does not natively support __any_sync(). Using __ballot()
// (https://rocmdocs.amd.com/en/latest/Programming_Guides/Kernel_language.html)
// to implement __any_sync(). Note: the "warp-size" of AMD GPU is 64.
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
__device__ int __any_sync(uint64_t mask, int predicate) {
uint64_t predicate_bit_pattern = __ballot(predicate);
return (predicate_bit_pattern & mask) > 0;
diff --git a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.cuh b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.cuh
index 5ca01d3439..5fe67ac463 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.cuh
+++ b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.cuh
@@ -6,7 +6,7 @@
*/
#pragma once
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
#define HIPCUB_ARCH 1
#endif
diff --git a/fbgemm_gpu/src/jagged_tensor_ops.cu b/fbgemm_gpu/src/jagged_tensor_ops.cu
index c26578750c..6a726367a6 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops.cu
+++ b/fbgemm_gpu/src/jagged_tensor_ops.cu
@@ -643,7 +643,7 @@ bool jagged_dense_dense_elementwise_jagged_output_matches_opt(
matches &= (y_0_reshaped.size(1) < INT_MAX);
int max_shared_bytes;
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef USE_ROCM
cudaDeviceGetAttribute(
&max_shared_bytes,
cudaDevAttrMaxSharedMemoryPerBlockOptin,
@@ -653,7 +653,7 @@ bool jagged_dense_dense_elementwise_jagged_output_matches_opt(
max_shared_bytes = 64 << 10;
#endif
int shared_kb = max_shared_bytes >> 10;
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef USE_ROCM
// Use 2/3 of the available GPU shared mem; leave rooms for L1$.
int used_shared_kb = round_down(shared_kb * 2 / 3, 16);
TORCH_CHECK(used_shared_kb > 0);
@@ -761,7 +761,7 @@ void jagged_dense_elementwise_jagged_output_opt_(
at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock;
if (dynamic_smem_size > cur_max_shared_bytes) {
int max_shared_bytes;
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef USE_ROCM
cudaDeviceGetAttribute(
&max_shared_bytes,
cudaDevAttrMaxSharedMemoryPerBlockOptin,
@@ -771,7 +771,7 @@ void jagged_dense_elementwise_jagged_output_opt_(
max_shared_bytes = 64 << 10;
#endif
int shared_kb = max_shared_bytes >> 10;
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef USE_ROCM
// Use 2/3 of the available GPU shared mem; leave rooms for L1$.
int used_shared_kb = round_down(shared_kb * 2 / 3, 16);
TORCH_CHECK(used_shared_kb > 0);
@@ -780,7 +780,7 @@ void jagged_dense_elementwise_jagged_output_opt_(
int used_shared_kb = shared_kb;
#endif
int used_shared_bytes = used_shared_kb << 10;
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef USE_ROCM
cudaFuncSetAttribute(
jagged_dense_dense_elementwise_jagged_output_opt_search_kernel_<
index_t>,
@@ -965,7 +965,7 @@ void jagged_dense_dense_elementwise_jagged_output_opt_(
at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock;
if (dynamic_smem_size > cur_max_shared_bytes) {
int max_shared_bytes;
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef USE_ROCM
cudaDeviceGetAttribute(
&max_shared_bytes,
cudaDevAttrMaxSharedMemoryPerBlockOptin,
@@ -975,7 +975,7 @@ void jagged_dense_dense_elementwise_jagged_output_opt_(
max_shared_bytes = 64 << 10;
#endif
int shared_kb = max_shared_bytes >> 10;
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef USE_ROCM
// Use 2/3 of the available GPU shared mem; leave rooms for L1$.
int used_shared_kb = round_down(shared_kb * 2 / 3, 16);
TORCH_CHECK(used_shared_kb > 0);
@@ -984,7 +984,7 @@ void jagged_dense_dense_elementwise_jagged_output_opt_(
int used_shared_kb = shared_kb;
#endif
int used_shared_bytes = used_shared_kb << 10;
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef USE_ROCM
cudaFuncSetAttribute(
jagged_dense_dense_elementwise_jagged_output_opt_search_kernel_<
index_t>,
diff --git a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
index 1acb91f54e..9b11349e59 100644
--- a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
+++ b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
@@ -15,7 +15,7 @@
#include
#include
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
#include "hip/hip_runtime.h"
#include "rocm_smi/rocm_smi.h"
diff --git a/fbgemm_gpu/src/metric_ops.cu b/fbgemm_gpu/src/metric_ops.cu
index 2115f949d1..13236870df 100644
--- a/fbgemm_gpu/src/metric_ops.cu
+++ b/fbgemm_gpu/src/metric_ops.cu
@@ -6,7 +6,7 @@
#include
// clang-format off
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
#define HIPCUB_ARCH 1
#include
#else
@@ -24,7 +24,7 @@ constexpr int NUM_THREADS_PER_BLOCK = 256;
namespace fbgemm_gpu {
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
namespace cub = hipcub;
#endif
diff --git a/fbgemm_gpu/src/quantize_ops.cu b/fbgemm_gpu/src/quantize_ops.cu
index 932ca684da..b3246b458e 100644
--- a/fbgemm_gpu/src/quantize_ops.cu
+++ b/fbgemm_gpu/src/quantize_ops.cu
@@ -7,7 +7,7 @@
#include
#include
#include
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef USE_ROCM
#include
#endif
@@ -113,7 +113,7 @@ __global__ inline void _float_to_fused8bitrowwise_cuda_kernel(
template
__device__ inline __attribute__((always_inline)) T
quantize_ops_shfl_xor(const T val, int laneMask, int width) {
-#if defined(__HIP_PLATFORM_HCC__) || CUDA_VERSION < 9000
+#if defined(USE_ROCM) || CUDA_VERSION < 9000
return __shfl_xor(val, laneMask, width);
#else
return __shfl_xor_sync(0xffffffff, val, laneMask, width);
@@ -133,7 +133,7 @@ __global__ inline void _get_8bit_qparam_cuda_kernel(
const int output_columns = ncols_aligned + 2 * sizeof(float);
// starting values for future reductions
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
#define HIPRT_INF_F __int_as_float(0x7f800000)
float minimum_element = HIPRT_INF_F;
float maximum_element = -HIPRT_INF_F;
diff --git a/fbgemm_gpu/src/sparse_ops.cu b/fbgemm_gpu/src/sparse_ops.cu
index a6e0266694..99ef18cb45 100644
--- a/fbgemm_gpu/src/sparse_ops.cu
+++ b/fbgemm_gpu/src/sparse_ops.cu
@@ -26,11 +26,11 @@
#include "fbgemm_gpu/fbgemm_cuda_utils.cuh"
#include "fbgemm_gpu/split_embeddings_utils.cuh"
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
#include
#endif
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
#define LDG(ptr) (*(ptr))
#else
#define LDG(ptr) (__ldg(ptr))
@@ -2138,7 +2138,7 @@ Tensor permute102_baddbmm_permute102_cuda(
// C (m, b, n) = A (m, b, k) * B (b, k, n) ---> row major
// C (m, b, n) = (B^T (b, k, n) * A^T (m, b, k))^T ---> column major
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
float alpha = 1.0f;
float beta = 1.0f;
diff --git a/fbgemm_gpu/src/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops_cpu.cpp
index 032dcff5fa..b75986ffae 100644
--- a/fbgemm_gpu/src/sparse_ops_cpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops_cpu.cpp
@@ -57,7 +57,7 @@ namespace fbgemm_gpu {
Tensor native_empty_like(const Tensor& self) {
return at::native::empty_like(
self,
- optTypeMetaToScalarType(self.options().dtype_opt()),
+ at::optTypeMetaToScalarType(self.options().dtype_opt()),
self.options().layout_opt(),
self.options().device_opt(),
self.options().pinned_memory_opt(),
diff --git a/fbgemm_gpu/src/split_embeddings_cache_cuda.cu b/fbgemm_gpu/src/split_embeddings_cache_cuda.cu
index b359e26d4e..88f449bfaa 100644
--- a/fbgemm_gpu/src/split_embeddings_cache_cuda.cu
+++ b/fbgemm_gpu/src/split_embeddings_cache_cuda.cu
@@ -425,7 +425,7 @@ __global__ __launch_bounds__(kMaxThreads) void lru_cache_find_uncached_kernel(
lru_state[cache_set][slot] = time_stamp;
}
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
if (!__any_sync(0xFFFFFFFFFFFFFFFF, found)) {
#else
if (!__any_sync(0xFFFFFFFF, found)) {
@@ -474,7 +474,7 @@ __launch_bounds__(kMaxThreads) void direct_mapped_lru_cache_find_uncached_kernel
cache_sets[n] = -1; // default value
} else {
// There is no atomicMax for int64_t...
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
auto addr = reinterpret_cast(
&lxu_cache_miss_timestamp[cache_set][0]);
auto val = static_cast(time_stamp);
@@ -1504,7 +1504,7 @@ __global__ __launch_bounds__(kMaxThreads) void lfu_cache_find_uncached_kernel(
const auto slot = threadIdx.x;
const bool found = __ldg((&lxu_cache_state[cache_set][0]) + slot) == idx;
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
if (!__any_sync(0xFFFFFFFFFFFFFFFF, found)) {
#else
if (!__any_sync(0xFFFFFFFF, found)) {
@@ -2208,7 +2208,7 @@ __global__ __launch_bounds__(kMaxThreads) void lxu_cache_lookup_kernel(
}
const int32_t cache_set = cache_slot(idx, C);
const bool found = (__ldg((&lxu_cache_state[cache_set][0]) + slot) == idx);
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef USE_ROCM
// FIXME: __ballot_sync with mask isn't supported by HIP yet.
// See https://fburl.com/fvy7j0lq for the similar context.
// assert false here with https://fburl.com/pfm7enw2
diff --git a/src/EmbeddingSpMDM.cc b/src/EmbeddingSpMDM.cc
index cdc80e4d51..d7ed7afa67 100644
--- a/src/EmbeddingSpMDM.cc
+++ b/src/EmbeddingSpMDM.cc
@@ -1479,7 +1479,7 @@ void compressed_indices_remap(
const inst_set_t isa = fbgemmInstructionSet();
if (isZmm(isa)) {
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef USE_ROCM
if (weights == nullptr) {
internal::compressed_indices_remap_avx512(
offsets_len,