From c4d332826a41dd87ac85af585662870cf505b210 Mon Sep 17 00:00:00 2001 From: Li Li Date: Wed, 25 Sep 2024 20:00:24 +0000 Subject: [PATCH] enable performance branch on rocm/pytorch-private:20240715_exec_dashboard_vLLM_nightly --- fbgemm_gpu/cmake/Hip.cmake | 233 +++++++----------- .../codegen/embedding_backward_dense_host.cpp | 4 +- ...embedding_backward_split_host_template.cpp | 4 +- .../embedding_backward_split_template.cu | 8 +- ...edding_forward_quantized_split_template.cu | 4 +- .../embedding_forward_split_template.cu | 4 +- fbgemm_gpu/hip_kernel/split_tbe_fwd_hip.cpp | 2 +- .../include/fbgemm_gpu/fbgemm_cuda_utils.cuh | 28 +-- fbgemm_gpu/include/fbgemm_gpu/sparse_ops.cuh | 2 +- fbgemm_gpu/src/jagged_tensor_ops.cu | 16 +- .../src/merge_pooled_embeddings_gpu.cpp | 2 +- fbgemm_gpu/src/metric_ops.cu | 4 +- fbgemm_gpu/src/quantize_ops.cu | 6 +- fbgemm_gpu/src/sparse_ops.cu | 6 +- fbgemm_gpu/src/sparse_ops_cpu.cpp | 2 +- fbgemm_gpu/src/split_embeddings_cache_cuda.cu | 8 +- src/EmbeddingSpMDM.cc | 2 +- 17 files changed, 142 insertions(+), 193 deletions(-) diff --git a/fbgemm_gpu/cmake/Hip.cmake b/fbgemm_gpu/cmake/Hip.cmake index 7db59ea9fe..b031ce7ce2 100644 --- a/fbgemm_gpu/cmake/Hip.cmake +++ b/fbgemm_gpu/cmake/Hip.cmake @@ -1,123 +1,84 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + set(FBGEMM_HAVE_HIP FALSE) -IF(NOT DEFINED ENV{ROCM_PATH}) - SET(ROCM_PATH /opt/rocm) -ELSE() - SET(ROCM_PATH $ENV{ROCM_PATH}) -ENDIF() -if(NOT DEFINED ENV{ROCM_INCLUDE_DIRS}) - set(ROCM_INCLUDE_DIRS ${ROCM_PATH}/include) +if(NOT DEFINED ENV{ROCM_PATH}) + set(ROCM_PATH /opt/rocm) else() - set(ROCM_INCLUDE_DIRS $ENV{ROCM_INCLUDE_DIRS}) + set(ROCM_PATH $ENV{ROCM_PATH}) endif() -# HIP_PATH -IF(NOT DEFINED ENV{HIP_PATH}) - SET(HIP_PATH ${ROCM_PATH}/hip) -ELSE() - SET(HIP_PATH $ENV{HIP_PATH}) -ENDIF() - -IF(NOT EXISTS ${HIP_PATH}) - return() -ENDIF() - -# HCC_PATH -IF(NOT DEFINED ENV{HCC_PATH}) - SET(HCC_PATH ${ROCM_PATH}/hcc) -ELSE() - SET(HCC_PATH $ENV{HCC_PATH}) -ENDIF() - -# HSA_PATH -IF(NOT DEFINED ENV{HSA_PATH}) - SET(HSA_PATH ${ROCM_PATH}/hsa) -ELSE() - SET(HSA_PATH $ENV{HSA_PATH}) -ENDIF() - -# ROCBLAS_PATH -IF(NOT DEFINED ENV{ROCBLAS_PATH}) - SET(ROCBLAS_PATH ${ROCM_PATH}/rocblas) -ELSE() - SET(ROCBLAS_PATH $ENV{ROCBLAS_PATH}) -ENDIF() - -# ROCSPARSE_PATH -IF(NOT DEFINED ENV{ROCSPARSE_PATH}) - SET(ROCSPARSE_PATH ${ROCM_PATH}/rocsparse) -ELSE() - SET(ROCSPARSE_PATH $ENV{ROCSPARSE_PATH}) -ENDIF() - -# ROCFFT_PATH -IF(NOT DEFINED ENV{ROCFFT_PATH}) - SET(ROCFFT_PATH ${ROCM_PATH}/rocfft) -ELSE() - SET(ROCFFT_PATH $ENV{ROCFFT_PATH}) -ENDIF() - -# HIPSPARSE_PATH -IF(NOT DEFINED ENV{HIPSPARSE_PATH}) - SET(HIPSPARSE_PATH ${ROCM_PATH}/hipsparse) -ELSE() - SET(HIPSPARSE_PATH $ENV{HIPSPARSE_PATH}) -ENDIF() - -# THRUST_PATH -IF(NOT DEFINED ENV{THRUST_PATH}) - SET(THRUST_PATH ${ROCM_PATH}/include) -ELSE() - SET(THRUST_PATH $ENV{THRUST_PATH}) -ENDIF() - -# HIPRAND_PATH -IF(NOT DEFINED ENV{HIPRAND_PATH}) - SET(HIPRAND_PATH ${ROCM_PATH}/hiprand) -ELSE() - SET(HIPRAND_PATH $ENV{HIPRAND_PATH}) -ENDIF() - -# ROCRAND_PATH -IF(NOT DEFINED ENV{ROCRAND_PATH}) - SET(ROCRAND_PATH ${ROCM_PATH}/rocrand) -ELSE() - SET(ROCRAND_PATH $ENV{ROCRAND_PATH}) -ENDIF() - -# MIOPEN_PATH -IF(NOT DEFINED ENV{MIOPEN_PATH}) - SET(MIOPEN_PATH ${ROCM_PATH}/miopen) -ELSE() - SET(MIOPEN_PATH $ENV{MIOPEN_PATH}) -ENDIF() -# Add HIP to the CMAKE Module Path -set(CMAKE_MODULE_PATH ${HIP_PATH}/cmake ${CMAKE_MODULE_PATH}) +macro(torch_hip_get_arch_list store_var) + if(DEFINED ENV{PYTORCH_ROCM_ARCH}) + set(_TMP $ENV{PYTORCH_ROCM_ARCH}) + else() + # Use arch of installed GPUs as default + execute_process(COMMAND "rocm_agent_enumerator" COMMAND bash "-c" "grep -v gfx000 | sort -u | xargs | tr -d '\n'" + RESULT_VARIABLE ROCM_AGENT_ENUMERATOR_RESULT + OUTPUT_VARIABLE ROCM_ARCH_INSTALLED) + if(NOT ROCM_AGENT_ENUMERATOR_RESULT EQUAL 0) + message(FATAL_ERROR " Could not detect ROCm arch for GPUs on machine. Result: '${ROCM_AGENT_ENUMERATOR_RESULT}'") + endif() + set(_TMP ${ROCM_ARCH_INSTALLED}) + endif() + string(REPLACE " " ";" ${store_var} "${_TMP}") +endmacro() + +torch_hip_get_arch_list(PYTORCH_ROCM_ARCH) +if(PYTORCH_ROCM_ARCH STREQUAL "") + message(FATAL_ERROR "No GPU arch specified for ROCm build. Please use PYTORCH_ROCM_ARCH environment variable to specify GPU archs to build for.") +endif() +message("Building FBGEMM for GPU arch: ${PYTORCH_ROCM_ARCH}") -# Disable Asserts In Code (Can't use asserts on HIP stack.) ADD_DEFINITIONS(-DNDEBUG) ADD_DEFINITIONS(-DUSE_ROCM) -IF(NOT DEFINED ENV{PYTORCH_ROCM_ARCH}) - SET(FBGEMM_ROCM_ARCH gfx900;gfx906;gfx908;gfx90a) -ELSE() - SET(FBGEMM_ROCM_ARCH $ENV{PYTORCH_ROCM_ARCH}) -ENDIF() +# Add HIP to the CMAKE Module Path +set(CMAKE_MODULE_PATH ${ROCM_PATH}/lib/cmake/hip ${CMAKE_MODULE_PATH}) + +macro(find_package_and_print_version PACKAGE_NAME) + find_package("${PACKAGE_NAME}" ${ARGN}) + message("${PACKAGE_NAME} VERSION: ${${PACKAGE_NAME}_VERSION}") +endmacro() # Find the HIP Package -find_package(HIP) +find_package_and_print_version(HIP 1.0) -IF(HIP_FOUND) +if(HIP_FOUND) set(FBGEMM_HAVE_HIP TRUE) + set(FOUND_ROCM_VERSION_H FALSE) + + if(EXISTS ${ROCM_PATH}/.info/version-dev) + # ROCM < 4.5, we don't have the header api file, use flat file + file(READ "${ROCM_PATH}/.info/version-dev" ROCM_VERSION_DEV_RAW) + message("\n***** ROCm version from ${ROCM_PATH}/.info/version-dev ****\n") + endif() + + set(PROJECT_RANDOM_BINARY_DIR "${PROJECT_BINARY_DIR}") + set(file "${PROJECT_BINARY_DIR}/detect_rocm_version.cc") # Find ROCM version for checks # ROCM 5.0 and later will have header api for version management if(EXISTS ${ROCM_INCLUDE_DIRS}/rocm_version.h) - - set(PROJECT_RANDOM_BINARY_DIR "${PROJECT_BINARY_DIR}") - set(file "${PROJECT_BINARY_DIR}/detect_rocm_version.cc") + set(FOUND_ROCM_VERSION_H TRUE) file(WRITE ${file} "" "#include \n" + ) + elseif(EXISTS ${ROCM_INCLUDE_DIRS}/rocm-core/rocm_version.h) + set(FOUND_ROCM_VERSION_H TRUE) + file(WRITE ${file} "" + "#include \n" + ) + else() + message("********************* rocm_version.h couldnt be found ******************\n") + endif() + + if(FOUND_ROCM_VERSION_H) + file(APPEND ${file} "" "#include \n" "#ifndef ROCM_VERSION_PATCH\n" @@ -177,37 +138,27 @@ IF(HIP_FOUND) message("\n***** Library versions from cmake find_package *****\n") - # As of ROCm 5.1.x, all *.cmake files are under /opt/rocm/lib/cmake/ - if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "5.1.0") - set(hip_DIR ${HIP_PATH}/lib/cmake/hip) - set(hsa-runtime64_DIR ${ROCM_PATH}/lib/cmake/hsa-runtime64) - set(AMDDeviceLibs_DIR ${ROCM_PATH}/lib/cmake/AMDDeviceLibs) - set(amd_comgr_DIR ${ROCM_PATH}/lib/cmake/amd_comgr) - set(rocrand_DIR ${ROCM_PATH}/lib/cmake/rocrand) - set(hiprand_DIR ${ROCM_PATH}/lib/cmake/hiprand) - set(rocblas_DIR ${ROCM_PATH}/lib/cmake/rocblas) - set(miopen_DIR ${ROCM_PATH}/lib/cmake/miopen) - set(rocfft_DIR ${ROCM_PATH}/lib/cmake/rocfft) - set(hipfft_DIR ${ROCM_PATH}/lib/cmake/hipfft) - set(hipsparse_DIR ${ROCM_PATH}/lib/cmake/hipsparse) - set(rccl_DIR ${ROCM_PATH}/lib/cmake/rccl) - set(rocprim_DIR ${ROCM_PATH}/lib/cmake/rocprim) - set(hipcub_DIR ${ROCM_PATH}/lib/cmake/hipcub) - set(rocthrust_DIR ${ROCM_PATH}/lib/cmake/rocthrust) - set(ROCclr_DIR ${ROCM_PATH}/rocclr/lib/cmake/rocclr) - set(ROCRAND_INCLUDE ${ROCM_PATH}/include) - set(ROCM_SMI_INCLUDE ${ROCM_PATH}/rocm_smi/include) - else() - message(FATAL_ERROR "\n***** The minimal ROCm version is 5.1.0 but have ${ROCM_VERSION_DEV} installed *****\n") - endif() + set(CMAKE_HCC_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG}) + set(CMAKE_HCC_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE}) - find_package(hip REQUIRED) - find_package(rocblas REQUIRED) - find_package(hipfft REQUIRED) - find_package(hiprand REQUIRED) - find_package(rocrand REQUIRED) - find_package(hipsparse REQUIRED) - find_package(rocprim REQUIRED) + find_package_and_print_version(hip REQUIRED) + find_package_and_print_version(hsa-runtime64 REQUIRED) + find_package_and_print_version(amd_comgr REQUIRED) + find_package_and_print_version(rocrand REQUIRED) + find_package_and_print_version(hiprand REQUIRED) + find_package_and_print_version(hipblas REQUIRED) + find_package_and_print_version(rocblas REQUIRED) + find_package_and_print_version(miopen REQUIRED) + find_package_and_print_version(rocfft REQUIRED) + find_package_and_print_version(hipsparse REQUIRED) + find_package_and_print_version(rccl) + find_package_and_print_version(rocprim REQUIRED) + find_package_and_print_version(hipcub REQUIRED) + find_package_and_print_version(rocthrust REQUIRED) + find_package_and_print_version(hipsolver REQUIRED) + + # Enabling HIP language support + enable_language(HIP) if(HIP_COMPILER STREQUAL clang) set(hip_library_name amdhip64) @@ -216,9 +167,11 @@ IF(HIP_FOUND) endif() message("HIP library name: ${hip_library_name}") - set(CMAKE_HCC_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG}) - set(CMAKE_HCC_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE}) - FIND_LIBRARY(FBGEMM_HIP_HCC_LIBRARIES ${hip_library_name} HINTS ${HIP_PATH}/lib) + # TODO: hip_hcc has an interface include flag "-hc" which is only + # recognizable by hcc, but not gcc and clang. Right now in our + # setup, hcc is only used for linking, but it should be used to + # compile the *_hip.cc files as well. + find_library(FBGEMM_HIP_HCC_LIBRARIES ${hip_library_name} HINTS ${ROCM_PATH}/lib) list(APPEND HIP_CXX_FLAGS -D__HIP_NO_HALF_OPERATORS__=1) # list(APPEND HIP_CXX_FLAGS -D__HIP_NO_HALF_CONVERSIONS__=1) @@ -228,25 +181,21 @@ IF(HIP_FOUND) list(APPEND HIP_CXX_FLAGS -mavx2) list(APPEND HIP_CXX_FLAGS -mf16c) list(APPEND HIP_CXX_FLAGS -mfma) - list(APPEND HIP_CXX_FLAGS -std=c++17) + list(APPEND HIP_CXX_FLAGS -std=c++20) set(HIP_HCC_FLAGS ${HIP_CXX_FLAGS}) # Ask hcc to generate device code during compilation so we can use # host linker to link. list(APPEND HIP_HCC_FLAGS -fno-gpu-rdc) list(APPEND HIP_HCC_FLAGS -Wno-defaulted-function-deleted) - foreach(fbgemm_rocm_arch ${FBGEMM_ROCM_ARCH}) + foreach(fbgemm_rocm_arch ${PYTORCH_ROCM_ARCH}) list(APPEND HIP_HCC_FLAGS --offload-arch=${fbgemm_rocm_arch}) endforeach() set(FBGEMM_HIP_INCLUDE ${ROCM_PATH}/include ${FBGEMM_HIP_INCLUDE}) set(FBGEMM_HIP_INCLUDE ${hip_INCLUDE_DIRS} $ $ ${FBGEMM_HIP_INCLUDE}) - hip_include_directories(${FBGEMM_HIP_INCLUDE} ${ROCRAND_INCLUDE} ${ROCM_SMI_INCLUDE}) + hip_include_directories(${FBGEMM_HIP_INCLUDE} ${ROCRAND_INCLUDE} ${ROCM_SMI_INCLUDE} /opt/rocm/include/hipblas) list (APPEND CMAKE_PREFIX_PATH ${HIP_PATH} ${ROCM_PATH}) - set(CMAKE_MODULE_PATH ${HIP_PATH}/cmake ${CMAKE_MODULE_PATH}) - -ELSE() - message("Not able to find HIP installation.") -ENDIF() +endif() \ No newline at end of file diff --git a/fbgemm_gpu/codegen/embedding_backward_dense_host.cpp b/fbgemm_gpu/codegen/embedding_backward_dense_host.cpp index cbb3d15bce..b33fab15d6 100644 --- a/fbgemm_gpu/codegen/embedding_backward_dense_host.cpp +++ b/fbgemm_gpu/codegen/embedding_backward_dense_host.cpp @@ -112,7 +112,7 @@ class SplitLookupFunction_Dense_Op ctx->saved_data["total_hash_size_bits"] = total_hash_size_bits; ctx->saved_data["pooling_mode"] = pooling_mode; -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM constexpr int32_t BT_block_size = 64; #else constexpr int32_t BT_block_size = 32; @@ -164,7 +164,7 @@ class SplitLookupFunction_Dense_Op TORCH_CHECK(grad_outputs.size() == 1); -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM constexpr int32_t BT_block_size = 64; constexpr int32_t max_segment_length_per_warp = 64; #else diff --git a/fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp b/fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp index 8cc4dcdd41..aa026f2d09 100644 --- a/fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp +++ b/fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp @@ -191,7 +191,7 @@ class Split{{ "NoBag" if nobag else "" }}LookupFunction_{{ optimizer }}_Op : {% endfor %} {% if not nobag %} -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM constexpr int32_t BT_block_size = 64; #else constexpr int32_t BT_block_size = 32; @@ -264,7 +264,7 @@ class Split{{ "NoBag" if nobag else "" }}LookupFunction_{{ optimizer }}_Op : TORCH_CHECK(grad_outputs.size() == 1); -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM constexpr int32_t BT_block_size = 64; constexpr int32_t max_segment_length_per_warp = 64; #else diff --git a/fbgemm_gpu/codegen/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/embedding_backward_split_template.cu index 343bcefff3..5b68400798 100644 --- a/fbgemm_gpu/codegen/embedding_backward_split_template.cu +++ b/fbgemm_gpu/codegen/embedding_backward_split_template.cu @@ -865,7 +865,7 @@ split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_ // V100: 96 KB; A100: 160 KB. int max_shared_bytes = 0; -#ifndef __HIP_PLATFORM_HCC__ +#ifndef USE_ROCM cudaDeviceGetAttribute(&max_shared_bytes, cudaDevAttrMaxSharedMemoryPerBlockOptin, dev_weights.get_device()); #else // MI100 has 64 KB local memory (shared memory) per workgroup @@ -874,7 +874,7 @@ split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_ C10_CUDA_KERNEL_LAUNCH_CHECK(); int shared_kb = max_shared_bytes >> 10; // V100: 64 KB; A100: 96 KB. -#ifndef __HIP_PLATFORM_HCC__ +#ifndef USE_ROCM // Use 2/3 of the available GPU shared mem; leave rooms for L1$. int used_shared_kb = round_down(shared_kb * 2 / 3, 16); TORCH_CHECK(used_shared_kb > 0); @@ -1112,7 +1112,7 @@ split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_ // must use dynamic shared memory (rather than statically sized // arrays) and require an explicit opt-in using cudaFuncSetAttribute()". -#ifndef __HIP_PLATFORM_HCC__ +#ifndef USE_ROCM cudaFuncSetAttribute( split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_{{ wdesc }}_kernel_cta_per_row_1< {% if not dense %} @@ -1218,7 +1218,7 @@ split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_ {% endif %}, true>) * 4 * kWarpSize * kMaxVecsPerThread; -#ifndef __HIP_PLATFORM_HCC__ +#ifndef USE_ROCM cudaFuncSetAttribute( split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_{{ wdesc }}_kernel_warp_per_row_1< {% if not dense %} diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu b/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu index e6f7f8c466..3cdf5fb371 100644 --- a/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu +++ b/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu @@ -297,7 +297,7 @@ __global__ void {{ type_map[emb_weight_type].enum_name }}_split_embedding{{ "_no } // equivalent to fence + wait. cp_async_wait<0>(); -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM // Performance - replace a block level __syncthreads with per CU __threadfence_block // __threadfence_block is fine replacement for __syncwarp on AMD GPUs, it is because // a. memory fencing: __threadfence_block ops. at CU level, same as __syncwarp at SM @@ -491,7 +491,7 @@ __global__ __launch_bounds__(kMaxThreads) void int_nbit_split_embedding_codegen_ const uint32_t subwarp_id = threadIdx.x / 4; const uint32_t subwarp_tid = threadIdx.x % 4; -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM const uint64_t subwarp_mask = static_cast(0xF) << (4 * subwarp_id); #else const uint32_t subwarp_mask = static_cast(0xF) << (4 * subwarp_id); diff --git a/fbgemm_gpu/codegen/embedding_forward_split_template.cu b/fbgemm_gpu/codegen/embedding_forward_split_template.cu index 3017b66dbb..59a6713782 100644 --- a/fbgemm_gpu/codegen/embedding_forward_split_template.cu +++ b/fbgemm_gpu/codegen/embedding_forward_split_template.cu @@ -21,7 +21,7 @@ #define SHFL_SYNC(val, srcLane) shfl_sync(val, srcLane, kThreadGroupSize, shfl_sync_mask) -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM #include "hip_kernel/split_tbe_fwd.hip.hpp" #endif @@ -522,7 +522,7 @@ Tensor {{ "dense" if dense else "split" }}_embedding{{ "_nobag" if nobag else "" return output; } -#ifdef __HIP_PLATFORM_HCC__ // HIP Optimal Kernel +#ifdef USE_ROCM // HIP Optimal Kernel /* * current limitations 1. sparse, and bag diff --git a/fbgemm_gpu/hip_kernel/split_tbe_fwd_hip.cpp b/fbgemm_gpu/hip_kernel/split_tbe_fwd_hip.cpp index 03d0471e19..c0a468d396 100644 --- a/fbgemm_gpu/hip_kernel/split_tbe_fwd_hip.cpp +++ b/fbgemm_gpu/hip_kernel/split_tbe_fwd_hip.cpp @@ -20,7 +20,7 @@ * THE SOFTWARE. * ******************************************************************************/ -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM #include #include diff --git a/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh index 4437902d02..0366ccf2ea 100644 --- a/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh +++ b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh @@ -13,7 +13,7 @@ #include #include #include -#if !defined(__HIP_PLATFORM_HCC__) && defined(CUDA_VERSION) && \ +#if !defined(USE_ROCM) && defined(CUDA_VERSION) && \ CUDA_VERSION >= 9000 #define FBGEMM_USE_SUBWARP_SHUFFLE #endif @@ -32,7 +32,7 @@ enum class PrimitiveType : uint8_t { FP = 0, INT = 1, BF = 2 }; #define DEVICE_INLINE __device__ inline __attribute__((always_inline)) // Warp size -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM static constexpr int32_t kWarpSize = 64; #else static constexpr int32_t kWarpSize = 32; @@ -53,7 +53,7 @@ struct Half4 { half2 b; __device__ inline void store(at::Half* p) { -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM p[0] = __low2half(a); p[1] = __high2half(a); p[2] = __low2half(b); @@ -101,7 +101,7 @@ struct Vec4T { } DEVICE_INLINE Vec4T(const at::Half* p) { -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM union U { half2 h[2]; uint2 ui; @@ -235,7 +235,7 @@ struct Vec4T { } DEVICE_INLINE Vec4T(const at::Half* p) { -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM union U { half2 h[2]; uint2 ui; @@ -329,7 +329,7 @@ struct Vec4T { } DEVICE_INLINE static void copy(const at::Half* src, at::Half* dst) { -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; @@ -429,7 +429,7 @@ struct Vec4T { } DEVICE_INLINE Vec4T(const at::Half* p) { -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM union U { half2 h[2]; uint2 ui; @@ -589,7 +589,7 @@ struct Vec4T { } DEVICE_INLINE Vec4T(const at::Half* p) { -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM union U { half2 h[2]; uint2 ui; @@ -734,7 +734,7 @@ DEVICE_INLINE T shfl_xor( int laneMask, int width = kWarpSize, unsigned shfl_sync_mask = 0xffffffffu) { -#if defined(__HIP_PLATFORM_HCC__) || CUDA_VERSION < 9000 +#if defined(USE_ROCM) || CUDA_VERSION < 9000 return __shfl_xor(val, laneMask, width); #else return __shfl_xor_sync(shfl_sync_mask, val, laneMask, width); @@ -747,7 +747,7 @@ DEVICE_INLINE T shfl_sync( int srcLane = 0, int width = kWarpSize, unsigned shfl_sync_mask = 0xffffffffu) { -#if defined(__HIP_PLATFORM_HCC__) || CUDA_VERSION < 9000 +#if defined(USE_ROCM) || CUDA_VERSION < 9000 return __shfl(val, srcLane, width); #else return __shfl_sync(shfl_sync_mask, val, srcLane, width); @@ -841,7 +841,7 @@ inline __device__ void warpBitonicMergeLE16(K& k, V& v) { template struct BitonicSort { static inline __device__ void sort(K k[1], V v[1]) { -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM static_assert(fbgemm_gpu::kWarpSize == 64, "unexpected warp size"); #else static_assert(fbgemm_gpu::kWarpSize == 32, "unexpected warp size"); @@ -1432,7 +1432,7 @@ DEVICE_INLINE float_16 make_zero_float_16() { __forceinline__ __device__ __half2 hfma2(const __half2 a, const __half2 b, const __half2 c) { #if (__CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610) || \ - defined(__HIP_PLATFORM_HCC__) + defined(USE_ROCM) return __hfma2(a, b, c); #else float2 fa, fb, fc; @@ -1447,7 +1447,7 @@ hfma2(const __half2 a, const __half2 b, const __half2 c) { __forceinline__ __device__ half hmul(half a, half b) { #if (__CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610) || \ - defined(__HIP_PLATFORM_HCC__) + defined(USE_ROCM) return __hmul(a, b); #else return __float2half(__half2float(a) * __half2float(b)); @@ -2620,7 +2620,7 @@ DEVICE_INLINE float float16_min(float_16 val) { // ROCm does not natively support __any_sync(). Using __ballot() // (https://rocmdocs.amd.com/en/latest/Programming_Guides/Kernel_language.html) // to implement __any_sync(). Note: the "warp-size" of AMD GPU is 64. -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM __device__ int __any_sync(uint64_t mask, int predicate) { uint64_t predicate_bit_pattern = __ballot(predicate); return (predicate_bit_pattern & mask) > 0; diff --git a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.cuh b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.cuh index 5ca01d3439..5fe67ac463 100644 --- a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.cuh +++ b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.cuh @@ -6,7 +6,7 @@ */ #pragma once -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM #define HIPCUB_ARCH 1 #endif diff --git a/fbgemm_gpu/src/jagged_tensor_ops.cu b/fbgemm_gpu/src/jagged_tensor_ops.cu index c26578750c..6a726367a6 100644 --- a/fbgemm_gpu/src/jagged_tensor_ops.cu +++ b/fbgemm_gpu/src/jagged_tensor_ops.cu @@ -643,7 +643,7 @@ bool jagged_dense_dense_elementwise_jagged_output_matches_opt( matches &= (y_0_reshaped.size(1) < INT_MAX); int max_shared_bytes; -#ifndef __HIP_PLATFORM_HCC__ +#ifndef USE_ROCM cudaDeviceGetAttribute( &max_shared_bytes, cudaDevAttrMaxSharedMemoryPerBlockOptin, @@ -653,7 +653,7 @@ bool jagged_dense_dense_elementwise_jagged_output_matches_opt( max_shared_bytes = 64 << 10; #endif int shared_kb = max_shared_bytes >> 10; -#ifndef __HIP_PLATFORM_HCC__ +#ifndef USE_ROCM // Use 2/3 of the available GPU shared mem; leave rooms for L1$. int used_shared_kb = round_down(shared_kb * 2 / 3, 16); TORCH_CHECK(used_shared_kb > 0); @@ -761,7 +761,7 @@ void jagged_dense_elementwise_jagged_output_opt_( at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock; if (dynamic_smem_size > cur_max_shared_bytes) { int max_shared_bytes; -#ifndef __HIP_PLATFORM_HCC__ +#ifndef USE_ROCM cudaDeviceGetAttribute( &max_shared_bytes, cudaDevAttrMaxSharedMemoryPerBlockOptin, @@ -771,7 +771,7 @@ void jagged_dense_elementwise_jagged_output_opt_( max_shared_bytes = 64 << 10; #endif int shared_kb = max_shared_bytes >> 10; -#ifndef __HIP_PLATFORM_HCC__ +#ifndef USE_ROCM // Use 2/3 of the available GPU shared mem; leave rooms for L1$. int used_shared_kb = round_down(shared_kb * 2 / 3, 16); TORCH_CHECK(used_shared_kb > 0); @@ -780,7 +780,7 @@ void jagged_dense_elementwise_jagged_output_opt_( int used_shared_kb = shared_kb; #endif int used_shared_bytes = used_shared_kb << 10; -#ifndef __HIP_PLATFORM_HCC__ +#ifndef USE_ROCM cudaFuncSetAttribute( jagged_dense_dense_elementwise_jagged_output_opt_search_kernel_< index_t>, @@ -965,7 +965,7 @@ void jagged_dense_dense_elementwise_jagged_output_opt_( at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock; if (dynamic_smem_size > cur_max_shared_bytes) { int max_shared_bytes; -#ifndef __HIP_PLATFORM_HCC__ +#ifndef USE_ROCM cudaDeviceGetAttribute( &max_shared_bytes, cudaDevAttrMaxSharedMemoryPerBlockOptin, @@ -975,7 +975,7 @@ void jagged_dense_dense_elementwise_jagged_output_opt_( max_shared_bytes = 64 << 10; #endif int shared_kb = max_shared_bytes >> 10; -#ifndef __HIP_PLATFORM_HCC__ +#ifndef USE_ROCM // Use 2/3 of the available GPU shared mem; leave rooms for L1$. int used_shared_kb = round_down(shared_kb * 2 / 3, 16); TORCH_CHECK(used_shared_kb > 0); @@ -984,7 +984,7 @@ void jagged_dense_dense_elementwise_jagged_output_opt_( int used_shared_kb = shared_kb; #endif int used_shared_bytes = used_shared_kb << 10; -#ifndef __HIP_PLATFORM_HCC__ +#ifndef USE_ROCM cudaFuncSetAttribute( jagged_dense_dense_elementwise_jagged_output_opt_search_kernel_< index_t>, diff --git a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp index 1acb91f54e..9b11349e59 100644 --- a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp +++ b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp @@ -15,7 +15,7 @@ #include #include -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM #include "hip/hip_runtime.h" #include "rocm_smi/rocm_smi.h" diff --git a/fbgemm_gpu/src/metric_ops.cu b/fbgemm_gpu/src/metric_ops.cu index 2115f949d1..13236870df 100644 --- a/fbgemm_gpu/src/metric_ops.cu +++ b/fbgemm_gpu/src/metric_ops.cu @@ -6,7 +6,7 @@ #include // clang-format off -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM #define HIPCUB_ARCH 1 #include #else @@ -24,7 +24,7 @@ constexpr int NUM_THREADS_PER_BLOCK = 256; namespace fbgemm_gpu { -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM namespace cub = hipcub; #endif diff --git a/fbgemm_gpu/src/quantize_ops.cu b/fbgemm_gpu/src/quantize_ops.cu index 932ca684da..b3246b458e 100644 --- a/fbgemm_gpu/src/quantize_ops.cu +++ b/fbgemm_gpu/src/quantize_ops.cu @@ -7,7 +7,7 @@ #include #include #include -#ifndef __HIP_PLATFORM_HCC__ +#ifndef USE_ROCM #include #endif @@ -113,7 +113,7 @@ __global__ inline void _float_to_fused8bitrowwise_cuda_kernel( template __device__ inline __attribute__((always_inline)) T quantize_ops_shfl_xor(const T val, int laneMask, int width) { -#if defined(__HIP_PLATFORM_HCC__) || CUDA_VERSION < 9000 +#if defined(USE_ROCM) || CUDA_VERSION < 9000 return __shfl_xor(val, laneMask, width); #else return __shfl_xor_sync(0xffffffff, val, laneMask, width); @@ -133,7 +133,7 @@ __global__ inline void _get_8bit_qparam_cuda_kernel( const int output_columns = ncols_aligned + 2 * sizeof(float); // starting values for future reductions -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM #define HIPRT_INF_F __int_as_float(0x7f800000) float minimum_element = HIPRT_INF_F; float maximum_element = -HIPRT_INF_F; diff --git a/fbgemm_gpu/src/sparse_ops.cu b/fbgemm_gpu/src/sparse_ops.cu index a6e0266694..99ef18cb45 100644 --- a/fbgemm_gpu/src/sparse_ops.cu +++ b/fbgemm_gpu/src/sparse_ops.cu @@ -26,11 +26,11 @@ #include "fbgemm_gpu/fbgemm_cuda_utils.cuh" #include "fbgemm_gpu/split_embeddings_utils.cuh" -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM #include #endif -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM #define LDG(ptr) (*(ptr)) #else #define LDG(ptr) (__ldg(ptr)) @@ -2138,7 +2138,7 @@ Tensor permute102_baddbmm_permute102_cuda( // C (m, b, n) = A (m, b, k) * B (b, k, n) ---> row major // C (m, b, n) = (B^T (b, k, n) * A^T (m, b, k))^T ---> column major -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM float alpha = 1.0f; float beta = 1.0f; diff --git a/fbgemm_gpu/src/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops_cpu.cpp index 032dcff5fa..b75986ffae 100644 --- a/fbgemm_gpu/src/sparse_ops_cpu.cpp +++ b/fbgemm_gpu/src/sparse_ops_cpu.cpp @@ -57,7 +57,7 @@ namespace fbgemm_gpu { Tensor native_empty_like(const Tensor& self) { return at::native::empty_like( self, - optTypeMetaToScalarType(self.options().dtype_opt()), + at::optTypeMetaToScalarType(self.options().dtype_opt()), self.options().layout_opt(), self.options().device_opt(), self.options().pinned_memory_opt(), diff --git a/fbgemm_gpu/src/split_embeddings_cache_cuda.cu b/fbgemm_gpu/src/split_embeddings_cache_cuda.cu index b359e26d4e..88f449bfaa 100644 --- a/fbgemm_gpu/src/split_embeddings_cache_cuda.cu +++ b/fbgemm_gpu/src/split_embeddings_cache_cuda.cu @@ -425,7 +425,7 @@ __global__ __launch_bounds__(kMaxThreads) void lru_cache_find_uncached_kernel( lru_state[cache_set][slot] = time_stamp; } -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM if (!__any_sync(0xFFFFFFFFFFFFFFFF, found)) { #else if (!__any_sync(0xFFFFFFFF, found)) { @@ -474,7 +474,7 @@ __launch_bounds__(kMaxThreads) void direct_mapped_lru_cache_find_uncached_kernel cache_sets[n] = -1; // default value } else { // There is no atomicMax for int64_t... -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM auto addr = reinterpret_cast( &lxu_cache_miss_timestamp[cache_set][0]); auto val = static_cast(time_stamp); @@ -1504,7 +1504,7 @@ __global__ __launch_bounds__(kMaxThreads) void lfu_cache_find_uncached_kernel( const auto slot = threadIdx.x; const bool found = __ldg((&lxu_cache_state[cache_set][0]) + slot) == idx; -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM if (!__any_sync(0xFFFFFFFFFFFFFFFF, found)) { #else if (!__any_sync(0xFFFFFFFF, found)) { @@ -2208,7 +2208,7 @@ __global__ __launch_bounds__(kMaxThreads) void lxu_cache_lookup_kernel( } const int32_t cache_set = cache_slot(idx, C); const bool found = (__ldg((&lxu_cache_state[cache_set][0]) + slot) == idx); -#ifdef __HIP_PLATFORM_HCC__ +#ifdef USE_ROCM // FIXME: __ballot_sync with mask isn't supported by HIP yet. // See https://fburl.com/fvy7j0lq for the similar context. // assert false here with https://fburl.com/pfm7enw2 diff --git a/src/EmbeddingSpMDM.cc b/src/EmbeddingSpMDM.cc index cdc80e4d51..d7ed7afa67 100644 --- a/src/EmbeddingSpMDM.cc +++ b/src/EmbeddingSpMDM.cc @@ -1479,7 +1479,7 @@ void compressed_indices_remap( const inst_set_t isa = fbgemmInstructionSet(); if (isZmm(isa)) { -#ifndef __HIP_PLATFORM_HCC__ +#ifndef USE_ROCM if (weights == nullptr) { internal::compressed_indices_remap_avx512( offsets_len,