Skip to content

Commit

Permalink
enable performance branch on rocm/pytorch-private:20240715_exec_dashb…
Browse files Browse the repository at this point in the history
…oard_vLLM_nightly
  • Loading branch information
liligwu committed Sep 25, 2024
1 parent 9d181e5 commit c4d3328
Show file tree
Hide file tree
Showing 17 changed files with 142 additions and 193 deletions.
233 changes: 91 additions & 142 deletions fbgemm_gpu/cmake/Hip.cmake
Original file line number Diff line number Diff line change
@@ -1,123 +1,84 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

set(FBGEMM_HAVE_HIP FALSE)

IF(NOT DEFINED ENV{ROCM_PATH})
SET(ROCM_PATH /opt/rocm)
ELSE()
SET(ROCM_PATH $ENV{ROCM_PATH})
ENDIF()
if(NOT DEFINED ENV{ROCM_INCLUDE_DIRS})
set(ROCM_INCLUDE_DIRS ${ROCM_PATH}/include)
if(NOT DEFINED ENV{ROCM_PATH})
set(ROCM_PATH /opt/rocm)
else()
set(ROCM_INCLUDE_DIRS $ENV{ROCM_INCLUDE_DIRS})
set(ROCM_PATH $ENV{ROCM_PATH})
endif()
# HIP_PATH
IF(NOT DEFINED ENV{HIP_PATH})
SET(HIP_PATH ${ROCM_PATH}/hip)
ELSE()
SET(HIP_PATH $ENV{HIP_PATH})
ENDIF()

IF(NOT EXISTS ${HIP_PATH})
return()
ENDIF()

# HCC_PATH
IF(NOT DEFINED ENV{HCC_PATH})
SET(HCC_PATH ${ROCM_PATH}/hcc)
ELSE()
SET(HCC_PATH $ENV{HCC_PATH})
ENDIF()

# HSA_PATH
IF(NOT DEFINED ENV{HSA_PATH})
SET(HSA_PATH ${ROCM_PATH}/hsa)
ELSE()
SET(HSA_PATH $ENV{HSA_PATH})
ENDIF()

# ROCBLAS_PATH
IF(NOT DEFINED ENV{ROCBLAS_PATH})
SET(ROCBLAS_PATH ${ROCM_PATH}/rocblas)
ELSE()
SET(ROCBLAS_PATH $ENV{ROCBLAS_PATH})
ENDIF()

# ROCSPARSE_PATH
IF(NOT DEFINED ENV{ROCSPARSE_PATH})
SET(ROCSPARSE_PATH ${ROCM_PATH}/rocsparse)
ELSE()
SET(ROCSPARSE_PATH $ENV{ROCSPARSE_PATH})
ENDIF()

# ROCFFT_PATH
IF(NOT DEFINED ENV{ROCFFT_PATH})
SET(ROCFFT_PATH ${ROCM_PATH}/rocfft)
ELSE()
SET(ROCFFT_PATH $ENV{ROCFFT_PATH})
ENDIF()

# HIPSPARSE_PATH
IF(NOT DEFINED ENV{HIPSPARSE_PATH})
SET(HIPSPARSE_PATH ${ROCM_PATH}/hipsparse)
ELSE()
SET(HIPSPARSE_PATH $ENV{HIPSPARSE_PATH})
ENDIF()

# THRUST_PATH
IF(NOT DEFINED ENV{THRUST_PATH})
SET(THRUST_PATH ${ROCM_PATH}/include)
ELSE()
SET(THRUST_PATH $ENV{THRUST_PATH})
ENDIF()

# HIPRAND_PATH
IF(NOT DEFINED ENV{HIPRAND_PATH})
SET(HIPRAND_PATH ${ROCM_PATH}/hiprand)
ELSE()
SET(HIPRAND_PATH $ENV{HIPRAND_PATH})
ENDIF()

# ROCRAND_PATH
IF(NOT DEFINED ENV{ROCRAND_PATH})
SET(ROCRAND_PATH ${ROCM_PATH}/rocrand)
ELSE()
SET(ROCRAND_PATH $ENV{ROCRAND_PATH})
ENDIF()

# MIOPEN_PATH
IF(NOT DEFINED ENV{MIOPEN_PATH})
SET(MIOPEN_PATH ${ROCM_PATH}/miopen)
ELSE()
SET(MIOPEN_PATH $ENV{MIOPEN_PATH})
ENDIF()

# Add HIP to the CMAKE Module Path
set(CMAKE_MODULE_PATH ${HIP_PATH}/cmake ${CMAKE_MODULE_PATH})
macro(torch_hip_get_arch_list store_var)
if(DEFINED ENV{PYTORCH_ROCM_ARCH})
set(_TMP $ENV{PYTORCH_ROCM_ARCH})
else()
# Use arch of installed GPUs as default
execute_process(COMMAND "rocm_agent_enumerator" COMMAND bash "-c" "grep -v gfx000 | sort -u | xargs | tr -d '\n'"
RESULT_VARIABLE ROCM_AGENT_ENUMERATOR_RESULT
OUTPUT_VARIABLE ROCM_ARCH_INSTALLED)
if(NOT ROCM_AGENT_ENUMERATOR_RESULT EQUAL 0)
message(FATAL_ERROR " Could not detect ROCm arch for GPUs on machine. Result: '${ROCM_AGENT_ENUMERATOR_RESULT}'")
endif()
set(_TMP ${ROCM_ARCH_INSTALLED})
endif()
string(REPLACE " " ";" ${store_var} "${_TMP}")
endmacro()

torch_hip_get_arch_list(PYTORCH_ROCM_ARCH)
if(PYTORCH_ROCM_ARCH STREQUAL "")
message(FATAL_ERROR "No GPU arch specified for ROCm build. Please use PYTORCH_ROCM_ARCH environment variable to specify GPU archs to build for.")
endif()
message("Building FBGEMM for GPU arch: ${PYTORCH_ROCM_ARCH}")

# Disable Asserts In Code (Can't use asserts on HIP stack.)
ADD_DEFINITIONS(-DNDEBUG)
ADD_DEFINITIONS(-DUSE_ROCM)

IF(NOT DEFINED ENV{PYTORCH_ROCM_ARCH})
SET(FBGEMM_ROCM_ARCH gfx900;gfx906;gfx908;gfx90a)
ELSE()
SET(FBGEMM_ROCM_ARCH $ENV{PYTORCH_ROCM_ARCH})
ENDIF()
# Add HIP to the CMAKE Module Path
set(CMAKE_MODULE_PATH ${ROCM_PATH}/lib/cmake/hip ${CMAKE_MODULE_PATH})

macro(find_package_and_print_version PACKAGE_NAME)
find_package("${PACKAGE_NAME}" ${ARGN})
message("${PACKAGE_NAME} VERSION: ${${PACKAGE_NAME}_VERSION}")
endmacro()

# Find the HIP Package
find_package(HIP)
find_package_and_print_version(HIP 1.0)

IF(HIP_FOUND)
if(HIP_FOUND)
set(FBGEMM_HAVE_HIP TRUE)
set(FOUND_ROCM_VERSION_H FALSE)

if(EXISTS ${ROCM_PATH}/.info/version-dev)
# ROCM < 4.5, we don't have the header api file, use flat file
file(READ "${ROCM_PATH}/.info/version-dev" ROCM_VERSION_DEV_RAW)
message("\n***** ROCm version from ${ROCM_PATH}/.info/version-dev ****\n")
endif()

set(PROJECT_RANDOM_BINARY_DIR "${PROJECT_BINARY_DIR}")
set(file "${PROJECT_BINARY_DIR}/detect_rocm_version.cc")

# Find ROCM version for checks
# ROCM 5.0 and later will have header api for version management
if(EXISTS ${ROCM_INCLUDE_DIRS}/rocm_version.h)

set(PROJECT_RANDOM_BINARY_DIR "${PROJECT_BINARY_DIR}")
set(file "${PROJECT_BINARY_DIR}/detect_rocm_version.cc")
set(FOUND_ROCM_VERSION_H TRUE)
file(WRITE ${file} ""
"#include <rocm_version.h>\n"
)
elseif(EXISTS ${ROCM_INCLUDE_DIRS}/rocm-core/rocm_version.h)
set(FOUND_ROCM_VERSION_H TRUE)
file(WRITE ${file} ""
"#include <rocm-core/rocm_version.h>\n"
)
else()
message("********************* rocm_version.h couldnt be found ******************\n")
endif()

if(FOUND_ROCM_VERSION_H)
file(APPEND ${file} ""
"#include <cstdio>\n"

"#ifndef ROCM_VERSION_PATCH\n"
Expand Down Expand Up @@ -177,37 +138,27 @@ IF(HIP_FOUND)

message("\n***** Library versions from cmake find_package *****\n")

# As of ROCm 5.1.x, all *.cmake files are under /opt/rocm/lib/cmake/<package>
if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "5.1.0")
set(hip_DIR ${HIP_PATH}/lib/cmake/hip)
set(hsa-runtime64_DIR ${ROCM_PATH}/lib/cmake/hsa-runtime64)
set(AMDDeviceLibs_DIR ${ROCM_PATH}/lib/cmake/AMDDeviceLibs)
set(amd_comgr_DIR ${ROCM_PATH}/lib/cmake/amd_comgr)
set(rocrand_DIR ${ROCM_PATH}/lib/cmake/rocrand)
set(hiprand_DIR ${ROCM_PATH}/lib/cmake/hiprand)
set(rocblas_DIR ${ROCM_PATH}/lib/cmake/rocblas)
set(miopen_DIR ${ROCM_PATH}/lib/cmake/miopen)
set(rocfft_DIR ${ROCM_PATH}/lib/cmake/rocfft)
set(hipfft_DIR ${ROCM_PATH}/lib/cmake/hipfft)
set(hipsparse_DIR ${ROCM_PATH}/lib/cmake/hipsparse)
set(rccl_DIR ${ROCM_PATH}/lib/cmake/rccl)
set(rocprim_DIR ${ROCM_PATH}/lib/cmake/rocprim)
set(hipcub_DIR ${ROCM_PATH}/lib/cmake/hipcub)
set(rocthrust_DIR ${ROCM_PATH}/lib/cmake/rocthrust)
set(ROCclr_DIR ${ROCM_PATH}/rocclr/lib/cmake/rocclr)
set(ROCRAND_INCLUDE ${ROCM_PATH}/include)
set(ROCM_SMI_INCLUDE ${ROCM_PATH}/rocm_smi/include)
else()
message(FATAL_ERROR "\n***** The minimal ROCm version is 5.1.0 but have ${ROCM_VERSION_DEV} installed *****\n")
endif()
set(CMAKE_HCC_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
set(CMAKE_HCC_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})

find_package(hip REQUIRED)
find_package(rocblas REQUIRED)
find_package(hipfft REQUIRED)
find_package(hiprand REQUIRED)
find_package(rocrand REQUIRED)
find_package(hipsparse REQUIRED)
find_package(rocprim REQUIRED)
find_package_and_print_version(hip REQUIRED)
find_package_and_print_version(hsa-runtime64 REQUIRED)
find_package_and_print_version(amd_comgr REQUIRED)
find_package_and_print_version(rocrand REQUIRED)
find_package_and_print_version(hiprand REQUIRED)
find_package_and_print_version(hipblas REQUIRED)
find_package_and_print_version(rocblas REQUIRED)
find_package_and_print_version(miopen REQUIRED)
find_package_and_print_version(rocfft REQUIRED)
find_package_and_print_version(hipsparse REQUIRED)
find_package_and_print_version(rccl)
find_package_and_print_version(rocprim REQUIRED)
find_package_and_print_version(hipcub REQUIRED)
find_package_and_print_version(rocthrust REQUIRED)
find_package_and_print_version(hipsolver REQUIRED)

# Enabling HIP language support
enable_language(HIP)

if(HIP_COMPILER STREQUAL clang)
set(hip_library_name amdhip64)
Expand All @@ -216,9 +167,11 @@ IF(HIP_FOUND)
endif()
message("HIP library name: ${hip_library_name}")

set(CMAKE_HCC_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
set(CMAKE_HCC_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
FIND_LIBRARY(FBGEMM_HIP_HCC_LIBRARIES ${hip_library_name} HINTS ${HIP_PATH}/lib)
# TODO: hip_hcc has an interface include flag "-hc" which is only
# recognizable by hcc, but not gcc and clang. Right now in our
# setup, hcc is only used for linking, but it should be used to
# compile the *_hip.cc files as well.
find_library(FBGEMM_HIP_HCC_LIBRARIES ${hip_library_name} HINTS ${ROCM_PATH}/lib)

list(APPEND HIP_CXX_FLAGS -D__HIP_NO_HALF_OPERATORS__=1)
# list(APPEND HIP_CXX_FLAGS -D__HIP_NO_HALF_CONVERSIONS__=1)
Expand All @@ -228,25 +181,21 @@ IF(HIP_FOUND)
list(APPEND HIP_CXX_FLAGS -mavx2)
list(APPEND HIP_CXX_FLAGS -mf16c)
list(APPEND HIP_CXX_FLAGS -mfma)
list(APPEND HIP_CXX_FLAGS -std=c++17)
list(APPEND HIP_CXX_FLAGS -std=c++20)

set(HIP_HCC_FLAGS ${HIP_CXX_FLAGS})
# Ask hcc to generate device code during compilation so we can use
# host linker to link.
list(APPEND HIP_HCC_FLAGS -fno-gpu-rdc)
list(APPEND HIP_HCC_FLAGS -Wno-defaulted-function-deleted)
foreach(fbgemm_rocm_arch ${FBGEMM_ROCM_ARCH})
foreach(fbgemm_rocm_arch ${PYTORCH_ROCM_ARCH})
list(APPEND HIP_HCC_FLAGS --offload-arch=${fbgemm_rocm_arch})
endforeach()

set(FBGEMM_HIP_INCLUDE ${ROCM_PATH}/include ${FBGEMM_HIP_INCLUDE})
set(FBGEMM_HIP_INCLUDE ${hip_INCLUDE_DIRS} $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}> $<INSTALL_INTERFACE:include> ${FBGEMM_HIP_INCLUDE})

hip_include_directories(${FBGEMM_HIP_INCLUDE} ${ROCRAND_INCLUDE} ${ROCM_SMI_INCLUDE})
hip_include_directories(${FBGEMM_HIP_INCLUDE} ${ROCRAND_INCLUDE} ${ROCM_SMI_INCLUDE} /opt/rocm/include/hipblas)

list (APPEND CMAKE_PREFIX_PATH ${HIP_PATH} ${ROCM_PATH})
set(CMAKE_MODULE_PATH ${HIP_PATH}/cmake ${CMAKE_MODULE_PATH})

ELSE()
message("Not able to find HIP installation.")
ENDIF()
endif()
4 changes: 2 additions & 2 deletions fbgemm_gpu/codegen/embedding_backward_dense_host.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ class SplitLookupFunction_Dense_Op
ctx->saved_data["total_hash_size_bits"] = total_hash_size_bits;
ctx->saved_data["pooling_mode"] = pooling_mode;

#ifdef __HIP_PLATFORM_HCC__
#ifdef USE_ROCM
constexpr int32_t BT_block_size = 64;
#else
constexpr int32_t BT_block_size = 32;
Expand Down Expand Up @@ -164,7 +164,7 @@ class SplitLookupFunction_Dense_Op

TORCH_CHECK(grad_outputs.size() == 1);

#ifdef __HIP_PLATFORM_HCC__
#ifdef USE_ROCM
constexpr int32_t BT_block_size = 64;
constexpr int32_t max_segment_length_per_warp = 64;
#else
Expand Down
4 changes: 2 additions & 2 deletions fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ class Split{{ "NoBag" if nobag else "" }}LookupFunction_{{ optimizer }}_Op :
{% endfor %}

{% if not nobag %}
#ifdef __HIP_PLATFORM_HCC__
#ifdef USE_ROCM
constexpr int32_t BT_block_size = 64;
#else
constexpr int32_t BT_block_size = 32;
Expand Down Expand Up @@ -264,7 +264,7 @@ class Split{{ "NoBag" if nobag else "" }}LookupFunction_{{ optimizer }}_Op :

TORCH_CHECK(grad_outputs.size() == 1);

#ifdef __HIP_PLATFORM_HCC__
#ifdef USE_ROCM
constexpr int32_t BT_block_size = 64;
constexpr int32_t max_segment_length_per_warp = 64;
#else
Expand Down
8 changes: 4 additions & 4 deletions fbgemm_gpu/codegen/embedding_backward_split_template.cu
Original file line number Diff line number Diff line change
Expand Up @@ -865,7 +865,7 @@ split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_

// V100: 96 KB; A100: 160 KB.
int max_shared_bytes = 0;
#ifndef __HIP_PLATFORM_HCC__
#ifndef USE_ROCM
cudaDeviceGetAttribute(&max_shared_bytes, cudaDevAttrMaxSharedMemoryPerBlockOptin, dev_weights.get_device());
#else
// MI100 has 64 KB local memory (shared memory) per workgroup
Expand All @@ -874,7 +874,7 @@ split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_
C10_CUDA_KERNEL_LAUNCH_CHECK();
int shared_kb = max_shared_bytes >> 10;
// V100: 64 KB; A100: 96 KB.
#ifndef __HIP_PLATFORM_HCC__
#ifndef USE_ROCM
// Use 2/3 of the available GPU shared mem; leave rooms for L1$.
int used_shared_kb = round_down(shared_kb * 2 / 3, 16);
TORCH_CHECK(used_shared_kb > 0);
Expand Down Expand Up @@ -1112,7 +1112,7 @@ split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_
// must use dynamic shared memory (rather than statically sized
// arrays) and require an explicit opt-in using cudaFuncSetAttribute()".
#ifndef __HIP_PLATFORM_HCC__
#ifndef USE_ROCM
cudaFuncSetAttribute(
split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_{{ wdesc }}_kernel_cta_per_row_1<
{% if not dense %}
Expand Down Expand Up @@ -1218,7 +1218,7 @@ split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_
{% endif %},
true>) * 4 * kWarpSize *
kMaxVecsPerThread;
#ifndef __HIP_PLATFORM_HCC__
#ifndef USE_ROCM
cudaFuncSetAttribute(
split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_{{ wdesc }}_kernel_warp_per_row_1<
{% if not dense %}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ __global__ void {{ type_map[emb_weight_type].enum_name }}_split_embedding{{ "_no
}
// equivalent to fence + wait.
cp_async_wait<0>();
#ifdef __HIP_PLATFORM_HCC__
#ifdef USE_ROCM
// Performance - replace a block level __syncthreads with per CU __threadfence_block
// __threadfence_block is fine replacement for __syncwarp on AMD GPUs, it is because
// a. memory fencing: __threadfence_block ops. at CU level, same as __syncwarp at SM
Expand Down Expand Up @@ -491,7 +491,7 @@ __global__ __launch_bounds__(kMaxThreads) void int_nbit_split_embedding_codegen_

const uint32_t subwarp_id = threadIdx.x / 4;
const uint32_t subwarp_tid = threadIdx.x % 4;
#ifdef __HIP_PLATFORM_HCC__
#ifdef USE_ROCM
const uint64_t subwarp_mask = static_cast<uint64_t>(0xF) << (4 * subwarp_id);
#else
const uint32_t subwarp_mask = static_cast<uint32_t>(0xF) << (4 * subwarp_id);
Expand Down
4 changes: 2 additions & 2 deletions fbgemm_gpu/codegen/embedding_forward_split_template.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

#define SHFL_SYNC(val, srcLane) shfl_sync(val, srcLane, kThreadGroupSize, shfl_sync_mask)

#ifdef __HIP_PLATFORM_HCC__
#ifdef USE_ROCM
#include "hip_kernel/split_tbe_fwd.hip.hpp"
#endif

Expand Down Expand Up @@ -522,7 +522,7 @@ Tensor {{ "dense" if dense else "split" }}_embedding{{ "_nobag" if nobag else ""
return output;
}

#ifdef __HIP_PLATFORM_HCC__ // HIP Optimal Kernel
#ifdef USE_ROCM // HIP Optimal Kernel
/*
* current limitations
1. sparse, and bag
Expand Down
Loading

0 comments on commit c4d3328

Please sign in to comment.