Skip to content

Commit

Permalink
Add i-cache flush for AMD GPUs into FBGEMM
Browse files Browse the repository at this point in the history
Summary: - Add a function into FBGEMM to flush i-cache

Differential Revision: D63296513
  • Loading branch information
zjing14 authored and facebook-github-bot committed Oct 2, 2024
1 parent 7c2bfb8 commit 7b4a194
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 0 deletions.
1 change: 1 addition & 0 deletions fbgemm_gpu/experimental/gen_ai/bench/ck_bf16_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ class CKMatmul(torch.nn.Module):
def forward(
self, a: torch.Tensor, b: torch.Tensor, bias: Optional[torch.Tensor] = None
) -> torch.Tensor:
torch.ops.fbgemm.flush_icache_hip()
return torch.ops.fbgemm.bf16_gemm(a, b, bias)


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <cstdlib>
#include <functional>
#include <initializer_list>
#include <iostream>
#include <numeric>
#include <tuple>
#include <unordered_map>

#include <ATen/ATen.h>
#include <c10/hip/HIPStream.h>
#include <torch/torch.h>

#if defined(USE_ROCM)

#include "ck/ck.hpp"
#include "ck/stream_config.hpp"
#include "ck/host_utility/hip_check_error.hpp"
#include "ck/utility/flush_icache.hpp"

namespace fbgemm_gpu {

void flush_icache_ck()
{
hipDeviceProp_t deviceProps;
hip_check_error(hipGetDeviceProperties(&deviceProps, 0));
int32_t gpu_block3 = deviceProps.multiProcessorCount * 60;

auto stream = at::cuda::getCurrentHIPStream().stream();

ck::flush_icache<<<dim3(gpu_block3), dim3(64), 0, stream>>>();
hip_check_error(hipGetLastError());
}

} // namespace fbgemm_gpu

#endif // defined(USE_ROCM)
10 changes: 10 additions & 0 deletions fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@

namespace fbgemm_gpu {

#ifdef USE_ROCM
// flush icache
void flush_icache_ck();
#endif

// SmoothQuant kernels
at::Tensor
i8i8bf16(at::Tensor XQ, at::Tensor WQ, double scale, int64_t split_k);
Expand Down Expand Up @@ -175,6 +180,11 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
m.impl(
"quantize_fp8_per_tensor_fixed_scale",
quantize_fp8_per_tensor_fixed_scale);

#ifdef USE_ROCM
m.def("flush_icache_hip() -> ()");
m.impl("flush_icache_hip", flush_icache_ck);
#endif
}

TORCH_LIBRARY_IMPL(fbgemm, CUDA, m) {
Expand Down

0 comments on commit 7b4a194

Please sign in to comment.