From b838d262671ef15055fd9a026d4e67b50db48a17 Mon Sep 17 00:00:00 2001
From: Jing Zhang <jizhan@meta.com>
Date: Tue, 1 Oct 2024 21:18:23 -0700
Subject: [PATCH] Add i-cache flush for AMD GPUs into FBGEMM (#3208)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/3208

X-link: https://github.com/facebookresearch/FBGEMM/pull/307

- Add a function into FBGEMM to flush i-cache

Differential Revision: D63296513
---
 .../gen_ai/bench/ck_bf16_bench.py             |  1 +
 .../src/quantize/ck_extensions/ck_utility.hip | 44 +++++++++++++++++++
 .../gen_ai/src/quantize/quantize.cpp          | 10 +++++
 3 files changed, 55 insertions(+)
 create mode 100644 fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/ck_utility.hip
diff --git a/fbgemm_gpu/experimental/gen_ai/bench/ck_bf16_bench.py b/fbgemm_gpu/experimental/gen_ai/bench/ck_bf16_bench.py
index b1383b79f..94e3c698b 100644
--- a/fbgemm_gpu/experimental/gen_ai/bench/ck_bf16_bench.py
+++ b/fbgemm_gpu/experimental/gen_ai/bench/ck_bf16_bench.py
@@ -45,6 +45,7 @@ class CKMatmul(torch.nn.Module):
     def forward(
         self, a: torch.Tensor, b: torch.Tensor, bias: Optional[torch.Tensor] = None
     ) -> torch.Tensor:
+        torch.ops.fbgemm.flush_icache_hip()
         return torch.ops.fbgemm.bf16_gemm(a, b, bias)
 
 
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/ck_utility.hip b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/ck_utility.hip
new file mode 100644
index 000000000..25f532ad3
--- /dev/null
+++ b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/ck_utility.hip
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstdlib>
+#include <functional>
+#include <initializer_list>
+#include <iostream>
+#include <numeric>
+#include <tuple>
+#include <unordered_map>
+
+#include <ATen/ATen.h>
+#include <c10/hip/HIPStream.h>
+#include <torch/torch.h>
+
+#if defined(USE_ROCM)
+
+#include "ck/ck.hpp"
+#include "ck/stream_config.hpp"
+#include "ck/host_utility/hip_check_error.hpp"
+#include "ck/utility/flush_icache.hpp"
+
+namespace fbgemm_gpu {
+
+void flush_icache_ck()
+{
+    hipDeviceProp_t deviceProps;
+    hip_check_error(hipGetDeviceProperties(&deviceProps, 0));
+    int32_t gpu_block3 = deviceProps.multiProcessorCount * 60;
+
+    auto stream = at::cuda::getCurrentHIPStream().stream();
+
+    ck::flush_icache<<<dim3(gpu_block3), dim3(64), 0, stream>>>();
+    hip_check_error(hipGetLastError());
+}
+
+} // namespace fbgemm_gpu
+
+#endif // defined(USE_ROCM)
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cpp b/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cpp
index 39084712c..e11ec08c8 100644
--- a/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cpp
+++ b/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cpp
@@ -27,6 +27,11 @@
 
 namespace fbgemm_gpu {
 
+#ifdef USE_ROCM
+// flush icache
+void flush_icache_ck();
+#endif
+
 // SmoothQuant kernels
 at::Tensor
 i8i8bf16(at::Tensor XQ, at::Tensor WQ, double scale, int64_t split_k);
@@ -175,6 +180,11 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.impl(
       "quantize_fp8_per_tensor_fixed_scale",
       quantize_fp8_per_tensor_fixed_scale);
+
+#ifdef USE_ROCM
+  m.def("flush_icache_hip() -> ()");
+  m.impl("flush_icache_hip", flush_icache_ck);
+#endif
 }
 
 TORCH_LIBRARY_IMPL(fbgemm, CUDA, m) {