From b838d262671ef15055fd9a026d4e67b50db48a17 Mon Sep 17 00:00:00 2001 From: Jing Zhang Date: Tue, 1 Oct 2024 21:18:23 -0700 Subject: [PATCH] Add i-cache flush for AMD GPUs into FBGEMM (#3208) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/3208 X-link: https://github.com/facebookresearch/FBGEMM/pull/307 - Add a function into FBGEMM to flush i-cache Differential Revision: D63296513 --- .../gen_ai/bench/ck_bf16_bench.py | 1 + .../src/quantize/ck_extensions/ck_utility.hip | 44 +++++++++++++++++++ .../gen_ai/src/quantize/quantize.cpp | 10 +++++ 3 files changed, 55 insertions(+) create mode 100644 fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/ck_utility.hip diff --git a/fbgemm_gpu/experimental/gen_ai/bench/ck_bf16_bench.py b/fbgemm_gpu/experimental/gen_ai/bench/ck_bf16_bench.py index b1383b79f..94e3c698b 100644 --- a/fbgemm_gpu/experimental/gen_ai/bench/ck_bf16_bench.py +++ b/fbgemm_gpu/experimental/gen_ai/bench/ck_bf16_bench.py @@ -45,6 +45,7 @@ class CKMatmul(torch.nn.Module): def forward( self, a: torch.Tensor, b: torch.Tensor, bias: Optional[torch.Tensor] = None ) -> torch.Tensor: + torch.ops.fbgemm.flush_icache_hip() return torch.ops.fbgemm.bf16_gemm(a, b, bias) diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/ck_utility.hip b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/ck_utility.hip new file mode 100644 index 000000000..25f532ad3 --- /dev/null +++ b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/ck_utility.hip @@ -0,0 +1,44 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#if defined(USE_ROCM) + +#include "ck/ck.hpp" +#include "ck/stream_config.hpp" +#include "ck/host_utility/hip_check_error.hpp" +#include "ck/utility/flush_icache.hpp" + +namespace fbgemm_gpu { + +void flush_icache_ck() +{ + hipDeviceProp_t deviceProps; + hip_check_error(hipGetDeviceProperties(&deviceProps, 0)); + int32_t gpu_block3 = deviceProps.multiProcessorCount * 60; + + auto stream = at::cuda::getCurrentHIPStream().stream(); + + ck::flush_icache<<>>(); + hip_check_error(hipGetLastError()); +} + +} // namespace fbgemm_gpu + +#endif // defined(USE_ROCM) diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cpp b/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cpp index 39084712c..e11ec08c8 100644 --- a/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cpp +++ b/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cpp @@ -27,6 +27,11 @@ namespace fbgemm_gpu { +#ifdef USE_ROCM +// flush icache +void flush_icache_ck(); +#endif + // SmoothQuant kernels at::Tensor i8i8bf16(at::Tensor XQ, at::Tensor WQ, double scale, int64_t split_k); @@ -175,6 +180,11 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) { m.impl( "quantize_fp8_per_tensor_fixed_scale", quantize_fp8_per_tensor_fixed_scale); + +#ifdef USE_ROCM + m.def("flush_icache_hip() -> ()"); + m.impl("flush_icache_hip", flush_icache_ck); +#endif } TORCH_LIBRARY_IMPL(fbgemm, CUDA, m) {