From fc7f5dfff3f67472bd58a34854a681db006bd01f Mon Sep 17 00:00:00 2001 From: Jing Zhang Date: Thu, 3 Oct 2024 08:17:52 -0700 Subject: [PATCH] Add i-cache flush for AMD GPUs into FBGEMM (#3208) Summary: X-link: https://github.com/facebookresearch/FBGEMM/pull/307 - Add a function into FBGEMM to flush i-cache Reviewed By: zixi-qi Differential Revision: D63296513 --- .../src/quantize/ck_extensions/ck_utility.hip | 44 +++++++++++++++++++ .../gen_ai/src/quantize/quantize.cpp | 10 +++++ 2 files changed, 54 insertions(+) create mode 100644 fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/ck_utility.hip diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/ck_utility.hip b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/ck_utility.hip new file mode 100644 index 000000000..25f532ad3 --- /dev/null +++ b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/ck_utility.hip @@ -0,0 +1,44 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#if defined(USE_ROCM) + +#include "ck/ck.hpp" +#include "ck/stream_config.hpp" +#include "ck/host_utility/hip_check_error.hpp" +#include "ck/utility/flush_icache.hpp" + +namespace fbgemm_gpu { + +void flush_icache_ck() +{ + hipDeviceProp_t deviceProps; + hip_check_error(hipGetDeviceProperties(&deviceProps, 0)); + int32_t gpu_block3 = deviceProps.multiProcessorCount * 60; + + auto stream = at::cuda::getCurrentHIPStream().stream(); + + ck::flush_icache<<>>(); + hip_check_error(hipGetLastError()); +} + +} // namespace fbgemm_gpu + +#endif // defined(USE_ROCM) diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cpp b/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cpp index ff5c66766..101a5cba1 100644 --- a/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cpp +++ b/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cpp @@ -27,6 +27,11 @@ namespace fbgemm_gpu { +#ifdef USE_ROCM +// flush icache +void flush_icache_ck(); +#endif + // SmoothQuant kernels at::Tensor i8i8bf16(at::Tensor XQ, at::Tensor WQ, double scale, int64_t split_k); @@ -185,6 +190,11 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) { m.impl( "quantize_fp8_per_tensor_fixed_scale", quantize_fp8_per_tensor_fixed_scale); + +#ifdef USE_ROCM + m.def("flush_icache_hip() -> ()"); + m.impl("flush_icache_hip", flush_icache_ck); +#endif } TORCH_LIBRARY_IMPL(fbgemm, CUDA, m) {