From 21052964875c30472ca43c82132d352898e49822 Mon Sep 17 00:00:00 2001 From: Kyle Lucke Date: Sun, 13 Oct 2024 10:06:27 -0700 Subject: [PATCH] Move GpuDriver HostAllocate and HostDeallocate functions into the proper Executor classes. PiperOrigin-RevId: 685451178 --- .../xla/stream_executor/cuda/cuda_driver.cc | 21 --------- .../xla/stream_executor/cuda/cuda_executor.cc | 43 ++++++++++++++++++- .../xla/stream_executor/cuda/cuda_executor.h | 16 +------ .../xla/xla/stream_executor/gpu/gpu_driver.h | 12 ------ .../xla/stream_executor/rocm/rocm_driver.cc | 21 --------- .../xla/stream_executor/rocm/rocm_executor.cc | 36 +++++++++++++++- .../xla/stream_executor/rocm/rocm_executor.h | 16 +------ 7 files changed, 80 insertions(+), 85 deletions(-) diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc b/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc index 88b955ba05521b..71cbc60cdcba17 100644 --- a/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc +++ b/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc @@ -787,27 +787,6 @@ void GpuDriver::DestroyStream(Context* context, GpuStreamHandle stream) { } } -void* GpuDriver::HostAllocate(Context* context, uint64_t bytes) { - ScopedActivateContext activation(context); - void* host_mem = nullptr; - // "Portable" memory is visible to all CUDA contexts. Safe for our use model. - auto status = cuda::ToStatus( - cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE)); - if (!status.ok()) { - LOG(ERROR) << "failed to alloc " << bytes << " bytes on host: " << status; - } - return host_mem; -} - -void GpuDriver::HostDeallocate(Context* context, void* location) { - ScopedActivateContext activation(context); - auto status = cuda::ToStatus(cuMemFreeHost(location)); - if (!status.ok()) { - LOG(ERROR) << "error deallocating host memory at " << location << ": " - << status; - } -} - absl::Status GpuDriver::SynchronizeStream(Context* context, CUstream stream) { ScopedActivateContext activated{context}; CHECK(stream != nullptr); diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc index 3ed5da5318b9f3..44ef40212314d8 100644 --- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc +++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc @@ -70,9 +70,11 @@ limitations under the License. #include "xla/stream_executor/gpu/gpu_types.h" #include "xla/stream_executor/gpu/read_numa_node.h" #include "xla/stream_executor/gpu/scoped_activate_context.h" +#include "xla/stream_executor/host_memory_allocation.h" #include "xla/stream_executor/kernel.h" #include "xla/stream_executor/kernel_spec.h" #include "xla/stream_executor/launch_dim.h" +#include "xla/stream_executor/memory_allocation.h" #include "xla/stream_executor/module_spec.h" #include "xla/stream_executor/platform.h" #include "xla/stream_executor/plugin_registry.h" @@ -503,6 +505,29 @@ void DeviceDeallocate(Context* context, void* location) { } } +// Allocates memory on the host. +void* HostAllocate(Context* context, uint64_t bytes) { + ScopedActivateContext activation(context); + void* host_mem = nullptr; + // "Portable" memory is visible to all CUDA contexts. Safe for our use model. + auto status = cuda::ToStatus( + cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE)); + if (!status.ok()) { + LOG(ERROR) << "failed to alloc " << bytes << " bytes on host: " << status; + } + return host_mem; +} + +// Deallocates memory allocated via HostAllocate. +void HostDeallocate(Context* context, void* location) { + ScopedActivateContext activation(context); + auto status = cuda::ToStatus(cuMemFreeHost(location)); + if (!status.ok()) { + LOG(ERROR) << "error deallocating host memory at " << location << ": " + << status; + } +} + } // namespace // Given const GPU memory, returns a libcuda device pointer datatype, suitable @@ -878,12 +903,22 @@ DeviceMemoryBase CudaExecutor::Allocate(uint64_t size, int64_t memory_space) { return DeviceMemoryBase(nullptr, 0); } else if (memory_space == static_cast(stream_executor::MemoryType::kHost)) { - return DeviceMemoryBase(GpuDriver::HostAllocate(gpu_context(), size), size); + return DeviceMemoryBase(HostAllocate(gpu_context(), size), size); } CHECK_EQ(memory_space, 0); return DeviceMemoryBase(DeviceAllocate(gpu_context(), size), size); } +absl::StatusOr> +CudaExecutor::HostMemoryAllocate(uint64_t size) { + auto* buffer = HostAllocate(gpu_context(), size); + if (buffer == nullptr && size > 0) { + return absl::InternalError( + absl::StrFormat("Failed to allocate HostMemory of size %d", size)); + } + return std::make_unique(buffer, size, this); +} + void CudaExecutor::Deallocate(DeviceMemoryBase* mem) { auto status_or_memory_space = GetPointerMemorySpace(mem->opaque()); if (!status_or_memory_space.ok()) { @@ -892,12 +927,16 @@ void CudaExecutor::Deallocate(DeviceMemoryBase* mem) { } auto memory_space = status_or_memory_space.value(); if (memory_space == MemoryType::kHost) { - GpuDriver::HostDeallocate(gpu_context(), mem->opaque()); + HostDeallocate(gpu_context(), mem->opaque()); } else { DeviceDeallocate(gpu_context(), mem->opaque()); } } +void CudaExecutor::HostMemoryDeallocate(void* location) { + return HostDeallocate(gpu_context(), location); +} + bool CudaExecutor::SynchronizeAllActivity() { return gpu_context()->Synchronize().ok(); } diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.h b/third_party/xla/xla/stream_executor/cuda/cuda_executor.h index 2b31223c9188bd..a0a4402864cbc1 100644 --- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.h +++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.h @@ -30,7 +30,6 @@ limitations under the License. #include "absl/numeric/int128.h" #include "absl/status/status.h" #include "absl/status/statusor.h" -#include "absl/strings/str_format.h" #include "absl/synchronization/mutex.h" #include "absl/types/span.h" #include "xla/stream_executor/blas.h" @@ -46,7 +45,6 @@ limitations under the License. #include "xla/stream_executor/gpu/gpu_executor.h" #include "xla/stream_executor/gpu/gpu_kernel.h" #include "xla/stream_executor/gpu/gpu_types.h" -#include "xla/stream_executor/host_memory_allocation.h" #include "xla/stream_executor/kernel.h" #include "xla/stream_executor/kernel_spec.h" #include "xla/stream_executor/memory_allocation.h" @@ -118,19 +116,9 @@ class CudaExecutor : public GpuExecutor { void* UnifiedMemoryAllocate(uint64_t size) override; void UnifiedMemoryDeallocate(void* location) override; absl::StatusOr> HostMemoryAllocate( - uint64_t size) override { - auto* buffer = GpuDriver::HostAllocate(gpu_context(), size); - if (buffer == nullptr && size > 0) { - return absl::InternalError( - absl::StrFormat("Failed to allocate HostMemory of size %d", size)); - } - return std::make_unique(buffer, size, this); - } - - void HostMemoryDeallocate(void* location) override { - return GpuDriver::HostDeallocate(gpu_context(), location); - } + uint64_t size) override; + void HostMemoryDeallocate(void* location) override; bool HostMemoryRegister(void* location, uint64_t size) override; bool HostMemoryUnregister(void* location) override; diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_driver.h b/third_party/xla/xla/stream_executor/gpu/gpu_driver.h index 5ccc4b75fc558a..278d17421e4ca1 100644 --- a/third_party/xla/xla/stream_executor/gpu/gpu_driver.h +++ b/third_party/xla/xla/stream_executor/gpu/gpu_driver.h @@ -69,18 +69,6 @@ class GpuDriver { // https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Driver_API_functions_supported_by_HIP.html#stream-management static void DestroyStream(Context* context, GpuStreamHandle stream); - // Allocates page-locked and CUDA-registered memory on the host via - // cuMemAllocHost/hipHostMalloc. - // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0 - // https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Driver_API_functions_supported_by_HIP.html#memory-management - static void* HostAllocate(Context* context, uint64_t bytes); - - // Deallocates a location created by HostAllocate, via - // cuMemFreeHost/hipHostFree. - // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g62e0fdbe181dab6b1c90fa1a51c7b92c - // https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Driver_API_functions_supported_by_HIP.html#memory-management - static void HostDeallocate(Context* context, void* location); - // Launches a CUDA/ROCm kernel via cuLaunchKernel/hipModuleLaunchKernel. // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15 // https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Driver_API_functions_supported_by_HIP.html#execution-control diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_driver.cc b/third_party/xla/xla/stream_executor/rocm/rocm_driver.cc index c96dfd5c14ebfe..42e7656135710a 100644 --- a/third_party/xla/xla/stream_executor/rocm/rocm_driver.cc +++ b/third_party/xla/xla/stream_executor/rocm/rocm_driver.cc @@ -610,27 +610,6 @@ void GpuDriver::DestroyStream(Context* context, GpuStreamHandle stream) { } } -void* GpuDriver::HostAllocate(Context* context, uint64_t bytes) { - ScopedActivateContext activation{context}; - void* host_mem = nullptr; - // "Portable" memory is visible to all ROCM contexts. Safe for our use model. - hipError_t res = wrap::hipHostMalloc(&host_mem, bytes, hipHostMallocPortable); - if (res != hipSuccess) { - LOG(ERROR) << "failed to alloc " << bytes - << " bytes on host: " << ToString(res); - } - return host_mem; -} - -void GpuDriver::HostDeallocate(Context* context, void* location) { - ScopedActivateContext activation{context}; - hipError_t res = wrap::hipHostFree(location); - if (res != hipSuccess) { - LOG(ERROR) << "error deallocating host memory at " << location << ": " - << ToString(res); - } -} - absl::Status GpuDriver::SynchronizeStream(Context* context, GpuStreamHandle stream) { ScopedActivateContext activated{context}; diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc b/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc index 650bd821f867e6..d3b1b5a0ddcf2f 100644 --- a/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc +++ b/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc @@ -58,9 +58,11 @@ limitations under the License. #include "xla/stream_executor/gpu/gpu_types.h" #include "xla/stream_executor/gpu/read_numa_node.h" #include "xla/stream_executor/gpu/scoped_activate_context.h" +#include "xla/stream_executor/host_memory_allocation.h" #include "xla/stream_executor/kernel.h" #include "xla/stream_executor/kernel_spec.h" #include "xla/stream_executor/launch_dim.h" +#include "xla/stream_executor/memory_allocation.h" #include "xla/stream_executor/module_spec.h" #include "xla/stream_executor/platform.h" #include "xla/stream_executor/platform/initialize.h" @@ -462,6 +464,20 @@ void DeviceDeallocate(Context* context, void* location) { << context->device_ordinal(); } } + +// Allocates memory on the host. +void* HostAllocate(Context* context, uint64_t bytes) { + ScopedActivateContext activation{context}; + void* host_mem = nullptr; + // "Portable" memory is visible to all ROCM contexts. Safe for our use model. + hipError_t res = wrap::hipHostMalloc(&host_mem, bytes, hipHostMallocPortable); + if (res != hipSuccess) { + LOG(ERROR) << "failed to alloc " << bytes + << " bytes on host: " << ToString(res); + } + return host_mem; +} + } // namespace RocmExecutor::~RocmExecutor() { @@ -711,11 +727,29 @@ absl::Status RocmExecutor::LoadModuleFromHsaco(const char* hsaco, DeviceMemoryBase RocmExecutor::Allocate(uint64_t size, int64_t memory_space) { if (memory_space == static_cast(stream_executor::MemoryType::kHost)) { - return DeviceMemoryBase(GpuDriver::HostAllocate(gpu_context(), size), size); + return DeviceMemoryBase(HostAllocate(gpu_context(), size), size); } CHECK_EQ(memory_space, 0); return DeviceMemoryBase(DeviceAllocate(gpu_context(), size), size); } +absl::StatusOr> +RocmExecutor::HostMemoryAllocate(uint64_t size) { + auto* buffer = HostAllocate(gpu_context(), size); + if (buffer == nullptr && size > 0) { + return absl::InternalError( + absl::StrFormat("Failed to allocate HostMemory of size %d", size)); + } + return std::make_unique(buffer, size, this); +} + +void RocmExecutor::HostMemoryDeallocate(void* location) { + ScopedActivateContext activation{gpu_context()}; + hipError_t res = wrap::hipHostFree(location); + if (res != hipSuccess) { + LOG(ERROR) << "error deallocating host memory at " << location << ": " + << ToString(res); + } +} void RocmExecutor::Deallocate(DeviceMemoryBase* mem) { DeviceDeallocate(gpu_context(), mem->opaque()); diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_executor.h b/third_party/xla/xla/stream_executor/rocm/rocm_executor.h index 07d243fae72dea..dd029894cbc815 100644 --- a/third_party/xla/xla/stream_executor/rocm/rocm_executor.h +++ b/third_party/xla/xla/stream_executor/rocm/rocm_executor.h @@ -30,7 +30,6 @@ limitations under the License. #include "absl/numeric/int128.h" #include "absl/status/status.h" #include "absl/status/statusor.h" -#include "absl/strings/str_format.h" #include "absl/synchronization/mutex.h" #include "absl/types/span.h" #include "xla/stream_executor/blas.h" @@ -45,7 +44,6 @@ limitations under the License. #include "xla/stream_executor/gpu/gpu_executor.h" #include "xla/stream_executor/gpu/gpu_kernel.h" #include "xla/stream_executor/gpu/gpu_types.h" -#include "xla/stream_executor/host_memory_allocation.h" #include "xla/stream_executor/kernel.h" #include "xla/stream_executor/kernel_spec.h" #include "xla/stream_executor/memory_allocation.h" @@ -111,18 +109,8 @@ class RocmExecutor : public GpuExecutor { void UnifiedMemoryDeallocate(void* location) override; absl::StatusOr> HostMemoryAllocate( - uint64_t size) override { - auto* buffer = GpuDriver::HostAllocate(gpu_context(), size); - if (buffer == nullptr && size > 0) { - return absl::InternalError( - absl::StrFormat("Failed to allocate HostMemory of size %d", size)); - } - return std::make_unique(buffer, size, this); - } - - void HostMemoryDeallocate(void* location) override { - return GpuDriver::HostDeallocate(gpu_context(), location); - } + uint64_t size) override; + void HostMemoryDeallocate(void* location) override; absl::StatusOr GetPointerMemorySpace(const void* ptr) override;