Skip to content

Commit

Permalink
Move GpuDriver HostAllocate and HostDeallocate functions into the pro…
Browse files Browse the repository at this point in the history
…per Executor classes.

PiperOrigin-RevId: 685451178
  • Loading branch information
klucke authored and tensorflower-gardener committed Oct 13, 2024
1 parent a2cf898 commit 2105296
Show file tree
Hide file tree
Showing 7 changed files with 80 additions and 85 deletions.
21 changes: 0 additions & 21 deletions third_party/xla/xla/stream_executor/cuda/cuda_driver.cc
Original file line number Diff line number Diff line change
Expand Up @@ -787,27 +787,6 @@ void GpuDriver::DestroyStream(Context* context, GpuStreamHandle stream) {
}
}

void* GpuDriver::HostAllocate(Context* context, uint64_t bytes) {
ScopedActivateContext activation(context);
void* host_mem = nullptr;
// "Portable" memory is visible to all CUDA contexts. Safe for our use model.
auto status = cuda::ToStatus(
cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE));
if (!status.ok()) {
LOG(ERROR) << "failed to alloc " << bytes << " bytes on host: " << status;
}
return host_mem;
}

void GpuDriver::HostDeallocate(Context* context, void* location) {
ScopedActivateContext activation(context);
auto status = cuda::ToStatus(cuMemFreeHost(location));
if (!status.ok()) {
LOG(ERROR) << "error deallocating host memory at " << location << ": "
<< status;
}
}

absl::Status GpuDriver::SynchronizeStream(Context* context, CUstream stream) {
ScopedActivateContext activated{context};
CHECK(stream != nullptr);
Expand Down
43 changes: 41 additions & 2 deletions third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,11 @@ limitations under the License.
#include "xla/stream_executor/gpu/gpu_types.h"
#include "xla/stream_executor/gpu/read_numa_node.h"
#include "xla/stream_executor/gpu/scoped_activate_context.h"
#include "xla/stream_executor/host_memory_allocation.h"
#include "xla/stream_executor/kernel.h"
#include "xla/stream_executor/kernel_spec.h"
#include "xla/stream_executor/launch_dim.h"
#include "xla/stream_executor/memory_allocation.h"
#include "xla/stream_executor/module_spec.h"
#include "xla/stream_executor/platform.h"
#include "xla/stream_executor/plugin_registry.h"
Expand Down Expand Up @@ -503,6 +505,29 @@ void DeviceDeallocate(Context* context, void* location) {
}
}

// Allocates memory on the host.
void* HostAllocate(Context* context, uint64_t bytes) {
ScopedActivateContext activation(context);
void* host_mem = nullptr;
// "Portable" memory is visible to all CUDA contexts. Safe for our use model.
auto status = cuda::ToStatus(
cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE));
if (!status.ok()) {
LOG(ERROR) << "failed to alloc " << bytes << " bytes on host: " << status;
}
return host_mem;
}

// Deallocates memory allocated via HostAllocate.
void HostDeallocate(Context* context, void* location) {
ScopedActivateContext activation(context);
auto status = cuda::ToStatus(cuMemFreeHost(location));
if (!status.ok()) {
LOG(ERROR) << "error deallocating host memory at " << location << ": "
<< status;
}
}

} // namespace

// Given const GPU memory, returns a libcuda device pointer datatype, suitable
Expand Down Expand Up @@ -878,12 +903,22 @@ DeviceMemoryBase CudaExecutor::Allocate(uint64_t size, int64_t memory_space) {
return DeviceMemoryBase(nullptr, 0);
} else if (memory_space ==
static_cast<int64_t>(stream_executor::MemoryType::kHost)) {
return DeviceMemoryBase(GpuDriver::HostAllocate(gpu_context(), size), size);
return DeviceMemoryBase(HostAllocate(gpu_context(), size), size);
}
CHECK_EQ(memory_space, 0);
return DeviceMemoryBase(DeviceAllocate(gpu_context(), size), size);
}

absl::StatusOr<std::unique_ptr<MemoryAllocation>>
CudaExecutor::HostMemoryAllocate(uint64_t size) {
auto* buffer = HostAllocate(gpu_context(), size);
if (buffer == nullptr && size > 0) {
return absl::InternalError(
absl::StrFormat("Failed to allocate HostMemory of size %d", size));
}
return std::make_unique<HostMemoryAllocation>(buffer, size, this);
}

void CudaExecutor::Deallocate(DeviceMemoryBase* mem) {
auto status_or_memory_space = GetPointerMemorySpace(mem->opaque());
if (!status_or_memory_space.ok()) {
Expand All @@ -892,12 +927,16 @@ void CudaExecutor::Deallocate(DeviceMemoryBase* mem) {
}
auto memory_space = status_or_memory_space.value();
if (memory_space == MemoryType::kHost) {
GpuDriver::HostDeallocate(gpu_context(), mem->opaque());
HostDeallocate(gpu_context(), mem->opaque());
} else {
DeviceDeallocate(gpu_context(), mem->opaque());
}
}

void CudaExecutor::HostMemoryDeallocate(void* location) {
return HostDeallocate(gpu_context(), location);
}

bool CudaExecutor::SynchronizeAllActivity() {
return gpu_context()->Synchronize().ok();
}
Expand Down
16 changes: 2 additions & 14 deletions third_party/xla/xla/stream_executor/cuda/cuda_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ limitations under the License.
#include "absl/numeric/int128.h"
#include "absl/status/status.h"
#include "absl/status/statusor.h"
#include "absl/strings/str_format.h"
#include "absl/synchronization/mutex.h"
#include "absl/types/span.h"
#include "xla/stream_executor/blas.h"
Expand All @@ -46,7 +45,6 @@ limitations under the License.
#include "xla/stream_executor/gpu/gpu_executor.h"
#include "xla/stream_executor/gpu/gpu_kernel.h"
#include "xla/stream_executor/gpu/gpu_types.h"
#include "xla/stream_executor/host_memory_allocation.h"
#include "xla/stream_executor/kernel.h"
#include "xla/stream_executor/kernel_spec.h"
#include "xla/stream_executor/memory_allocation.h"
Expand Down Expand Up @@ -118,19 +116,9 @@ class CudaExecutor : public GpuExecutor {
void* UnifiedMemoryAllocate(uint64_t size) override;
void UnifiedMemoryDeallocate(void* location) override;
absl::StatusOr<std::unique_ptr<MemoryAllocation>> HostMemoryAllocate(
uint64_t size) override {
auto* buffer = GpuDriver::HostAllocate(gpu_context(), size);
if (buffer == nullptr && size > 0) {
return absl::InternalError(
absl::StrFormat("Failed to allocate HostMemory of size %d", size));
}
return std::make_unique<HostMemoryAllocation>(buffer, size, this);
}

void HostMemoryDeallocate(void* location) override {
return GpuDriver::HostDeallocate(gpu_context(), location);
}
uint64_t size) override;

void HostMemoryDeallocate(void* location) override;
bool HostMemoryRegister(void* location, uint64_t size) override;
bool HostMemoryUnregister(void* location) override;

Expand Down
12 changes: 0 additions & 12 deletions third_party/xla/xla/stream_executor/gpu/gpu_driver.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,18 +69,6 @@ class GpuDriver {
// https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Driver_API_functions_supported_by_HIP.html#stream-management
static void DestroyStream(Context* context, GpuStreamHandle stream);

// Allocates page-locked and CUDA-registered memory on the host via
// cuMemAllocHost/hipHostMalloc.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0
// https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Driver_API_functions_supported_by_HIP.html#memory-management
static void* HostAllocate(Context* context, uint64_t bytes);

// Deallocates a location created by HostAllocate, via
// cuMemFreeHost/hipHostFree.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g62e0fdbe181dab6b1c90fa1a51c7b92c
// https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Driver_API_functions_supported_by_HIP.html#memory-management
static void HostDeallocate(Context* context, void* location);

// Launches a CUDA/ROCm kernel via cuLaunchKernel/hipModuleLaunchKernel.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15
// https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Driver_API_functions_supported_by_HIP.html#execution-control
Expand Down
21 changes: 0 additions & 21 deletions third_party/xla/xla/stream_executor/rocm/rocm_driver.cc
Original file line number Diff line number Diff line change
Expand Up @@ -610,27 +610,6 @@ void GpuDriver::DestroyStream(Context* context, GpuStreamHandle stream) {
}
}

void* GpuDriver::HostAllocate(Context* context, uint64_t bytes) {
ScopedActivateContext activation{context};
void* host_mem = nullptr;
// "Portable" memory is visible to all ROCM contexts. Safe for our use model.
hipError_t res = wrap::hipHostMalloc(&host_mem, bytes, hipHostMallocPortable);
if (res != hipSuccess) {
LOG(ERROR) << "failed to alloc " << bytes
<< " bytes on host: " << ToString(res);
}
return host_mem;
}

void GpuDriver::HostDeallocate(Context* context, void* location) {
ScopedActivateContext activation{context};
hipError_t res = wrap::hipHostFree(location);
if (res != hipSuccess) {
LOG(ERROR) << "error deallocating host memory at " << location << ": "
<< ToString(res);
}
}

absl::Status GpuDriver::SynchronizeStream(Context* context,
GpuStreamHandle stream) {
ScopedActivateContext activated{context};
Expand Down
36 changes: 35 additions & 1 deletion third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,11 @@ limitations under the License.
#include "xla/stream_executor/gpu/gpu_types.h"
#include "xla/stream_executor/gpu/read_numa_node.h"
#include "xla/stream_executor/gpu/scoped_activate_context.h"
#include "xla/stream_executor/host_memory_allocation.h"
#include "xla/stream_executor/kernel.h"
#include "xla/stream_executor/kernel_spec.h"
#include "xla/stream_executor/launch_dim.h"
#include "xla/stream_executor/memory_allocation.h"
#include "xla/stream_executor/module_spec.h"
#include "xla/stream_executor/platform.h"
#include "xla/stream_executor/platform/initialize.h"
Expand Down Expand Up @@ -462,6 +464,20 @@ void DeviceDeallocate(Context* context, void* location) {
<< context->device_ordinal();
}
}

// Allocates memory on the host.
void* HostAllocate(Context* context, uint64_t bytes) {
ScopedActivateContext activation{context};
void* host_mem = nullptr;
// "Portable" memory is visible to all ROCM contexts. Safe for our use model.
hipError_t res = wrap::hipHostMalloc(&host_mem, bytes, hipHostMallocPortable);
if (res != hipSuccess) {
LOG(ERROR) << "failed to alloc " << bytes
<< " bytes on host: " << ToString(res);
}
return host_mem;
}

} // namespace

RocmExecutor::~RocmExecutor() {
Expand Down Expand Up @@ -711,11 +727,29 @@ absl::Status RocmExecutor::LoadModuleFromHsaco(const char* hsaco,
DeviceMemoryBase RocmExecutor::Allocate(uint64_t size, int64_t memory_space) {
if (memory_space ==
static_cast<int64_t>(stream_executor::MemoryType::kHost)) {
return DeviceMemoryBase(GpuDriver::HostAllocate(gpu_context(), size), size);
return DeviceMemoryBase(HostAllocate(gpu_context(), size), size);
}
CHECK_EQ(memory_space, 0);
return DeviceMemoryBase(DeviceAllocate(gpu_context(), size), size);
}
absl::StatusOr<std::unique_ptr<MemoryAllocation>>
RocmExecutor::HostMemoryAllocate(uint64_t size) {
auto* buffer = HostAllocate(gpu_context(), size);
if (buffer == nullptr && size > 0) {
return absl::InternalError(
absl::StrFormat("Failed to allocate HostMemory of size %d", size));
}
return std::make_unique<HostMemoryAllocation>(buffer, size, this);
}

void RocmExecutor::HostMemoryDeallocate(void* location) {
ScopedActivateContext activation{gpu_context()};
hipError_t res = wrap::hipHostFree(location);
if (res != hipSuccess) {
LOG(ERROR) << "error deallocating host memory at " << location << ": "
<< ToString(res);
}
}

void RocmExecutor::Deallocate(DeviceMemoryBase* mem) {
DeviceDeallocate(gpu_context(), mem->opaque());
Expand Down
16 changes: 2 additions & 14 deletions third_party/xla/xla/stream_executor/rocm/rocm_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ limitations under the License.
#include "absl/numeric/int128.h"
#include "absl/status/status.h"
#include "absl/status/statusor.h"
#include "absl/strings/str_format.h"
#include "absl/synchronization/mutex.h"
#include "absl/types/span.h"
#include "xla/stream_executor/blas.h"
Expand All @@ -45,7 +44,6 @@ limitations under the License.
#include "xla/stream_executor/gpu/gpu_executor.h"
#include "xla/stream_executor/gpu/gpu_kernel.h"
#include "xla/stream_executor/gpu/gpu_types.h"
#include "xla/stream_executor/host_memory_allocation.h"
#include "xla/stream_executor/kernel.h"
#include "xla/stream_executor/kernel_spec.h"
#include "xla/stream_executor/memory_allocation.h"
Expand Down Expand Up @@ -111,18 +109,8 @@ class RocmExecutor : public GpuExecutor {

void UnifiedMemoryDeallocate(void* location) override;
absl::StatusOr<std::unique_ptr<MemoryAllocation>> HostMemoryAllocate(
uint64_t size) override {
auto* buffer = GpuDriver::HostAllocate(gpu_context(), size);
if (buffer == nullptr && size > 0) {
return absl::InternalError(
absl::StrFormat("Failed to allocate HostMemory of size %d", size));
}
return std::make_unique<HostMemoryAllocation>(buffer, size, this);
}

void HostMemoryDeallocate(void* location) override {
return GpuDriver::HostDeallocate(gpu_context(), location);
}
uint64_t size) override;
void HostMemoryDeallocate(void* location) override;

absl::StatusOr<MemoryType> GetPointerMemorySpace(const void* ptr) override;

Expand Down

0 comments on commit 2105296

Please sign in to comment.