Skip to content

Commit

Permalink
ROCm 6.1.1 updates
Browse files Browse the repository at this point in the history
  • Loading branch information
dayatsin-amd committed May 8, 2024
1 parent 5e29915 commit 953b241
Show file tree
Hide file tree
Showing 10 changed files with 69 additions and 28 deletions.
2 changes: 1 addition & 1 deletion src/core/common/shared.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@

namespace rocr {
namespace core {
std::function<void*(size_t, size_t, uint32_t)> BaseShared::allocate_ = nullptr;
std::function<void*(size_t, size_t, uint32_t, int)> BaseShared::allocate_ = nullptr;
std::function<void(void*)> BaseShared::free_ = nullptr;
} // namespace core
} // namespace rocr
36 changes: 32 additions & 4 deletions src/core/common/shared.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,22 +58,34 @@ namespace core {
class BaseShared {
public:
static void SetAllocateAndFree(
const std::function<void*(size_t, size_t, uint32_t)>& allocate,
const std::function<void*(size_t, size_t, uint32_t, int)>& allocate,
const std::function<void(void*)>& free) {
allocate_ = allocate;
free_ = free;
}

protected:
static std::function<void*(size_t, size_t, uint32_t)> allocate_;
static std::function<void*(size_t, size_t, uint32_t, int)> allocate_;
static std::function<void(void*)> free_;
};

/// @brief Default Allocator for Shared. Ensures allocations are whole pages.
template <typename T> class PageAllocator : private BaseShared {
public:
__forceinline static T* alloc(int flags = 0) {
T* ret = reinterpret_cast<T*>(allocate_(AlignUp(sizeof(T), 4096), 4096, flags));
T* ret = reinterpret_cast<T*>(allocate_(AlignUp(sizeof(T), 4096), 4096, flags, 0));
if (ret == nullptr) throw std::bad_alloc();

MAKE_NAMED_SCOPE_GUARD(throwGuard, [&]() { free_(ret); });

new (ret) T;

throwGuard.Dismiss();
return ret;
}

__forceinline static T* alloc(int agent_node_id, int flags) {
T* ret = reinterpret_cast<T*>(allocate_(AlignUp(sizeof(T), 4096), 4096, flags, agent_node_id));
if (ret == nullptr) throw std::bad_alloc();

MAKE_NAMED_SCOPE_GUARD(throwGuard, [&]() { free_(ret); });
Expand Down Expand Up @@ -107,6 +119,16 @@ class Shared final : private BaseShared {
shared_object_ = PageAllocator<T>::alloc(flags);
}

explicit Shared(int agent_node_id, Allocator* pool = nullptr, int flags = 0) : pool_(pool) {
assert(allocate_ != nullptr && free_ != nullptr &&
"Shared object allocator is not set");

if (pool_)
shared_object_ = pool_->alloc();
else
shared_object_ = PageAllocator<T>::alloc(agent_node_id, flags);
}

~Shared() {
assert(allocate_ != nullptr && free_ != nullptr && "Shared object allocator is not set");

Expand Down Expand Up @@ -147,6 +169,12 @@ template <typename T> class Shared<T, PageAllocator<T>> final : private BaseShar
shared_object_ = PageAllocator<T>::alloc(flags);
}

Shared(int agent_node_id, int flags) {
assert(allocate_ != nullptr && free_ != nullptr && "Shared object allocator is not set");

shared_object_ = PageAllocator<T>::alloc(agent_node_id, flags);
}

~Shared() {
assert(allocate_ != nullptr && free_ != nullptr &&
"Shared object allocator is not set");
Expand Down Expand Up @@ -183,7 +211,7 @@ template <typename T, size_t Align> class SharedArray final : private BaseShared
static_assert((__alignof(T) <= Align) || (Align == 0), "Align is less than alignof(T)");

shared_object_ =
reinterpret_cast<T*>(allocate_(sizeof(T) * length, Max(__alignof(T), Align), 0));
reinterpret_cast<T*>(allocate_(sizeof(T) * length, Max(__alignof(T), Align), 0, 0));
if (shared_object_ == nullptr) throw std::bad_alloc();

size_t i = 0;
Expand Down
4 changes: 2 additions & 2 deletions src/core/inc/amd_memory_region.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ class MemoryRegion : public core::MemoryRegion {

~MemoryRegion();

hsa_status_t Allocate(size_t& size, AllocateFlags alloc_flags, void** address) const;
hsa_status_t Allocate(size_t& size, AllocateFlags alloc_flags, void** address, int agent_node_id = 0) const;

hsa_status_t Free(void* address, size_t size) const;

Expand Down Expand Up @@ -200,7 +200,7 @@ class MemoryRegion : public core::MemoryRegion {
const core::Runtime::LinkInfo& link_info) const;

// Operational body for Allocate. Recursive.
hsa_status_t AllocateImpl(size_t& size, AllocateFlags alloc_flags, void** address) const;
hsa_status_t AllocateImpl(size_t& size, AllocateFlags alloc_flags, void** address, int agent_node_id) const;

// Operational body for Free. Recursive.
hsa_status_t FreeImpl(void* address, size_t size) const;
Expand Down
6 changes: 5 additions & 1 deletion src/core/inc/memory_region.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,15 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> {
AllocateAsan = (1 << 6), // ASAN - First page of allocation remapped to system memory
AllocatePinned = (1 << 7), // Currently treating Pinned memory as NoSubstitute
AllocateMemoryOnly = (1 << 8), // Memory only handle from thunk, no virtual address
// Flag to allocate system memory with GTT Access
// Note: The node_id needs to be the node_id of the device even though this is allocating
// system memory
AllocateGTTAccess = (1 << 9),
};

typedef uint32_t AllocateFlags;

virtual hsa_status_t Allocate(size_t& size, AllocateFlags alloc_flags, void** address) const = 0;
virtual hsa_status_t Allocate(size_t& size, AllocateFlags alloc_flags, void** address, int agent_node_id) const = 0;

virtual hsa_status_t Free(void* address, size_t size) const = 0;

Expand Down
6 changes: 6 additions & 0 deletions src/core/inc/queue.h
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ struct SharedQueue {
class LocalQueue {
public:
LocalQueue(int mem_flags) : local_queue_(mem_flags) {}
LocalQueue(int agent_node_id, int mem_flags) : local_queue_(agent_node_id, mem_flags) {}
SharedQueue* queue() const { return local_queue_.shared_object(); }

private:
Expand All @@ -183,6 +184,11 @@ class Queue : public Checked<0xFA3906A679F9DB49>, private LocalQueue {
public_handle_ = Convert(this);
}

Queue(int agent_node_id, int mem_flags = 0) : LocalQueue(agent_node_id, mem_flags), amd_queue_(queue()->amd_queue) {
queue()->core_queue = this;
public_handle_ = Convert(this);
}

virtual ~Queue() {}

virtual void Destroy() { delete this; }
Expand Down
6 changes: 3 additions & 3 deletions src/core/inc/runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ class Runtime {
/// @retval ::HSA_STATUS_SUCCESS If allocation is successful.
hsa_status_t AllocateMemory(const MemoryRegion* region, size_t size,
MemoryRegion::AllocateFlags alloc_flags,
void** address);
void** address, int agent_node_id = 0);

/// @brief Free memory previously allocated with AllocateMemory.
///
Expand Down Expand Up @@ -419,7 +419,7 @@ class Runtime {

amd::hsa::code::AmdHsaCodeManager* code_manager() { return &code_manager_; }

std::function<void*(size_t size, size_t align, MemoryRegion::AllocateFlags flags)>&
std::function<void*(size_t size, size_t align, MemoryRegion::AllocateFlags flags, int agent_node_id)>&
system_allocator() {
return system_allocator_;
}
Expand Down Expand Up @@ -659,7 +659,7 @@ class Runtime {
prefetch_map_t prefetch_map_;

// Allocator using ::system_region_
std::function<void*(size_t size, size_t align, MemoryRegion::AllocateFlags flags)> system_allocator_;
std::function<void*(size_t size, size_t align, MemoryRegion::AllocateFlags flags, int agent_node_id)> system_allocator_;

// Deallocator using ::system_region_
std::function<void(void*)> system_deallocator_;
Expand Down
2 changes: 1 addition & 1 deletion src/core/runtime/amd_aql_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ int AqlQueue::rtti_id_ = 0;

AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, ScratchInfo& scratch,
core::HsaEventCallback callback, void* err_data, bool is_kv)
: Queue(agent->isMES() ? MemoryRegion::AllocateNonPaged : 0),
: Queue(agent->node_id(), agent->isMES() ? (MemoryRegion::AllocateGTTAccess | MemoryRegion::AllocateNonPaged) : 0),
LocalSignal(0, false),
DoorbellSignal(signal()),
ring_buf_(nullptr),
Expand Down
19 changes: 11 additions & 8 deletions src/core/runtime/amd_memory_region.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,7 @@ namespace AMD {
// Tracks aggregate size of system memory available on platform
size_t MemoryRegion::max_sysmem_alloc_size_ = 0;

void* MemoryRegion::AllocateKfdMemory(const HsaMemFlags& flag,
HSAuint32 node_id, size_t size) {
void* MemoryRegion::AllocateKfdMemory(const HsaMemFlags& flag, HSAuint32 node_id, size_t size) {
void* ret = NULL;
const HSAKMT_STATUS status = hsaKmtAllocMemory(node_id, size, flag, &ret);
return (status == HSAKMT_STATUS_SUCCESS) ? ret : NULL;
Expand Down Expand Up @@ -168,13 +167,13 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile,

MemoryRegion::~MemoryRegion() {}

hsa_status_t MemoryRegion::Allocate(size_t& size, AllocateFlags alloc_flags, void** address) const {
hsa_status_t MemoryRegion::Allocate(size_t& size, AllocateFlags alloc_flags, void** address, int agent_node_id) const {
ScopedAcquire<KernelMutex> lock(&owner()->agent_memory_lock_);
return AllocateImpl(size, alloc_flags, address);
return AllocateImpl(size, alloc_flags, address, agent_node_id);
}

hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags,
void** address) const {
void** address, int agent_node_id) const {
if (address == NULL) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
Expand Down Expand Up @@ -207,6 +206,8 @@ hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags,
kmt_alloc_flags.ui32.CoarseGrain = (alloc_flags & AllocatePCIeRW ? 0 : kmt_alloc_flags.ui32.CoarseGrain);
kmt_alloc_flags.ui32.NoSubstitute = (alloc_flags & AllocatePinned ? 1 : kmt_alloc_flags.ui32.NoSubstitute);

kmt_alloc_flags.ui32.GTTAccess = (alloc_flags & AllocateGTTAccess ? 1 : kmt_alloc_flags.ui32.GTTAccess);

// Only allow using the suballocator for ordinary VRAM.
if (IsLocalMemory() && !kmt_alloc_flags.ui32.NoAddress) {
bool subAllocEnabled = !core::Runtime::runtime_singleton_->flag().disable_fragment_alloc();
Expand All @@ -226,12 +227,14 @@ hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags,
}
}

const HSAuint32 node_id = (alloc_flags & AllocateGTTAccess) ? agent_node_id : owner()->node_id();

// Allocate memory.
// If it fails attempt to release memory from the block allocator and retry.
*address = AllocateKfdMemory(kmt_alloc_flags, owner()->node_id(), size);
*address = AllocateKfdMemory(kmt_alloc_flags, node_id, size);
if (*address == nullptr) {
owner()->Trim();
*address = AllocateKfdMemory(kmt_alloc_flags, owner()->node_id(), size);
*address = AllocateKfdMemory(kmt_alloc_flags, node_id, size);
}

if (kmt_alloc_flags.ui32.NoAddress) return HSA_STATUS_SUCCESS;
Expand Down Expand Up @@ -766,7 +769,7 @@ void* MemoryRegion::BlockAllocator::alloc(size_t request_size, size_t& allocated
size_t bsize = AlignUp(request_size, block_size());

hsa_status_t err = region_.AllocateImpl(
bsize, core::MemoryRegion::AllocateRestrict | core::MemoryRegion::AllocateDirect, &ret);
bsize, core::MemoryRegion::AllocateRestrict | core::MemoryRegion::AllocateDirect, &ret, 0);
if (err != HSA_STATUS_SUCCESS)
throw AMD::hsa_exception(err, "MemoryRegion::BlockAllocator::alloc failed.");
assert(ret != nullptr && "Region returned nullptr on success.");
Expand Down
12 changes: 6 additions & 6 deletions src/core/runtime/runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -207,12 +207,12 @@ void Runtime::RegisterAgent(Agent* agent, bool Enabled) {
for (auto pool : system_regions_fine_) {
if (pool->kernarg()) {
system_allocator_ = [pool](size_t size, size_t alignment,
MemoryRegion::AllocateFlags alloc_flags) -> void* {
MemoryRegion::AllocateFlags alloc_flags, int agent_node_id) -> void* {
assert(alignment <= 4096);
void* ptr = NULL;
return (HSA_STATUS_SUCCESS ==
core::Runtime::runtime_singleton_->AllocateMemory(pool, size, alloc_flags,
&ptr))
&ptr, agent_node_id))
? ptr
: NULL;
};
Expand Down Expand Up @@ -310,9 +310,9 @@ hsa_status_t Runtime::IterateAgent(hsa_status_t (*callback)(hsa_agent_t agent,

hsa_status_t Runtime::AllocateMemory(const MemoryRegion* region, size_t size,
MemoryRegion::AllocateFlags alloc_flags,
void** address) {
void** address, int agent_node_id) {
size_t size_requested = size; // region->Allocate(...) may align-up size to granularity
hsa_status_t status = region->Allocate(size, alloc_flags, address);
hsa_status_t status = region->Allocate(size, alloc_flags, address, agent_node_id);
// Track the allocation result so that it could be freed properly.
if (status == HSA_STATUS_SUCCESS) {
ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
Expand Down Expand Up @@ -496,7 +496,7 @@ hsa_status_t Runtime::CopyMemory(void* dst, const void* src, size_t size) {
requires the caller to specify all allowed agents we can't assume that a peer mapped pointer
would remain mapped for the duration of the copy.
*/
void* temp = system_allocator_(size, 0, core::MemoryRegion::AllocateNoFlags);
void* temp = system_allocator_(size, 0, core::MemoryRegion::AllocateNoFlags, 0);
MAKE_SCOPE_GUARD([&]() { system_deallocator_(temp); });
hsa_status_t err = src_agent->DmaCopy(temp, source, size);
if (err == HSA_STATUS_SUCCESS) err = dst_agent->DmaCopy(dst, temp, size);
Expand Down Expand Up @@ -2960,7 +2960,7 @@ hsa_status_t Runtime::VMemoryHandleCreate(const MemoryRegion* region, size_t siz

ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
void* thunk_handle;
hsa_status_t status = region->Allocate(size, alloc_flags, &thunk_handle);
hsa_status_t status = region->Allocate(size, alloc_flags, &thunk_handle, 0);
if (status == HSA_STATUS_SUCCESS) {
memory_handle_map_.emplace(std::piecewise_construct,
std::forward_as_tuple(thunk_handle),
Expand Down
4 changes: 2 additions & 2 deletions src/core/runtime/signal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,11 @@ SharedSignal* SharedSignalPool_t::alloc() {
ScopedAcquire<HybridMutex> lock(&lock_);
if (free_list_.empty()) {
SharedSignal* block = reinterpret_cast<SharedSignal*>(
allocate_(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), 0));
allocate_(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), 0, 0));
if (block == nullptr) {
block_size_ = minblock_;
block = reinterpret_cast<SharedSignal*>(
allocate_(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), 0));
allocate_(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), 0, 0));
if (block == nullptr) throw std::bad_alloc();
}

Expand Down

0 comments on commit 953b241

Please sign in to comment.