ROCm 6.1.1 updates

ROCm · May 8, 2024 · 953b241 · 953b241
1 parent 5e29915
commit 953b241
Show file tree

Hide file tree

Showing 10 changed files with 69 additions and 28 deletions.
diff --git a/src/core/common/shared.cpp b/src/core/common/shared.cpp
@@ -44,7 +44,7 @@
 
 namespace rocr {
 namespace core {
-std::function<void*(size_t, size_t, uint32_t)> BaseShared::allocate_ = nullptr;
+std::function<void*(size_t, size_t, uint32_t, int)> BaseShared::allocate_ = nullptr;
 std::function<void(void*)> BaseShared::free_ = nullptr;
 }   // namespace core
 }   // namespace rocr
diff --git a/src/core/common/shared.h b/src/core/common/shared.h
@@ -58,22 +58,34 @@ namespace core {
 class BaseShared {
  public:
   static void SetAllocateAndFree(
-      const std::function<void*(size_t, size_t, uint32_t)>& allocate,
+      const std::function<void*(size_t, size_t, uint32_t, int)>& allocate,
       const std::function<void(void*)>& free) {
     allocate_ = allocate;
     free_ = free;
   }
 
  protected:
-  static std::function<void*(size_t, size_t, uint32_t)> allocate_;
+  static std::function<void*(size_t, size_t, uint32_t, int)> allocate_;
   static std::function<void(void*)> free_;
 };
 
 /// @brief Default Allocator for Shared.  Ensures allocations are whole pages.
 template <typename T> class PageAllocator : private BaseShared {
  public:
   __forceinline static T* alloc(int flags = 0) {
-    T* ret = reinterpret_cast<T*>(allocate_(AlignUp(sizeof(T), 4096), 4096, flags));
+    T* ret = reinterpret_cast<T*>(allocate_(AlignUp(sizeof(T), 4096), 4096, flags, 0));
+    if (ret == nullptr) throw std::bad_alloc();
+
+    MAKE_NAMED_SCOPE_GUARD(throwGuard, [&]() { free_(ret); });
+
+    new (ret) T;
+
+    throwGuard.Dismiss();
+    return ret;
+  }
+
+  __forceinline static T* alloc(int agent_node_id, int flags) {
+    T* ret = reinterpret_cast<T*>(allocate_(AlignUp(sizeof(T), 4096), 4096, flags, agent_node_id));
     if (ret == nullptr) throw std::bad_alloc();
 
     MAKE_NAMED_SCOPE_GUARD(throwGuard, [&]() { free_(ret); });
@@ -107,6 +119,16 @@ class Shared final : private BaseShared {
       shared_object_ = PageAllocator<T>::alloc(flags);
   }
 
+  explicit Shared(int agent_node_id, Allocator* pool = nullptr, int flags = 0) : pool_(pool) {
+    assert(allocate_ != nullptr && free_ != nullptr &&
+           "Shared object allocator is not set");
+
+    if (pool_)
+      shared_object_ = pool_->alloc();
+    else
+      shared_object_ = PageAllocator<T>::alloc(agent_node_id, flags);
+  }
+
   ~Shared() {
     assert(allocate_ != nullptr && free_ != nullptr && "Shared object allocator is not set");
 
@@ -147,6 +169,12 @@ template <typename T> class Shared<T, PageAllocator<T>> final : private BaseShar
     shared_object_ = PageAllocator<T>::alloc(flags);
   }
 
+  Shared(int agent_node_id, int flags) {
+    assert(allocate_ != nullptr && free_ != nullptr && "Shared object allocator is not set");
+
+    shared_object_ = PageAllocator<T>::alloc(agent_node_id, flags);
+  }
+
   ~Shared() {
     assert(allocate_ != nullptr && free_ != nullptr &&
            "Shared object allocator is not set");
@@ -183,7 +211,7 @@ template <typename T, size_t Align> class SharedArray final : private BaseShared
     static_assert((__alignof(T) <= Align) || (Align == 0), "Align is less than alignof(T)");
 
     shared_object_ =
-        reinterpret_cast<T*>(allocate_(sizeof(T) * length, Max(__alignof(T), Align), 0));
+        reinterpret_cast<T*>(allocate_(sizeof(T) * length, Max(__alignof(T), Align), 0, 0));
     if (shared_object_ == nullptr) throw std::bad_alloc();
 
     size_t i = 0;

diff --git a/src/core/inc/amd_memory_region.h b/src/core/inc/amd_memory_region.h
@@ -100,7 +100,7 @@ class MemoryRegion : public core::MemoryRegion {
 
   ~MemoryRegion();
 
-  hsa_status_t Allocate(size_t& size, AllocateFlags alloc_flags, void** address) const;
+  hsa_status_t Allocate(size_t& size, AllocateFlags alloc_flags, void** address, int agent_node_id = 0) const;
 
   hsa_status_t Free(void* address, size_t size) const;
 
@@ -200,7 +200,7 @@ class MemoryRegion : public core::MemoryRegion {
                                              const core::Runtime::LinkInfo& link_info) const;
 
   // Operational body for Allocate.  Recursive.
-  hsa_status_t AllocateImpl(size_t& size, AllocateFlags alloc_flags, void** address) const;
+  hsa_status_t AllocateImpl(size_t& size, AllocateFlags alloc_flags, void** address, int agent_node_id) const;
 
   // Operational body for Free.  Recursive.
   hsa_status_t FreeImpl(void* address, size_t size) const;

diff --git a/src/core/inc/memory_region.h b/src/core/inc/memory_region.h
@@ -99,11 +99,15 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> {
     AllocateAsan = (1 << 6),        // ASAN - First page of allocation remapped to system memory
     AllocatePinned = (1 << 7),      // Currently treating Pinned memory as NoSubstitute
     AllocateMemoryOnly = (1 << 8),  // Memory only handle from thunk, no virtual address
+    // Flag to allocate system memory with GTT Access
+    // Note: The node_id needs to be the node_id of the device even though this is allocating
+    // system memory
+    AllocateGTTAccess = (1 << 9),
   };
 
   typedef uint32_t AllocateFlags;
 
-  virtual hsa_status_t Allocate(size_t& size, AllocateFlags alloc_flags, void** address) const = 0;
+  virtual hsa_status_t Allocate(size_t& size, AllocateFlags alloc_flags, void** address, int agent_node_id) const = 0;
 
   virtual hsa_status_t Free(void* address, size_t size) const = 0;
 

diff --git a/src/core/inc/queue.h b/src/core/inc/queue.h
@@ -162,6 +162,7 @@ struct SharedQueue {
 class LocalQueue {
  public:
   LocalQueue(int mem_flags) : local_queue_(mem_flags) {}
+  LocalQueue(int agent_node_id, int mem_flags) : local_queue_(agent_node_id, mem_flags) {}
   SharedQueue* queue() const { return local_queue_.shared_object(); }
 
  private:
@@ -183,6 +184,11 @@ class Queue : public Checked<0xFA3906A679F9DB49>, private LocalQueue {
     public_handle_ = Convert(this);
   }
 
+  Queue(int agent_node_id, int mem_flags = 0) : LocalQueue(agent_node_id, mem_flags), amd_queue_(queue()->amd_queue) {
+    queue()->core_queue = this;
+    public_handle_ = Convert(this);
+  }
+
   virtual ~Queue() {}
 
   virtual void Destroy() { delete this; }

diff --git a/src/core/inc/runtime.h b/src/core/inc/runtime.h
@@ -198,7 +198,7 @@ class Runtime {
   /// @retval ::HSA_STATUS_SUCCESS If allocation is successful.
   hsa_status_t AllocateMemory(const MemoryRegion* region, size_t size,
                               MemoryRegion::AllocateFlags alloc_flags,
-                              void** address);
+                              void** address, int agent_node_id = 0);
 
   /// @brief Free memory previously allocated with AllocateMemory.
   ///
@@ -419,7 +419,7 @@ class Runtime {
 
   amd::hsa::code::AmdHsaCodeManager* code_manager() { return &code_manager_; }
 
-  std::function<void*(size_t size, size_t align, MemoryRegion::AllocateFlags flags)>&
+  std::function<void*(size_t size, size_t align, MemoryRegion::AllocateFlags flags, int agent_node_id)>&
   system_allocator() {
     return system_allocator_;
   }
@@ -659,7 +659,7 @@ class Runtime {
   prefetch_map_t prefetch_map_;
 
   // Allocator using ::system_region_
-  std::function<void*(size_t size, size_t align, MemoryRegion::AllocateFlags flags)> system_allocator_;
+  std::function<void*(size_t size, size_t align, MemoryRegion::AllocateFlags flags, int agent_node_id)> system_allocator_;
 
   // Deallocator using ::system_region_
   std::function<void(void*)> system_deallocator_;

diff --git a/src/core/runtime/amd_aql_queue.cpp b/src/core/runtime/amd_aql_queue.cpp
@@ -79,7 +79,7 @@ int AqlQueue::rtti_id_ = 0;
 
 AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, ScratchInfo& scratch,
                    core::HsaEventCallback callback, void* err_data, bool is_kv)
-    : Queue(agent->isMES() ? MemoryRegion::AllocateNonPaged : 0),
+    : Queue(agent->node_id(), agent->isMES() ? (MemoryRegion::AllocateGTTAccess | MemoryRegion::AllocateNonPaged) : 0),
       LocalSignal(0, false),
       DoorbellSignal(signal()),
       ring_buf_(nullptr),

diff --git a/src/core/runtime/amd_memory_region.cpp b/src/core/runtime/amd_memory_region.cpp
@@ -57,8 +57,7 @@ namespace AMD {
 // Tracks aggregate size of system memory available on platform
 size_t MemoryRegion::max_sysmem_alloc_size_ = 0;
 
-void* MemoryRegion::AllocateKfdMemory(const HsaMemFlags& flag,
-                                      HSAuint32 node_id, size_t size) {
+void* MemoryRegion::AllocateKfdMemory(const HsaMemFlags& flag, HSAuint32 node_id, size_t size) {
   void* ret = NULL;
   const HSAKMT_STATUS status = hsaKmtAllocMemory(node_id, size, flag, &ret);
   return (status == HSAKMT_STATUS_SUCCESS) ? ret : NULL;
@@ -168,13 +167,13 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile,
 
 MemoryRegion::~MemoryRegion() {}
 
-hsa_status_t MemoryRegion::Allocate(size_t& size, AllocateFlags alloc_flags, void** address) const {
+hsa_status_t MemoryRegion::Allocate(size_t& size, AllocateFlags alloc_flags, void** address, int agent_node_id) const {
   ScopedAcquire<KernelMutex> lock(&owner()->agent_memory_lock_);
-  return AllocateImpl(size, alloc_flags, address);
+  return AllocateImpl(size, alloc_flags, address, agent_node_id);
 }
 
 hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags,
-                                        void** address) const {
+                                        void** address, int agent_node_id) const {
   if (address == NULL) {
     return HSA_STATUS_ERROR_INVALID_ARGUMENT;
   }
@@ -207,6 +206,8 @@ hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags,
   kmt_alloc_flags.ui32.CoarseGrain = (alloc_flags & AllocatePCIeRW ? 0 : kmt_alloc_flags.ui32.CoarseGrain);
   kmt_alloc_flags.ui32.NoSubstitute = (alloc_flags & AllocatePinned ? 1 : kmt_alloc_flags.ui32.NoSubstitute);
 
+  kmt_alloc_flags.ui32.GTTAccess = (alloc_flags & AllocateGTTAccess ? 1 : kmt_alloc_flags.ui32.GTTAccess);
+
   // Only allow using the suballocator for ordinary VRAM.
   if (IsLocalMemory() && !kmt_alloc_flags.ui32.NoAddress) {
     bool subAllocEnabled = !core::Runtime::runtime_singleton_->flag().disable_fragment_alloc();
@@ -226,12 +227,14 @@ hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags,
     }
   }
 
+  const HSAuint32 node_id = (alloc_flags & AllocateGTTAccess) ? agent_node_id : owner()->node_id();
+
   // Allocate memory.
   // If it fails attempt to release memory from the block allocator and retry.
-  *address = AllocateKfdMemory(kmt_alloc_flags, owner()->node_id(), size);
+  *address = AllocateKfdMemory(kmt_alloc_flags, node_id, size);
   if (*address == nullptr) {
     owner()->Trim();
-    *address = AllocateKfdMemory(kmt_alloc_flags, owner()->node_id(), size);
+    *address = AllocateKfdMemory(kmt_alloc_flags, node_id, size);
   }
 
   if (kmt_alloc_flags.ui32.NoAddress) return HSA_STATUS_SUCCESS;
@@ -766,7 +769,7 @@ void* MemoryRegion::BlockAllocator::alloc(size_t request_size, size_t& allocated
   size_t bsize = AlignUp(request_size, block_size());
 
   hsa_status_t err = region_.AllocateImpl(
-      bsize, core::MemoryRegion::AllocateRestrict | core::MemoryRegion::AllocateDirect, &ret);
+      bsize, core::MemoryRegion::AllocateRestrict | core::MemoryRegion::AllocateDirect, &ret, 0);
   if (err != HSA_STATUS_SUCCESS)
     throw AMD::hsa_exception(err, "MemoryRegion::BlockAllocator::alloc failed.");
   assert(ret != nullptr && "Region returned nullptr on success.");

diff --git a/src/core/runtime/runtime.cpp b/src/core/runtime/runtime.cpp
@@ -207,12 +207,12 @@ void Runtime::RegisterAgent(Agent* agent, bool Enabled) {
       for (auto pool : system_regions_fine_) {
         if (pool->kernarg()) {
           system_allocator_ = [pool](size_t size, size_t alignment,
-                                     MemoryRegion::AllocateFlags alloc_flags) -> void* {
+                                     MemoryRegion::AllocateFlags alloc_flags, int agent_node_id) -> void* {
             assert(alignment <= 4096);
             void* ptr = NULL;
             return (HSA_STATUS_SUCCESS ==
                     core::Runtime::runtime_singleton_->AllocateMemory(pool, size, alloc_flags,
-                                                                      &ptr))
+                                                                      &ptr, agent_node_id))
                 ? ptr
                 : NULL;
           };
@@ -310,9 +310,9 @@ hsa_status_t Runtime::IterateAgent(hsa_status_t (*callback)(hsa_agent_t agent,
 
 hsa_status_t Runtime::AllocateMemory(const MemoryRegion* region, size_t size,
                                      MemoryRegion::AllocateFlags alloc_flags,
-                                     void** address) {
+                                     void** address, int agent_node_id) {
   size_t size_requested = size;  // region->Allocate(...) may align-up size to granularity
-  hsa_status_t status = region->Allocate(size, alloc_flags, address);
+  hsa_status_t status = region->Allocate(size, alloc_flags, address, agent_node_id);
   // Track the allocation result so that it could be freed properly.
   if (status == HSA_STATUS_SUCCESS) {
     ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
@@ -496,7 +496,7 @@ hsa_status_t Runtime::CopyMemory(void* dst, const void* src, size_t size) {
   requires the caller to specify all allowed agents we can't assume that a peer mapped pointer
   would remain mapped for the duration of the copy.
   */
-  void* temp = system_allocator_(size, 0, core::MemoryRegion::AllocateNoFlags);
+  void* temp = system_allocator_(size, 0, core::MemoryRegion::AllocateNoFlags, 0);
   MAKE_SCOPE_GUARD([&]() { system_deallocator_(temp); });
   hsa_status_t err = src_agent->DmaCopy(temp, source, size);
   if (err == HSA_STATUS_SUCCESS) err = dst_agent->DmaCopy(dst, temp, size);
@@ -2960,7 +2960,7 @@ hsa_status_t Runtime::VMemoryHandleCreate(const MemoryRegion* region, size_t siz
 
   ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
   void* thunk_handle;
-  hsa_status_t status = region->Allocate(size, alloc_flags, &thunk_handle);
+  hsa_status_t status = region->Allocate(size, alloc_flags, &thunk_handle, 0);
   if (status == HSA_STATUS_SUCCESS) {
     memory_handle_map_.emplace(std::piecewise_construct,
           std::forward_as_tuple(thunk_handle),

diff --git a/src/core/runtime/signal.cpp b/src/core/runtime/signal.cpp
@@ -73,11 +73,11 @@ SharedSignal* SharedSignalPool_t::alloc() {
   ScopedAcquire<HybridMutex> lock(&lock_);
   if (free_list_.empty()) {
     SharedSignal* block = reinterpret_cast<SharedSignal*>(
-        allocate_(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), 0));
+        allocate_(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), 0, 0));
     if (block == nullptr) {
       block_size_ = minblock_;
       block = reinterpret_cast<SharedSignal*>(
-          allocate_(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), 0));
+          allocate_(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), 0, 0));
       if (block == nullptr) throw std::bad_alloc();
     }