diff --git a/src/core/inc/amd_gpu_agent.h b/src/core/inc/amd_gpu_agent.h index 6fd8be666..c7791ad29 100644 --- a/src/core/inc/amd_gpu_agent.h +++ b/src/core/inc/amd_gpu_agent.h @@ -129,10 +129,7 @@ class GpuAgentInt : public core::Agent { // // @param [in] signal Pointer to signal that provides the async copy timing. // @param [out] time Structure to be populated with the host domain value. - virtual void TranslateTime(core::Signal* signal, - hsa_amd_profiling_async_copy_time_t& time) { - return TranslateTime(signal, (hsa_amd_profiling_dispatch_time_t&)time); - } + virtual void TranslateTime(core::Signal* signal, hsa_amd_profiling_async_copy_time_t& time) = 0; // @brief Translate timestamp agent domain to host domain. // @@ -248,9 +245,6 @@ class GpuAgent : public GpuAgentInt { // @brief Override from core::Agent. hsa_status_t DmaFill(void* ptr, uint32_t value, size_t count) override; - // @brief Get the next available end timestamp object. - uint64_t* ObtainEndTsObject(); - // @brief Override from core::Agent. hsa_status_t GetInfo(hsa_agent_info_t attribute, void* value) const override; @@ -284,6 +278,9 @@ class GpuAgent : public GpuAgentInt { void TranslateTime(core::Signal* signal, hsa_amd_profiling_dispatch_time_t& time) override; + // @brief Override from amd::GpuAgentInt. + void TranslateTime(core::Signal* signal, hsa_amd_profiling_async_copy_time_t& time) override; + // @brief Override from amd::GpuAgentInt. uint64_t TranslateTime(uint64_t tick) override; @@ -490,9 +487,6 @@ class GpuAgent : public GpuAgentInt { // @brief Create internal queues and blits. void InitDma(); - // @brief Initialize memory pool for end timestamp object. - // @retval True if the memory pool for end timestamp object is initialized. - bool InitEndTsPool(); // Bind index of peer device that is connected via xGMI links lazy_ptr& GetXgmiBlit(const core::Agent& peer_agent); @@ -503,23 +497,12 @@ class GpuAgent : public GpuAgentInt { // Bind the Blit object that will drive the copy operation lazy_ptr& GetBlitObject(const core::Agent& dst_agent, const core::Agent& src_agent); - // @brief Alternative aperture base address. Only on KV. uintptr_t ape1_base_; // @brief Alternative aperture size. Only on KV. size_t ape1_size_; - // Each end ts is 32 bytes. - static const size_t kTsSize = 32; - - // Number of element in the pool. - uint32_t end_ts_pool_size_; - - std::atomic end_ts_pool_counter_; - - std::atomic end_ts_base_addr_; - DISALLOW_COPY_AND_ASSIGN(GpuAgent); }; diff --git a/src/core/inc/signal.h b/src/core/inc/signal.h index 82ca6d193..24dd36acc 100644 --- a/src/core/inc/signal.h +++ b/src/core/inc/signal.h @@ -82,8 +82,12 @@ class Signal; /// @brief ABI and object conversion struct for signals. May be shared between processes. struct SharedSignal { amd_signal_t amd_signal; + uint64_t sdma_start_ts; Signal* core_signal; Check<0x71FCCA6A3D5D5276, true> id; + uint8_t reserved[8]; + uint64_t sdma_end_ts; + uint8_t reserved2[24]; SharedSignal() { memset(&amd_signal, 0, sizeof(amd_signal)); @@ -95,6 +99,39 @@ struct SharedSignal { bool IsIPC() const { return core_signal == nullptr; } + void GetSdmaTsAddresses(uint64_t*& start, uint64_t*& end) { + /* + SDMA timestamps on gfx7xx/8xxx require 32 byte alignment (gfx9xx relaxes + alignment to 8 bytes). This conflicts with the frozen format for amd_signal_t + so we place the time stamps in sdma_start/end_ts instead (amd_signal.start_ts + is also properly aligned). Reading of the timestamps occurs in GetRawTs(). + */ + start = &sdma_start_ts; + end = &sdma_end_ts; + } + + void CopyPrep() { + // Clear sdma_end_ts before a copy so we can detect if the copy was done via + // SDMA or blit kernel. + sdma_start_ts = 0; + sdma_end_ts = 0; + } + + void GetRawTs(bool FetchCopyTs, uint64_t& start, uint64_t& end) { + /* + If the read is for a copy we need to check if it was done by blit kernel or SDMA. + Since we clear sdma_start/end_ts during CopyPrep we know it was a SDMA copy if one + of those is non-zero. Otherwise return compute kernel stamps from amd_signal. + */ + if (FetchCopyTs && sdma_end_ts != 0) { + start = sdma_start_ts; + end = sdma_end_ts; + return; + } + start = amd_signal.start_ts; + end = amd_signal.end_ts; + } + static __forceinline SharedSignal* Convert(hsa_signal_t signal) { SharedSignal* ret = reinterpret_cast(static_cast(signal.handle) - offsetof(SharedSignal, amd_signal)); @@ -112,6 +149,12 @@ static_assert(std::is_standard_layout::value, "SharedSignal must remain standard layout for IPC use."); static_assert(std::is_trivially_destructible::value, "SharedSignal must not be modified on delete for IPC use."); +static_assert((offsetof(SharedSignal, sdma_start_ts) % 32) == 0, + "Bad SDMA time stamp alignment."); +static_assert((offsetof(SharedSignal, sdma_end_ts) % 32) == 0, + "Bad SDMA time stamp alignment."); +static_assert(sizeof(SharedSignal) == 128, + "Bad SharedSignal size."); /// @brief Pool class for SharedSignal suitable for use with Shared. class SharedSignalPool_t : private BaseShared { @@ -318,12 +361,23 @@ class Signal { /// @brief Checks if signal is currently in use by a wait API. bool InWaiting() const { return waiting_ != 0; } + // Prep for copy profiling. Store copy agent and ready API block. __forceinline void async_copy_agent(core::Agent* agent) { async_copy_agent_ = agent; + core::SharedSignal::Convert(Convert(this))->CopyPrep(); } __forceinline core::Agent* async_copy_agent() { return async_copy_agent_; } + void GetSdmaTsAddresses(uint64_t*& start, uint64_t*& end) { + core::SharedSignal::Convert(Convert(this))->GetSdmaTsAddresses(start, end); + } + + // Set FetchCopyTs = true when reading time stamps from a copy operation. + void GetRawTs(bool FetchCopyTs, uint64_t& start, uint64_t& end) { + core::SharedSignal::Convert(Convert(this))->GetRawTs(FetchCopyTs, start, end); + } + /// @brief Structure which defines key signal elements like type and value. /// Address of this struct is used as a value for the opaque handle of type /// hsa_signal_t provided to the public API. diff --git a/src/core/runtime/amd_blit_kernel.cpp b/src/core/runtime/amd_blit_kernel.cpp index 06f19c5cb..f7b5653b2 100644 --- a/src/core/runtime/amd_blit_kernel.cpp +++ b/src/core/runtime/amd_blit_kernel.cpp @@ -625,9 +625,8 @@ hsa_status_t BlitKernel::SubmitLinearCopyCommand( // Insert barrier packets to handle dependent signals. // Barrier bit keeps signal checking traffic from competing with a copy. const uint16_t kBarrierPacketHeader = (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | - (1 << HSA_PACKET_HEADER_BARRIER) | (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | - (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); + (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); hsa_barrier_and_packet_t barrier_packet = {0}; barrier_packet.header = HSA_PACKET_TYPE_INVALID; @@ -807,7 +806,6 @@ void BlitKernel::PopulateQueue(uint64_t index, uint64_t code_handle, void* args, static const uint16_t kDispatchPacketHeader = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | - (((completion_signal.handle != 0) ? 1 : 0) << HSA_PACKET_HEADER_BARRIER) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); diff --git a/src/core/runtime/amd_blit_sdma.cpp b/src/core/runtime/amd_blit_sdma.cpp index 8bdd7b279..2a189de89 100644 --- a/src/core/runtime/amd_blit_sdma.cpp +++ b/src/core/runtime/amd_blit_sdma.cpp @@ -141,8 +141,8 @@ hsa_status_t BlitSdma::Initial platform_atomic_support_ = link.info.atomic_support_64bit; } - // Determine if sDMA microcode supports HDP flush command - if (agent_->GetSdmaMicrocodeVersion() >= SDMA_PKT_HDP_FLUSH::kMinVersion_) { + // HDP flush supported on gfx900 and forward. + if (agent_->isa()->GetMajorVersion() > 8) { hdp_flush_support_ = true; } @@ -248,22 +248,13 @@ hsa_status_t BlitSdma::SubmitC // profiling in the middle of the call. const bool profiling_enabled = agent_->profiling_enabled(); - uint64_t* end_ts_addr = NULL; + uint64_t* start_ts_addr = nullptr; + uint64_t* end_ts_addr = nullptr; uint32_t total_timestamp_command_size = 0; if (profiling_enabled) { - // SDMA timestamp packet requires 32 byte of aligned memory, but - // amd_signal_t::end_ts is not 32 byte aligned. So an extra copy packet to - // read from a 32 byte aligned bounce buffer is required to avoid changing - // the amd_signal_t ABI. - - end_ts_addr = agent_->ObtainEndTsObject(); - if (end_ts_addr == NULL) { - return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - } - - total_timestamp_command_size = - (2 * timestamp_command_size_) + linear_copy_command_size_; + out_signal.GetSdmaTsAddresses(start_ts_addr, end_ts_addr); + total_timestamp_command_size = 2 * timestamp_command_size_; } // On agent that does not support platform atomic, we replace it with @@ -315,8 +306,7 @@ hsa_status_t BlitSdma::SubmitC } if (profiling_enabled) { - BuildGetGlobalTimestampCommand( - command_addr, reinterpret_cast(&out_signal.signal_.start_ts)); + BuildGetGlobalTimestampCommand(command_addr, reinterpret_cast(start_ts_addr)); command_addr += timestamp_command_size_; } @@ -337,11 +327,6 @@ hsa_status_t BlitSdma::SubmitC BuildGetGlobalTimestampCommand(command_addr, reinterpret_cast(end_ts_addr)); command_addr += timestamp_command_size_; - - BuildCopyCommand(command_addr, 1, - reinterpret_cast(&out_signal.signal_.end_ts), - reinterpret_cast(end_ts_addr), sizeof(uint64_t)); - command_addr += linear_copy_command_size_; } // After transfer is completed, decrement the signal value. diff --git a/src/core/runtime/amd_gpu_agent.cpp b/src/core/runtime/amd_gpu_agent.cpp index 4d2d23b35..5b27486dd 100644 --- a/src/core/runtime/amd_gpu_agent.cpp +++ b/src/core/runtime/amd_gpu_agent.cpp @@ -86,10 +86,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props) memory_bus_width_(0), memory_max_frequency_(0), ape1_base_(0), - ape1_size_(0), - end_ts_pool_size_(0), - end_ts_pool_counter_(0), - end_ts_base_addr_(NULL) { + ape1_size_(0) { const bool is_apu_node = (properties_.NumCPUCores > 0); profile_ = (is_apu_node) ? HSA_PROFILE_FULL : HSA_PROFILE_BASE; @@ -144,10 +141,6 @@ GpuAgent::~GpuAgent() { } } - if (end_ts_base_addr_ != NULL) { - core::Runtime::runtime_singleton_->FreeMemory(end_ts_base_addr_); - } - if (ape1_base_ != 0) { _aligned_free(reinterpret_cast(ape1_base_)); } @@ -405,58 +398,6 @@ void GpuAgent::InitCacheList() { cache_props_[i].CacheLevel, cache_props_[i].CacheSize)); } -bool GpuAgent::InitEndTsPool() { - if (HSA_PROFILE_FULL == profile_) { - return true; - } - - if (end_ts_base_addr_.load(std::memory_order_acquire) != NULL) { - return true; - } - - ScopedAcquire lock(&blit_lock_); - - if (end_ts_base_addr_.load(std::memory_order_relaxed) != NULL) { - return true; - } - - end_ts_pool_size_ = - static_cast((BlitSdmaBase::kQueueSize + BlitSdmaBase::kCopyPacketSize - 1) / - (BlitSdmaBase::kCopyPacketSize)); - - // Allocate end timestamp object for both h2d and d2h DMA. - const size_t alloc_size = 2 * end_ts_pool_size_ * kTsSize; - - core::Runtime* runtime = core::Runtime::runtime_singleton_; - - uint64_t* buff = NULL; - if (HSA_STATUS_SUCCESS != - runtime->AllocateMemory(local_region_, alloc_size, - MemoryRegion::AllocateRestrict, - reinterpret_cast(&buff))) { - return false; - } - - end_ts_base_addr_.store(buff, std::memory_order_release); - - return true; -} - -uint64_t* GpuAgent::ObtainEndTsObject() { - if (end_ts_base_addr_ == NULL) { - return NULL; - } - - const uint32_t end_ts_index = - end_ts_pool_counter_.fetch_add(1U, std::memory_order_acq_rel) % - end_ts_pool_size_; - const static size_t kNumU64 = kTsSize / sizeof(uint64_t); - uint64_t* end_ts_addr = &end_ts_base_addr_[end_ts_index * kNumU64]; - assert(IsMultipleOf(end_ts_addr, kTsSize)); - - return end_ts_addr; -} - hsa_status_t GpuAgent::IterateRegion( hsa_status_t (*callback)(hsa_region_t region, void* data), void* data) const { @@ -701,10 +642,6 @@ hsa_status_t GpuAgent::DmaFill(void* ptr, uint32_t value, size_t count) { } hsa_status_t GpuAgent::EnableDmaProfiling(bool enable) { - if (enable && !InitEndTsPool()) { - return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - } - for (auto& blit : blits_) { if (blit.created()) { const hsa_status_t stat = blit->EnableProfiling(enable); @@ -1099,16 +1036,28 @@ void GpuAgent::ReleaseQueueScratch(ScratchInfo& scratch) { void GpuAgent::TranslateTime(core::Signal* signal, hsa_amd_profiling_dispatch_time_t& time) { + uint64_t start, end; + signal->GetRawTs(false, start, end); // Order is important, we want to translate the end time first to ensure that packet duration is // not impacted by clock measurement latency jitter. - time.end = TranslateTime(signal->signal_.end_ts); - time.start = TranslateTime(signal->signal_.start_ts); - - if ((signal->signal_.start_ts == 0) || (signal->signal_.end_ts == 0) || - (signal->signal_.start_ts > t1_.GPUClockCounter) || - (signal->signal_.end_ts > t1_.GPUClockCounter) || - (signal->signal_.start_ts < t0_.GPUClockCounter) || - (signal->signal_.end_ts < t0_.GPUClockCounter)) + time.end = TranslateTime(end); + time.start = TranslateTime(start); + + if ((start == 0) || (end == 0) || (start > t1_.GPUClockCounter) || (end > t1_.GPUClockCounter) || + (start < t0_.GPUClockCounter) || (end < t0_.GPUClockCounter)) + debug_print("Signal %p time stamps may be invalid.", &signal->signal_); +} + +void GpuAgent::TranslateTime(core::Signal* signal, hsa_amd_profiling_async_copy_time_t& time) { + uint64_t start, end; + signal->GetRawTs(true, start, end); + // Order is important, we want to translate the end time first to ensure that packet duration is + // not impacted by clock measurement latency jitter. + time.end = TranslateTime(end); + time.start = TranslateTime(start); + + if ((start == 0) || (end == 0) || (start > t1_.GPUClockCounter) || (end > t1_.GPUClockCounter) || + (start < t0_.GPUClockCounter) || (end < t0_.GPUClockCounter)) debug_print("Signal %p time stamps may be invalid.", &signal->signal_); } @@ -1215,11 +1164,6 @@ void GpuAgent::BindTrapHandler() { return; } - // Disable trap handler on APUs until KFD is fixed. - if (profile_ == HSA_PROFILE_FULL) { - return; - } - // Assemble the trap handler source code. AssembleShader("TrapHandler", AssembleTarget::ISA, trap_code_buf_, trap_code_buf_size_); diff --git a/src/core/runtime/default_signal.cpp b/src/core/runtime/default_signal.cpp index 249736dd7..350a214ca 100644 --- a/src/core/runtime/default_signal.cpp +++ b/src/core/runtime/default_signal.cpp @@ -89,6 +89,10 @@ hsa_signal_value_t BusyWaitSignal::WaitRelaxed(hsa_signal_condition_t condition, timer::fast_clock::time_point start_time, time; start_time = timer::fast_clock::now(); + // Set a polling timeout value + // Should be a few times bigger than null kernel latency + const timer::fast_clock::duration kMaxElapsed = std::chrono::microseconds(200); + uint64_t hsa_freq; HSA::hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &hsa_freq); const timer::fast_clock::duration fast_timeout = @@ -127,7 +131,9 @@ hsa_signal_value_t BusyWaitSignal::WaitRelaxed(hsa_signal_condition_t condition, value = atomic::Load(&signal_.value, std::memory_order_relaxed); return hsa_signal_value_t(value); } - os::uSleep(20); + if (time - start_time > kMaxElapsed) { + os::uSleep(20); + } } } diff --git a/src/core/runtime/hsa_ext_amd.cpp b/src/core/runtime/hsa_ext_amd.cpp index 5fd678784..df93bf27d 100644 --- a/src/core/runtime/hsa_ext_amd.cpp +++ b/src/core/runtime/hsa_ext_amd.cpp @@ -382,20 +382,10 @@ hsa_status_t hsa_amd_profiling_get_async_copy_time( core::Agent* agent = signal->async_copy_agent(); - if (agent == NULL) { + if (agent == nullptr) { return HSA_STATUS_ERROR; } - // Validate the embedded agent pointer if the signal is IPC. - if (signal->isIPC()) { - for (auto it : core::Runtime::runtime_singleton_->gpu_agents()) { - if (it == agent) break; - } - // If the agent isn't a GPU then it is from a different process or it's a CPU. - // Assume it's a CPU and illegal uses will generate garbage data same as kernel completion. - agent = core::Runtime::runtime_singleton_->cpu_agents()[0]; - } - if (agent->device_type() == core::Agent::DeviceType::kAmdGpuDevice) { // Translate timestamp from GPU to system domain. static_cast(agent)->TranslateTime(signal, *time); diff --git a/src/core/runtime/runtime.cpp b/src/core/runtime/runtime.cpp index 488c44083..5940a51af 100644 --- a/src/core/runtime/runtime.cpp +++ b/src/core/runtime/runtime.cpp @@ -471,6 +471,7 @@ hsa_status_t Runtime::CopyMemory(void* dst, core::Agent& dst_agent, // For cpu to cpu, fire and forget a copy thread. const bool profiling_enabled = (dst_agent.profiling_enabled() || src_agent.profiling_enabled()); + if (profiling_enabled) completion_signal.async_copy_agent(&dst_agent); std::thread( [](void* dst, const void* src, size_t size, std::vector dep_signals,