Skip to content

Commit

Permalink
Merge pull request #76 from RadeonOpenCompute/roc-2.9.x
Browse files Browse the repository at this point in the history
ROCm 2.9.0 updates
  • Loading branch information
skeelyamd authored Oct 4, 2019
2 parents f446e05 + 8336f95 commit b507d85
Show file tree
Hide file tree
Showing 8 changed files with 96 additions and 135 deletions.
25 changes: 4 additions & 21 deletions src/core/inc/amd_gpu_agent.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,10 +129,7 @@ class GpuAgentInt : public core::Agent {
//
// @param [in] signal Pointer to signal that provides the async copy timing.
// @param [out] time Structure to be populated with the host domain value.
virtual void TranslateTime(core::Signal* signal,
hsa_amd_profiling_async_copy_time_t& time) {
return TranslateTime(signal, (hsa_amd_profiling_dispatch_time_t&)time);
}
virtual void TranslateTime(core::Signal* signal, hsa_amd_profiling_async_copy_time_t& time) = 0;

// @brief Translate timestamp agent domain to host domain.
//
Expand Down Expand Up @@ -248,9 +245,6 @@ class GpuAgent : public GpuAgentInt {
// @brief Override from core::Agent.
hsa_status_t DmaFill(void* ptr, uint32_t value, size_t count) override;

// @brief Get the next available end timestamp object.
uint64_t* ObtainEndTsObject();

// @brief Override from core::Agent.
hsa_status_t GetInfo(hsa_agent_info_t attribute, void* value) const override;

Expand Down Expand Up @@ -284,6 +278,9 @@ class GpuAgent : public GpuAgentInt {
void TranslateTime(core::Signal* signal,
hsa_amd_profiling_dispatch_time_t& time) override;

// @brief Override from amd::GpuAgentInt.
void TranslateTime(core::Signal* signal, hsa_amd_profiling_async_copy_time_t& time) override;

// @brief Override from amd::GpuAgentInt.
uint64_t TranslateTime(uint64_t tick) override;

Expand Down Expand Up @@ -490,9 +487,6 @@ class GpuAgent : public GpuAgentInt {
// @brief Create internal queues and blits.
void InitDma();

// @brief Initialize memory pool for end timestamp object.
// @retval True if the memory pool for end timestamp object is initialized.
bool InitEndTsPool();

// Bind index of peer device that is connected via xGMI links
lazy_ptr<core::Blit>& GetXgmiBlit(const core::Agent& peer_agent);
Expand All @@ -503,23 +497,12 @@ class GpuAgent : public GpuAgentInt {

// Bind the Blit object that will drive the copy operation
lazy_ptr<core::Blit>& GetBlitObject(const core::Agent& dst_agent, const core::Agent& src_agent);

// @brief Alternative aperture base address. Only on KV.
uintptr_t ape1_base_;

// @brief Alternative aperture size. Only on KV.
size_t ape1_size_;

// Each end ts is 32 bytes.
static const size_t kTsSize = 32;

// Number of element in the pool.
uint32_t end_ts_pool_size_;

std::atomic<uint32_t> end_ts_pool_counter_;

std::atomic<uint64_t*> end_ts_base_addr_;

DISALLOW_COPY_AND_ASSIGN(GpuAgent);
};

Expand Down
54 changes: 54 additions & 0 deletions src/core/inc/signal.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,12 @@ class Signal;
/// @brief ABI and object conversion struct for signals. May be shared between processes.
struct SharedSignal {
amd_signal_t amd_signal;
uint64_t sdma_start_ts;
Signal* core_signal;
Check<0x71FCCA6A3D5D5276, true> id;
uint8_t reserved[8];
uint64_t sdma_end_ts;
uint8_t reserved2[24];

SharedSignal() {
memset(&amd_signal, 0, sizeof(amd_signal));
Expand All @@ -95,6 +99,39 @@ struct SharedSignal {

bool IsIPC() const { return core_signal == nullptr; }

void GetSdmaTsAddresses(uint64_t*& start, uint64_t*& end) {
/*
SDMA timestamps on gfx7xx/8xxx require 32 byte alignment (gfx9xx relaxes
alignment to 8 bytes). This conflicts with the frozen format for amd_signal_t
so we place the time stamps in sdma_start/end_ts instead (amd_signal.start_ts
is also properly aligned). Reading of the timestamps occurs in GetRawTs().
*/
start = &sdma_start_ts;
end = &sdma_end_ts;
}

void CopyPrep() {
// Clear sdma_end_ts before a copy so we can detect if the copy was done via
// SDMA or blit kernel.
sdma_start_ts = 0;
sdma_end_ts = 0;
}

void GetRawTs(bool FetchCopyTs, uint64_t& start, uint64_t& end) {
/*
If the read is for a copy we need to check if it was done by blit kernel or SDMA.
Since we clear sdma_start/end_ts during CopyPrep we know it was a SDMA copy if one
of those is non-zero. Otherwise return compute kernel stamps from amd_signal.
*/
if (FetchCopyTs && sdma_end_ts != 0) {
start = sdma_start_ts;
end = sdma_end_ts;
return;
}
start = amd_signal.start_ts;
end = amd_signal.end_ts;
}

static __forceinline SharedSignal* Convert(hsa_signal_t signal) {
SharedSignal* ret = reinterpret_cast<SharedSignal*>(static_cast<uintptr_t>(signal.handle) -
offsetof(SharedSignal, amd_signal));
Expand All @@ -112,6 +149,12 @@ static_assert(std::is_standard_layout<SharedSignal>::value,
"SharedSignal must remain standard layout for IPC use.");
static_assert(std::is_trivially_destructible<SharedSignal>::value,
"SharedSignal must not be modified on delete for IPC use.");
static_assert((offsetof(SharedSignal, sdma_start_ts) % 32) == 0,
"Bad SDMA time stamp alignment.");
static_assert((offsetof(SharedSignal, sdma_end_ts) % 32) == 0,
"Bad SDMA time stamp alignment.");
static_assert(sizeof(SharedSignal) == 128,
"Bad SharedSignal size.");

/// @brief Pool class for SharedSignal suitable for use with Shared.
class SharedSignalPool_t : private BaseShared {
Expand Down Expand Up @@ -318,12 +361,23 @@ class Signal {
/// @brief Checks if signal is currently in use by a wait API.
bool InWaiting() const { return waiting_ != 0; }

// Prep for copy profiling. Store copy agent and ready API block.
__forceinline void async_copy_agent(core::Agent* agent) {
async_copy_agent_ = agent;
core::SharedSignal::Convert(Convert(this))->CopyPrep();
}

__forceinline core::Agent* async_copy_agent() { return async_copy_agent_; }

void GetSdmaTsAddresses(uint64_t*& start, uint64_t*& end) {
core::SharedSignal::Convert(Convert(this))->GetSdmaTsAddresses(start, end);
}

// Set FetchCopyTs = true when reading time stamps from a copy operation.
void GetRawTs(bool FetchCopyTs, uint64_t& start, uint64_t& end) {
core::SharedSignal::Convert(Convert(this))->GetRawTs(FetchCopyTs, start, end);
}

/// @brief Structure which defines key signal elements like type and value.
/// Address of this struct is used as a value for the opaque handle of type
/// hsa_signal_t provided to the public API.
Expand Down
4 changes: 1 addition & 3 deletions src/core/runtime/amd_blit_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -625,9 +625,8 @@ hsa_status_t BlitKernel::SubmitLinearCopyCommand(
// Insert barrier packets to handle dependent signals.
// Barrier bit keeps signal checking traffic from competing with a copy.
const uint16_t kBarrierPacketHeader = (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
(HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);

hsa_barrier_and_packet_t barrier_packet = {0};
barrier_packet.header = HSA_PACKET_TYPE_INVALID;
Expand Down Expand Up @@ -807,7 +806,6 @@ void BlitKernel::PopulateQueue(uint64_t index, uint64_t code_handle, void* args,

static const uint16_t kDispatchPacketHeader =
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(((completion_signal.handle != 0) ? 1 : 0) << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);

Expand Down
29 changes: 7 additions & 22 deletions src/core/runtime/amd_blit_sdma.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,8 +141,8 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Initial
platform_atomic_support_ = link.info.atomic_support_64bit;
}

// Determine if sDMA microcode supports HDP flush command
if (agent_->GetSdmaMicrocodeVersion() >= SDMA_PKT_HDP_FLUSH::kMinVersion_) {
// HDP flush supported on gfx900 and forward.
if (agent_->isa()->GetMajorVersion() > 8) {
hdp_flush_support_ = true;
}

Expand Down Expand Up @@ -248,22 +248,13 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitC
// profiling in the middle of the call.
const bool profiling_enabled = agent_->profiling_enabled();

uint64_t* end_ts_addr = NULL;
uint64_t* start_ts_addr = nullptr;
uint64_t* end_ts_addr = nullptr;
uint32_t total_timestamp_command_size = 0;

if (profiling_enabled) {
// SDMA timestamp packet requires 32 byte of aligned memory, but
// amd_signal_t::end_ts is not 32 byte aligned. So an extra copy packet to
// read from a 32 byte aligned bounce buffer is required to avoid changing
// the amd_signal_t ABI.

end_ts_addr = agent_->ObtainEndTsObject();
if (end_ts_addr == NULL) {
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}

total_timestamp_command_size =
(2 * timestamp_command_size_) + linear_copy_command_size_;
out_signal.GetSdmaTsAddresses(start_ts_addr, end_ts_addr);
total_timestamp_command_size = 2 * timestamp_command_size_;
}

// On agent that does not support platform atomic, we replace it with
Expand Down Expand Up @@ -315,8 +306,7 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitC
}

if (profiling_enabled) {
BuildGetGlobalTimestampCommand(
command_addr, reinterpret_cast<void*>(&out_signal.signal_.start_ts));
BuildGetGlobalTimestampCommand(command_addr, reinterpret_cast<void*>(start_ts_addr));
command_addr += timestamp_command_size_;
}

Expand All @@ -337,11 +327,6 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitC
BuildGetGlobalTimestampCommand(command_addr,
reinterpret_cast<void*>(end_ts_addr));
command_addr += timestamp_command_size_;

BuildCopyCommand(command_addr, 1,
reinterpret_cast<void*>(&out_signal.signal_.end_ts),
reinterpret_cast<void*>(end_ts_addr), sizeof(uint64_t));
command_addr += linear_copy_command_size_;
}

// After transfer is completed, decrement the signal value.
Expand Down
98 changes: 21 additions & 77 deletions src/core/runtime/amd_gpu_agent.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
memory_bus_width_(0),
memory_max_frequency_(0),
ape1_base_(0),
ape1_size_(0),
end_ts_pool_size_(0),
end_ts_pool_counter_(0),
end_ts_base_addr_(NULL) {
ape1_size_(0) {
const bool is_apu_node = (properties_.NumCPUCores > 0);
profile_ = (is_apu_node) ? HSA_PROFILE_FULL : HSA_PROFILE_BASE;

Expand Down Expand Up @@ -144,10 +141,6 @@ GpuAgent::~GpuAgent() {
}
}

if (end_ts_base_addr_ != NULL) {
core::Runtime::runtime_singleton_->FreeMemory(end_ts_base_addr_);
}

if (ape1_base_ != 0) {
_aligned_free(reinterpret_cast<void*>(ape1_base_));
}
Expand Down Expand Up @@ -405,58 +398,6 @@ void GpuAgent::InitCacheList() {
cache_props_[i].CacheLevel, cache_props_[i].CacheSize));
}

bool GpuAgent::InitEndTsPool() {
if (HSA_PROFILE_FULL == profile_) {
return true;
}

if (end_ts_base_addr_.load(std::memory_order_acquire) != NULL) {
return true;
}

ScopedAcquire<KernelMutex> lock(&blit_lock_);

if (end_ts_base_addr_.load(std::memory_order_relaxed) != NULL) {
return true;
}

end_ts_pool_size_ =
static_cast<uint32_t>((BlitSdmaBase::kQueueSize + BlitSdmaBase::kCopyPacketSize - 1) /
(BlitSdmaBase::kCopyPacketSize));

// Allocate end timestamp object for both h2d and d2h DMA.
const size_t alloc_size = 2 * end_ts_pool_size_ * kTsSize;

core::Runtime* runtime = core::Runtime::runtime_singleton_;

uint64_t* buff = NULL;
if (HSA_STATUS_SUCCESS !=
runtime->AllocateMemory(local_region_, alloc_size,
MemoryRegion::AllocateRestrict,
reinterpret_cast<void**>(&buff))) {
return false;
}

end_ts_base_addr_.store(buff, std::memory_order_release);

return true;
}

uint64_t* GpuAgent::ObtainEndTsObject() {
if (end_ts_base_addr_ == NULL) {
return NULL;
}

const uint32_t end_ts_index =
end_ts_pool_counter_.fetch_add(1U, std::memory_order_acq_rel) %
end_ts_pool_size_;
const static size_t kNumU64 = kTsSize / sizeof(uint64_t);
uint64_t* end_ts_addr = &end_ts_base_addr_[end_ts_index * kNumU64];
assert(IsMultipleOf(end_ts_addr, kTsSize));

return end_ts_addr;
}

hsa_status_t GpuAgent::IterateRegion(
hsa_status_t (*callback)(hsa_region_t region, void* data),
void* data) const {
Expand Down Expand Up @@ -701,10 +642,6 @@ hsa_status_t GpuAgent::DmaFill(void* ptr, uint32_t value, size_t count) {
}

hsa_status_t GpuAgent::EnableDmaProfiling(bool enable) {
if (enable && !InitEndTsPool()) {
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}

for (auto& blit : blits_) {
if (blit.created()) {
const hsa_status_t stat = blit->EnableProfiling(enable);
Expand Down Expand Up @@ -1099,16 +1036,28 @@ void GpuAgent::ReleaseQueueScratch(ScratchInfo& scratch) {

void GpuAgent::TranslateTime(core::Signal* signal,
hsa_amd_profiling_dispatch_time_t& time) {
uint64_t start, end;
signal->GetRawTs(false, start, end);
// Order is important, we want to translate the end time first to ensure that packet duration is
// not impacted by clock measurement latency jitter.
time.end = TranslateTime(signal->signal_.end_ts);
time.start = TranslateTime(signal->signal_.start_ts);

if ((signal->signal_.start_ts == 0) || (signal->signal_.end_ts == 0) ||
(signal->signal_.start_ts > t1_.GPUClockCounter) ||
(signal->signal_.end_ts > t1_.GPUClockCounter) ||
(signal->signal_.start_ts < t0_.GPUClockCounter) ||
(signal->signal_.end_ts < t0_.GPUClockCounter))
time.end = TranslateTime(end);
time.start = TranslateTime(start);

if ((start == 0) || (end == 0) || (start > t1_.GPUClockCounter) || (end > t1_.GPUClockCounter) ||
(start < t0_.GPUClockCounter) || (end < t0_.GPUClockCounter))
debug_print("Signal %p time stamps may be invalid.", &signal->signal_);
}

void GpuAgent::TranslateTime(core::Signal* signal, hsa_amd_profiling_async_copy_time_t& time) {
uint64_t start, end;
signal->GetRawTs(true, start, end);
// Order is important, we want to translate the end time first to ensure that packet duration is
// not impacted by clock measurement latency jitter.
time.end = TranslateTime(end);
time.start = TranslateTime(start);

if ((start == 0) || (end == 0) || (start > t1_.GPUClockCounter) || (end > t1_.GPUClockCounter) ||
(start < t0_.GPUClockCounter) || (end < t0_.GPUClockCounter))
debug_print("Signal %p time stamps may be invalid.", &signal->signal_);
}

Expand Down Expand Up @@ -1215,11 +1164,6 @@ void GpuAgent::BindTrapHandler() {
return;
}

// Disable trap handler on APUs until KFD is fixed.
if (profile_ == HSA_PROFILE_FULL) {
return;
}

// Assemble the trap handler source code.
AssembleShader("TrapHandler", AssembleTarget::ISA, trap_code_buf_, trap_code_buf_size_);

Expand Down
8 changes: 7 additions & 1 deletion src/core/runtime/default_signal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,10 @@ hsa_signal_value_t BusyWaitSignal::WaitRelaxed(hsa_signal_condition_t condition,
timer::fast_clock::time_point start_time, time;
start_time = timer::fast_clock::now();

// Set a polling timeout value
// Should be a few times bigger than null kernel latency
const timer::fast_clock::duration kMaxElapsed = std::chrono::microseconds(200);

uint64_t hsa_freq;
HSA::hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &hsa_freq);
const timer::fast_clock::duration fast_timeout =
Expand Down Expand Up @@ -127,7 +131,9 @@ hsa_signal_value_t BusyWaitSignal::WaitRelaxed(hsa_signal_condition_t condition,
value = atomic::Load(&signal_.value, std::memory_order_relaxed);
return hsa_signal_value_t(value);
}
os::uSleep(20);
if (time - start_time > kMaxElapsed) {
os::uSleep(20);
}
}
}

Expand Down
Loading

0 comments on commit b507d85

Please sign in to comment.