diff --git a/src/core/inc/amd_gpu_agent.h b/src/core/inc/amd_gpu_agent.h
index 6fd8be666..c7791ad29 100644
--- a/src/core/inc/amd_gpu_agent.h
+++ b/src/core/inc/amd_gpu_agent.h
@@ -129,10 +129,7 @@ class GpuAgentInt : public core::Agent {
   //
   // @param [in] signal Pointer to signal that provides the async copy timing.
   // @param [out] time Structure to be populated with the host domain value.
-  virtual void TranslateTime(core::Signal* signal,
-                             hsa_amd_profiling_async_copy_time_t& time) {
-    return TranslateTime(signal, (hsa_amd_profiling_dispatch_time_t&)time);
-  }
+  virtual void TranslateTime(core::Signal* signal, hsa_amd_profiling_async_copy_time_t& time) = 0;
 
   // @brief Translate timestamp agent domain to host domain.
   //
@@ -248,9 +245,6 @@ class GpuAgent : public GpuAgentInt {
   // @brief Override from core::Agent.
   hsa_status_t DmaFill(void* ptr, uint32_t value, size_t count) override;
 
-  // @brief Get the next available end timestamp object.
-  uint64_t* ObtainEndTsObject();
-
   // @brief Override from core::Agent.
   hsa_status_t GetInfo(hsa_agent_info_t attribute, void* value) const override;
 
@@ -284,6 +278,9 @@ class GpuAgent : public GpuAgentInt {
   void TranslateTime(core::Signal* signal,
                      hsa_amd_profiling_dispatch_time_t& time) override;
 
+  // @brief Override from amd::GpuAgentInt.
+  void TranslateTime(core::Signal* signal, hsa_amd_profiling_async_copy_time_t& time) override;
+
   // @brief Override from amd::GpuAgentInt.
   uint64_t TranslateTime(uint64_t tick) override;
 
@@ -490,9 +487,6 @@ class GpuAgent : public GpuAgentInt {
   // @brief Create internal queues and blits.
   void InitDma();
 
-  // @brief Initialize memory pool for end timestamp object.
-  // @retval True if the memory pool for end timestamp object is initialized.
-  bool InitEndTsPool();
 
   // Bind index of peer device that is connected via xGMI links
   lazy_ptr<core::Blit>& GetXgmiBlit(const core::Agent& peer_agent);
@@ -503,23 +497,12 @@ class GpuAgent : public GpuAgentInt {
 
   // Bind the Blit object that will drive the copy operation
   lazy_ptr<core::Blit>& GetBlitObject(const core::Agent& dst_agent, const core::Agent& src_agent);
-
   // @brief Alternative aperture base address. Only on KV.
   uintptr_t ape1_base_;
 
   // @brief Alternative aperture size. Only on KV.
   size_t ape1_size_;
 
-  // Each end ts is 32 bytes.
-  static const size_t kTsSize = 32;
-
-  // Number of element in the pool.
-  uint32_t end_ts_pool_size_;
-
-  std::atomic<uint32_t> end_ts_pool_counter_;
-
-  std::atomic<uint64_t*> end_ts_base_addr_;
-
   DISALLOW_COPY_AND_ASSIGN(GpuAgent);
 };
 
diff --git a/src/core/inc/signal.h b/src/core/inc/signal.h
index 82ca6d193..24dd36acc 100644
--- a/src/core/inc/signal.h
+++ b/src/core/inc/signal.h
@@ -82,8 +82,12 @@ class Signal;
 /// @brief ABI and object conversion struct for signals.  May be shared between processes.
 struct SharedSignal {
   amd_signal_t amd_signal;
+  uint64_t sdma_start_ts;
   Signal* core_signal;
   Check<0x71FCCA6A3D5D5276, true> id;
+  uint8_t reserved[8];
+  uint64_t sdma_end_ts;
+  uint8_t reserved2[24];
 
   SharedSignal() {
     memset(&amd_signal, 0, sizeof(amd_signal));
@@ -95,6 +99,39 @@ struct SharedSignal {
 
   bool IsIPC() const { return core_signal == nullptr; }
 
+  void GetSdmaTsAddresses(uint64_t*& start, uint64_t*& end) {
+    /*
+    SDMA timestamps on gfx7xx/8xxx require 32 byte alignment (gfx9xx relaxes
+    alignment to 8 bytes).  This conflicts with the frozen format for amd_signal_t
+    so we place the time stamps in sdma_start/end_ts instead (amd_signal.start_ts
+    is also properly aligned).  Reading of the timestamps occurs in GetRawTs().
+    */
+    start = &sdma_start_ts;
+    end = &sdma_end_ts;
+  }
+
+  void CopyPrep() {
+    // Clear sdma_end_ts before a copy so we can detect if the copy was done via
+    // SDMA or blit kernel.
+    sdma_start_ts = 0;
+    sdma_end_ts = 0;
+  }
+
+  void GetRawTs(bool FetchCopyTs, uint64_t& start, uint64_t& end) {
+    /*
+    If the read is for a copy we need to check if it was done by blit kernel or SDMA.
+    Since we clear sdma_start/end_ts during CopyPrep we know it was a SDMA copy if one
+    of those is non-zero.  Otherwise return compute kernel stamps from amd_signal.
+    */
+    if (FetchCopyTs && sdma_end_ts != 0) {
+      start = sdma_start_ts;
+      end = sdma_end_ts;
+      return;
+    }
+    start = amd_signal.start_ts;
+    end = amd_signal.end_ts;
+  }
+
   static __forceinline SharedSignal* Convert(hsa_signal_t signal) {
     SharedSignal* ret = reinterpret_cast<SharedSignal*>(static_cast<uintptr_t>(signal.handle) -
                                                         offsetof(SharedSignal, amd_signal));
@@ -112,6 +149,12 @@ static_assert(std::is_standard_layout<SharedSignal>::value,
               "SharedSignal must remain standard layout for IPC use.");
 static_assert(std::is_trivially_destructible<SharedSignal>::value,
               "SharedSignal must not be modified on delete for IPC use.");
+static_assert((offsetof(SharedSignal, sdma_start_ts) % 32) == 0,
+              "Bad SDMA time stamp alignment.");
+static_assert((offsetof(SharedSignal, sdma_end_ts) % 32) == 0,
+              "Bad SDMA time stamp alignment.");
+static_assert(sizeof(SharedSignal) == 128,
+              "Bad SharedSignal size.");
 
 /// @brief Pool class for SharedSignal suitable for use with Shared.
 class SharedSignalPool_t : private BaseShared {
@@ -318,12 +361,23 @@ class Signal {
   /// @brief Checks if signal is currently in use by a wait API.
   bool InWaiting() const { return waiting_ != 0; }
 
+  // Prep for copy profiling.  Store copy agent and ready API block.
   __forceinline void async_copy_agent(core::Agent* agent) {
     async_copy_agent_ = agent;
+    core::SharedSignal::Convert(Convert(this))->CopyPrep();
   }
 
   __forceinline core::Agent* async_copy_agent() { return async_copy_agent_; }
 
+  void GetSdmaTsAddresses(uint64_t*& start, uint64_t*& end) {
+    core::SharedSignal::Convert(Convert(this))->GetSdmaTsAddresses(start, end);
+  }
+
+  // Set FetchCopyTs = true when reading time stamps from a copy operation.
+  void GetRawTs(bool FetchCopyTs, uint64_t& start, uint64_t& end) {
+    core::SharedSignal::Convert(Convert(this))->GetRawTs(FetchCopyTs, start, end);
+  }
+
   /// @brief Structure which defines key signal elements like type and value.
   /// Address of this struct is used as a value for the opaque handle of type
   /// hsa_signal_t provided to the public API.
diff --git a/src/core/runtime/amd_blit_kernel.cpp b/src/core/runtime/amd_blit_kernel.cpp
index 06f19c5cb..f7b5653b2 100644
--- a/src/core/runtime/amd_blit_kernel.cpp
+++ b/src/core/runtime/amd_blit_kernel.cpp
@@ -625,9 +625,8 @@ hsa_status_t BlitKernel::SubmitLinearCopyCommand(
   // Insert barrier packets to handle dependent signals.
   // Barrier bit keeps signal checking traffic from competing with a copy.
   const uint16_t kBarrierPacketHeader = (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) |
-      (1 << HSA_PACKET_HEADER_BARRIER) |
       (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
-      (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
+      (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
 
   hsa_barrier_and_packet_t barrier_packet = {0};
   barrier_packet.header = HSA_PACKET_TYPE_INVALID;
@@ -807,7 +806,6 @@ void BlitKernel::PopulateQueue(uint64_t index, uint64_t code_handle, void* args,
 
   static const uint16_t kDispatchPacketHeader =
       (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
-      (((completion_signal.handle != 0) ? 1 : 0) << HSA_PACKET_HEADER_BARRIER) |
       (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
       (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
 
diff --git a/src/core/runtime/amd_blit_sdma.cpp b/src/core/runtime/amd_blit_sdma.cpp
index 8bdd7b279..2a189de89 100644
--- a/src/core/runtime/amd_blit_sdma.cpp
+++ b/src/core/runtime/amd_blit_sdma.cpp
@@ -141,8 +141,8 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Initial
     platform_atomic_support_ = link.info.atomic_support_64bit;
   }
 
-  // Determine if sDMA microcode supports HDP flush command
-  if (agent_->GetSdmaMicrocodeVersion() >= SDMA_PKT_HDP_FLUSH::kMinVersion_) {
+  // HDP flush supported on gfx900 and forward.
+  if (agent_->isa()->GetMajorVersion() > 8) {
     hdp_flush_support_ = true;
   }
 
@@ -248,22 +248,13 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitC
   // profiling in the middle of the call.
   const bool profiling_enabled = agent_->profiling_enabled();
 
-  uint64_t* end_ts_addr = NULL;
+  uint64_t* start_ts_addr = nullptr;
+  uint64_t* end_ts_addr = nullptr;
   uint32_t total_timestamp_command_size = 0;
 
   if (profiling_enabled) {
-    // SDMA timestamp packet requires 32 byte of aligned memory, but
-    // amd_signal_t::end_ts is not 32 byte aligned. So an extra copy packet to
-    // read from a 32 byte aligned bounce buffer is required to avoid changing
-    // the amd_signal_t ABI.
-
-    end_ts_addr = agent_->ObtainEndTsObject();
-    if (end_ts_addr == NULL) {
-      return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
-    }
-
-    total_timestamp_command_size =
-        (2 * timestamp_command_size_) + linear_copy_command_size_;
+    out_signal.GetSdmaTsAddresses(start_ts_addr, end_ts_addr);
+    total_timestamp_command_size = 2 * timestamp_command_size_;
   }
 
   // On agent that does not support platform atomic, we replace it with
@@ -315,8 +306,7 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitC
   }
 
   if (profiling_enabled) {
-    BuildGetGlobalTimestampCommand(
-        command_addr, reinterpret_cast<void*>(&out_signal.signal_.start_ts));
+    BuildGetGlobalTimestampCommand(command_addr, reinterpret_cast<void*>(start_ts_addr));
     command_addr += timestamp_command_size_;
   }
 
@@ -337,11 +327,6 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitC
     BuildGetGlobalTimestampCommand(command_addr,
                                    reinterpret_cast<void*>(end_ts_addr));
     command_addr += timestamp_command_size_;
-
-    BuildCopyCommand(command_addr, 1,
-                     reinterpret_cast<void*>(&out_signal.signal_.end_ts),
-                     reinterpret_cast<void*>(end_ts_addr), sizeof(uint64_t));
-    command_addr += linear_copy_command_size_;
   }
 
   // After transfer is completed, decrement the signal value.
diff --git a/src/core/runtime/amd_gpu_agent.cpp b/src/core/runtime/amd_gpu_agent.cpp
index 4d2d23b35..5b27486dd 100644
--- a/src/core/runtime/amd_gpu_agent.cpp
+++ b/src/core/runtime/amd_gpu_agent.cpp
@@ -86,10 +86,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
       memory_bus_width_(0),
       memory_max_frequency_(0),
       ape1_base_(0),
-      ape1_size_(0),
-      end_ts_pool_size_(0),
-      end_ts_pool_counter_(0),
-      end_ts_base_addr_(NULL) {
+      ape1_size_(0) {
   const bool is_apu_node = (properties_.NumCPUCores > 0);
   profile_ = (is_apu_node) ? HSA_PROFILE_FULL : HSA_PROFILE_BASE;
 
@@ -144,10 +141,6 @@ GpuAgent::~GpuAgent() {
     }
   }
 
-  if (end_ts_base_addr_ != NULL) {
-    core::Runtime::runtime_singleton_->FreeMemory(end_ts_base_addr_);
-  }
-
   if (ape1_base_ != 0) {
     _aligned_free(reinterpret_cast<void*>(ape1_base_));
   }
@@ -405,58 +398,6 @@ void GpuAgent::InitCacheList() {
                                      cache_props_[i].CacheLevel, cache_props_[i].CacheSize));
 }
 
-bool GpuAgent::InitEndTsPool() {
-  if (HSA_PROFILE_FULL == profile_) {
-    return true;
-  }
-
-  if (end_ts_base_addr_.load(std::memory_order_acquire) != NULL) {
-    return true;
-  }
-
-  ScopedAcquire<KernelMutex> lock(&blit_lock_);
-
-  if (end_ts_base_addr_.load(std::memory_order_relaxed) != NULL) {
-    return true;
-  }
-
-  end_ts_pool_size_ =
-      static_cast<uint32_t>((BlitSdmaBase::kQueueSize + BlitSdmaBase::kCopyPacketSize - 1) /
-                            (BlitSdmaBase::kCopyPacketSize));
-
-  // Allocate end timestamp object for both h2d and d2h DMA.
-  const size_t alloc_size = 2 * end_ts_pool_size_ * kTsSize;
-
-  core::Runtime* runtime = core::Runtime::runtime_singleton_;
-
-  uint64_t* buff = NULL;
-  if (HSA_STATUS_SUCCESS !=
-      runtime->AllocateMemory(local_region_, alloc_size,
-                              MemoryRegion::AllocateRestrict,
-                              reinterpret_cast<void**>(&buff))) {
-    return false;
-  }
-
-  end_ts_base_addr_.store(buff, std::memory_order_release);
-
-  return true;
-}
-
-uint64_t* GpuAgent::ObtainEndTsObject() {
-  if (end_ts_base_addr_ == NULL) {
-    return NULL;
-  }
-
-  const uint32_t end_ts_index =
-      end_ts_pool_counter_.fetch_add(1U, std::memory_order_acq_rel) %
-      end_ts_pool_size_;
-  const static size_t kNumU64 = kTsSize / sizeof(uint64_t);
-  uint64_t* end_ts_addr = &end_ts_base_addr_[end_ts_index * kNumU64];
-  assert(IsMultipleOf(end_ts_addr, kTsSize));
-
-  return end_ts_addr;
-}
-
 hsa_status_t GpuAgent::IterateRegion(
     hsa_status_t (*callback)(hsa_region_t region, void* data),
     void* data) const {
@@ -701,10 +642,6 @@ hsa_status_t GpuAgent::DmaFill(void* ptr, uint32_t value, size_t count) {
 }
 
 hsa_status_t GpuAgent::EnableDmaProfiling(bool enable) {
-  if (enable && !InitEndTsPool()) {
-    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
-  }
-
   for (auto& blit : blits_) {
     if (blit.created()) {
       const hsa_status_t stat = blit->EnableProfiling(enable);
@@ -1099,16 +1036,28 @@ void GpuAgent::ReleaseQueueScratch(ScratchInfo& scratch) {
 
 void GpuAgent::TranslateTime(core::Signal* signal,
                              hsa_amd_profiling_dispatch_time_t& time) {
+  uint64_t start, end;
+  signal->GetRawTs(false, start, end);
   // Order is important, we want to translate the end time first to ensure that packet duration is
   // not impacted by clock measurement latency jitter.
-  time.end = TranslateTime(signal->signal_.end_ts);
-  time.start = TranslateTime(signal->signal_.start_ts);
-
-  if ((signal->signal_.start_ts == 0) || (signal->signal_.end_ts == 0) ||
-      (signal->signal_.start_ts > t1_.GPUClockCounter) ||
-      (signal->signal_.end_ts > t1_.GPUClockCounter) ||
-      (signal->signal_.start_ts < t0_.GPUClockCounter) ||
-      (signal->signal_.end_ts < t0_.GPUClockCounter))
+  time.end = TranslateTime(end);
+  time.start = TranslateTime(start);
+
+  if ((start == 0) || (end == 0) || (start > t1_.GPUClockCounter) || (end > t1_.GPUClockCounter) ||
+      (start < t0_.GPUClockCounter) || (end < t0_.GPUClockCounter))
+    debug_print("Signal %p time stamps may be invalid.", &signal->signal_);
+}
+
+void GpuAgent::TranslateTime(core::Signal* signal, hsa_amd_profiling_async_copy_time_t& time) {
+  uint64_t start, end;
+  signal->GetRawTs(true, start, end);
+  // Order is important, we want to translate the end time first to ensure that packet duration is
+  // not impacted by clock measurement latency jitter.
+  time.end = TranslateTime(end);
+  time.start = TranslateTime(start);
+
+  if ((start == 0) || (end == 0) || (start > t1_.GPUClockCounter) || (end > t1_.GPUClockCounter) ||
+      (start < t0_.GPUClockCounter) || (end < t0_.GPUClockCounter))
     debug_print("Signal %p time stamps may be invalid.", &signal->signal_);
 }
 
@@ -1215,11 +1164,6 @@ void GpuAgent::BindTrapHandler() {
     return;
   }
 
-  // Disable trap handler on APUs until KFD is fixed.
-  if (profile_ == HSA_PROFILE_FULL) {
-    return;
-  }
-
   // Assemble the trap handler source code.
   AssembleShader("TrapHandler", AssembleTarget::ISA, trap_code_buf_, trap_code_buf_size_);
 
diff --git a/src/core/runtime/default_signal.cpp b/src/core/runtime/default_signal.cpp
index 249736dd7..350a214ca 100644
--- a/src/core/runtime/default_signal.cpp
+++ b/src/core/runtime/default_signal.cpp
@@ -89,6 +89,10 @@ hsa_signal_value_t BusyWaitSignal::WaitRelaxed(hsa_signal_condition_t condition,
   timer::fast_clock::time_point start_time, time;
   start_time = timer::fast_clock::now();
 
+  // Set a polling timeout value
+  // Should be a few times bigger than null kernel latency
+  const timer::fast_clock::duration kMaxElapsed = std::chrono::microseconds(200);
+
   uint64_t hsa_freq;
   HSA::hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &hsa_freq);
   const timer::fast_clock::duration fast_timeout =
@@ -127,7 +131,9 @@ hsa_signal_value_t BusyWaitSignal::WaitRelaxed(hsa_signal_condition_t condition,
       value = atomic::Load(&signal_.value, std::memory_order_relaxed);
       return hsa_signal_value_t(value);
     }
-    os::uSleep(20);
+    if (time - start_time > kMaxElapsed) {
+      os::uSleep(20);
+    }
   }
 }
 
diff --git a/src/core/runtime/hsa_ext_amd.cpp b/src/core/runtime/hsa_ext_amd.cpp
index 5fd678784..df93bf27d 100644
--- a/src/core/runtime/hsa_ext_amd.cpp
+++ b/src/core/runtime/hsa_ext_amd.cpp
@@ -382,20 +382,10 @@ hsa_status_t hsa_amd_profiling_get_async_copy_time(
 
   core::Agent* agent = signal->async_copy_agent();
 
-  if (agent == NULL) {
+  if (agent == nullptr) {
     return HSA_STATUS_ERROR;
   }
 
-  // Validate the embedded agent pointer if the signal is IPC.
-  if (signal->isIPC()) {
-    for (auto it : core::Runtime::runtime_singleton_->gpu_agents()) {
-      if (it == agent) break;
-    }
-    // If the agent isn't a GPU then it is from a different process or it's a CPU.
-    // Assume it's a CPU and illegal uses will generate garbage data same as kernel completion.
-    agent = core::Runtime::runtime_singleton_->cpu_agents()[0];
-  }
-
   if (agent->device_type() == core::Agent::DeviceType::kAmdGpuDevice) {
     // Translate timestamp from GPU to system domain.
     static_cast<amd::GpuAgentInt*>(agent)->TranslateTime(signal, *time);
diff --git a/src/core/runtime/runtime.cpp b/src/core/runtime/runtime.cpp
index 488c44083..5940a51af 100644
--- a/src/core/runtime/runtime.cpp
+++ b/src/core/runtime/runtime.cpp
@@ -471,6 +471,7 @@ hsa_status_t Runtime::CopyMemory(void* dst, core::Agent& dst_agent,
   // For cpu to cpu, fire and forget a copy thread.
   const bool profiling_enabled =
       (dst_agent.profiling_enabled() || src_agent.profiling_enabled());
+  if (profiling_enabled) completion_signal.async_copy_agent(&dst_agent);
   std::thread(
       [](void* dst, const void* src, size_t size,
          std::vector<core::Signal*> dep_signals,