Skip to content

Commit

Permalink
[OpenMP][Offload][AMDGPU] Add support for computing achieved occupanc…
Browse files Browse the repository at this point in the history
…y for a launched kernel

This patch introduces the new feature for computing the achieved occupancy for the launched kernel based on the number of threads, number of teams and the max(theoritical) occupancy.

Change-Id: Ida34b9b4c1f3471bc540a277ac864ad124841a35
  • Loading branch information
Kewen12 authored and ronlieb committed Sep 14, 2024
1 parent 59d693e commit 2fdc022
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 4 deletions.
47 changes: 45 additions & 2 deletions offload/plugins-nextgen/amdgpu/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1332,6 +1332,47 @@ struct AMDGPUKernelTy : public GenericKernelTy {

return Occupancy;
}

/// Compute the achieved kernel occupancy for AMD GPU.
unsigned computeAchievedOccupancy(GenericDeviceTy &Device,
uint32_t numThreads,
uint64_t numTeams) const override {
// Check if max occupancy is available
if (MaxOccupancy <= 0) {
return 0;
}

// Default number of waves per EU.
unsigned MaxWavesPerEU = llvm::omp::amdgpu_arch::MaxWavesPerEU10;

// Get GPU info.
bool IsEquippedWithGFX90A = Device.hasGfx90aDevice();
if (IsEquippedWithGFX90A) {
MaxWavesPerEU = llvm::omp::amdgpu_arch::MaxWavesPerEU8;
}

// Get the max number of waves per CU.
unsigned MaxNumWaves = MaxOccupancy * llvm::omp::amdgpu_arch::SIMDPerCU;
// Get the number of waves from the kernel launch parameters.
unsigned AchievedNumWaves =
divideCeil(numThreads, llvm::omp::amdgpu_arch::WaveFrontSize64) *
numTeams;
// Get the number of waves per CU.
AchievedNumWaves =
divideCeil(AchievedNumWaves, Device.getNumComputeUnits());
// Get the min waves.
AchievedNumWaves = std::min(MaxNumWaves, AchievedNumWaves);
// Total number of wave slots each CU supports.
unsigned TotalWaveSlotsPerCU =
MaxWavesPerEU * llvm::omp::amdgpu_arch::SIMDPerCU;
// Compute occupancy ratio representing in percentage.
unsigned Occupancy = (AchievedNumWaves * 100) / TotalWaveSlotsPerCU;

// Cache the result.
AchievedOccupancy = Occupancy;

return Occupancy;
}
};

/// Class representing an HSA signal. Signals are used to define dependencies
Expand Down Expand Up @@ -5029,12 +5070,14 @@ void AMDGPUKernelTy::printAMDOneLineKernelTrace(GenericDeviceTy &GenericDevice,
"DEVID: %2d SGN:%d ConstWGSize:%-4d args:%2d teamsXthrds:(%4luX%4d) "
"reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
"sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu rpc:%d "
"md:%d md_LB:%ld md_UB:%ld Occupancy: %u n:%s\n",
"md:%d md_LB:%ld md_UB:%ld Max Occupancy: %u Achieved Occupancy: "
"%d%% n:%s\n",
GenericDevice.getDeviceId(), getExecutionModeFlags(), ConstWGSize,
KernelArgs.NumArgs, NumBlocks, NumThreads, 0, 0, GroupSegmentSize,
SGPRCount, VGPRCount, SGPRSpillCount, VGPRSpillCount,
KernelArgs.Tripcount, NeedsHostServices, isMultiDeviceKernel(),
MultiDeviceLB, MultiDeviceUB, MaxOccupancy, getName());
MultiDeviceLB, MultiDeviceUB, MaxOccupancy, AchievedOccupancy,
getName());
}

Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
Expand Down
19 changes: 17 additions & 2 deletions offload/plugins-nextgen/common/include/PluginInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,17 @@ struct GenericKernelTy {
return MaxOccupancy;
}

/// Compute achieved occupancy
/// This function computes the achieved occupancy for a launched kernel based
/// on the number of threads, number of teams and the max occupancy of this
/// kernel. It returns in ratio representing the occupancy for each CU(SM).
virtual unsigned computeAchievedOccupancy(GenericDeviceTy &Device,
uint32_t numThreads,
uint64_t numTeams) const {
// This function should be overridden in the derived class.
return AchievedOccupancy;
}

protected:
/// Get the execution mode name of the kernel.
const char *getExecutionModeName() const {
Expand Down Expand Up @@ -435,8 +446,12 @@ struct GenericKernelTy {
bool IsBareKernel = false;

/// Upper-bound for the launched kernel occupancy.
/// -1 indicates an invalid result.
mutable unsigned MaxOccupancy = -1;
/// 0 indicates an invalid result.
mutable unsigned MaxOccupancy = 0;

/// Achieved occupancy for the launched kernel.
/// 0 indications an invalid result.
mutable unsigned AchievedOccupancy = 0;
};

/// Information about an allocation, when it has been allocated, and when/if it
Expand Down
3 changes: 3 additions & 0 deletions offload/plugins-nextgen/common/src/PluginInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -738,6 +738,9 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
NumBlocks, NumThreads, KernelArgs.Tripcount);
}

// Get achieved occupancy for this kernel.
computeAchievedOccupancy(GenericDevice, NumThreads, NumBlocks);

if (auto Err = printLaunchInfo(GenericDevice, KernelArgs, NumThreads,
NumBlocks, MultiDeviceLB, MultiDeviceUB))
return Err;
Expand Down

0 comments on commit 2fdc022

Please sign in to comment.