Skip to content

Commit

Permalink
Merge pull request #78 from RadeonOpenCompute/roc-2.10.x
Browse files Browse the repository at this point in the history
ROCm 2.10.0 updates
  • Loading branch information
skeelyamd authored Nov 23, 2019
2 parents b507d85 + 0ac4868 commit 8546125
Show file tree
Hide file tree
Showing 15 changed files with 277 additions and 44 deletions.
1 change: 1 addition & 0 deletions src/core/inc/amd_gpu_agent.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ struct ScratchInfo {
void* queue_base;
size_t size;
size_t size_per_thread;
uint32_t lanes_per_wave;
ptrdiff_t queue_process_offset;
bool large;
bool retry;
Expand Down
6 changes: 6 additions & 0 deletions src/core/inc/amd_gpu_pm4.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,12 @@
# define PM4_ACQUIRE_MEM_COHER_CNTL_SH_ICACHE_ACTION_ENA (1 << 29)
#define PM4_ACQUIRE_MEM_DW2_COHER_SIZE(x) (((x) & 0xFFFFFFFF) << 0)
#define PM4_ACQUIRE_MEM_DW3_COHER_SIZE_HI(x) (((x) & 0xFF) << 0)
#define PM4_ACQUIRE_MEM_DW7_GCR_CNTL(x) (((x) & 0x7FFFF) << 0)
# define PM4_ACQUIRE_MEM_GCR_CNTL_GLI_INV(x) (((x) & 0x3) << 0)
# define PM4_ACQUIRE_MEM_GCR_CNTL_GLK_INV (1 << 7)
# define PM4_ACQUIRE_MEM_GCR_CNTL_GLV_INV (1 << 8)
# define PM4_ACQUIRE_MEM_GCR_CNTL_GL1_INV (1 << 9)
# define PM4_ACQUIRE_MEM_GCR_CNTL_GL2_INV (1 << 14)

#define PM4_RELEASE_MEM_DW1_EVENT_INDEX(x) (((x) & 0xF) << 8)
# define PM4_RELEASE_MEM_EVENT_INDEX_AQL 0x7
Expand Down
86 changes: 86 additions & 0 deletions src/core/inc/amd_gpu_shaders.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,13 @@ static const unsigned int kCodeTrapHandler9[] = {
.set TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT , 26
.set SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT , 15
.set SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK , 0x1F8000
.elseif .amdgcn.gfx_generation_number == 10
.set TTMP11_SAVE_REPLAY_W64H_SHIFT , 31
.set TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT , 24
.set SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT , 25
.set SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT , 15
.set SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK , 0x3F8000
.set SQ_WAVE_IB_STS_REPLAY_W64H_MASK , 0x2000000
.else
.error "unsupported target"
.endif
Expand Down Expand Up @@ -217,6 +224,14 @@ static const unsigned int kCodeTrapHandler9[] = {
s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2
.endif
.if .amdgcn.gfx_generation_number == 10
s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT)
s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK
s_or_b32 ttmp2, ttmp2, ttmp3
s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2
.endif
// Restore SQ_WAVE_STATUS.
s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
Expand Down Expand Up @@ -296,6 +311,77 @@ static const unsigned int kCodeFill8[] = {
0x00001902, 0xD11C6A03, 0x01A90103, 0xBF82FFF5, 0xBF810000,
};

static const unsigned int kCodeCopyAligned10[] = {
0xF4080100, 0xFA000000, 0xF4080200, 0xFA000010, 0xF4080300, 0xFA000020,
0xF4080400, 0xFA000030, 0xF4080500, 0xFA000040, 0xF4000600, 0xFA000050,
0xBF8CC07F, 0x8F028602, 0xD70F6A00, 0x00020002, 0x7E060205, 0xD70F6A02,
0x00020004, 0xD5286A03, 0x01A90103, 0x7E0A0207, 0xD70F6A04, 0x00020006,
0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00001102, 0xBF86000F, 0x87FE6A7E,
0xDC200000, 0x017D0002, 0xBF8C3F70, 0xD70F6A02, 0x00020418, 0xD5286A03,
0x01A90103, 0xDC600000, 0x007D0104, 0xD70F6A04, 0x00020818, 0xD5286A05,
0x01A90105, 0xBF82FFEE, 0xBEFE04C1, 0x8F198418, 0x34020084, 0x7E060209,
0xD70F6A02, 0x00020208, 0xD5286A03, 0x01A90103, 0x7E0A020B, 0xD70F6A04,
0x0002020A, 0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00001902, 0xBF86000E,
0xDC380000, 0x087D0002, 0xD70F6A02, 0x00020419, 0xD5286A03, 0x01A90103,
0xBF8C3F70, 0xDC780000, 0x007D0804, 0xD70F6A04, 0x00020819, 0xD5286A05,
0x01A90105, 0xBF82FFEF, 0x8F198218, 0x34020082, 0x7E06020D, 0xD70F6A02,
0x0002020C, 0xD5286A03, 0x01A90103, 0x7E0A020F, 0xD70F6A04, 0x0002020E,
0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00002102, 0xBF86000F, 0x87FE6A7E,
0xDC300000, 0x017D0002, 0xD70F6A02, 0x00020419, 0xD5286A03, 0x01A90103,
0xBF8C3F70, 0xDC700000, 0x007D0104, 0xD70F6A04, 0x00020819, 0xD5286A05,
0x01A90105, 0xBF82FFEE, 0xBEFE04C1, 0x7E060211, 0xD70F6A02, 0x00020010,
0xD5286A03, 0x01A90103, 0x7E0A0213, 0xD70F6A04, 0x00020012, 0xD5286A05,
0x01A90105, 0xD4E1006A, 0x00002902, 0xBF860006, 0x87FE6A7E, 0xDC200000,
0x017D0002, 0xBF8C3F70, 0xDC600000, 0x007D0104, 0xBF810000,
};

static const unsigned int kCodeCopyMisaligned10[] = {
0xF4080100, 0xFA000000, 0xF4080200, 0xFA000010, 0xF4080300, 0xFA000020,
0xF4000400, 0xFA000030, 0xBF8CC07F, 0x8F028602, 0xD70F6A00, 0x00020002,
0x7E060205, 0xD70F6A02, 0x00020004, 0xD5286A03, 0x01A90103, 0x7E0A0207,
0xD70F6A04, 0x00020006, 0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00001102,
0xBF860032, 0xDC200000, 0x067D0002, 0xD70F6A02, 0x00020410, 0xD5286A03,
0x01A90103, 0xDC200000, 0x077D0002, 0xD70F6A02, 0x00020410, 0xD5286A03,
0x01A90103, 0xDC200000, 0x087D0002, 0xD70F6A02, 0x00020410, 0xD5286A03,
0x01A90103, 0xDC200000, 0x097D0002, 0xD70F6A02, 0x00020410, 0xD5286A03,
0x01A90103, 0xBF8C3F70, 0xDC600000, 0x007D0604, 0xD70F6A04, 0x00020810,
0xD5286A05, 0x01A90105, 0xDC600000, 0x007D0704, 0xD70F6A04, 0x00020810,
0xD5286A05, 0x01A90105, 0xDC600000, 0x007D0804, 0xD70F6A04, 0x00020810,
0xD5286A05, 0x01A90105, 0xDC600000, 0x007D0904, 0xD70F6A04, 0x00020810,
0xD5286A05, 0x01A90105, 0xBF82FFCB, 0x7E060209, 0xD70F6A02, 0x00020008,
0xD5286A03, 0x01A90103, 0x7E0A020B, 0xD70F6A04, 0x0002000A, 0xD5286A05,
0x01A90105, 0xD4E1006A, 0x00001902, 0xBF86000F, 0x87FE6A7E, 0xDC200000,
0x017D0002, 0xD70F6A02, 0x00020410, 0xD5286A03, 0x01A90103, 0xBF8C3F70,
0xDC600000, 0x007D0104, 0xD70F6A04, 0x00020810, 0xD5286A05, 0x01A90105,
0xBF82FFEE, 0xBF810000,
};

static const unsigned int kCodeFill10[] = {
0xF4080100, 0xFA000000, 0xF4080200, 0xFA000010, 0xBF8CC07F, 0x8F028602,
0xD70F6A00, 0x00020002, 0x7E08020A, 0x7E0A020A, 0x7E0C020A, 0x7E0E020A,
0x8F0C840B, 0x34020084, 0x7E060205, 0xD70F6A02, 0x00020204, 0xD5286A03,
0x01A90103, 0xD4E1006A, 0x00000D02, 0xBF860007, 0xDC780000, 0x007D0402,
0xD70F6A02, 0x0002040C, 0xD5286A03, 0x01A90103, 0xBF82FFF6, 0x8F0C820B,
0x34020082, 0x7E060207, 0xD70F6A02, 0x00020206, 0xD5286A03, 0x01A90103,
0xD4E1006A, 0x00001102, 0xBF860008, 0x87FE6A7E, 0xDC700000, 0x007D0402,
0xD70F6A02, 0x0002040C, 0xD5286A03, 0x01A90103, 0xBF82FFF5, 0xBF810000,
};

static const unsigned int kCodeTrapHandler10[] = {
0xB96EF803, 0x8770FF6E, 0x10000100, 0xBF06FF70, 0x00000100, 0xBEF003FF,
0x20000000, 0xBF85000E, 0x8770FF6E, 0x00000800, 0xBEF003F4, 0xBF85000A,
0x93EEFF6D, 0x00080010, 0xBF84002C, 0xBF06826E, 0xBEF003FF, 0x80000000,
0xBF850003, 0x806C846C, 0x826D806D, 0xBF820025, 0xBEFE03FF, 0x80000000,
0xBF90000A, 0xBF800007, 0xBF0C9F7E, 0xBF84FFFD, 0x876EFF7E, 0x000003FF,
0x8F6E836E, 0xF4051BBD, 0xDC000000, 0xBF8CC07F, 0xF4051BB7, 0xFA0000C0,
0xBF8CC07F, 0xBEF10380, 0xF6811C37, 0xFA000008, 0xBF8CC07F, 0x88707170,
0xBF85000E, 0xF4051C37, 0xFA000010, 0xBF8CC07F, 0x87F07070, 0xBF840009,
0xF4011BB7, 0xFA000018, 0xBF8CC07F, 0xF4411BB8, 0xFA000000, 0xBF8CC07F,
0xBEFC0380, 0xBF800000, 0xBF900001, 0x8878FF78, 0x00002000, 0x906E8977,
0x876FFF6E, 0x003F8000, 0x906E8677, 0x876EFF6E, 0x02000000, 0x886E6F6E,
0xB9EEF807, 0x87FE7E7E, 0x87EA6A6A, 0xB9F8F802, 0xBE80226C,
};

} // namespace amd

#endif // header guard
38 changes: 38 additions & 0 deletions src/core/inc/registers.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,10 @@ BUF_NUM_FORMAT_RESERVED_6__VI = 0x00000006,
BUF_NUM_FORMAT_FLOAT = 0x00000007,
} BUF_NUM_FORMAT;

typedef enum BUF_FORMAT {
BUF_FORMAT_32_UINT = 0x00000014,
} BUF_FORMAT;

typedef enum SQ_SEL_XYZW01 {
SQ_SEL_0 = 0x00000000,
SQ_SEL_1 = 0x00000001,
Expand Down Expand Up @@ -201,4 +205,38 @@ SQ_SEL_W = 0x00000007,
float f32All;
};

union SQ_BUF_RSRC_WORD3_GFX10 {
struct {
#if defined(LITTLEENDIAN_CPU)
unsigned int DST_SEL_X : 3;
unsigned int DST_SEL_Y : 3;
unsigned int DST_SEL_Z : 3;
unsigned int DST_SEL_W : 3;
unsigned int FORMAT : 7;
unsigned int RESERVED1 : 2;
unsigned int INDEX_STRIDE : 2;
unsigned int ADD_TID_ENABLE : 1;
unsigned int RESOURCE_LEVEL : 1;
unsigned int RESERVED2 : 3;
unsigned int OOB_SELECT : 2;
unsigned int TYPE : 2;
#elif defined(BIGENDIAN_CPU)
unsigned int TYPE : 2;
unsigned int OOB_SELECT : 2;
unsigned int RESERVED2 : 3;
unsigned int RESOURCE_LEVEL : 1;
unsigned int ADD_TID_ENABLE : 1;
unsigned int INDEX_STRIDE : 2;
unsigned int RESERVED1 : 2;
unsigned int FORMAT : 7;
unsigned int DST_SEL_W : 3;
unsigned int DST_SEL_Z : 3;
unsigned int DST_SEL_Y : 3;
unsigned int DST_SEL_X : 3;
#endif
} bitfields, bits;
unsigned int u32All;
signed int i32All;
float f32All;
};
#endif // header guard
9 changes: 8 additions & 1 deletion src/core/inc/sdma_registers.h
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,14 @@ typedef struct SDMA_PKT_FENCE_TAG {
struct {
unsigned int op : 8;
unsigned int sub_op : 8;
unsigned int reserved_0 : 16;
unsigned int mtype : 3;
unsigned int gcc : 1;
unsigned int sys : 1;
unsigned int pad1 : 1;
unsigned int snp : 1;
unsigned int gpa : 1;
unsigned int l2_policy : 2;
unsigned int reserved_0 : 6;
};
unsigned int DW_0_DATA;
} HEADER_UNION;
Expand Down
76 changes: 52 additions & 24 deletions src/core/runtime/amd_aql_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -749,7 +749,7 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
}

// Process only one queue error.
if (error_code == 1) {
if (error_code & 0x401) { // insufficient scratch, wave64 or wave32
// Insufficient scratch - recoverable, don't process dynamic scratch if errors are present.
auto& scratch = queue->queue_scratch_;

Expand All @@ -764,10 +764,11 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
uint32_t scratch_request = pkt.dispatch.private_segment_size;

scratch.size_per_thread = scratch_request;
scratch.lanes_per_wave = (error_code & 0x400) ? 32 : 64;
// Align whole waves to 1KB.
scratch.size_per_thread = AlignUp(scratch.size_per_thread, 16);
scratch.size_per_thread = AlignUp(scratch.size_per_thread, 1024 / scratch.lanes_per_wave);
scratch.size = scratch.size_per_thread * (queue->amd_queue_.max_cu_id + 1) *
queue->agent_->properties().MaxSlotsScratchCU * queue->agent_->properties().WaveFrontSize;
queue->agent_->properties().MaxSlotsScratchCU * scratch.lanes_per_wave;

queue->agent_->AcquireQueueScratch(scratch);

Expand Down Expand Up @@ -948,7 +949,7 @@ void AqlQueue::ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) {
rel_mem[4] = 0;
rel_mem[5] = 0;
rel_mem[6] = 0;
} else if (agent_->isa()->GetMajorVersion() == 9) {
} else if (agent_->isa()->GetMajorVersion() >= 9) {
// Construct an AQL packet to jump to the PM4 IB.
struct amd_aql_pm4_ib {
uint16_t header;
Expand Down Expand Up @@ -1001,7 +1002,7 @@ void AqlQueue::InitScratchSRD() {
SQ_BUF_RSRC_WORD0 srd0;
SQ_BUF_RSRC_WORD1 srd1;
SQ_BUF_RSRC_WORD2 srd2;
SQ_BUF_RSRC_WORD3 srd3;
uint32_t srd3_u32;

uint32_t scratch_base_hi = 0;
uintptr_t scratch_base = uintptr_t(queue_scratch_.queue_base);
Expand All @@ -1017,33 +1018,60 @@ void AqlQueue::InitScratchSRD() {

srd2.bits.NUM_RECORDS = uint32_t(queue_scratch_.size);

srd3.bits.DST_SEL_X = SQ_SEL_X;
srd3.bits.DST_SEL_Y = SQ_SEL_Y;
srd3.bits.DST_SEL_Z = SQ_SEL_Z;
srd3.bits.DST_SEL_W = SQ_SEL_W;
srd3.bits.NUM_FORMAT = BUF_NUM_FORMAT_UINT;
srd3.bits.DATA_FORMAT = BUF_DATA_FORMAT_32;
srd3.bits.ELEMENT_SIZE = 1; // 4
srd3.bits.INDEX_STRIDE = 3; // 64
srd3.bits.ADD_TID_ENABLE = 1;
srd3.bits.ATC__CI__VI = (agent_->profile() == HSA_PROFILE_FULL);
srd3.bits.HASH_ENABLE = 0;
srd3.bits.HEAP = 0;
srd3.bits.MTYPE__CI__VI = 0;
srd3.bits.TYPE = SQ_RSRC_BUF;
if (agent_->isa()->GetMajorVersion() < 10) {
SQ_BUF_RSRC_WORD3 srd3;

srd3.bits.DST_SEL_X = SQ_SEL_X;
srd3.bits.DST_SEL_Y = SQ_SEL_Y;
srd3.bits.DST_SEL_Z = SQ_SEL_Z;
srd3.bits.DST_SEL_W = SQ_SEL_W;
srd3.bits.NUM_FORMAT = BUF_NUM_FORMAT_UINT;
srd3.bits.DATA_FORMAT = BUF_DATA_FORMAT_32;
srd3.bits.ELEMENT_SIZE = 1; // 4
srd3.bits.INDEX_STRIDE = 3; // 64
srd3.bits.ADD_TID_ENABLE = 1;
srd3.bits.ATC__CI__VI = (agent_->profile() == HSA_PROFILE_FULL);
srd3.bits.HASH_ENABLE = 0;
srd3.bits.HEAP = 0;
srd3.bits.MTYPE__CI__VI = 0;
srd3.bits.TYPE = SQ_RSRC_BUF;

srd3_u32 = srd3.u32All;
} else {
SQ_BUF_RSRC_WORD3_GFX10 srd3;

srd3.bits.DST_SEL_X = SQ_SEL_X;
srd3.bits.DST_SEL_Y = SQ_SEL_Y;
srd3.bits.DST_SEL_Z = SQ_SEL_Z;
srd3.bits.DST_SEL_W = SQ_SEL_W;
srd3.bits.FORMAT = BUF_FORMAT_32_UINT;
srd3.bits.RESERVED1 = 0;
srd3.bits.INDEX_STRIDE = 0; // filled in by CP
srd3.bits.ADD_TID_ENABLE = 1;
srd3.bits.RESOURCE_LEVEL = 1;
srd3.bits.RESERVED2 = 0;
srd3.bits.OOB_SELECT = 2; // no bounds check in swizzle mode
srd3.bits.TYPE = SQ_RSRC_BUF;

srd3_u32 = srd3.u32All;
}

// Update Queue's Scratch descriptor's property
amd_queue_.scratch_resource_descriptor[0] = srd0.u32All;
amd_queue_.scratch_resource_descriptor[1] = srd1.u32All;
amd_queue_.scratch_resource_descriptor[2] = srd2.u32All;
amd_queue_.scratch_resource_descriptor[3] = srd3.u32All;
amd_queue_.scratch_resource_descriptor[3] = srd3_u32;

// Populate flat scratch parameters in amd_queue_.
amd_queue_.scratch_backing_memory_location =
queue_scratch_.queue_process_offset;
amd_queue_.scratch_backing_memory_byte_size = queue_scratch_.size;
amd_queue_.scratch_workitem_byte_size =
uint32_t(queue_scratch_.size_per_thread);

// For backwards compatibility this field records the per-lane scratch
// for a 64 lane wavefront. If scratch was allocated for 32 lane waves
// then the effective size for a 64 lane wave is halved.
amd_queue_.scratch_wave64_lane_byte_size =
uint32_t((queue_scratch_.size_per_thread * queue_scratch_.lanes_per_wave) / 64);

// Set concurrent wavefront limits only when scratch is being used.
COMPUTE_TMPRING_SIZE tmpring_size = {};
Expand All @@ -1059,8 +1087,8 @@ void AqlQueue::InitScratchSRD() {

// Scratch is allocated program COMPUTE_TMPRING_SIZE register
// Scratch Size per Wave is specified in terms of kilobytes
uint32_t wave_size = agent_props.WaveFrontSize;
uint32_t wave_scratch = (((wave_size * queue_scratch_.size_per_thread) + 1023) / 1024);
uint32_t wave_scratch = (((queue_scratch_.lanes_per_wave *
queue_scratch_.size_per_thread) + 1023) / 1024);
tmpring_size.bits.WAVESIZE = wave_scratch;
assert(wave_scratch == tmpring_size.bits.WAVESIZE && "WAVESIZE Overflow.");
uint32_t num_waves = (queue_scratch_.size / (tmpring_size.bits.WAVESIZE * 1024));
Expand Down
7 changes: 6 additions & 1 deletion src/core/runtime/amd_blit_sdma.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,8 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Initial
}

// HDP flush supported on gfx900 and forward.
if (agent_->isa()->GetMajorVersion() > 8) {
// FIXME: Not working on gfx10, raises SRBM write protection interrupt.
if (agent_->isa()->GetMajorVersion() == 9) {
hdp_flush_support_ = true;
}

Expand Down Expand Up @@ -623,6 +624,10 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildFenceComma

packet_addr->HEADER_UNION.op = SDMA_OP_FENCE;

if (agent_->isa()->GetMajorVersion() >= 10) {
packet_addr->HEADER_UNION.mtype = 3;
}

packet_addr->ADDR_LO_UNION.addr_31_0 = ptrlow32(fence);

packet_addr->ADDR_HI_UNION.addr_63_32 = ptrhigh32(fence);
Expand Down
Loading

0 comments on commit 8546125

Please sign in to comment.