From 586f50baded5d9183dd703f4f31d4fd4bed9880d Mon Sep 17 00:00:00 2001 From: xuechen2 Date: Mon, 11 Dec 2023 10:30:56 +0800 Subject: [PATCH] Handle null descriptor for patching global buffer operations Null descriptor is not taken into consideration on pathing global load/store/atomic operations when `allowNullDescriptor` is enabled. If the dword3 of descriptor is zero, we will return 0 for load/atmic and skip store. NOTE: we use dword3 to check null desc so that we should not change the descriptor itself if it is null in `CreateBufferBuffer`. --- lgc/builder/DescBuilder.cpp | 10 +- lgc/include/lgc/patch/PatchBufferOp.h | 2 + lgc/patch/PatchBufferOp.cpp | 164 ++++++++++-------- lgc/test/Transforms/PatchBufferOp/simple.lgc | 150 +++++++++------- .../Transforms/PatchBufferOp/uniform-phi.lgc | 25 +-- 5 files changed, 206 insertions(+), 145 deletions(-) diff --git a/lgc/builder/DescBuilder.cpp b/lgc/builder/DescBuilder.cpp index 6e96c12817..eacc62a788 100644 --- a/lgc/builder/DescBuilder.cpp +++ b/lgc/builder/DescBuilder.cpp @@ -188,8 +188,14 @@ Value *BuilderImpl::CreateBufferDesc(uint64_t descSet, unsigned binding, Value * desc = CreateInsertElement(desc, CreateAnd(desc1, getInt32(0xc000ffff)), 1); desc = CreateInsertElement(desc, CreateMul(stride, desc2), 2); // gfx10 and gfx11 have oob fields with 2 bits in dword3[29:28] here force to set to 3 as OOB_COMPLETE mode. - if (getPipelineState()->getTargetInfo().getGfxIpVersion().major >= 10) - desc = CreateInsertElement(desc, CreateOr(desc3, getInt32(0x30000000)), 3); + if (getPipelineState()->getTargetInfo().getGfxIpVersion().major >= 10) { + Value *newDesc3 = CreateOr(desc3, getInt32(0x30000000)); + if (getPipelineState()->getOptions().allowNullDescriptor) { + Value *isNullDesc = CreateICmpEQ(desc3, getInt32(0)); + newDesc3 = CreateSelect(isNullDesc, desc3, newDesc3); + } + desc = CreateInsertElement(desc, newDesc3, 3); + } } } diff --git a/lgc/include/lgc/patch/PatchBufferOp.h b/lgc/include/lgc/patch/PatchBufferOp.h index 3f9345425c..ef80b98773 100644 --- a/lgc/include/lgc/patch/PatchBufferOp.h +++ b/lgc/include/lgc/patch/PatchBufferOp.h @@ -118,6 +118,8 @@ class BufferOpLowering { llvm::Value *replaceLoadStore(llvm::Instruction &inst); llvm::Instruction *makeLoop(llvm::Value *const loopStart, llvm::Value *const loopEnd, llvm::Value *const loopStride, llvm::Instruction *const insertPos); + Value *createGlobalPointerAccess(llvm::Value *bufferDesc, llvm::Value *offset, llvm::Type *type, + llvm::Instruction &inst, const llvm::function_ref &callback); TypeLowering &m_typeLowering; llvm::IRBuilder<> m_builder; diff --git a/lgc/patch/PatchBufferOp.cpp b/lgc/patch/PatchBufferOp.cpp index 4a9d3bab0a..1580fa82b2 100644 --- a/lgc/patch/PatchBufferOp.cpp +++ b/lgc/patch/PatchBufferOp.cpp @@ -353,34 +353,27 @@ void BufferOpLowering::visitAtomicCmpXchgInst(AtomicCmpXchgInst &atomicCmpXchgIn // If our buffer descriptor is divergent, need to handle it differently. if (getDescriptorInfo(bufferDesc).divergent.value()) { - Value *const baseAddr = getBaseAddressFromBufferDesc(bufferDesc); - - // The 2nd element in the buffer descriptor is the byte bound, we do this to support robust buffer access. - Value *const bound = m_builder.CreateExtractElement(bufferDesc, 2); - Value *const inBound = m_builder.CreateICmpULT(baseIndex, bound); - Value *const newBaseIndex = m_builder.CreateSelect(inBound, baseIndex, m_builder.getInt32(0)); - - // Add on the index to the address. - Value *atomicPointer = m_builder.CreateGEP(m_builder.getInt8Ty(), baseAddr, newBaseIndex); - - atomicPointer = m_builder.CreateBitCast(atomicPointer, storeType->getPointerTo(ADDR_SPACE_GLOBAL)); - - const AtomicOrdering successOrdering = atomicCmpXchgInst.getSuccessOrdering(); - const AtomicOrdering failureOrdering = atomicCmpXchgInst.getFailureOrdering(); - - Value *const compareValue = atomicCmpXchgInst.getCompareOperand(); - Value *const newValue = atomicCmpXchgInst.getNewValOperand(); - AtomicCmpXchgInst *const newAtomicCmpXchg = m_builder.CreateAtomicCmpXchg( - atomicPointer, compareValue, newValue, MaybeAlign(), successOrdering, failureOrdering); - newAtomicCmpXchg->setVolatile(atomicCmpXchgInst.isVolatile()); - newAtomicCmpXchg->setSyncScopeID(atomicCmpXchgInst.getSyncScopeID()); - newAtomicCmpXchg->setWeak(atomicCmpXchgInst.isWeak()); - copyMetadata(newAtomicCmpXchg, &atomicCmpXchgInst); + auto createAtomicCmpXchgFunc = [&](Value *pointer) { + const AtomicOrdering successOrdering = atomicCmpXchgInst.getSuccessOrdering(); + const AtomicOrdering failureOrdering = atomicCmpXchgInst.getFailureOrdering(); + + Value *const compareValue = atomicCmpXchgInst.getCompareOperand(); + Value *const newValue = atomicCmpXchgInst.getNewValOperand(); + AtomicCmpXchgInst *const newAtomicCmpXchg = m_builder.CreateAtomicCmpXchg( + pointer, compareValue, newValue, MaybeAlign(), successOrdering, failureOrdering); + newAtomicCmpXchg->setVolatile(atomicCmpXchgInst.isVolatile()); + newAtomicCmpXchg->setSyncScopeID(atomicCmpXchgInst.getSyncScopeID()); + newAtomicCmpXchg->setWeak(atomicCmpXchgInst.isWeak()); + copyMetadata(newAtomicCmpXchg, &atomicCmpXchgInst); + return newAtomicCmpXchg; + }; + Value *result = + createGlobalPointerAccess(bufferDesc, baseIndex, storeType, atomicCmpXchgInst, createAtomicCmpXchgFunc); // Record the atomic instruction so we remember to delete it later. m_typeLowering.eraseInstruction(&atomicCmpXchgInst); - atomicCmpXchgInst.replaceAllUsesWith(newAtomicCmpXchg); + atomicCmpXchgInst.replaceAllUsesWith(result); } else { switch (atomicCmpXchgInst.getSuccessOrdering()) { case AtomicOrdering::Release: @@ -459,29 +452,21 @@ void BufferOpLowering::visitAtomicRMWInst(AtomicRMWInst &atomicRmwInst) { // If our buffer descriptor is divergent, need to handle it differently. if (getDescriptorInfo(bufferDesc).divergent.value()) { - Value *const baseAddr = getBaseAddressFromBufferDesc(bufferDesc); - - // The 2nd element in the buffer descriptor is the byte bound, we do this to support robust buffer access. - Value *const bound = m_builder.CreateExtractElement(bufferDesc, 2); - Value *const inBound = m_builder.CreateICmpULT(baseIndex, bound); - Value *const newBaseIndex = m_builder.CreateSelect(inBound, baseIndex, m_builder.getInt32(0)); - - // Add on the index to the address. - Value *atomicPointer = m_builder.CreateGEP(m_builder.getInt8Ty(), baseAddr, newBaseIndex); - - atomicPointer = m_builder.CreateBitCast(atomicPointer, storeType->getPointerTo(ADDR_SPACE_GLOBAL)); - - AtomicRMWInst *const newAtomicRmw = - m_builder.CreateAtomicRMW(atomicRmwInst.getOperation(), atomicPointer, atomicRmwInst.getValOperand(), - atomicRmwInst.getAlign(), atomicRmwInst.getOrdering()); - newAtomicRmw->setVolatile(atomicRmwInst.isVolatile()); - newAtomicRmw->setSyncScopeID(atomicRmwInst.getSyncScopeID()); - copyMetadata(newAtomicRmw, &atomicRmwInst); + auto createAtomicRmwFunc = [&](Value *pointer) { + AtomicRMWInst *const newAtomicRmw = + m_builder.CreateAtomicRMW(atomicRmwInst.getOperation(), pointer, atomicRmwInst.getValOperand(), + atomicRmwInst.getAlign(), atomicRmwInst.getOrdering()); + newAtomicRmw->setVolatile(atomicRmwInst.isVolatile()); + newAtomicRmw->setSyncScopeID(atomicRmwInst.getSyncScopeID()); + copyMetadata(newAtomicRmw, &atomicRmwInst); + return newAtomicRmw; + }; + Value *result = createGlobalPointerAccess(bufferDesc, baseIndex, storeType, atomicRmwInst, createAtomicRmwFunc); // Record the atomic instruction so we remember to delete it later. m_typeLowering.eraseInstruction(&atomicRmwInst); - atomicRmwInst.replaceAllUsesWith(newAtomicRmw); + atomicRmwInst.replaceAllUsesWith(result); } else { switch (atomicRmwInst.getOrdering()) { case AtomicOrdering::Release: @@ -1292,36 +1277,28 @@ Value *BufferOpLowering::replaceLoadStore(Instruction &inst) { // If our buffer descriptor is divergent, need to handle that differently. if (getDescriptorInfo(bufferDesc).divergent.value()) { - Value *const baseAddr = getBaseAddressFromBufferDesc(bufferDesc); - - // The 2nd element in the buffer descriptor is the byte bound, we do this to support robust buffer access. - Value *const bound = m_builder.CreateExtractElement(bufferDesc, 2); - Value *const inBound = m_builder.CreateICmpULT(baseIndex, bound); - Value *const newBaseIndex = m_builder.CreateSelect(inBound, baseIndex, m_builder.getInt32(0)); - - // Add on the index to the address. - Value *pointer = m_builder.CreateGEP(m_builder.getInt8Ty(), baseAddr, newBaseIndex); - - pointer = m_builder.CreateBitCast(pointer, type->getPointerTo(ADDR_SPACE_GLOBAL)); - - if (isLoad) { - LoadInst *const newLoad = m_builder.CreateAlignedLoad(type, pointer, alignment, loadInst->isVolatile()); - newLoad->setOrdering(ordering); - newLoad->setSyncScopeID(syncScopeID); - copyMetadata(newLoad, loadInst); - - if (isInvariant) - newLoad->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(m_builder.getContext(), {})); - - return newLoad; - } - StoreInst *const newStore = - m_builder.CreateAlignedStore(storeInst->getValueOperand(), pointer, alignment, storeInst->isVolatile()); - newStore->setOrdering(ordering); - newStore->setSyncScopeID(syncScopeID); - copyMetadata(newStore, storeInst); - - return newStore; + auto createLoadStoreFunc = [&](Value *pointer) { + Value *result = nullptr; + if (isLoad) { + LoadInst *const newLoad = m_builder.CreateAlignedLoad(type, pointer, alignment, loadInst->isVolatile()); + newLoad->setOrdering(ordering); + newLoad->setSyncScopeID(syncScopeID); + copyMetadata(newLoad, loadInst); + + if (isInvariant) + newLoad->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(m_builder.getContext(), {})); + result = newLoad; + } else { + StoreInst *const newStore = + m_builder.CreateAlignedStore(storeInst->getValueOperand(), pointer, alignment, storeInst->isVolatile()); + newStore->setOrdering(ordering); + newStore->setSyncScopeID(syncScopeID); + copyMetadata(newStore, storeInst); + result = newStore; + } + return result; + }; + return createGlobalPointerAccess(bufferDesc, baseIndex, type, inst, createLoadStoreFunc); } switch (ordering) { @@ -1571,3 +1548,44 @@ Instruction *BufferOpLowering::makeLoop(Value *const loopStart, Value *const loo return loopCounter; } + +// ===================================================================================================================== +// Create global pointer access. +// +// @param bufferDesc: The buffer descriptor +// @param offset: The offset on the global memory +// @param type: The accessed data type +// @param inst: The instruction to be executed on the buffer +// @param callback: The callback function to perform the specific global access +Value *BufferOpLowering::createGlobalPointerAccess(Value *bufferDesc, Value *offset, Type *type, Instruction &inst, + const function_ref &callback) { + // Handle null descriptor if it is allowed. Load/atomic zero and skip store for null descriptor. + Value *isNullDesc = m_builder.getFalse(); + if (m_pipelineState.getOptions().allowNullDescriptor) { + // Check dword3 against 0 for a null descriptor + Value *descWord3 = m_builder.CreateExtractElement(bufferDesc, 3); + isNullDesc = m_builder.CreateICmpEQ(descWord3, m_builder.getInt32(0)); + } + BasicBlock *const origBlock = inst.getParent(); + Instruction *const terminator = SplitBlockAndInsertIfThen(isNullDesc, &inst, false); + + // Global pointer access + m_builder.SetInsertPoint(terminator); + Value *baseAddr = getBaseAddressFromBufferDesc(bufferDesc); + // The 2nd element in the buffer descriptor is the byte bound, we do this to support robust buffer access. + Value *bound = m_builder.CreateExtractElement(bufferDesc, 2); + Value *inBound = m_builder.CreateICmpULT(offset, bound); + Value *newOffset = m_builder.CreateSelect(inBound, offset, m_builder.getInt32(0)); + // Add on the index to the address. + Value *pointer = m_builder.CreateGEP(m_builder.getInt8Ty(), baseAddr, newOffset); + pointer = m_builder.CreateBitCast(pointer, type->getPointerTo(ADDR_SPACE_GLOBAL)); + Value *newValue = callback(pointer); + + m_builder.SetInsertPoint(&inst); + assert(!type->isVoidTy()); + auto phi = m_builder.CreatePHI(type, 2, "newValue"); + phi->addIncoming(Constant::getNullValue(type), origBlock); + phi->addIncoming(newValue, terminator->getParent()); + + return phi; +} diff --git a/lgc/test/Transforms/PatchBufferOp/simple.lgc b/lgc/test/Transforms/PatchBufferOp/simple.lgc index 35c84beccd..618abc5143 100644 --- a/lgc/test/Transforms/PatchBufferOp/simple.lgc +++ b/lgc/test/Transforms/PatchBufferOp/simple.lgc @@ -29,16 +29,21 @@ define amdgpu_gfx float @uniform_select(<4 x i32> inreg %desc0, <4 x i32> inreg define amdgpu_gfx float @divergent_select(<4 x i32> inreg %desc0, <4 x i32> inreg %desc1, i1 %sel) !lgc.shaderstage !0 { ; CHECK-LABEL: @divergent_select( ; CHECK-NEXT: [[PTR_0:%.*]] = select i1 [[SEL:%.*]], <4 x i32> [[DESC0:%.*]], <4 x i32> [[DESC1:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(1) -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i32 0, [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP4]], i32 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(1) [[TMP8]], align 4 -; CHECK-NEXT: ret float [[TMP9]] +; CHECK-NEXT: br i1 false, label [[TMP1:%.*]], label [[TMP11:%.*]] +; CHECK: 1: +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = and <2 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ult i32 0, [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 0, i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP5]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr addrspace(1) [[TMP9]], align 4 +; CHECK-NEXT: br label [[TMP11]] +; CHECK: 11: +; CHECK-NEXT: [[NEWVALUE:%.*]] = phi float [ 0.000000e+00, [[TMP0:%.*]] ], [ [[TMP10]], [[TMP1]] ] +; CHECK-NEXT: ret float [[NEWVALUE]] ; %ptr0 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc0) %ptr1 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc1) @@ -50,16 +55,21 @@ define amdgpu_gfx float @divergent_select(<4 x i32> inreg %desc0, <4 x i32> inre define amdgpu_gfx float @divergent_select1(<4 x i32> %desc0, <4 x i32> inreg %desc1, i1 inreg %sel) !lgc.shaderstage !0 { ; CHECK-LABEL: @divergent_select1( ; CHECK-NEXT: [[PTR_0:%.*]] = select i1 [[SEL:%.*]], <4 x i32> [[DESC0:%.*]], <4 x i32> [[DESC1:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(1) -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i32 0, [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP4]], i32 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(1) [[TMP8]], align 4 -; CHECK-NEXT: ret float [[TMP9]] +; CHECK-NEXT: br i1 false, label [[TMP1:%.*]], label [[TMP11:%.*]] +; CHECK: 1: +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = and <2 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ult i32 0, [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 0, i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP5]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr addrspace(1) [[TMP9]], align 4 +; CHECK-NEXT: br label [[TMP11]] +; CHECK: 11: +; CHECK-NEXT: [[NEWVALUE:%.*]] = phi float [ 0.000000e+00, [[TMP0:%.*]] ], [ [[TMP10]], [[TMP1]] ] +; CHECK-NEXT: ret float [[NEWVALUE]] ; %ptr0 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc0) %ptr1 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc1) @@ -71,16 +81,21 @@ define amdgpu_gfx float @divergent_select1(<4 x i32> %desc0, <4 x i32> inreg %de define amdgpu_gfx float @divergent_select2(<4 x i32> inreg %desc0, <4 x i32> %desc1, i1 inreg %sel) !lgc.shaderstage !0 { ; CHECK-LABEL: @divergent_select2( ; CHECK-NEXT: [[PTR_0:%.*]] = select i1 [[SEL:%.*]], <4 x i32> [[DESC0:%.*]], <4 x i32> [[DESC1:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(1) -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i32 0, [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP4]], i32 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(1) [[TMP8]], align 4 -; CHECK-NEXT: ret float [[TMP9]] +; CHECK-NEXT: br i1 false, label [[TMP1:%.*]], label [[TMP11:%.*]] +; CHECK: 1: +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = and <2 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ult i32 0, [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 0, i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP5]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr addrspace(1) [[TMP9]], align 4 +; CHECK-NEXT: br label [[TMP11]] +; CHECK: 11: +; CHECK-NEXT: [[NEWVALUE:%.*]] = phi float [ 0.000000e+00, [[TMP0:%.*]] ], [ [[TMP10]], [[TMP1]] ] +; CHECK-NEXT: ret float [[NEWVALUE]] ; %ptr0 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc0) %ptr1 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc1) @@ -131,16 +146,21 @@ define amdgpu_gfx float @divergent_input0_phi(<4 x i32> %desc0, <4 x i32> inreg ; CHECK-NEXT: br label [[TAIL]] ; CHECK: tail: ; CHECK-NEXT: [[PTR_0:%.*]] = phi <4 x i32> [ [[DESC0:%.*]], [[A]] ], [ [[DESC1:%.*]], [[B]] ] -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(1) -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i32 0, [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP4]], i32 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(1) [[TMP8]], align 4 -; CHECK-NEXT: ret float [[TMP9]] +; CHECK-NEXT: br i1 false, label [[TMP1:%.*]], label [[TMP11:%.*]] +; CHECK: 1: +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = and <2 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ult i32 0, [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 0, i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP5]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr addrspace(1) [[TMP9]], align 4 +; CHECK-NEXT: br label [[TMP11]] +; CHECK: 11: +; CHECK-NEXT: [[NEWVALUE:%.*]] = phi float [ 0.000000e+00, [[TAIL]] ], [ [[TMP10]], [[TMP1]] ] +; CHECK-NEXT: ret float [[NEWVALUE]] ; br i1 %sel, label %a, label %b @@ -167,16 +187,21 @@ define amdgpu_gfx float @divergent_input1_phi(<4 x i32> inreg %desc0, <4 x i32> ; CHECK-NEXT: br label [[TAIL]] ; CHECK: tail: ; CHECK-NEXT: [[PTR_0:%.*]] = phi <4 x i32> [ [[DESC0:%.*]], [[A]] ], [ [[DESC1:%.*]], [[B]] ] -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(1) -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i32 0, [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP4]], i32 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(1) [[TMP8]], align 4 -; CHECK-NEXT: ret float [[TMP9]] +; CHECK-NEXT: br i1 false, label [[TMP1:%.*]], label [[TMP11:%.*]] +; CHECK: 1: +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = and <2 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ult i32 0, [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 0, i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP5]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr addrspace(1) [[TMP9]], align 4 +; CHECK-NEXT: br label [[TMP11]] +; CHECK: 11: +; CHECK-NEXT: [[NEWVALUE:%.*]] = phi float [ 0.000000e+00, [[TAIL]] ], [ [[TMP10]], [[TMP1]] ] +; CHECK-NEXT: ret float [[NEWVALUE]] ; br i1 %sel, label %a, label %b @@ -203,16 +228,21 @@ define amdgpu_gfx float @divergent_sync_phi(<4 x i32> inreg %desc0, <4 x i32> in ; CHECK-NEXT: br label [[TAIL]] ; CHECK: tail: ; CHECK-NEXT: [[PTR_0:%.*]] = phi <4 x i32> [ [[DESC0:%.*]], [[A]] ], [ [[DESC1:%.*]], [[B]] ] -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(1) -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i32 0, [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP4]], i32 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(1) [[TMP8]], align 4 -; CHECK-NEXT: ret float [[TMP9]] +; CHECK-NEXT: br i1 false, label [[TMP1:%.*]], label [[TMP11:%.*]] +; CHECK: 1: +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = and <2 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ult i32 0, [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 0, i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP5]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr addrspace(1) [[TMP9]], align 4 +; CHECK-NEXT: br label [[TMP11]] +; CHECK: 11: +; CHECK-NEXT: [[NEWVALUE:%.*]] = phi float [ 0.000000e+00, [[TAIL]] ], [ [[TMP10]], [[TMP1]] ] +; CHECK-NEXT: ret float [[NEWVALUE]] ; br i1 %sel, label %a, label %b diff --git a/lgc/test/Transforms/PatchBufferOp/uniform-phi.lgc b/lgc/test/Transforms/PatchBufferOp/uniform-phi.lgc index 9a1bdcdb55..3a7a3ee37e 100644 --- a/lgc/test/Transforms/PatchBufferOp/uniform-phi.lgc +++ b/lgc/test/Transforms/PatchBufferOp/uniform-phi.lgc @@ -16,16 +16,21 @@ define amdgpu_gfx float @uniform_phi(<4 x i32> inreg %desc0, <4 x i32> inreg %de ; CHECK-NEXT: br label [[TAIL]] ; CHECK: tail: ; CHECK-NEXT: [[PTR_0:%.*]] = phi <4 x i32> [ [[DESC0:%.*]], [[A]] ], [ [[DESC1:%.*]], [[B]] ] -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(1) -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i32 0, [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP4]], i32 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(1) [[TMP8]], align 4 -; CHECK-NEXT: ret float [[TMP9]] +; CHECK-NEXT: br i1 false, label [[TMP1:%.*]], label [[TMP11:%.*]] +; CHECK: 1: +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = and <2 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ult i32 0, [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 0, i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP5]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr addrspace(1) [[TMP9]], align 4 +; CHECK-NEXT: br label [[TMP11]] +; CHECK: 11: +; CHECK-NEXT: [[NEWVALUE:%.*]] = phi float [ 0.000000e+00, [[TAIL]] ], [ [[TMP10]], [[TMP1]] ] +; CHECK-NEXT: ret float [[NEWVALUE]] ; br i1 %sel, label %a, label %b