From 586f50baded5d9183dd703f4f31d4fd4bed9880d Mon Sep 17 00:00:00 2001
From: xuechen2 <Xue.Chen@amd.com>
Date: Mon, 11 Dec 2023 10:30:56 +0800
Subject: [PATCH] Handle null descriptor for patching global buffer operations

Null descriptor is not taken into consideration on pathing global
load/store/atomic operations when `allowNullDescriptor` is enabled.
If the dword3 of descriptor is zero, we will return 0 for load/atmic and
skip store.
NOTE: we use dword3 to check null desc so that we should not change the
descriptor itself if it is null in `CreateBufferBuffer`.
---
 lgc/builder/DescBuilder.cpp                   |  10 +-
 lgc/include/lgc/patch/PatchBufferOp.h         |   2 +
 lgc/patch/PatchBufferOp.cpp                   | 164 ++++++++++--------
 lgc/test/Transforms/PatchBufferOp/simple.lgc  | 150 +++++++++-------
 .../Transforms/PatchBufferOp/uniform-phi.lgc  |  25 +--
 5 files changed, 206 insertions(+), 145 deletions(-)
diff --git a/lgc/builder/DescBuilder.cpp b/lgc/builder/DescBuilder.cpp
index 6e96c12817..eacc62a788 100644
--- a/lgc/builder/DescBuilder.cpp
+++ b/lgc/builder/DescBuilder.cpp
@@ -188,8 +188,14 @@ Value *BuilderImpl::CreateBufferDesc(uint64_t descSet, unsigned binding, Value *
       desc = CreateInsertElement(desc, CreateAnd(desc1, getInt32(0xc000ffff)), 1);
       desc = CreateInsertElement(desc, CreateMul(stride, desc2), 2);
       // gfx10 and gfx11 have oob fields with 2 bits in dword3[29:28] here force to set to 3 as OOB_COMPLETE mode.
-      if (getPipelineState()->getTargetInfo().getGfxIpVersion().major >= 10)
-        desc = CreateInsertElement(desc, CreateOr(desc3, getInt32(0x30000000)), 3);
+      if (getPipelineState()->getTargetInfo().getGfxIpVersion().major >= 10) {
+        Value *newDesc3 = CreateOr(desc3, getInt32(0x30000000));
+        if (getPipelineState()->getOptions().allowNullDescriptor) {
+          Value *isNullDesc = CreateICmpEQ(desc3, getInt32(0));
+          newDesc3 = CreateSelect(isNullDesc, desc3, newDesc3);
+        }
+        desc = CreateInsertElement(desc, newDesc3, 3);
+      }
     }
   }
 
diff --git a/lgc/include/lgc/patch/PatchBufferOp.h b/lgc/include/lgc/patch/PatchBufferOp.h
index 3f9345425c..ef80b98773 100644
--- a/lgc/include/lgc/patch/PatchBufferOp.h
+++ b/lgc/include/lgc/patch/PatchBufferOp.h
@@ -118,6 +118,8 @@ class BufferOpLowering {
   llvm::Value *replaceLoadStore(llvm::Instruction &inst);
   llvm::Instruction *makeLoop(llvm::Value *const loopStart, llvm::Value *const loopEnd, llvm::Value *const loopStride,
                               llvm::Instruction *const insertPos);
+  Value *createGlobalPointerAccess(llvm::Value *bufferDesc, llvm::Value *offset, llvm::Type *type,
+                                   llvm::Instruction &inst, const llvm::function_ref<Value *(Value *)> &callback);
 
   TypeLowering &m_typeLowering;
   llvm::IRBuilder<> m_builder;
diff --git a/lgc/patch/PatchBufferOp.cpp b/lgc/patch/PatchBufferOp.cpp
index 4a9d3bab0a..1580fa82b2 100644
--- a/lgc/patch/PatchBufferOp.cpp
+++ b/lgc/patch/PatchBufferOp.cpp
@@ -353,34 +353,27 @@ void BufferOpLowering::visitAtomicCmpXchgInst(AtomicCmpXchgInst &atomicCmpXchgIn
 
   // If our buffer descriptor is divergent, need to handle it differently.
   if (getDescriptorInfo(bufferDesc).divergent.value()) {
-    Value *const baseAddr = getBaseAddressFromBufferDesc(bufferDesc);
-
-    // The 2nd element in the buffer descriptor is the byte bound, we do this to support robust buffer access.
-    Value *const bound = m_builder.CreateExtractElement(bufferDesc, 2);
-    Value *const inBound = m_builder.CreateICmpULT(baseIndex, bound);
-    Value *const newBaseIndex = m_builder.CreateSelect(inBound, baseIndex, m_builder.getInt32(0));
-
-    // Add on the index to the address.
-    Value *atomicPointer = m_builder.CreateGEP(m_builder.getInt8Ty(), baseAddr, newBaseIndex);
-
-    atomicPointer = m_builder.CreateBitCast(atomicPointer, storeType->getPointerTo(ADDR_SPACE_GLOBAL));
-
-    const AtomicOrdering successOrdering = atomicCmpXchgInst.getSuccessOrdering();
-    const AtomicOrdering failureOrdering = atomicCmpXchgInst.getFailureOrdering();
-
-    Value *const compareValue = atomicCmpXchgInst.getCompareOperand();
-    Value *const newValue = atomicCmpXchgInst.getNewValOperand();
-    AtomicCmpXchgInst *const newAtomicCmpXchg = m_builder.CreateAtomicCmpXchg(
-        atomicPointer, compareValue, newValue, MaybeAlign(), successOrdering, failureOrdering);
-    newAtomicCmpXchg->setVolatile(atomicCmpXchgInst.isVolatile());
-    newAtomicCmpXchg->setSyncScopeID(atomicCmpXchgInst.getSyncScopeID());
-    newAtomicCmpXchg->setWeak(atomicCmpXchgInst.isWeak());
-    copyMetadata(newAtomicCmpXchg, &atomicCmpXchgInst);
+    auto createAtomicCmpXchgFunc = [&](Value *pointer) {
+      const AtomicOrdering successOrdering = atomicCmpXchgInst.getSuccessOrdering();
+      const AtomicOrdering failureOrdering = atomicCmpXchgInst.getFailureOrdering();
+
+      Value *const compareValue = atomicCmpXchgInst.getCompareOperand();
+      Value *const newValue = atomicCmpXchgInst.getNewValOperand();
+      AtomicCmpXchgInst *const newAtomicCmpXchg = m_builder.CreateAtomicCmpXchg(
+          pointer, compareValue, newValue, MaybeAlign(), successOrdering, failureOrdering);
+      newAtomicCmpXchg->setVolatile(atomicCmpXchgInst.isVolatile());
+      newAtomicCmpXchg->setSyncScopeID(atomicCmpXchgInst.getSyncScopeID());
+      newAtomicCmpXchg->setWeak(atomicCmpXchgInst.isWeak());
+      copyMetadata(newAtomicCmpXchg, &atomicCmpXchgInst);
+      return newAtomicCmpXchg;
+    };
+    Value *result =
+        createGlobalPointerAccess(bufferDesc, baseIndex, storeType, atomicCmpXchgInst, createAtomicCmpXchgFunc);
 
     // Record the atomic instruction so we remember to delete it later.
     m_typeLowering.eraseInstruction(&atomicCmpXchgInst);
 
-    atomicCmpXchgInst.replaceAllUsesWith(newAtomicCmpXchg);
+    atomicCmpXchgInst.replaceAllUsesWith(result);
   } else {
     switch (atomicCmpXchgInst.getSuccessOrdering()) {
     case AtomicOrdering::Release:
@@ -459,29 +452,21 @@ void BufferOpLowering::visitAtomicRMWInst(AtomicRMWInst &atomicRmwInst) {
 
     // If our buffer descriptor is divergent, need to handle it differently.
     if (getDescriptorInfo(bufferDesc).divergent.value()) {
-      Value *const baseAddr = getBaseAddressFromBufferDesc(bufferDesc);
-
-      // The 2nd element in the buffer descriptor is the byte bound, we do this to support robust buffer access.
-      Value *const bound = m_builder.CreateExtractElement(bufferDesc, 2);
-      Value *const inBound = m_builder.CreateICmpULT(baseIndex, bound);
-      Value *const newBaseIndex = m_builder.CreateSelect(inBound, baseIndex, m_builder.getInt32(0));
-
-      // Add on the index to the address.
-      Value *atomicPointer = m_builder.CreateGEP(m_builder.getInt8Ty(), baseAddr, newBaseIndex);
-
-      atomicPointer = m_builder.CreateBitCast(atomicPointer, storeType->getPointerTo(ADDR_SPACE_GLOBAL));
-
-      AtomicRMWInst *const newAtomicRmw =
-          m_builder.CreateAtomicRMW(atomicRmwInst.getOperation(), atomicPointer, atomicRmwInst.getValOperand(),
-                                    atomicRmwInst.getAlign(), atomicRmwInst.getOrdering());
-      newAtomicRmw->setVolatile(atomicRmwInst.isVolatile());
-      newAtomicRmw->setSyncScopeID(atomicRmwInst.getSyncScopeID());
-      copyMetadata(newAtomicRmw, &atomicRmwInst);
+      auto createAtomicRmwFunc = [&](Value *pointer) {
+        AtomicRMWInst *const newAtomicRmw =
+            m_builder.CreateAtomicRMW(atomicRmwInst.getOperation(), pointer, atomicRmwInst.getValOperand(),
+                                      atomicRmwInst.getAlign(), atomicRmwInst.getOrdering());
+        newAtomicRmw->setVolatile(atomicRmwInst.isVolatile());
+        newAtomicRmw->setSyncScopeID(atomicRmwInst.getSyncScopeID());
+        copyMetadata(newAtomicRmw, &atomicRmwInst);
+        return newAtomicRmw;
+      };
+      Value *result = createGlobalPointerAccess(bufferDesc, baseIndex, storeType, atomicRmwInst, createAtomicRmwFunc);
 
       // Record the atomic instruction so we remember to delete it later.
       m_typeLowering.eraseInstruction(&atomicRmwInst);
 
-      atomicRmwInst.replaceAllUsesWith(newAtomicRmw);
+      atomicRmwInst.replaceAllUsesWith(result);
     } else {
       switch (atomicRmwInst.getOrdering()) {
       case AtomicOrdering::Release:
@@ -1292,36 +1277,28 @@ Value *BufferOpLowering::replaceLoadStore(Instruction &inst) {
 
   // If our buffer descriptor is divergent, need to handle that differently.
   if (getDescriptorInfo(bufferDesc).divergent.value()) {
-    Value *const baseAddr = getBaseAddressFromBufferDesc(bufferDesc);
-
-    // The 2nd element in the buffer descriptor is the byte bound, we do this to support robust buffer access.
-    Value *const bound = m_builder.CreateExtractElement(bufferDesc, 2);
-    Value *const inBound = m_builder.CreateICmpULT(baseIndex, bound);
-    Value *const newBaseIndex = m_builder.CreateSelect(inBound, baseIndex, m_builder.getInt32(0));
-
-    // Add on the index to the address.
-    Value *pointer = m_builder.CreateGEP(m_builder.getInt8Ty(), baseAddr, newBaseIndex);
-
-    pointer = m_builder.CreateBitCast(pointer, type->getPointerTo(ADDR_SPACE_GLOBAL));
-
-    if (isLoad) {
-      LoadInst *const newLoad = m_builder.CreateAlignedLoad(type, pointer, alignment, loadInst->isVolatile());
-      newLoad->setOrdering(ordering);
-      newLoad->setSyncScopeID(syncScopeID);
-      copyMetadata(newLoad, loadInst);
-
-      if (isInvariant)
-        newLoad->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(m_builder.getContext(), {}));
-
-      return newLoad;
-    }
-    StoreInst *const newStore =
-        m_builder.CreateAlignedStore(storeInst->getValueOperand(), pointer, alignment, storeInst->isVolatile());
-    newStore->setOrdering(ordering);
-    newStore->setSyncScopeID(syncScopeID);
-    copyMetadata(newStore, storeInst);
-
-    return newStore;
+    auto createLoadStoreFunc = [&](Value *pointer) {
+      Value *result = nullptr;
+      if (isLoad) {
+        LoadInst *const newLoad = m_builder.CreateAlignedLoad(type, pointer, alignment, loadInst->isVolatile());
+        newLoad->setOrdering(ordering);
+        newLoad->setSyncScopeID(syncScopeID);
+        copyMetadata(newLoad, loadInst);
+
+        if (isInvariant)
+          newLoad->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(m_builder.getContext(), {}));
+        result = newLoad;
+      } else {
+        StoreInst *const newStore =
+            m_builder.CreateAlignedStore(storeInst->getValueOperand(), pointer, alignment, storeInst->isVolatile());
+        newStore->setOrdering(ordering);
+        newStore->setSyncScopeID(syncScopeID);
+        copyMetadata(newStore, storeInst);
+        result = newStore;
+      }
+      return result;
+    };
+    return createGlobalPointerAccess(bufferDesc, baseIndex, type, inst, createLoadStoreFunc);
   }
 
   switch (ordering) {
@@ -1571,3 +1548,44 @@ Instruction *BufferOpLowering::makeLoop(Value *const loopStart, Value *const loo
 
   return loopCounter;
 }
+
+// =====================================================================================================================
+// Create global pointer access.
+//
+// @param bufferDesc: The buffer descriptor
+// @param offset: The offset on the global memory
+// @param type: The accessed data type
+// @param inst: The instruction to be executed on the buffer
+// @param callback: The callback function to perform the specific global access
+Value *BufferOpLowering::createGlobalPointerAccess(Value *bufferDesc, Value *offset, Type *type, Instruction &inst,
+                                                   const function_ref<Value *(Value *)> &callback) {
+  // Handle null descriptor if it is allowed. Load/atomic zero and skip store for null descriptor.
+  Value *isNullDesc = m_builder.getFalse();
+  if (m_pipelineState.getOptions().allowNullDescriptor) {
+    // Check dword3 against 0 for a null descriptor
+    Value *descWord3 = m_builder.CreateExtractElement(bufferDesc, 3);
+    isNullDesc = m_builder.CreateICmpEQ(descWord3, m_builder.getInt32(0));
+  }
+  BasicBlock *const origBlock = inst.getParent();
+  Instruction *const terminator = SplitBlockAndInsertIfThen(isNullDesc, &inst, false);
+
+  // Global pointer access
+  m_builder.SetInsertPoint(terminator);
+  Value *baseAddr = getBaseAddressFromBufferDesc(bufferDesc);
+  // The 2nd element in the buffer descriptor is the byte bound, we do this to support robust buffer access.
+  Value *bound = m_builder.CreateExtractElement(bufferDesc, 2);
+  Value *inBound = m_builder.CreateICmpULT(offset, bound);
+  Value *newOffset = m_builder.CreateSelect(inBound, offset, m_builder.getInt32(0));
+  // Add on the index to the address.
+  Value *pointer = m_builder.CreateGEP(m_builder.getInt8Ty(), baseAddr, newOffset);
+  pointer = m_builder.CreateBitCast(pointer, type->getPointerTo(ADDR_SPACE_GLOBAL));
+  Value *newValue = callback(pointer);
+
+  m_builder.SetInsertPoint(&inst);
+  assert(!type->isVoidTy());
+  auto phi = m_builder.CreatePHI(type, 2, "newValue");
+  phi->addIncoming(Constant::getNullValue(type), origBlock);
+  phi->addIncoming(newValue, terminator->getParent());
+
+  return phi;
+}
diff --git a/lgc/test/Transforms/PatchBufferOp/simple.lgc b/lgc/test/Transforms/PatchBufferOp/simple.lgc
index 35c84beccd..618abc5143 100644
--- a/lgc/test/Transforms/PatchBufferOp/simple.lgc
+++ b/lgc/test/Transforms/PatchBufferOp/simple.lgc
@@ -29,16 +29,21 @@ define amdgpu_gfx float @uniform_select(<4 x i32> inreg %desc0, <4 x i32> inreg
 define amdgpu_gfx float @divergent_select(<4 x i32> inreg %desc0, <4 x i32> inreg %desc1, i1 %sel) !lgc.shaderstage !0 {
 ; CHECK-LABEL: @divergent_select(
 ; CHECK-NEXT:    [[PTR_0:%.*]] = select i1 [[SEL:%.*]], <4 x i32> [[DESC0:%.*]], <4 x i32> [[DESC1:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i32> [[TMP1]], <i32 -1, i32 65535>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(1)
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 0, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP4]], i32 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(1) [[TMP8]], align 4
-; CHECK-NEXT:    ret float [[TMP9]]
+; CHECK-NEXT:    br i1 false, label [[TMP1:%.*]], label [[TMP11:%.*]]
+; CHECK:       1:
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = and <2 x i32> [[TMP2]], <i32 -1, i32 65535>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr addrspace(1)
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ult i32 0, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP5]], i32 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr addrspace(1) [[TMP9]], align 4
+; CHECK-NEXT:    br label [[TMP11]]
+; CHECK:       11:
+; CHECK-NEXT:    [[NEWVALUE:%.*]] = phi float [ 0.000000e+00, [[TMP0:%.*]] ], [ [[TMP10]], [[TMP1]] ]
+; CHECK-NEXT:    ret float [[NEWVALUE]]
 ;
   %ptr0 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc0)
   %ptr1 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc1)
@@ -50,16 +55,21 @@ define amdgpu_gfx float @divergent_select(<4 x i32> inreg %desc0, <4 x i32> inre
 define amdgpu_gfx float @divergent_select1(<4 x i32> %desc0, <4 x i32> inreg %desc1, i1 inreg %sel) !lgc.shaderstage !0 {
 ; CHECK-LABEL: @divergent_select1(
 ; CHECK-NEXT:    [[PTR_0:%.*]] = select i1 [[SEL:%.*]], <4 x i32> [[DESC0:%.*]], <4 x i32> [[DESC1:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i32> [[TMP1]], <i32 -1, i32 65535>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(1)
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 0, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP4]], i32 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(1) [[TMP8]], align 4
-; CHECK-NEXT:    ret float [[TMP9]]
+; CHECK-NEXT:    br i1 false, label [[TMP1:%.*]], label [[TMP11:%.*]]
+; CHECK:       1:
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = and <2 x i32> [[TMP2]], <i32 -1, i32 65535>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr addrspace(1)
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ult i32 0, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP5]], i32 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr addrspace(1) [[TMP9]], align 4
+; CHECK-NEXT:    br label [[TMP11]]
+; CHECK:       11:
+; CHECK-NEXT:    [[NEWVALUE:%.*]] = phi float [ 0.000000e+00, [[TMP0:%.*]] ], [ [[TMP10]], [[TMP1]] ]
+; CHECK-NEXT:    ret float [[NEWVALUE]]
 ;
   %ptr0 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc0)
   %ptr1 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc1)
@@ -71,16 +81,21 @@ define amdgpu_gfx float @divergent_select1(<4 x i32> %desc0, <4 x i32> inreg %de
 define amdgpu_gfx float @divergent_select2(<4 x i32> inreg %desc0, <4 x i32> %desc1, i1 inreg %sel) !lgc.shaderstage !0 {
 ; CHECK-LABEL: @divergent_select2(
 ; CHECK-NEXT:    [[PTR_0:%.*]] = select i1 [[SEL:%.*]], <4 x i32> [[DESC0:%.*]], <4 x i32> [[DESC1:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i32> [[TMP1]], <i32 -1, i32 65535>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(1)
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 0, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP4]], i32 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(1) [[TMP8]], align 4
-; CHECK-NEXT:    ret float [[TMP9]]
+; CHECK-NEXT:    br i1 false, label [[TMP1:%.*]], label [[TMP11:%.*]]
+; CHECK:       1:
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = and <2 x i32> [[TMP2]], <i32 -1, i32 65535>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr addrspace(1)
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ult i32 0, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP5]], i32 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr addrspace(1) [[TMP9]], align 4
+; CHECK-NEXT:    br label [[TMP11]]
+; CHECK:       11:
+; CHECK-NEXT:    [[NEWVALUE:%.*]] = phi float [ 0.000000e+00, [[TMP0:%.*]] ], [ [[TMP10]], [[TMP1]] ]
+; CHECK-NEXT:    ret float [[NEWVALUE]]
 ;
   %ptr0 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc0)
   %ptr1 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc1)
@@ -131,16 +146,21 @@ define amdgpu_gfx float @divergent_input0_phi(<4 x i32> %desc0, <4 x i32> inreg
 ; CHECK-NEXT:    br label [[TAIL]]
 ; CHECK:       tail:
 ; CHECK-NEXT:    [[PTR_0:%.*]] = phi <4 x i32> [ [[DESC0:%.*]], [[A]] ], [ [[DESC1:%.*]], [[B]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i32> [[TMP1]], <i32 -1, i32 65535>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(1)
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 0, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP4]], i32 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(1) [[TMP8]], align 4
-; CHECK-NEXT:    ret float [[TMP9]]
+; CHECK-NEXT:    br i1 false, label [[TMP1:%.*]], label [[TMP11:%.*]]
+; CHECK:       1:
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = and <2 x i32> [[TMP2]], <i32 -1, i32 65535>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr addrspace(1)
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ult i32 0, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP5]], i32 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr addrspace(1) [[TMP9]], align 4
+; CHECK-NEXT:    br label [[TMP11]]
+; CHECK:       11:
+; CHECK-NEXT:    [[NEWVALUE:%.*]] = phi float [ 0.000000e+00, [[TAIL]] ], [ [[TMP10]], [[TMP1]] ]
+; CHECK-NEXT:    ret float [[NEWVALUE]]
 ;
   br i1 %sel, label %a, label %b
 
@@ -167,16 +187,21 @@ define amdgpu_gfx float @divergent_input1_phi(<4 x i32> inreg %desc0, <4 x i32>
 ; CHECK-NEXT:    br label [[TAIL]]
 ; CHECK:       tail:
 ; CHECK-NEXT:    [[PTR_0:%.*]] = phi <4 x i32> [ [[DESC0:%.*]], [[A]] ], [ [[DESC1:%.*]], [[B]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i32> [[TMP1]], <i32 -1, i32 65535>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(1)
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 0, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP4]], i32 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(1) [[TMP8]], align 4
-; CHECK-NEXT:    ret float [[TMP9]]
+; CHECK-NEXT:    br i1 false, label [[TMP1:%.*]], label [[TMP11:%.*]]
+; CHECK:       1:
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = and <2 x i32> [[TMP2]], <i32 -1, i32 65535>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr addrspace(1)
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ult i32 0, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP5]], i32 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr addrspace(1) [[TMP9]], align 4
+; CHECK-NEXT:    br label [[TMP11]]
+; CHECK:       11:
+; CHECK-NEXT:    [[NEWVALUE:%.*]] = phi float [ 0.000000e+00, [[TAIL]] ], [ [[TMP10]], [[TMP1]] ]
+; CHECK-NEXT:    ret float [[NEWVALUE]]
 ;
   br i1 %sel, label %a, label %b
 
@@ -203,16 +228,21 @@ define amdgpu_gfx float @divergent_sync_phi(<4 x i32> inreg %desc0, <4 x i32> in
 ; CHECK-NEXT:    br label [[TAIL]]
 ; CHECK:       tail:
 ; CHECK-NEXT:    [[PTR_0:%.*]] = phi <4 x i32> [ [[DESC0:%.*]], [[A]] ], [ [[DESC1:%.*]], [[B]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i32> [[TMP1]], <i32 -1, i32 65535>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(1)
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 0, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP4]], i32 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(1) [[TMP8]], align 4
-; CHECK-NEXT:    ret float [[TMP9]]
+; CHECK-NEXT:    br i1 false, label [[TMP1:%.*]], label [[TMP11:%.*]]
+; CHECK:       1:
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = and <2 x i32> [[TMP2]], <i32 -1, i32 65535>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr addrspace(1)
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ult i32 0, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP5]], i32 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr addrspace(1) [[TMP9]], align 4
+; CHECK-NEXT:    br label [[TMP11]]
+; CHECK:       11:
+; CHECK-NEXT:    [[NEWVALUE:%.*]] = phi float [ 0.000000e+00, [[TAIL]] ], [ [[TMP10]], [[TMP1]] ]
+; CHECK-NEXT:    ret float [[NEWVALUE]]
 ;
   br i1 %sel, label %a, label %b
 
diff --git a/lgc/test/Transforms/PatchBufferOp/uniform-phi.lgc b/lgc/test/Transforms/PatchBufferOp/uniform-phi.lgc
index 9a1bdcdb55..3a7a3ee37e 100644
--- a/lgc/test/Transforms/PatchBufferOp/uniform-phi.lgc
+++ b/lgc/test/Transforms/PatchBufferOp/uniform-phi.lgc
@@ -16,16 +16,21 @@ define amdgpu_gfx float @uniform_phi(<4 x i32> inreg %desc0, <4 x i32> inreg %de
 ; CHECK-NEXT:    br label [[TAIL]]
 ; CHECK:       tail:
 ; CHECK-NEXT:    [[PTR_0:%.*]] = phi <4 x i32> [ [[DESC0:%.*]], [[A]] ], [ [[DESC1:%.*]], [[B]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i32> [[TMP1]], <i32 -1, i32 65535>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(1)
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 0, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP4]], i32 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(1) [[TMP8]], align 4
-; CHECK-NEXT:    ret float [[TMP9]]
+; CHECK-NEXT:    br i1 false, label [[TMP1:%.*]], label [[TMP11:%.*]]
+; CHECK:       1:
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = and <2 x i32> [[TMP2]], <i32 -1, i32 65535>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr addrspace(1)
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ult i32 0, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP5]], i32 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr addrspace(1) [[TMP9]], align 4
+; CHECK-NEXT:    br label [[TMP11]]
+; CHECK:       11:
+; CHECK-NEXT:    [[NEWVALUE:%.*]] = phi float [ 0.000000e+00, [[TAIL]] ], [ [[TMP10]], [[TMP1]] ]
+; CHECK-NEXT:    ret float [[NEWVALUE]]
 ;
   br i1 %sel, label %a, label %b