From 217a8e383e126256ac16ff4264e6f8850abe3853 Mon Sep 17 00:00:00 2001
From: xuechen2 <Xue.Chen@amd.com>
Date: Mon, 11 Dec 2023 10:30:56 +0800
Subject: [PATCH] Apply extended robust access on global buffer access

This change will perform defined behavior for global buffer access in
the case of null descriptor or out-of-bound if the application need the
extended `robustnessAccess`.
NOTE: we use dword2 to check null descriptor.
---
 lgc/include/lgc/patch/PatchBufferOp.h         |   2 +
 lgc/interface/lgc/Pipeline.h                  |   1 +
 lgc/patch/PatchBufferOp.cpp                   | 174 ++++++++++--------
 lgc/test/Transforms/PatchBufferOp/simple.lgc  | 150 +++++++++------
 .../Transforms/PatchBufferOp/uniform-phi.lgc  |  25 ++-
 llpc/context/llpcPipelineContext.cpp          |   1 +
 6 files changed, 210 insertions(+), 143 deletions(-)
diff --git a/lgc/include/lgc/patch/PatchBufferOp.h b/lgc/include/lgc/patch/PatchBufferOp.h
index 3f9345425c..96ccfe4ffd 100644
--- a/lgc/include/lgc/patch/PatchBufferOp.h
+++ b/lgc/include/lgc/patch/PatchBufferOp.h
@@ -118,6 +118,8 @@ class BufferOpLowering {
   llvm::Value *replaceLoadStore(llvm::Instruction &inst);
   llvm::Instruction *makeLoop(llvm::Value *const loopStart, llvm::Value *const loopEnd, llvm::Value *const loopStride,
                               llvm::Instruction *const insertPos);
+  Value *createGlobalPointerAccess(llvm::Value *const bufferDesc, llvm::Value *const offset, llvm::Type *const type,
+                                   llvm::Instruction &inst, const llvm::function_ref<Value *(Value *)> callback);
 
   TypeLowering &m_typeLowering;
   llvm::IRBuilder<> m_builder;
diff --git a/lgc/interface/lgc/Pipeline.h b/lgc/interface/lgc/Pipeline.h
index 6a346ac856..5d8a3036ea 100644
--- a/lgc/interface/lgc/Pipeline.h
+++ b/lgc/interface/lgc/Pipeline.h
@@ -187,6 +187,7 @@ union Options {
     unsigned rtStaticPipelineFlags;          // Ray tracing static pipeline flags
     unsigned rtTriCompressMode;              // Ray tracing triangle compression mode
     bool useGpurt;                           // Whether GPURT is used
+    bool enableExtendedRobustBufferAccess;   // Enable the extended robust buffer access
   };
 };
 static_assert(sizeof(Options) == sizeof(Options::u32All));
diff --git a/lgc/patch/PatchBufferOp.cpp b/lgc/patch/PatchBufferOp.cpp
index c728e9a9ad..3d89ea6ada 100644
--- a/lgc/patch/PatchBufferOp.cpp
+++ b/lgc/patch/PatchBufferOp.cpp
@@ -353,34 +353,27 @@ void BufferOpLowering::visitAtomicCmpXchgInst(AtomicCmpXchgInst &atomicCmpXchgIn
 
   // If our buffer descriptor is divergent, need to handle it differently.
   if (getDescriptorInfo(bufferDesc).divergent.value()) {
-    Value *const baseAddr = getBaseAddressFromBufferDesc(bufferDesc);
-
-    // The 2nd element in the buffer descriptor is the byte bound, we do this to support robust buffer access.
-    Value *const bound = m_builder.CreateExtractElement(bufferDesc, 2);
-    Value *const inBound = m_builder.CreateICmpULT(baseIndex, bound);
-    Value *const newBaseIndex = m_builder.CreateSelect(inBound, baseIndex, m_builder.getInt32(0));
-
-    // Add on the index to the address.
-    Value *atomicPointer = m_builder.CreateGEP(m_builder.getInt8Ty(), baseAddr, newBaseIndex);
-
-    atomicPointer = m_builder.CreateBitCast(atomicPointer, storeType->getPointerTo(ADDR_SPACE_GLOBAL));
-
-    const AtomicOrdering successOrdering = atomicCmpXchgInst.getSuccessOrdering();
-    const AtomicOrdering failureOrdering = atomicCmpXchgInst.getFailureOrdering();
-
-    Value *const compareValue = atomicCmpXchgInst.getCompareOperand();
-    Value *const newValue = atomicCmpXchgInst.getNewValOperand();
-    AtomicCmpXchgInst *const newAtomicCmpXchg = m_builder.CreateAtomicCmpXchg(
-        atomicPointer, compareValue, newValue, MaybeAlign(), successOrdering, failureOrdering);
-    newAtomicCmpXchg->setVolatile(atomicCmpXchgInst.isVolatile());
-    newAtomicCmpXchg->setSyncScopeID(atomicCmpXchgInst.getSyncScopeID());
-    newAtomicCmpXchg->setWeak(atomicCmpXchgInst.isWeak());
-    copyMetadata(newAtomicCmpXchg, &atomicCmpXchgInst);
+    auto createAtomicCmpXchgFunc = [&](Value *pointer) {
+      const AtomicOrdering successOrdering = atomicCmpXchgInst.getSuccessOrdering();
+      const AtomicOrdering failureOrdering = atomicCmpXchgInst.getFailureOrdering();
+
+      Value *const compareValue = atomicCmpXchgInst.getCompareOperand();
+      Value *const newValue = atomicCmpXchgInst.getNewValOperand();
+      AtomicCmpXchgInst *const newAtomicCmpXchg = m_builder.CreateAtomicCmpXchg(
+          pointer, compareValue, newValue, MaybeAlign(), successOrdering, failureOrdering);
+      newAtomicCmpXchg->setVolatile(atomicCmpXchgInst.isVolatile());
+      newAtomicCmpXchg->setSyncScopeID(atomicCmpXchgInst.getSyncScopeID());
+      newAtomicCmpXchg->setWeak(atomicCmpXchgInst.isWeak());
+      copyMetadata(newAtomicCmpXchg, &atomicCmpXchgInst);
+      return newAtomicCmpXchg;
+    };
+    Value *result =
+        createGlobalPointerAccess(bufferDesc, baseIndex, storeType, atomicCmpXchgInst, createAtomicCmpXchgFunc);
 
     // Record the atomic instruction so we remember to delete it later.
     m_typeLowering.eraseInstruction(&atomicCmpXchgInst);
 
-    atomicCmpXchgInst.replaceAllUsesWith(newAtomicCmpXchg);
+    atomicCmpXchgInst.replaceAllUsesWith(result);
   } else {
     switch (atomicCmpXchgInst.getSuccessOrdering()) {
     case AtomicOrdering::Release:
@@ -459,29 +452,21 @@ void BufferOpLowering::visitAtomicRMWInst(AtomicRMWInst &atomicRmwInst) {
 
     // If our buffer descriptor is divergent, need to handle it differently.
     if (getDescriptorInfo(bufferDesc).divergent.value()) {
-      Value *const baseAddr = getBaseAddressFromBufferDesc(bufferDesc);
-
-      // The 2nd element in the buffer descriptor is the byte bound, we do this to support robust buffer access.
-      Value *const bound = m_builder.CreateExtractElement(bufferDesc, 2);
-      Value *const inBound = m_builder.CreateICmpULT(baseIndex, bound);
-      Value *const newBaseIndex = m_builder.CreateSelect(inBound, baseIndex, m_builder.getInt32(0));
-
-      // Add on the index to the address.
-      Value *atomicPointer = m_builder.CreateGEP(m_builder.getInt8Ty(), baseAddr, newBaseIndex);
-
-      atomicPointer = m_builder.CreateBitCast(atomicPointer, storeType->getPointerTo(ADDR_SPACE_GLOBAL));
-
-      AtomicRMWInst *const newAtomicRmw =
-          m_builder.CreateAtomicRMW(atomicRmwInst.getOperation(), atomicPointer, atomicRmwInst.getValOperand(),
-                                    atomicRmwInst.getAlign(), atomicRmwInst.getOrdering());
-      newAtomicRmw->setVolatile(atomicRmwInst.isVolatile());
-      newAtomicRmw->setSyncScopeID(atomicRmwInst.getSyncScopeID());
-      copyMetadata(newAtomicRmw, &atomicRmwInst);
+      auto createAtomicRmwFunc = [&](Value *pointer) {
+        AtomicRMWInst *const newAtomicRmw =
+            m_builder.CreateAtomicRMW(atomicRmwInst.getOperation(), pointer, atomicRmwInst.getValOperand(),
+                                      atomicRmwInst.getAlign(), atomicRmwInst.getOrdering());
+        newAtomicRmw->setVolatile(atomicRmwInst.isVolatile());
+        newAtomicRmw->setSyncScopeID(atomicRmwInst.getSyncScopeID());
+        copyMetadata(newAtomicRmw, &atomicRmwInst);
+        return newAtomicRmw;
+      };
+      Value *result = createGlobalPointerAccess(bufferDesc, baseIndex, storeType, atomicRmwInst, createAtomicRmwFunc);
 
       // Record the atomic instruction so we remember to delete it later.
       m_typeLowering.eraseInstruction(&atomicRmwInst);
 
-      atomicRmwInst.replaceAllUsesWith(newAtomicRmw);
+      atomicRmwInst.replaceAllUsesWith(result);
     } else {
       switch (atomicRmwInst.getOrdering()) {
       case AtomicOrdering::Release:
@@ -1292,36 +1277,28 @@ Value *BufferOpLowering::replaceLoadStore(Instruction &inst) {
 
   // If our buffer descriptor is divergent, need to handle that differently.
   if (getDescriptorInfo(bufferDesc).divergent.value()) {
-    Value *const baseAddr = getBaseAddressFromBufferDesc(bufferDesc);
-
-    // The 2nd element in the buffer descriptor is the byte bound, we do this to support robust buffer access.
-    Value *const bound = m_builder.CreateExtractElement(bufferDesc, 2);
-    Value *const inBound = m_builder.CreateICmpULT(baseIndex, bound);
-    Value *const newBaseIndex = m_builder.CreateSelect(inBound, baseIndex, m_builder.getInt32(0));
-
-    // Add on the index to the address.
-    Value *pointer = m_builder.CreateGEP(m_builder.getInt8Ty(), baseAddr, newBaseIndex);
-
-    pointer = m_builder.CreateBitCast(pointer, type->getPointerTo(ADDR_SPACE_GLOBAL));
-
-    if (isLoad) {
-      LoadInst *const newLoad = m_builder.CreateAlignedLoad(type, pointer, alignment, loadInst->isVolatile());
-      newLoad->setOrdering(ordering);
-      newLoad->setSyncScopeID(syncScopeID);
-      copyMetadata(newLoad, loadInst);
-
-      if (isInvariant)
-        newLoad->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(m_builder.getContext(), {}));
-
-      return newLoad;
-    }
-    StoreInst *const newStore =
-        m_builder.CreateAlignedStore(storeInst->getValueOperand(), pointer, alignment, storeInst->isVolatile());
-    newStore->setOrdering(ordering);
-    newStore->setSyncScopeID(syncScopeID);
-    copyMetadata(newStore, storeInst);
-
-    return newStore;
+    auto createLoadStoreFunc = [&](Value *pointer) {
+      Value *result = nullptr;
+      if (isLoad) {
+        LoadInst *const newLoad = m_builder.CreateAlignedLoad(type, pointer, alignment, loadInst->isVolatile());
+        newLoad->setOrdering(ordering);
+        newLoad->setSyncScopeID(syncScopeID);
+        copyMetadata(newLoad, loadInst);
+
+        if (isInvariant)
+          newLoad->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(m_builder.getContext(), {}));
+        result = newLoad;
+      } else {
+        StoreInst *const newStore =
+            m_builder.CreateAlignedStore(storeInst->getValueOperand(), pointer, alignment, storeInst->isVolatile());
+        newStore->setOrdering(ordering);
+        newStore->setSyncScopeID(syncScopeID);
+        copyMetadata(newStore, storeInst);
+        result = newStore;
+      }
+      return result;
+    };
+    return createGlobalPointerAccess(bufferDesc, baseIndex, type, inst, createLoadStoreFunc);
   }
 
   switch (ordering) {
@@ -1572,3 +1549,54 @@ Instruction *BufferOpLowering::makeLoop(Value *const loopStart, Value *const loo
 
   return loopCounter;
 }
+
+// =====================================================================================================================
+// Create global pointer access.
+//
+// @param bufferDesc: The buffer descriptor
+// @param offset: The offset on the global memory
+// @param type: The accessed data type
+// @param inst: The instruction to be executed on the buffer
+// @param callback: The callback function to perform the specific global access
+Value *BufferOpLowering::createGlobalPointerAccess(Value *const bufferDesc, Value *const offset, Type *const type,
+                                                   Instruction &inst, const function_ref<Value *(Value *)> callback) {
+  // The 2nd element (NUM_RECORDS) in the buffer descriptor is byte bound.
+  Value *bound = m_builder.CreateExtractElement(bufferDesc, 2);
+  Value *inBound = m_builder.CreateICmpULT(offset, bound);
+
+  // If null descriptor or extended robust buffer access is allowed, we will create a branch to perform normal global
+  // access based on the valid check.
+  Value *isValidAccess = m_builder.getTrue();
+  if (m_pipelineState.getOptions().allowNullDescriptor ||
+      m_pipelineState.getOptions().enableExtendedRobustBufferAccess) {
+    Value *isNonNullDesc = m_builder.getTrue();
+    if (m_pipelineState.getOptions().allowNullDescriptor) {
+      // Check dword2 against 0 for null descriptor
+      isNonNullDesc = m_builder.CreateICmpNE(bound, m_builder.getInt32(0));
+    }
+    Value *isInBound = m_pipelineState.getOptions().enableExtendedRobustBufferAccess ? inBound : m_builder.getTrue();
+    isValidAccess = m_builder.CreateAnd(isNonNullDesc, isInBound);
+  }
+
+  BasicBlock *const origBlock = inst.getParent();
+  Instruction *const terminator = SplitBlockAndInsertIfThen(isValidAccess, &inst, false);
+
+  // Global pointer access
+  m_builder.SetInsertPoint(terminator);
+  Value *baseAddr = getBaseAddressFromBufferDesc(bufferDesc);
+  // NOTE: The offset of out-of-bound overridden as 0 may causes unexpected result when the extended robustness access
+  // is disabled.
+  Value *newOffset = m_builder.CreateSelect(inBound, offset, m_builder.getInt32(0));
+  // Add on the index to the address.
+  Value *pointer = m_builder.CreateGEP(m_builder.getInt8Ty(), baseAddr, newOffset);
+  pointer = m_builder.CreateBitCast(pointer, type->getPointerTo(ADDR_SPACE_GLOBAL));
+  Value *newValue = callback(pointer);
+
+  m_builder.SetInsertPoint(&inst);
+  assert(!type->isVoidTy());
+  auto phi = m_builder.CreatePHI(type, 2, "newValue");
+  phi->addIncoming(Constant::getNullValue(type), origBlock);
+  phi->addIncoming(newValue, terminator->getParent());
+
+  return phi;
+}
diff --git a/lgc/test/Transforms/PatchBufferOp/simple.lgc b/lgc/test/Transforms/PatchBufferOp/simple.lgc
index 35c84beccd..e172c4a40c 100644
--- a/lgc/test/Transforms/PatchBufferOp/simple.lgc
+++ b/lgc/test/Transforms/PatchBufferOp/simple.lgc
@@ -29,16 +29,21 @@ define amdgpu_gfx float @uniform_select(<4 x i32> inreg %desc0, <4 x i32> inreg
 define amdgpu_gfx float @divergent_select(<4 x i32> inreg %desc0, <4 x i32> inreg %desc1, i1 %sel) !lgc.shaderstage !0 {
 ; CHECK-LABEL: @divergent_select(
 ; CHECK-NEXT:    [[PTR_0:%.*]] = select i1 [[SEL:%.*]], <4 x i32> [[DESC0:%.*]], <4 x i32> [[DESC1:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i32> [[TMP1]], <i32 -1, i32 65535>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(1)
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 0, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP4]], i32 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(1) [[TMP8]], align 4
-; CHECK-NEXT:    ret float [[TMP9]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i32 0, [[TMP1]]
+; CHECK-NEXT:    br i1 true, label [[TMP3:%.*]], label [[TMP11:%.*]]
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = and <2 x i32> [[TMP4]], <i32 -1, i32 65535>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr addrspace(1)
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP2]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP7]], i32 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr addrspace(1) [[TMP9]], align 4
+; CHECK-NEXT:    br label [[TMP11]]
+; CHECK:       11:
+; CHECK-NEXT:    [[NEWVALUE:%.*]] = phi float [ 0.000000e+00, [[TMP0:%.*]] ], [ [[TMP10]], [[TMP3]] ]
+; CHECK-NEXT:    ret float [[NEWVALUE]]
 ;
   %ptr0 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc0)
   %ptr1 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc1)
@@ -50,16 +55,21 @@ define amdgpu_gfx float @divergent_select(<4 x i32> inreg %desc0, <4 x i32> inre
 define amdgpu_gfx float @divergent_select1(<4 x i32> %desc0, <4 x i32> inreg %desc1, i1 inreg %sel) !lgc.shaderstage !0 {
 ; CHECK-LABEL: @divergent_select1(
 ; CHECK-NEXT:    [[PTR_0:%.*]] = select i1 [[SEL:%.*]], <4 x i32> [[DESC0:%.*]], <4 x i32> [[DESC1:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i32> [[TMP1]], <i32 -1, i32 65535>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(1)
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 0, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP4]], i32 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(1) [[TMP8]], align 4
-; CHECK-NEXT:    ret float [[TMP9]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i32 0, [[TMP1]]
+; CHECK-NEXT:    br i1 true, label [[TMP3:%.*]], label [[TMP11:%.*]]
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = and <2 x i32> [[TMP4]], <i32 -1, i32 65535>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr addrspace(1)
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP2]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP7]], i32 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr addrspace(1) [[TMP9]], align 4
+; CHECK-NEXT:    br label [[TMP11]]
+; CHECK:       11:
+; CHECK-NEXT:    [[NEWVALUE:%.*]] = phi float [ 0.000000e+00, [[TMP0:%.*]] ], [ [[TMP10]], [[TMP3]] ]
+; CHECK-NEXT:    ret float [[NEWVALUE]]
 ;
   %ptr0 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc0)
   %ptr1 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc1)
@@ -71,16 +81,21 @@ define amdgpu_gfx float @divergent_select1(<4 x i32> %desc0, <4 x i32> inreg %de
 define amdgpu_gfx float @divergent_select2(<4 x i32> inreg %desc0, <4 x i32> %desc1, i1 inreg %sel) !lgc.shaderstage !0 {
 ; CHECK-LABEL: @divergent_select2(
 ; CHECK-NEXT:    [[PTR_0:%.*]] = select i1 [[SEL:%.*]], <4 x i32> [[DESC0:%.*]], <4 x i32> [[DESC1:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i32> [[TMP1]], <i32 -1, i32 65535>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(1)
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 0, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP4]], i32 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(1) [[TMP8]], align 4
-; CHECK-NEXT:    ret float [[TMP9]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i32 0, [[TMP1]]
+; CHECK-NEXT:    br i1 true, label [[TMP3:%.*]], label [[TMP11:%.*]]
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = and <2 x i32> [[TMP4]], <i32 -1, i32 65535>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr addrspace(1)
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP2]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP7]], i32 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr addrspace(1) [[TMP9]], align 4
+; CHECK-NEXT:    br label [[TMP11]]
+; CHECK:       11:
+; CHECK-NEXT:    [[NEWVALUE:%.*]] = phi float [ 0.000000e+00, [[TMP0:%.*]] ], [ [[TMP10]], [[TMP3]] ]
+; CHECK-NEXT:    ret float [[NEWVALUE]]
 ;
   %ptr0 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc0)
   %ptr1 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc1)
@@ -131,16 +146,21 @@ define amdgpu_gfx float @divergent_input0_phi(<4 x i32> %desc0, <4 x i32> inreg
 ; CHECK-NEXT:    br label [[TAIL]]
 ; CHECK:       tail:
 ; CHECK-NEXT:    [[PTR_0:%.*]] = phi <4 x i32> [ [[DESC0:%.*]], [[A]] ], [ [[DESC1:%.*]], [[B]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i32> [[TMP1]], <i32 -1, i32 65535>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(1)
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 0, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP4]], i32 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(1) [[TMP8]], align 4
-; CHECK-NEXT:    ret float [[TMP9]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i32 0, [[TMP1]]
+; CHECK-NEXT:    br i1 true, label [[TMP3:%.*]], label [[TMP11:%.*]]
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = and <2 x i32> [[TMP4]], <i32 -1, i32 65535>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr addrspace(1)
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP2]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP7]], i32 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr addrspace(1) [[TMP9]], align 4
+; CHECK-NEXT:    br label [[TMP11]]
+; CHECK:       11:
+; CHECK-NEXT:    [[NEWVALUE:%.*]] = phi float [ 0.000000e+00, [[TAIL]] ], [ [[TMP10]], [[TMP3]] ]
+; CHECK-NEXT:    ret float [[NEWVALUE]]
 ;
   br i1 %sel, label %a, label %b
 
@@ -167,16 +187,21 @@ define amdgpu_gfx float @divergent_input1_phi(<4 x i32> inreg %desc0, <4 x i32>
 ; CHECK-NEXT:    br label [[TAIL]]
 ; CHECK:       tail:
 ; CHECK-NEXT:    [[PTR_0:%.*]] = phi <4 x i32> [ [[DESC0:%.*]], [[A]] ], [ [[DESC1:%.*]], [[B]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i32> [[TMP1]], <i32 -1, i32 65535>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(1)
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 0, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP4]], i32 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(1) [[TMP8]], align 4
-; CHECK-NEXT:    ret float [[TMP9]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i32 0, [[TMP1]]
+; CHECK-NEXT:    br i1 true, label [[TMP3:%.*]], label [[TMP11:%.*]]
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = and <2 x i32> [[TMP4]], <i32 -1, i32 65535>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr addrspace(1)
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP2]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP7]], i32 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr addrspace(1) [[TMP9]], align 4
+; CHECK-NEXT:    br label [[TMP11]]
+; CHECK:       11:
+; CHECK-NEXT:    [[NEWVALUE:%.*]] = phi float [ 0.000000e+00, [[TAIL]] ], [ [[TMP10]], [[TMP3]] ]
+; CHECK-NEXT:    ret float [[NEWVALUE]]
 ;
   br i1 %sel, label %a, label %b
 
@@ -203,16 +228,21 @@ define amdgpu_gfx float @divergent_sync_phi(<4 x i32> inreg %desc0, <4 x i32> in
 ; CHECK-NEXT:    br label [[TAIL]]
 ; CHECK:       tail:
 ; CHECK-NEXT:    [[PTR_0:%.*]] = phi <4 x i32> [ [[DESC0:%.*]], [[A]] ], [ [[DESC1:%.*]], [[B]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i32> [[TMP1]], <i32 -1, i32 65535>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(1)
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 0, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP4]], i32 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(1) [[TMP8]], align 4
-; CHECK-NEXT:    ret float [[TMP9]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i32 0, [[TMP1]]
+; CHECK-NEXT:    br i1 true, label [[TMP3:%.*]], label [[TMP11:%.*]]
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = and <2 x i32> [[TMP4]], <i32 -1, i32 65535>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr addrspace(1)
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP2]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP7]], i32 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr addrspace(1) [[TMP9]], align 4
+; CHECK-NEXT:    br label [[TMP11]]
+; CHECK:       11:
+; CHECK-NEXT:    [[NEWVALUE:%.*]] = phi float [ 0.000000e+00, [[TAIL]] ], [ [[TMP10]], [[TMP3]] ]
+; CHECK-NEXT:    ret float [[NEWVALUE]]
 ;
   br i1 %sel, label %a, label %b
 
diff --git a/lgc/test/Transforms/PatchBufferOp/uniform-phi.lgc b/lgc/test/Transforms/PatchBufferOp/uniform-phi.lgc
index 9a1bdcdb55..9bdd634dfd 100644
--- a/lgc/test/Transforms/PatchBufferOp/uniform-phi.lgc
+++ b/lgc/test/Transforms/PatchBufferOp/uniform-phi.lgc
@@ -16,16 +16,21 @@ define amdgpu_gfx float @uniform_phi(<4 x i32> inreg %desc0, <4 x i32> inreg %de
 ; CHECK-NEXT:    br label [[TAIL]]
 ; CHECK:       tail:
 ; CHECK-NEXT:    [[PTR_0:%.*]] = phi <4 x i32> [ [[DESC0:%.*]], [[A]] ], [ [[DESC1:%.*]], [[B]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i32> [[TMP1]], <i32 -1, i32 65535>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(1)
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 0, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP4]], i32 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(1) [[TMP8]], align 4
-; CHECK-NEXT:    ret float [[TMP9]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i32 0, [[TMP1]]
+; CHECK-NEXT:    br i1 true, label [[TMP3:%.*]], label [[TMP11:%.*]]
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = and <2 x i32> [[TMP4]], <i32 -1, i32 65535>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr addrspace(1)
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP2]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP7]], i32 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr addrspace(1) [[TMP9]], align 4
+; CHECK-NEXT:    br label [[TMP11]]
+; CHECK:       11:
+; CHECK-NEXT:    [[NEWVALUE:%.*]] = phi float [ 0.000000e+00, [[TAIL]] ], [ [[TMP10]], [[TMP3]] ]
+; CHECK-NEXT:    ret float [[NEWVALUE]]
 ;
   br i1 %sel, label %a, label %b
 
diff --git a/llpc/context/llpcPipelineContext.cpp b/llpc/context/llpcPipelineContext.cpp
index e71c97e687..f808c4d294 100644
--- a/llpc/context/llpcPipelineContext.cpp
+++ b/llpc/context/llpcPipelineContext.cpp
@@ -328,6 +328,7 @@ Options PipelineContext::computePipelineOptions() const {
   }
 
   options.allowNullDescriptor = getPipelineOptions()->extendedRobustness.nullDescriptor;
+  options.enableExtendedRobustBufferAccess = getPipelineOptions()->extendedRobustness.robustBufferAccess;
   options.disableImageResourceCheck = getPipelineOptions()->disableImageResourceCheck;
   options.optimizeTessFactor = getPipelineOptions()->optimizeTessFactor;
   options.enableInterpModePatch = getPipelineOptions()->enableInterpModePatch;