diff --git a/include/vkgcDefs.h b/include/vkgcDefs.h
index 8365ffa169..db665dd386 100644
--- a/include/vkgcDefs.h
+++ b/include/vkgcDefs.h
@@ -246,6 +246,7 @@ struct optional_bool : private std::optional<bool> {
   using std::optional<bool>::has_value;
   using std::optional<bool>::value;
   using std::optional<bool>::value_or;
+  using std::optional<bool>::operator*;
 };
 
 /// Enumerates result codes of LLPC operations.
@@ -888,7 +889,7 @@ struct PipelineShaderOptions {
   unsigned ldsSpillLimitDwords;
 
   /// Attempt to scalarize waterfall descriptor loads.
-  bool scalarizeWaterfallLoads;
+  optional_bool scalarizeWaterfallLoads;
 
   /// Force rearranges threadId within group into blocks of 8*8 or 8*4
   bool overrideForceThreadIdSwizzling;
diff --git a/lgc/builder/BuilderImpl.cpp b/lgc/builder/BuilderImpl.cpp
index ec99c51e21..f3501d6c44 100644
--- a/lgc/builder/BuilderImpl.cpp
+++ b/lgc/builder/BuilderImpl.cpp
@@ -33,6 +33,7 @@
 #include "lgc/LgcDialect.h"
 #include "lgc/state/PipelineState.h"
 #include "lgc/state/TargetInfo.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 
@@ -329,6 +330,117 @@ BranchInst *BuilderImpl::createIf(Value *condition, bool wantElse, const Twine &
 }
 
 #if defined(LLVM_HAVE_BRANCH_AMD_GFX)
+// A simple memory efficient container that holds up to 64 instructions in a bit vector. It needs two helper data
+// structures: 1. instrToIndex that maps an instruction to its index in the bit vector and 2. indexToInstr that maps an
+// index back to an instruction.
+class TinyInstructionSet {
+public:
+  using IndexToInstructionVec = SmallVector<Instruction *, 64>;
+  using InstrToIndexMap = DenseMap<Instruction *, unsigned>;
+
+private:
+  BitVector bits;
+
+public:
+  TinyInstructionSet(unsigned size) { bits.resize(size); }
+
+  class iterator {
+    BitVector::const_set_bits_iterator it;
+    const SmallVector<Instruction *, 64> &indexToInstr;
+
+  public:
+    iterator(BitVector::const_set_bits_iterator it, const IndexToInstructionVec &indexToInstr)
+        : it(it), indexToInstr(indexToInstr) {}
+    iterator &operator++() {
+      ++it;
+      return *this;
+    }
+
+    Instruction *operator*() {
+      unsigned index = *it;
+      assert(index < indexToInstr.size() && "Index out of range.");
+      return indexToInstr[index];
+    }
+
+    bool operator!=(const iterator &otherIt) {
+      assert(&otherIt.indexToInstr == &indexToInstr && "Iterators of different objects.");
+      return otherIt.it != it;
+    }
+  };
+
+  iterator begin(const IndexToInstructionVec &indexToInstr) const {
+    return iterator(bits.set_bits_begin(), indexToInstr);
+  }
+
+  iterator end(const IndexToInstructionVec &indexToInstr) const { return iterator(bits.set_bits_end(), indexToInstr); }
+
+  void insert(Instruction *instr, const InstrToIndexMap &instrToIndex) {
+    auto it = instrToIndex.find(instr);
+    assert(it != instrToIndex.end() && "Expected to find instr in instrToIndex.");
+    unsigned index = it->second;
+    bits.set(index);
+  }
+
+  unsigned size() const { return bits.size(); }
+
+  bool empty() const { return bits.empty(); }
+};
+
+class TraceNonUniformIndex {
+  // Maps the instruction to its index in the bit vector.
+  TinyInstructionSet::InstrToIndexMap instrToIndex;
+  // The instructions used as keys in instrToIndex in program order. It is used to map an index to an instruction.
+  TinyInstructionSet::IndexToInstructionVec indexToInstr;
+  // Maps an instruction to its dependencies.
+  DenseMap<Value *, TinyInstructionSet> instrDeps;
+  bool scalarizeDescriptorLoads;
+  unsigned upperLimit;
+  void insertNewValueInInstrDeps(Value *, Instruction *);
+
+public:
+  TraceNonUniformIndex(Instruction *nonUniformInst, bool scalarizeDescriptorLoads = false, unsigned upperLimit = 64)
+      : scalarizeDescriptorLoads(scalarizeDescriptorLoads), upperLimit(upperLimit) {
+    // Initialization of instrToIndex and indexToInstr.
+    if (scalarizeDescriptorLoads) {
+      unsigned cnt = 0;
+      for (Instruction *I = nonUniformInst->getPrevNode(); I != nullptr && cnt < upperLimit;
+           I = I->getPrevNode(), ++cnt) {
+        indexToInstr.push_back(I);
+        instrToIndex[I] = cnt;
+      }
+    }
+  }
+
+  Value *run(Value *);
+
+  const DenseMap<Value *, TinyInstructionSet> &getInstrDeps() const { return instrDeps; }
+
+  const TinyInstructionSet::IndexToInstructionVec &getIndexToInstr() const { return indexToInstr; }
+
+  bool foundDependencies() const { return scalarizeDescriptorLoads; }
+};
+
+// Adds newValue in instrDeps map. The dependencies of the newValue are the currentVisitedInstr and its dependencies.
+// @param newValue : the new value to be added in instrDeps map
+// @param currentVisitedInstr : the value from where we copy the dependencies for newValue
+void TraceNonUniformIndex::insertNewValueInInstrDeps(Value *newValue, Instruction *currentVisitedInstr) {
+  if (!instrToIndex.contains(currentVisitedInstr)) {
+    // The instruction is either outside of 64 limit or in a different basic block. So, we bail-out scalarization.
+    scalarizeDescriptorLoads = false;
+    return;
+  }
+  assert(instrDeps.contains(currentVisitedInstr) && "The current visited instruction should have been in the map.");
+  auto it1 = instrDeps.try_emplace(newValue, upperLimit).first;
+  auto &setOfInstrs = it1->second;
+  auto it2 = instrDeps.find(currentVisitedInstr);
+  const auto &set = it2->second;
+  for (auto it3 = set.begin(indexToInstr), ite = set.end(indexToInstr); it3 != ite; ++it3) {
+    auto *instr = *it3;
+    setOfInstrs.insert(instr, instrToIndex);
+  }
+  setOfInstrs.insert(currentVisitedInstr, instrToIndex);
+}
+
 // =====================================================================================================================
 // For a non-uniform input, try and trace back through a descriptor load to
 // find the non-uniform index used in it. If that fails, we just use the
@@ -339,11 +451,15 @@ BranchInst *BuilderImpl::createIf(Value *condition, bool wantElse, const Twine &
 // This uses a fairly simple heuristic that nevertheless allows temporary expansion of the search breadth to handle
 // the common case where a base pointer is assembled from separate high and low halves.
 //
+// In case of scalarization, it fills the instrDeps map by using insertNewValueInInstrDeps().
+//
 // @param nonUniformVal : Value representing non-uniform descriptor
 // @return : Value representing the non-uniform index, or null if nonUniformVal could be proven to be uniform
-static Value *traceNonUniformIndex(Value *nonUniformVal) {
+Value *TraceNonUniformIndex::run(Value *nonUniformVal) {
   auto load = dyn_cast<LoadInst>(nonUniformVal);
-  if (!load) {
+  if (scalarizeDescriptorLoads && load) {
+    instrDeps.try_emplace(load, upperLimit);
+  } else if (!load) {
     // Workarounds that modify image descriptor can be peeped through, i.e.
     //   %baseValue = load <8 x i32>, <8 x i32> addrspace(4)* %..., align 16
     //   %rawElement = extractelement <8 x i32> %baseValue, i64 6
@@ -353,6 +469,9 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) {
     if (!insert)
       return nonUniformVal;
 
+    if (scalarizeDescriptorLoads)
+      instrDeps.try_emplace(insert, upperLimit);
+
     load = dyn_cast<LoadInst>(insert->getOperand(0));
     if (!load)
       return nonUniformVal;
@@ -360,9 +479,15 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) {
     // We found the load, but must verify the chain.
     // Consider updatedElement as a generic instruction or constant.
     if (auto updatedElement = dyn_cast<Instruction>(insert->getOperand(1))) {
+      if (scalarizeDescriptorLoads)
+        insertNewValueInInstrDeps(updatedElement, insert);
       for (Value *operand : updatedElement->operands()) {
         if (auto extract = dyn_cast<ExtractElementInst>(operand)) {
           // Only dynamic value must be ExtractElementInst based on load.
+          if (scalarizeDescriptorLoads) {
+            insertNewValueInInstrDeps(extract, updatedElement);
+            insertNewValueInInstrDeps(load, extract);
+          }
           if (dyn_cast<LoadInst>(extract->getOperand(0)) != load)
             return nonUniformVal;
         } else if (!isa<Constant>(operand)) {
@@ -386,11 +511,13 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) {
   SmallVector<Instruction *, 2> nonUniforms;
   nonUniforms.push_back(load);
 
-  auto propagate = [&](Value *value) -> bool {
+  auto propagate = [&](Value *value, Instruction *current) {
     if (auto inst = dyn_cast<Instruction>(value)) {
       if (nonUniforms.size() >= 2)
         return false;
       nonUniforms.push_back(inst);
+      if (scalarizeDescriptorLoads)
+        insertNewValueInInstrDeps(inst, current);
       return true;
     }
     return isa<Constant>(value);
@@ -410,13 +537,13 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) {
 
     // See if we can propagate the search further.
     if (current->isCast() || current->isUnaryOp()) {
-      if (!propagate(current->getOperand(0)))
+      if (!propagate(current->getOperand(0), current))
         return nonUniformVal;
       continue;
     }
 
     if (current->isBinaryOp()) {
-      if (!propagate(current->getOperand(0)) || !propagate(current->getOperand(1)))
+      if (!propagate(current->getOperand(0), current) || !propagate(current->getOperand(1), current))
         return nonUniformVal;
       continue;
     }
@@ -427,14 +554,15 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) {
       if (as == ADDR_SPACE_FLAT || as == ADDR_SPACE_PRIVATE)
         return nonUniformVal; // load is a source of divergence, can't propagate
 
-      if (!propagate(ptr))
+      if (!propagate(ptr, current))
         return nonUniformVal;
       continue;
     }
 
     if (auto gep = dyn_cast<GetElementPtrInst>(current)) {
       if (gep->hasAllConstantIndices()) {
-        if (!propagate(gep->getPointerOperand()))
+
+        if (!propagate(gep->getPointerOperand(), current))
           return nonUniformVal;
         continue;
       }
@@ -443,33 +571,37 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) {
       if (candidateIndex || gep->getNumIndices() != 1)
         return nonUniformVal;
 
-      if (!propagate(gep->getPointerOperand()))
+      if (!propagate(gep->getPointerOperand(), current))
         return nonUniformVal;
 
       candidateIndex = *gep->idx_begin();
       if (getSize(candidateIndex) > nonUniformValSize)
         return nonUniformVal; // propagating further is worthless
+
+      if (scalarizeDescriptorLoads)
+        insertNewValueInInstrDeps(candidateIndex, current);
+
       continue;
     }
 
     if (auto extract = dyn_cast<ExtractValueInst>(current)) {
-      if (!propagate(extract->getAggregateOperand()))
+      if (!propagate(extract->getAggregateOperand(), current))
         return nonUniformVal;
       continue;
     }
     if (auto insert = dyn_cast<InsertValueInst>(current)) {
-      if (!propagate(insert->getAggregateOperand()) || !propagate(insert->getInsertedValueOperand()))
+      if (!propagate(insert->getAggregateOperand(), current) || !propagate(insert->getInsertedValueOperand(), current))
         return nonUniformVal;
       continue;
     }
     if (auto extract = dyn_cast<ExtractElementInst>(current)) {
-      if (!isa<Constant>(extract->getIndexOperand()) || !propagate(extract->getVectorOperand()))
+      if (!isa<Constant>(extract->getIndexOperand()) || !propagate(extract->getVectorOperand(), current))
         return nonUniformVal;
       continue;
     }
     if (auto insert = dyn_cast<InsertElementInst>(current)) {
-      if (!isa<Constant>(insert->getOperand(2)) || !propagate(insert->getOperand(0)) ||
-          !propagate(insert->getOperand(1)))
+      if (!isa<Constant>(insert->getOperand(2)) || !propagate(insert->getOperand(0), current) ||
+          !propagate(insert->getOperand(1), current))
         return nonUniformVal;
       continue;
     }
@@ -532,6 +664,12 @@ static bool instructionsEqual(Instruction *lhs, Instruction *rhs) {
 // Create a waterfall loop containing the specified instruction.
 // This does not use the current insert point; new code is inserted before and after nonUniformInst.
 //
+// For scalarization we need to collect all the instructions that need to be moved inside the loop. This is done by
+// traceNonUniformIndex() which traverses all use-def predecessors of nonUniformInst. At the same time it adds these
+// instructions to instrDeps map. Once traceNonUniformIndex() completes, we use the returned value as a key to the
+// instrDeps map to get the dependencies. These dependencies are the instructions that will be cloned and moved inside
+// the waterfall loop.
+//
 // @param nonUniformInst : The instruction to put in a waterfall loop
 // @param operandIdxs : The operand index/indices for non-uniform inputs that need to be uniform
 // @param scalarizeDescriptorLoads : Attempt to scalarize descriptor loads
@@ -548,24 +686,40 @@ Instruction *BuilderImpl::createWaterfallLoop(Instruction *nonUniformInst, Array
   assert(operandIdxs.empty() == false);
 
   SmallVector<Value *, 2> nonUniformIndices;
+  // Maps the nonUniformIndex that is returned by traceNonUniformIndex() to the nonUniformInst.
+  DenseMap<Value *, std::pair<Value *, unsigned>> nonUniformIndexImageCallOperand;
+  TraceNonUniformIndex traceNonUniformIndex(nonUniformInst, scalarizeDescriptorLoads, 64);
+
   for (unsigned operandIdx : operandIdxs) {
-    Value *nonUniformIndex = traceNonUniformIndex(nonUniformInst->getOperand(operandIdx));
-    if (nonUniformIndex)
+    Value *nonUniformImageCallOperand = nonUniformInst->getOperand(operandIdx);
+    Value *nonUniformIndex = traceNonUniformIndex.run(nonUniformImageCallOperand);
+    scalarizeDescriptorLoads = traceNonUniformIndex.foundDependencies();
+    if (nonUniformIndex) {
       nonUniformIndices.push_back(nonUniformIndex);
+
+      if (scalarizeDescriptorLoads)
+        nonUniformIndexImageCallOperand[nonUniformIndex] = std::make_pair(nonUniformImageCallOperand, operandIdx);
+    }
   }
+
   if (nonUniformIndices.empty())
     return nonUniformInst;
 
-  // For any index that is 64 bit, change it back to 32 bit for comparison at the top of the
-  // waterfall loop.
+  // For any index that is 64 bit, change it back to 32 bit for comparison at the top of the waterfall loop.
+  // At this point the nonUniformVal of nonUniformIndices might change. We also need the original non uniform values for
+  // the scalarization of the descriptor loads.
+  DenseMap<Value *, Value *> newOrigNonUniformVal;
   for (Value *&nonUniformVal : nonUniformIndices) {
     if (nonUniformVal->getType()->isIntegerTy(64)) {
       auto sExt = dyn_cast<SExtInst>(nonUniformVal);
+      Value *origNonUniformVal = nonUniformVal;
       // 64-bit index may already be formed from extension of 32-bit value.
       if (sExt && sExt->getOperand(0)->getType()->isIntegerTy(32)) {
         nonUniformVal = sExt->getOperand(0);
+        newOrigNonUniformVal[nonUniformVal] = origNonUniformVal;
       } else {
         nonUniformVal = CreateTrunc(nonUniformVal, getInt32Ty());
+        newOrigNonUniformVal[nonUniformVal] = origNonUniformVal;
       }
     }
   }
@@ -601,36 +755,84 @@ Instruction *BuilderImpl::createWaterfallLoop(Instruction *nonUniformInst, Array
 
   Value *waterfallBegin;
   if (scalarizeDescriptorLoads) {
-    // Attempt to scalarize descriptor loads.
-    assert(firstIndexInst);
-    CallInst *firstCallInst = dyn_cast<CallInst>(firstIndexInst);
-    if (firstCallInst && firstCallInst->getIntrinsicID() == Intrinsic::amdgcn_waterfall_readfirstlane) {
-      // Descriptor loads are already inside a waterfall.
-      waterfallBegin = firstCallInst->getArgOperand(0);
-    } else {
-      // Begin waterfall loop just after shared index is computed.
-      // This places all dependent instructions within the waterfall loop, including descriptor loads.
-      auto descTy = firstIndexInst->getType();
-      SetInsertPoint(firstIndexInst->getNextNonDebugInstruction(false));
-      waterfallBegin = ConstantInt::get(getInt32Ty(), 0);
-      waterfallBegin = CreateIntrinsic(Intrinsic::amdgcn_waterfall_begin, descTy, {waterfallBegin, firstIndexInst},
-                                       nullptr, instName);
-
-      // Scalarize shared index.
-      Value *desc = CreateIntrinsic(Intrinsic::amdgcn_waterfall_readfirstlane, {descTy, descTy},
-                                    {waterfallBegin, firstIndexInst}, nullptr, instName);
+    SetInsertPoint(nonUniformInst);
+    auto descTy = firstIndexInst->getType();
+    // Create waterfall.begin and waterfall.readfirstlane intrinsics.
+    waterfallBegin = ConstantInt::get(getInt32Ty(), 0);
+    waterfallBegin =
+        CreateIntrinsic(Intrinsic::amdgcn_waterfall_begin, descTy, {waterfallBegin, firstIndexInst}, nullptr, instName);
+
+    // Scalarize shared index.
+    Value *readFirstLane = CreateIntrinsic(Intrinsic::amdgcn_waterfall_readfirstlane, {descTy, descTy},
+                                           {waterfallBegin, firstIndexInst}, nullptr, instName);
+
+    for (auto *nonUniformVal : nonUniformIndices) {
+      // Get the first non uniform instruction of the chain.
+      auto it1 = newOrigNonUniformVal.find(nonUniformVal);
+      Value *origNonUniformVal = nonUniformVal;
+      if (it1 != newOrigNonUniformVal.end())
+        origNonUniformVal = it1->second;
+
+      auto [nonUniformImageCallOperand, operandIdx] = nonUniformIndexImageCallOperand[origNonUniformVal];
+
+      if (origNonUniformVal == nonUniformImageCallOperand)
+        continue;
+
+      // Get the instruction chain of the first non uniform instruction.
+      const DenseMap<Value *, TinyInstructionSet> &instrDeps = traceNonUniformIndex.getInstrDeps();
+      auto it2 = instrDeps.find(origNonUniformVal);
+      assert(it2 != instrDeps.end() && "The non-uniform index should be in instrDep map.");
+      auto &instrsToClone = it2->second;
+      assert(!instrsToClone.empty() && "There are not any instructions to clone.");
+
+      // Clone and emit the instructions that we want to push inside the waterfall loop.
+      std::map<Instruction *, Instruction *> origClonedValuesMap;
+      Instruction *prevInst = nonUniformInst;
+      const TinyInstructionSet::IndexToInstructionVec &indexToInstr = traceNonUniformIndex.getIndexToInstr();
+      for (auto it3 = instrsToClone.begin(indexToInstr), ite = instrsToClone.end(indexToInstr); it3 != ite; ++it3) {
+        auto *origInst = *it3;
+        auto *newInst = origInst->clone();
+        newInst->insertBefore(prevInst);
+        origClonedValuesMap[origInst] = newInst;
+        prevInst = newInst;
+        // Update the non-uniform operand of the image call with the new non-uniform operand.
+        if (nonUniformImageCallOperand == origInst) {
+          if (nonUniformInst->getType()->isVoidTy())
+            newInst = CreateIntrinsic(Intrinsic::amdgcn_waterfall_last_use, newInst->getType(),
+                                      {waterfallBegin, newInst}, nullptr, instName);
+          nonUniformInst->setOperand(operandIdx, newInst);
+        }
+      }
+      // Finally, clone the first non uniform instruction.
+      auto *origInst = cast<Instruction>(origNonUniformVal);
+      auto *newInst = origInst->clone();
+      newInst->insertBefore(prevInst);
+      origClonedValuesMap[origInst] = newInst;
+
+      // Update the operands of the cloned instructions.
+      for (auto [origInst, newInst] : origClonedValuesMap) {
+        for (Use &use : newInst->operands()) {
+          Value *op = use.get();
+          if (auto *opI = dyn_cast<Instruction>(op)) {
+            auto it = origClonedValuesMap.find(opI);
+            if (it == origClonedValuesMap.end())
+              continue;
+            Instruction *clonedI = it->second;
+            use.set(clonedI);
+          }
+        }
+      }
 
       // Replace all references to shared index within the waterfall loop with scalarized index.
       // (Note: this includes the non-uniform instruction itself.)
       // Loads using scalarized index will become scalar loads.
-      for (Value *otherNonUniformVal : nonUniformIndices) {
-        otherNonUniformVal->replaceUsesWithIf(desc, [desc, waterfallBegin, nonUniformInst](Use &U) {
-          Instruction *userInst = cast<Instruction>(U.getUser());
-          return U.getUser() != waterfallBegin && U.getUser() != desc &&
-                 userInst->getParent() == nonUniformInst->getParent() &&
-                 (userInst == nonUniformInst || userInst->comesBefore(nonUniformInst));
-        });
-      }
+      nonUniformVal->replaceUsesWithIf(readFirstLane, [readFirstLane, waterfallBegin, nonUniformInst](Use &U) {
+        Instruction *userInst = cast<Instruction>(U.getUser());
+        return userInst != waterfallBegin && userInst != readFirstLane &&
+               userInst->getParent() == nonUniformInst->getParent() &&
+               (userInst == nonUniformInst || userInst->comesBefore(nonUniformInst)) &&
+               !userInst->comesBefore(cast<Instruction>(waterfallBegin));
+      });
     }
   } else {
     // Insert new code just before nonUniformInst.
diff --git a/llpc/context/llpcPipelineContext.cpp b/llpc/context/llpcPipelineContext.cpp
index e71c97e687..c9fb480c0d 100644
--- a/llpc/context/llpcPipelineContext.cpp
+++ b/llpc/context/llpcPipelineContext.cpp
@@ -618,10 +618,9 @@ ShaderOptions PipelineContext::computeShaderOptions(const PipelineShaderInfo &sh
   if (ScalarizeWaterfallDescriptorLoads.getNumOccurrences() > 0) {
     shaderOptions.scalarizeWaterfallLoads = ScalarizeWaterfallDescriptorLoads;
   } else {
-    shaderOptions.scalarizeWaterfallLoads = shaderInfo.options.scalarizeWaterfallLoads;
-    // Enable waterfall load scalarization when vgpr limit is set.
-    if (shaderOptions.vgprLimit != 0 && shaderOptions.vgprLimit != UINT_MAX)
-      shaderOptions.scalarizeWaterfallLoads = true;
+    shaderOptions.scalarizeWaterfallLoads = true;
+    if (shaderInfo.options.scalarizeWaterfallLoads.has_value())
+      shaderOptions.scalarizeWaterfallLoads = *shaderInfo.options.scalarizeWaterfallLoads;
   }
 
   shaderOptions.sgprLimit = shaderInfo.options.sgprLimit;
diff --git a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallInsertion.frag b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallInsertion.frag
index 6845f3f011..74975a3767 100644
--- a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallInsertion.frag
+++ b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallInsertion.frag
@@ -18,16 +18,24 @@ void main()
     _3 = texture(_11[nonuniformEXT(_12)], vec2(0.0));
 }
 
-// BEGIN_SHADERTEST
-/*
-; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
-; Make sure that the begin indices chosen are the non-uniform offsets rather than the whole resource desc
-; Make sure that there's a waterfall.readfirstlane for both the image resource desc and sample desc
-; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST-DAG: call i32 @llvm.amdgcn.waterfall.begin.i32
-; SHADERTEST-DAG: call i32 @llvm.amdgcn.waterfall.begin.i32
-; SHADERTEST-DAG: call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32
-; SHADERTEST-DAG: call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32
-; SHADERTEST: AMDLLPC SUCCESS
-*/
-// END_SHADERTEST
+// RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
+// Make sure that the begin indices chosen are the non-uniform offsets rather than the whole resource desc
+// Make sure that there's a waterfall.readfirstlane for both the image resource desc and sample desc
+// SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
+// SHADERTEST: %[[mul:[0-9]+]] = mul i32 %{{.*}}, 48
+// SHADERTEST-NEXT: %[[begin:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul]])
+// SHADERTEST-NEXT: %[[readfirstlane:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin]], i32 %[[mul]])
+// SHADERTEST-NEXT: %[[sext:[0-9]+]] = sext i32 %[[readfirstlane]] to i64
+// SHADERTEST-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
+// SHADERTEST-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// SHADERTEST-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 3
+// SHADERTEST-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], 268435455
+// SHADERTEST-NEXT: %[[cmp:[0-9]+]] = icmp slt i32 %[[extract]], 0
+// SHADERTEST-NEXT: %[[select:[0-9]+]] = select i1 %[[cmp]], i32 %[[extract]], i32 %[[and]]
+// SHADERTEST-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[select]], i64 3
+// SHADERTEST-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// SHADERTEST-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
+// SHADERTEST-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// SHADERTEST-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half 0xH0000, half 0xH0000, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]])
+// SHADERTEST: AMDLLPC SUCCESS
diff --git a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize.frag b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize.frag
index fbf9c25c0f..ff090feb37 100644
--- a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize.frag
+++ b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize.frag
@@ -1,6 +1,3 @@
-// Make sure that there is a single begin index
-// Make sure that there is a single waterfall.readfirstlane for the offset
-
 #version 450
 #extension GL_EXT_nonuniform_qualifier : require
 
@@ -16,18 +13,56 @@ void main()
     _3 = texture(_11[nonuniformEXT(_12)], _6);
 }
 
-// BEGIN_SHADERTEST
-//
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST-GFX %s
 // Explicitly check GFX10.3 ASIC variants:
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST %s
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST %s
-// SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-// SHADERTEST: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32
-// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32
-// SHADERTEST: AMDLLPC SUCCESS
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST-GFX_10_3_0 %s
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST-GFX_10_3_2 %s
+
+// SHADERTEST-GFX-LABEL: {{^// LLPC}} pipeline patching results
+// SHADERTEST-GFX: %[[mul:[0-9]+]] = mul i32 %{{.*}}, 48
+// SHADERTEST-GFX-NEXT: %[[begin:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul]])
+// SHADERTEST-GFX-NEXT: %[[readfirstlane:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin]], i32 %[[mul]])
+// SHADERTEST-GFX-NEXT: %[[sext:[0-9]+]] = sext i32 %[[readfirstlane]] to i64
+// SHADERTEST-GFX-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
+// SHADERTEST-GFX-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// SHADERTEST-GFX-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 3
+// SHADERTEST-GFX-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], 268435455
+// SHADERTEST-GFX-NEXT: %[[cmp:[0-9]+]] = icmp slt i32 %[[extract]], 0
+// SHADERTEST-GFX-NEXT: %[[select:[0-9]+]] = select i1 %[[cmp]], i32 %[[extract]], i32 %[[and]]
+// SHADERTEST-GFX-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[select]], i64 3
+// SHADERTEST-GFX-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// SHADERTEST-GFX-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
+// SHADERTEST-GFX-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// SHADERTEST-GFX-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]])
+// SHADERTEST-GFX: AMDLLPC SUCCESS
+//
+// SHADERTEST-GFX_10_3_0-LABEL: {{^// LLPC}} pipeline patching results
+// SHADERTEST-GFX_10_3_0: %[[mul:[0-9]+]] = mul i32 %{{.*}}, 48
+// SHADERTEST-GFX_10_3_0-NEXT: %[[begin:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[readfirstlane:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin]], i32 %[[mul]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[sext:[0-9]+]] = sext i32 %[[readfirstlane]] to i64
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// SHADERTEST-GFX_10_3_0-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_0-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]])
+// SHADERTEST-GFX_10_3_0: AMDLLPC SUCCESS
 //
-// END_SHADERTEST
+// SHADERTEST-GFX_10_3_2-LABEL: {{^// LLPC}} pipeline patching results
+// SHADERTEST-GFX_10_3_2: %[[mul:[0-9]+]] = mul i32 %{{.*}}, 48
+// SHADERTEST-GFX_10_3_2-NEXT: %[[begin:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[readfirstlane:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin]], i32 %[[mul]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[sext:[0-9]+]] = sext i32 %[[readfirstlane]] to i64
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// SHADERTEST-GFX_10_3_2-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], -1048577
+// SHADERTEST-GFX_10_3_2-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[and]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// SHADERTEST-GFX_10_3_2-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_2-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]])
+// SHADERTEST-GFX_10_3_2: AMDLLPC SUCCESS
diff --git a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_MultiBlock.frag b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_MultiBlock.frag
index 82cd87a930..8e1893653d 100644
--- a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_MultiBlock.frag
+++ b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_MultiBlock.frag
@@ -1,5 +1,5 @@
 // Make sure that there are two non-overlapping waterfall loops
-// First is scalarized and second is vector type
+// The first two loops are scalarized and the last one is vector type
 
 #version 450
 #extension GL_EXT_nonuniform_qualifier : require
@@ -25,24 +25,139 @@ void main()
     _3 = samp0 + samp1;
 }
 
-// BEGIN_SHADERTEST
-//
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST-GFX %s
 // Explicitly check GFX10.3 ASIC variants:
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST %s
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST %s
-// SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-// SHADERTEST: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32
-// SHADERTEST: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST: call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32
-// SHADERTEST: call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32
-// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32
-// SHADERTEST: AMDLLPC SUCCESS
-//
-// END_SHADERTEST
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST-GFX_10_3_0 %s
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST-GFX_10_3_2 %s
+
+// SHADERTEST-GFX-LABEL: {{^// LLPC}} pipeline patching results
+// SHADERTEST-GFX: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48
+// SHADERTEST-GFX-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[mul1]] to i64
+// SHADERTEST-GFX-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX-NEXT: %[[load1:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// SHADERTEST-GFX-NEXT: %[[load2:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// SHADERTEST-GFX-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]])
+// SHADERTEST-GFX-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64
+// SHADERTEST-GFX-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32
+// SHADERTEST-GFX-NEXT: %[[extract1:[.a-z0-9]+]] = extractelement <8 x i32> %[[load3]], i64 3
+// SHADERTEST-GFX-NEXT: %[[and1:[0-9]+]] = and i32 %[[extract1]], 268435455
+// SHADERTEST-GFX-NEXT: %[[cmp1:[0-9]+]] = icmp slt i32 %[[extract1]], 0
+// SHADERTEST-GFX-NEXT: %[[select1:[0-9]+]] = select i1 %[[cmp1]], i32 %[[extract1]], i32 %[[and1]]
+// SHADERTEST-GFX-NEXT: %[[insert1:[.a-z0-9]+]] = insertelement <8 x i32> %[[load3]], i32 %[[select1]], i64 3
+// SHADERTEST-GFX-NEXT: %[[shufflevector1:[0-9]+]] = shufflevector <8 x i32> %[[insert1]], <8 x i32> %[[load3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// SHADERTEST-GFX-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16
+// SHADERTEST-GFX-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector1]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]])
+//
+// SHADERTEST-GFX-NEXT: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]])
+// SHADERTEST-GFX-NEXT: %[[sext3:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64
+// SHADERTEST-GFX-NEXT: %[[gep5:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]]
+// SHADERTEST-GFX-NEXT: %[[load5:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep5]], align 32
+// SHADERTEST-GFX-NEXT: %[[extract2:[.a-z0-9]+]] = extractelement <8 x i32> %[[load5]], i64 3
+// SHADERTEST-GFX-NEXT: %[[and2:[0-9]+]] = and i32 %[[extract2]], 268435455
+// SHADERTEST-GFX-NEXT: %[[cmp2:[0-9]+]] = icmp slt i32 %[[extract2]], 0
+// SHADERTEST-GFX-NEXT: %[[select2:[0-9]+]] = select i1 %[[cmp2]], i32 %[[extract2]], i32 %[[and2]]
+// SHADERTEST-GFX-NEXT: %[[insert2:[.a-z0-9]+]] = insertelement <8 x i32> %[[load5]], i32 %[[select2]], i64 3
+// SHADERTEST-GFX-NEXT: %[[shufflevector2:[0-9]+]] = shufflevector <8 x i32> %[[insert2]], <8 x i32> %[[load5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// SHADERTEST-GFX-NEXT: %[[gep6:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]]
+// SHADERTEST-GFX-NEXT: %[[load6:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep6]], align 16
+// SHADERTEST-GFX-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector2]], <4 x i32> %[[load6]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]])
+//
+// SHADERTEST-GFX: %[[begin3:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX-NEXT: %[[readfirstlane3:[0-9]+]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 %[[begin3]], <8 x i32> %[[load2]])
+// SHADERTEST-GFX-NEXT: %[[extract3:[.a-z0-9]+]] = extractelement <8 x i32> %[[readfirstlane3]], i64 3
+// SHADERTEST-GFX-NEXT: %[[and3:[0-9]+]] = and i32 %[[extract3]], 268435455
+// SHADERTEST-GFX-NEXT: %[[cmp3:[0-9]+]] = icmp slt i32 %[[extract3]], 0
+// SHADERTEST-GFX-NEXT: %[[select3:[0-9]+]] = select i1 %[[cmp3]], i32 %[[extract3]], i32 %[[and3]]
+// SHADERTEST-GFX-NEXT: %[[insert3:[.a-z0-9]+]] = insertelement <8 x i32> %[[readfirstlane3]], i32 %[[select3]], i64 3
+// SHADERTEST-GFX-NEXT: %[[shufflevector3:[0-9]+]] = shufflevector <8 x i32> %[[insert3]], <8 x i32> %[[readfirstlane3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// SHADERTEST-GFX-NEXT: %[[readfirstlane4:[0-9]+]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 %[[begin3]], <4 x i32> %[[load1]])
+// SHADERTEST-GFX-NEXT: [[image_call3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector3]], <4 x i32> %[[readfirstlane4]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX-NEXT: %[[end3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin3]], <4 x float> %[[image_call3]])
+// SHADERTEST-GFX: AMDLLPC SUCCESS
+//
+// SHADERTEST-GFX_10_3_0-LABEL: {{^// LLPC}} pipeline patching results
+// SHADERTEST-GFX_10_3_0: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48
+// SHADERTEST-GFX_10_3_0-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[mul1]] to i64
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load1:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load2:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// SHADERTEST-GFX_10_3_0-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16
+// SHADERTEST-GFX_10_3_0-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load3]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_0-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]])
+//
+// SHADERTEST-GFX_10_3_0-NEXT: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[sext3:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep5:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load5:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep5]], align 32
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep6:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load6:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep6]], align 16
+// SHADERTEST-GFX_10_3_0-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load5]], <4 x i32> %[[load6]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_0-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]])
+//
+// SHADERTEST-GFX_10_3_0: %[[begin3:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[readfirstlane3:[0-9]+]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 %[[begin3]], <8 x i32> %[[load2]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[readfirstlane4:[0-9]+]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 %[[begin3]], <4 x i32> %[[load1]])
+// SHADERTEST-GFX_10_3_0-NEXT: [[image_call3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[readfirstlane3]], <4 x i32> %[[readfirstlane4]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_0-NEXT: %[[end3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin3]], <4 x float> %[[image_call3]])
+// SHADERTEST-GFX_10_3_0: AMDLLPC SUCCESS
+//
+// SHADERTEST-GFX_10_3_2-LABEL: {{^// LLPC}} pipeline patching results
+// SHADERTEST-GFX_10_3_2: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48
+// SHADERTEST-GFX_10_3_2-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[mul1]] to i64
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load1:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load2:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// SHADERTEST-GFX_10_3_2-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32
+// SHADERTEST-GFX_10_3_2-NEXT: %[[extract1:[.a-z0-9]+]] = extractelement <8 x i32> %[[load3]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[and1:[0-9]+]] = and i32 %[[extract1]], -1048577
+// SHADERTEST-GFX_10_3_2-NEXT: %[[insert1:[.a-z0-9]+]] = insertelement <8 x i32> %[[load3]], i32 %[[and1]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[shufflevector1:[0-9]+]] = shufflevector <8 x i32> %[[insert1]], <8 x i32> %[[load3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16
+// SHADERTEST-GFX_10_3_2-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector1]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_2-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]])
+//
+// SHADERTEST-GFX_10_3_2-NEXT: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[sext3:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep5:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load5:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep5]], align 32
+// SHADERTEST-GFX_10_3_2-NEXT: %[[extract1:[.a-z0-9]+]] = extractelement <8 x i32> %[[load5]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[and1:[0-9]+]] = and i32 %[[extract1]], -1048577
+// SHADERTEST-GFX_10_3_2-NEXT: %[[insert1:[.a-z0-9]+]] = insertelement <8 x i32> %[[load5]], i32 %[[and1]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[shufflevector1:[0-9]+]] = shufflevector <8 x i32> %[[insert1]], <8 x i32> %[[load5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep6:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load6:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep6]], align 16
+// SHADERTEST-GFX_10_3_2-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector1]], <4 x i32> %[[load6]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_2-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]])
+//
+// SHADERTEST-GFX_10_3_2: %[[extract2:[.a-z0-9]+]] = extractelement <8 x i32> %[[load2]], i64 6
+// SHADERTEST-GFX_10_3_2: %[[and2:[0-9]+]] = and i32 %[[extract2]], -1048577
+// SHADERTEST-GFX_10_3_2-NEXT: %[[insert2:[.a-z0-9]+]] = insertelement <8 x i32> %[[load2]], i32 %[[and2]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[shufflevector2:[0-9]+]] = shufflevector <8 x i32> %[[insert2]], <8 x i32> %[[load2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+// SHADERTEST-GFX_10_3_2: %[[begin3:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[readfirstlane3:[0-9]+]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 %[[begin3]], <8 x i32> %[[shufflevector2]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[readfirstlane4:[0-9]+]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 %[[begin3]], <4 x i32> %[[load1]])
+// SHADERTEST-GFX_10_3_2-NEXT: [[image_call3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[readfirstlane3]], <4 x i32> %[[readfirstlane4]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_2-NEXT: %[[end3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin3]], <4 x float> %[[image_call3]])
+// SHADERTEST-GFX_10_3_2: AMDLLPC SUCCESS
diff --git a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_SharedDesc.frag b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_SharedDesc.frag
index 123a2bc917..132f84103f 100644
--- a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_SharedDesc.frag
+++ b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_SharedDesc.frag
@@ -1,7 +1,3 @@
-// Make sure that there is a single begin index
-// Make sure that there is a single waterfall.readfirstlane for the offset
-// Make sure that there are two waterfall.end operations for the samples
-
 #version 450
 #extension GL_EXT_nonuniform_qualifier : require
 
@@ -20,21 +16,96 @@ void main()
     _3 = samp0 + samp1;
 }
 
-// BEGIN_SHADERTEST
-//
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST-GFX %s
 // Explicitly check GFX10.3 ASIC variants:
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST %s
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST %s
-// SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-// SHADERTEST: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32
-// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32
-// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32
-// SHADERTEST: AMDLLPC SUCCESS
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST-GFX_10_3_0 %s
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST-GFX_10_3_2 %s
+
+// SHADERTEST-GFX-LABEL: {{^// LLPC}} pipeline patching results
+// SHADERTEST-GFX: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48
+// SHADERTEST-GFX-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]])
+// SHADERTEST-GFX-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64
+// SHADERTEST-GFX-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// SHADERTEST-GFX-NEXT: %[[extract1:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 3
+// SHADERTEST-GFX-NEXT: %[[and1:[0-9]+]] = and i32 %[[extract1]], 268435455
+// SHADERTEST-GFX-NEXT: %[[cmp1:[0-9]+]] = icmp slt i32 %[[extract1]], 0
+// SHADERTEST-GFX-NEXT: %[[select1:[0-9]+]] = select i1 %[[cmp1]], i32 %[[extract1]], i32 %[[and1]]
+// SHADERTEST-GFX-NEXT: %[[insert1:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[select1]], i64 3
+// SHADERTEST-GFX-NEXT: %[[shufflevector1:[0-9]+]] = shufflevector <8 x i32> %[[insert1]], <8 x i32> %[[load1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// SHADERTEST-GFX-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// SHADERTEST-GFX-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]])
+//
+// SHADERTEST-GFX: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]])
+// SHADERTEST-GFX-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64
+// SHADERTEST-GFX-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32
+// SHADERTEST-GFX-NEXT: %[[extract2:[.a-z0-9]+]] = extractelement <8 x i32> %[[load3]], i64 3
+// SHADERTEST-GFX-NEXT: %[[and2:[0-9]+]] = and i32 %[[extract2]], 268435455
+// SHADERTEST-GFX-NEXT: %[[cmp2:[0-9]+]] = icmp slt i32 %[[extract2]], 0
+// SHADERTEST-GFX-NEXT: %[[select2:[0-9]+]] = select i1 %[[cmp2]], i32 %[[extract2]], i32 %[[and2]]
+// SHADERTEST-GFX-NEXT: %[[insert2:[.a-z0-9]+]] = insertelement <8 x i32> %[[load3]], i32 %[[select2]], i64 3
+// SHADERTEST-GFX-NEXT: %[[shufflevector2:[0-9]+]] = shufflevector <8 x i32> %[[insert2]], <8 x i32> %[[load3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// SHADERTEST-GFX-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16
+// SHADERTEST-GFX-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector2]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]])
+// SHADERTEST-GFX: AMDLLPC SUCCESS
+
+// SHADERTEST-GFX_10_3_0-LABEL: {{^// LLPC}} pipeline patching results
+// SHADERTEST-GFX_10_3_0: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48
+// SHADERTEST-GFX_10_3_0-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// SHADERTEST-GFX_10_3_0-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_0-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]])
+//
+// SHADERTEST-GFX_10_3_0: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16
+// SHADERTEST-GFX_10_3_0-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load3]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_0-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]])
+// SHADERTEST-GFX_10_3_0: AMDLLPC SUCCESS
+
+// SHADERTEST-GFX_10_3_2-LABEL: {{^// LLPC}} pipeline patching results
+// SHADERTEST-GFX_10_3_2: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48
+// SHADERTEST-GFX_10_3_2-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// SHADERTEST-GFX_10_3_2-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], -1048577
+// SHADERTEST-GFX_10_3_2-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[and]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// SHADERTEST-GFX_10_3_2-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_2-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]])
 //
-// END_SHADERTEST
+// SHADERTEST-GFX_10_3_2: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32
+// SHADERTEST-GFX_10_3_2-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load3]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], -1048577
+// SHADERTEST-GFX_10_3_2-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load3]], i32 %[[and]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16
+// SHADERTEST-GFX_10_3_2-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_2-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]])
+// SHADERTEST-GFX_10_3_2: AMDLLPC SUCCESS
diff --git a/tool/dumper/vkgcPipelineDumper.cpp b/tool/dumper/vkgcPipelineDumper.cpp
index 3691e649f7..143280fc83 100644
--- a/tool/dumper/vkgcPipelineDumper.cpp
+++ b/tool/dumper/vkgcPipelineDumper.cpp
@@ -647,7 +647,8 @@ void PipelineDumper::dumpPipelineShaderInfo(const PipelineShaderInfo *shaderInfo
   dumpFile << "options.fastMathFlags = " << shaderInfo->options.fastMathFlags << "\n";
   dumpFile << "options.disableFastMathFlags = " << shaderInfo->options.disableFastMathFlags << "\n";
   dumpFile << "options.ldsSpillLimitDwords = " << shaderInfo->options.ldsSpillLimitDwords << "\n";
-  dumpFile << "options.scalarizeWaterfallLoads = " << shaderInfo->options.scalarizeWaterfallLoads << "\n";
+  if (shaderInfo->options.scalarizeWaterfallLoads.has_value())
+    dumpFile << "options.scalarizeWaterfallLoads = " << *shaderInfo->options.scalarizeWaterfallLoads << "\n";
   dumpFile << "options.overrideShaderThreadGroupSizeX = " << shaderInfo->options.overrideShaderThreadGroupSizeX << "\n";
   dumpFile << "options.overrideShaderThreadGroupSizeY = " << shaderInfo->options.overrideShaderThreadGroupSizeY << "\n";
   dumpFile << "options.overrideShaderThreadGroupSizeZ = " << shaderInfo->options.overrideShaderThreadGroupSizeZ << "\n";