diff --git a/include/vkgcDefs.h b/include/vkgcDefs.h
index 8365ffa169..db665dd386 100644
--- a/include/vkgcDefs.h
+++ b/include/vkgcDefs.h
@@ -246,6 +246,7 @@ struct optional_bool : private std::optional<bool> {
   using std::optional<bool>::has_value;
   using std::optional<bool>::value;
   using std::optional<bool>::value_or;
+  using std::optional<bool>::operator*;
 };
 
 /// Enumerates result codes of LLPC operations.
@@ -888,7 +889,7 @@ struct PipelineShaderOptions {
   unsigned ldsSpillLimitDwords;
 
   /// Attempt to scalarize waterfall descriptor loads.
-  bool scalarizeWaterfallLoads;
+  optional_bool scalarizeWaterfallLoads;
 
   /// Force rearranges threadId within group into blocks of 8*8 or 8*4
   bool overrideForceThreadIdSwizzling;
diff --git a/lgc/builder/BuilderImpl.cpp b/lgc/builder/BuilderImpl.cpp
index ec99c51e21..f3501d6c44 100644
--- a/lgc/builder/BuilderImpl.cpp
+++ b/lgc/builder/BuilderImpl.cpp
@@ -33,6 +33,7 @@
 #include "lgc/LgcDialect.h"
 #include "lgc/state/PipelineState.h"
 #include "lgc/state/TargetInfo.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 
@@ -329,6 +330,117 @@ BranchInst *BuilderImpl::createIf(Value *condition, bool wantElse, const Twine &
 }
 
 #if defined(LLVM_HAVE_BRANCH_AMD_GFX)
+// A simple memory efficient container that holds up to 64 instructions in a bit vector. It needs two helper data
+// structures: 1. instrToIndex that maps an instruction to its index in the bit vector and 2. indexToInstr that maps an
+// index back to an instruction.
+class TinyInstructionSet {
+public:
+  using IndexToInstructionVec = SmallVector<Instruction *, 64>;
+  using InstrToIndexMap = DenseMap<Instruction *, unsigned>;
+
+private:
+  BitVector bits;
+
+public:
+  TinyInstructionSet(unsigned size) { bits.resize(size); }
+
+  class iterator {
+    BitVector::const_set_bits_iterator it;
+    const SmallVector<Instruction *, 64> &indexToInstr;
+
+  public:
+    iterator(BitVector::const_set_bits_iterator it, const IndexToInstructionVec &indexToInstr)
+        : it(it), indexToInstr(indexToInstr) {}
+    iterator &operator++() {
+      ++it;
+      return *this;
+    }
+
+    Instruction *operator*() {
+      unsigned index = *it;
+      assert(index < indexToInstr.size() && "Index out of range.");
+      return indexToInstr[index];
+    }
+
+    bool operator!=(const iterator &otherIt) {
+      assert(&otherIt.indexToInstr == &indexToInstr && "Iterators of different objects.");
+      return otherIt.it != it;
+    }
+  };
+
+  iterator begin(const IndexToInstructionVec &indexToInstr) const {
+    return iterator(bits.set_bits_begin(), indexToInstr);
+  }
+
+  iterator end(const IndexToInstructionVec &indexToInstr) const { return iterator(bits.set_bits_end(), indexToInstr); }
+
+  void insert(Instruction *instr, const InstrToIndexMap &instrToIndex) {
+    auto it = instrToIndex.find(instr);
+    assert(it != instrToIndex.end() && "Expected to find instr in instrToIndex.");
+    unsigned index = it->second;
+    bits.set(index);
+  }
+
+  unsigned size() const { return bits.size(); }
+
+  bool empty() const { return bits.empty(); }
+};
+
+class TraceNonUniformIndex {
+  // Maps the instruction to its index in the bit vector.
+  TinyInstructionSet::InstrToIndexMap instrToIndex;
+  // The instructions used as keys in instrToIndex in program order. It is used to map an index to an instruction.
+  TinyInstructionSet::IndexToInstructionVec indexToInstr;
+  // Maps an instruction to its dependencies.
+  DenseMap<Value *, TinyInstructionSet> instrDeps;
+  bool scalarizeDescriptorLoads;
+  unsigned upperLimit;
+  void insertNewValueInInstrDeps(Value *, Instruction *);
+
+public:
+  TraceNonUniformIndex(Instruction *nonUniformInst, bool scalarizeDescriptorLoads = false, unsigned upperLimit = 64)
+      : scalarizeDescriptorLoads(scalarizeDescriptorLoads), upperLimit(upperLimit) {
+    // Initialization of instrToIndex and indexToInstr.
+    if (scalarizeDescriptorLoads) {
+      unsigned cnt = 0;
+      for (Instruction *I = nonUniformInst->getPrevNode(); I != nullptr && cnt < upperLimit;
+           I = I->getPrevNode(), ++cnt) {
+        indexToInstr.push_back(I);
+        instrToIndex[I] = cnt;
+      }
+    }
+  }
+
+  Value *run(Value *);
+
+  const DenseMap<Value *, TinyInstructionSet> &getInstrDeps() const { return instrDeps; }
+
+  const TinyInstructionSet::IndexToInstructionVec &getIndexToInstr() const { return indexToInstr; }
+
+  bool foundDependencies() const { return scalarizeDescriptorLoads; }
+};
+
+// Adds newValue in instrDeps map. The dependencies of the newValue are the currentVisitedInstr and its dependencies.
+// @param newValue : the new value to be added in instrDeps map
+// @param currentVisitedInstr : the value from where we copy the dependencies for newValue
+void TraceNonUniformIndex::insertNewValueInInstrDeps(Value *newValue, Instruction *currentVisitedInstr) {
+  if (!instrToIndex.contains(currentVisitedInstr)) {
+    // The instruction is either outside of 64 limit or in a different basic block. So, we bail-out scalarization.
+    scalarizeDescriptorLoads = false;
+    return;
+  }
+  assert(instrDeps.contains(currentVisitedInstr) && "The current visited instruction should have been in the map.");
+  auto it1 = instrDeps.try_emplace(newValue, upperLimit).first;
+  auto &setOfInstrs = it1->second;
+  auto it2 = instrDeps.find(currentVisitedInstr);
+  const auto &set = it2->second;
+  for (auto it3 = set.begin(indexToInstr), ite = set.end(indexToInstr); it3 != ite; ++it3) {
+    auto *instr = *it3;
+    setOfInstrs.insert(instr, instrToIndex);
+  }
+  setOfInstrs.insert(currentVisitedInstr, instrToIndex);
+}
+
 // =====================================================================================================================
 // For a non-uniform input, try and trace back through a descriptor load to
 // find the non-uniform index used in it. If that fails, we just use the
@@ -339,11 +451,15 @@ BranchInst *BuilderImpl::createIf(Value *condition, bool wantElse, const Twine &
 // This uses a fairly simple heuristic that nevertheless allows temporary expansion of the search breadth to handle
 // the common case where a base pointer is assembled from separate high and low halves.
 //
+// In case of scalarization, it fills the instrDeps map by using insertNewValueInInstrDeps().
+//
 // @param nonUniformVal : Value representing non-uniform descriptor
 // @return : Value representing the non-uniform index, or null if nonUniformVal could be proven to be uniform
-static Value *traceNonUniformIndex(Value *nonUniformVal) {
+Value *TraceNonUniformIndex::run(Value *nonUniformVal) {
   auto load = dyn_cast<LoadInst>(nonUniformVal);
-  if (!load) {
+  if (scalarizeDescriptorLoads && load) {
+    instrDeps.try_emplace(load, upperLimit);
+  } else if (!load) {
     // Workarounds that modify image descriptor can be peeped through, i.e.
     //   %baseValue = load <8 x i32>, <8 x i32> addrspace(4)* %..., align 16
     //   %rawElement = extractelement <8 x i32> %baseValue, i64 6
@@ -353,6 +469,9 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) {
     if (!insert)
       return nonUniformVal;
 
+    if (scalarizeDescriptorLoads)
+      instrDeps.try_emplace(insert, upperLimit);
+
     load = dyn_cast<LoadInst>(insert->getOperand(0));
     if (!load)
       return nonUniformVal;
@@ -360,9 +479,15 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) {
     // We found the load, but must verify the chain.
     // Consider updatedElement as a generic instruction or constant.
     if (auto updatedElement = dyn_cast<Instruction>(insert->getOperand(1))) {
+      if (scalarizeDescriptorLoads)
+        insertNewValueInInstrDeps(updatedElement, insert);
       for (Value *operand : updatedElement->operands()) {
         if (auto extract = dyn_cast<ExtractElementInst>(operand)) {
           // Only dynamic value must be ExtractElementInst based on load.
+          if (scalarizeDescriptorLoads) {
+            insertNewValueInInstrDeps(extract, updatedElement);
+            insertNewValueInInstrDeps(load, extract);
+          }
           if (dyn_cast<LoadInst>(extract->getOperand(0)) != load)
             return nonUniformVal;
         } else if (!isa<Constant>(operand)) {
@@ -386,11 +511,13 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) {
   SmallVector<Instruction *, 2> nonUniforms;
   nonUniforms.push_back(load);
 
-  auto propagate = [&](Value *value) -> bool {
+  auto propagate = [&](Value *value, Instruction *current) {
     if (auto inst = dyn_cast<Instruction>(value)) {
       if (nonUniforms.size() >= 2)
         return false;
       nonUniforms.push_back(inst);
+      if (scalarizeDescriptorLoads)
+        insertNewValueInInstrDeps(inst, current);
       return true;
     }
     return isa<Constant>(value);
@@ -410,13 +537,13 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) {
 
     // See if we can propagate the search further.
     if (current->isCast() || current->isUnaryOp()) {
-      if (!propagate(current->getOperand(0)))
+      if (!propagate(current->getOperand(0), current))
         return nonUniformVal;
       continue;
     }
 
     if (current->isBinaryOp()) {
-      if (!propagate(current->getOperand(0)) || !propagate(current->getOperand(1)))
+      if (!propagate(current->getOperand(0), current) || !propagate(current->getOperand(1), current))
         return nonUniformVal;
       continue;
     }
@@ -427,14 +554,15 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) {
       if (as == ADDR_SPACE_FLAT || as == ADDR_SPACE_PRIVATE)
         return nonUniformVal; // load is a source of divergence, can't propagate
 
-      if (!propagate(ptr))
+      if (!propagate(ptr, current))
         return nonUniformVal;
       continue;
     }
 
     if (auto gep = dyn_cast<GetElementPtrInst>(current)) {
       if (gep->hasAllConstantIndices()) {
-        if (!propagate(gep->getPointerOperand()))
+
+        if (!propagate(gep->getPointerOperand(), current))
           return nonUniformVal;
         continue;
       }
@@ -443,33 +571,37 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) {
       if (candidateIndex || gep->getNumIndices() != 1)
         return nonUniformVal;
 
-      if (!propagate(gep->getPointerOperand()))
+      if (!propagate(gep->getPointerOperand(), current))
         return nonUniformVal;
 
       candidateIndex = *gep->idx_begin();
       if (getSize(candidateIndex) > nonUniformValSize)
         return nonUniformVal; // propagating further is worthless
+
+      if (scalarizeDescriptorLoads)
+        insertNewValueInInstrDeps(candidateIndex, current);
+
       continue;
     }
 
     if (auto extract = dyn_cast<ExtractValueInst>(current)) {
-      if (!propagate(extract->getAggregateOperand()))
+      if (!propagate(extract->getAggregateOperand(), current))
         return nonUniformVal;
       continue;
     }
     if (auto insert = dyn_cast<InsertValueInst>(current)) {
-      if (!propagate(insert->getAggregateOperand()) || !propagate(insert->getInsertedValueOperand()))
+      if (!propagate(insert->getAggregateOperand(), current) || !propagate(insert->getInsertedValueOperand(), current))
         return nonUniformVal;
       continue;
     }
     if (auto extract = dyn_cast<ExtractElementInst>(current)) {
-      if (!isa<Constant>(extract->getIndexOperand()) || !propagate(extract->getVectorOperand()))
+      if (!isa<Constant>(extract->getIndexOperand()) || !propagate(extract->getVectorOperand(), current))
         return nonUniformVal;
       continue;
     }
     if (auto insert = dyn_cast<InsertElementInst>(current)) {
-      if (!isa<Constant>(insert->getOperand(2)) || !propagate(insert->getOperand(0)) ||
-          !propagate(insert->getOperand(1)))
+      if (!isa<Constant>(insert->getOperand(2)) || !propagate(insert->getOperand(0), current) ||
+          !propagate(insert->getOperand(1), current))
         return nonUniformVal;
       continue;
     }
@@ -532,6 +664,12 @@ static bool instructionsEqual(Instruction *lhs, Instruction *rhs) {
 // Create a waterfall loop containing the specified instruction.
 // This does not use the current insert point; new code is inserted before and after nonUniformInst.
 //
+// For scalarization we need to collect all the instructions that need to be moved inside the loop. This is done by
+// traceNonUniformIndex() which traverses all use-def predecessors of nonUniformInst. At the same time it adds these
+// instructions to instrDeps map. Once traceNonUniformIndex() completes, we use the returned value as a key to the
+// instrDeps map to get the dependencies. These dependencies are the instructions that will be cloned and moved inside
+// the waterfall loop.
+//
 // @param nonUniformInst : The instruction to put in a waterfall loop
 // @param operandIdxs : The operand index/indices for non-uniform inputs that need to be uniform
 // @param scalarizeDescriptorLoads : Attempt to scalarize descriptor loads
@@ -548,24 +686,40 @@ Instruction *BuilderImpl::createWaterfallLoop(Instruction *nonUniformInst, Array
   assert(operandIdxs.empty() == false);
 
   SmallVector<Value *, 2> nonUniformIndices;
+  // Maps the nonUniformIndex that is returned by traceNonUniformIndex() to the nonUniformInst.
+  DenseMap<Value *, std::pair<Value *, unsigned>> nonUniformIndexImageCallOperand;
+  TraceNonUniformIndex traceNonUniformIndex(nonUniformInst, scalarizeDescriptorLoads, 64);
+
   for (unsigned operandIdx : operandIdxs) {
-    Value *nonUniformIndex = traceNonUniformIndex(nonUniformInst->getOperand(operandIdx));
-    if (nonUniformIndex)
+    Value *nonUniformImageCallOperand = nonUniformInst->getOperand(operandIdx);
+    Value *nonUniformIndex = traceNonUniformIndex.run(nonUniformImageCallOperand);
+    scalarizeDescriptorLoads = traceNonUniformIndex.foundDependencies();
+    if (nonUniformIndex) {
       nonUniformIndices.push_back(nonUniformIndex);
+
+      if (scalarizeDescriptorLoads)
+        nonUniformIndexImageCallOperand[nonUniformIndex] = std::make_pair(nonUniformImageCallOperand, operandIdx);
+    }
   }
+
   if (nonUniformIndices.empty())
     return nonUniformInst;
 
-  // For any index that is 64 bit, change it back to 32 bit for comparison at the top of the
-  // waterfall loop.
+  // For any index that is 64 bit, change it back to 32 bit for comparison at the top of the waterfall loop.
+  // At this point the nonUniformVal of nonUniformIndices might change. We also need the original non uniform values for
+  // the scalarization of the descriptor loads.
+  DenseMap<Value *, Value *> newOrigNonUniformVal;
   for (Value *&nonUniformVal : nonUniformIndices) {
     if (nonUniformVal->getType()->isIntegerTy(64)) {
       auto sExt = dyn_cast<SExtInst>(nonUniformVal);
+      Value *origNonUniformVal = nonUniformVal;
       // 64-bit index may already be formed from extension of 32-bit value.
       if (sExt && sExt->getOperand(0)->getType()->isIntegerTy(32)) {
         nonUniformVal = sExt->getOperand(0);
+        newOrigNonUniformVal[nonUniformVal] = origNonUniformVal;
       } else {
         nonUniformVal = CreateTrunc(nonUniformVal, getInt32Ty());
+        newOrigNonUniformVal[nonUniformVal] = origNonUniformVal;
       }
     }
   }
@@ -601,36 +755,84 @@ Instruction *BuilderImpl::createWaterfallLoop(Instruction *nonUniformInst, Array
 
   Value *waterfallBegin;
   if (scalarizeDescriptorLoads) {
-    // Attempt to scalarize descriptor loads.
-    assert(firstIndexInst);
-    CallInst *firstCallInst = dyn_cast<CallInst>(firstIndexInst);
-    if (firstCallInst && firstCallInst->getIntrinsicID() == Intrinsic::amdgcn_waterfall_readfirstlane) {
-      // Descriptor loads are already inside a waterfall.
-      waterfallBegin = firstCallInst->getArgOperand(0);
-    } else {
-      // Begin waterfall loop just after shared index is computed.
-      // This places all dependent instructions within the waterfall loop, including descriptor loads.
-      auto descTy = firstIndexInst->getType();
-      SetInsertPoint(firstIndexInst->getNextNonDebugInstruction(false));
-      waterfallBegin = ConstantInt::get(getInt32Ty(), 0);
-      waterfallBegin = CreateIntrinsic(Intrinsic::amdgcn_waterfall_begin, descTy, {waterfallBegin, firstIndexInst},
-                                       nullptr, instName);
-
-      // Scalarize shared index.
-      Value *desc = CreateIntrinsic(Intrinsic::amdgcn_waterfall_readfirstlane, {descTy, descTy},
-                                    {waterfallBegin, firstIndexInst}, nullptr, instName);
+    SetInsertPoint(nonUniformInst);
+    auto descTy = firstIndexInst->getType();
+    // Create waterfall.begin and waterfall.readfirstlane intrinsics.
+    waterfallBegin = ConstantInt::get(getInt32Ty(), 0);
+    waterfallBegin =
+        CreateIntrinsic(Intrinsic::amdgcn_waterfall_begin, descTy, {waterfallBegin, firstIndexInst}, nullptr, instName);
+
+    // Scalarize shared index.
+    Value *readFirstLane = CreateIntrinsic(Intrinsic::amdgcn_waterfall_readfirstlane, {descTy, descTy},
+                                           {waterfallBegin, firstIndexInst}, nullptr, instName);
+
+    for (auto *nonUniformVal : nonUniformIndices) {
+      // Get the first non uniform instruction of the chain.
+      auto it1 = newOrigNonUniformVal.find(nonUniformVal);
+      Value *origNonUniformVal = nonUniformVal;
+      if (it1 != newOrigNonUniformVal.end())
+        origNonUniformVal = it1->second;
+
+      auto [nonUniformImageCallOperand, operandIdx] = nonUniformIndexImageCallOperand[origNonUniformVal];
+
+      if (origNonUniformVal == nonUniformImageCallOperand)
+        continue;
+
+      // Get the instruction chain of the first non uniform instruction.
+      const DenseMap<Value *, TinyInstructionSet> &instrDeps = traceNonUniformIndex.getInstrDeps();
+      auto it2 = instrDeps.find(origNonUniformVal);
+      assert(it2 != instrDeps.end() && "The non-uniform index should be in instrDep map.");
+      auto &instrsToClone = it2->second;
+      assert(!instrsToClone.empty() && "There are not any instructions to clone.");
+
+      // Clone and emit the instructions that we want to push inside the waterfall loop.
+      std::map<Instruction *, Instruction *> origClonedValuesMap;
+      Instruction *prevInst = nonUniformInst;
+      const TinyInstructionSet::IndexToInstructionVec &indexToInstr = traceNonUniformIndex.getIndexToInstr();
+      for (auto it3 = instrsToClone.begin(indexToInstr), ite = instrsToClone.end(indexToInstr); it3 != ite; ++it3) {
+        auto *origInst = *it3;
+        auto *newInst = origInst->clone();
+        newInst->insertBefore(prevInst);
+        origClonedValuesMap[origInst] = newInst;
+        prevInst = newInst;
+        // Update the non-uniform operand of the image call with the new non-uniform operand.
+        if (nonUniformImageCallOperand == origInst) {
+          if (nonUniformInst->getType()->isVoidTy())
+            newInst = CreateIntrinsic(Intrinsic::amdgcn_waterfall_last_use, newInst->getType(),
+                                      {waterfallBegin, newInst}, nullptr, instName);
+          nonUniformInst->setOperand(operandIdx, newInst);
+        }
+      }
+      // Finally, clone the first non uniform instruction.
+      auto *origInst = cast<Instruction>(origNonUniformVal);
+      auto *newInst = origInst->clone();
+      newInst->insertBefore(prevInst);
+      origClonedValuesMap[origInst] = newInst;
+
+      // Update the operands of the cloned instructions.
+      for (auto [origInst, newInst] : origClonedValuesMap) {
+        for (Use &use : newInst->operands()) {
+          Value *op = use.get();
+          if (auto *opI = dyn_cast<Instruction>(op)) {
+            auto it = origClonedValuesMap.find(opI);
+            if (it == origClonedValuesMap.end())
+              continue;
+            Instruction *clonedI = it->second;
+            use.set(clonedI);
+          }
+        }
+      }
 
       // Replace all references to shared index within the waterfall loop with scalarized index.
       // (Note: this includes the non-uniform instruction itself.)
       // Loads using scalarized index will become scalar loads.
-      for (Value *otherNonUniformVal : nonUniformIndices) {
-        otherNonUniformVal->replaceUsesWithIf(desc, [desc, waterfallBegin, nonUniformInst](Use &U) {
-          Instruction *userInst = cast<Instruction>(U.getUser());
-          return U.getUser() != waterfallBegin && U.getUser() != desc &&
-                 userInst->getParent() == nonUniformInst->getParent() &&
-                 (userInst == nonUniformInst || userInst->comesBefore(nonUniformInst));
-        });
-      }
+      nonUniformVal->replaceUsesWithIf(readFirstLane, [readFirstLane, waterfallBegin, nonUniformInst](Use &U) {
+        Instruction *userInst = cast<Instruction>(U.getUser());
+        return userInst != waterfallBegin && userInst != readFirstLane &&
+               userInst->getParent() == nonUniformInst->getParent() &&
+               (userInst == nonUniformInst || userInst->comesBefore(nonUniformInst)) &&
+               !userInst->comesBefore(cast<Instruction>(waterfallBegin));
+      });
     }
   } else {
     // Insert new code just before nonUniformInst.
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest1.ll b/lgc/test/scalarizationOfDescriptorLoadsTest1.ll
new file mode 100644
index 0000000000..0e14077f57
--- /dev/null
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest1.ll
@@ -0,0 +1,113 @@
+; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --tool lgc
+; RUN: lgc -mcpu=gfx1010 -print-after=lgc-builder-replayer -o - %s 2>&1 | FileCheck --check-prefixes=CHECK %s
+; ModuleID = 'lgcPipeline'
+source_filename = "lgcPipeline"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8-p32:32:32"
+target triple = "amdgcn--amdpal"
+
+; Function Attrs: nounwind
+define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spirv.ExecutionModel !14 !lgc.shaderstage !15 {
+.entry:
+  %0 = call <4 x i32> (...) @lgc.create.read.generic.input.v4i32(i32 2, i32 0, i32 0, i32 0, i32 0, i32 poison)
+  %.fr = freeze <4 x i32> %0
+  %__llpc_input_proxy_4.0.vec.extract = extractelement <4 x i32> %.fr, i64 0
+  %__llpc_input_proxy_4.4.vec.extract = extractelement <4 x i32> %.fr, i64 1
+  %1 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 4, i32 4, i64 0, i32 0)
+  %2 = call i32 (...) @lgc.create.get.desc.stride.i32(i32 4, i32 4, i64 0, i32 0)
+  %3 = mul i32 %__llpc_input_proxy_4.4.vec.extract, %2
+  %4 = sext i32 %3 to i64
+  %5 = getelementptr i8, ptr addrspace(4) %1, i64 %4
+  %6 = load <4 x i32>, ptr addrspace(4) %5, align 16, !invariant.load !16
+  %7 = mul i32 %__llpc_input_proxy_4.0.vec.extract, %2
+  %8 = sext i32 %7 to i64
+  %9 = getelementptr i8, ptr addrspace(4) %1, i64 %8
+  %10 = load <4 x i32>, ptr addrspace(4) %9, align 16, !invariant.load !16
+  %11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 0, i32 8, <4 x i32> %10, i32 0)
+  call void (...) @lgc.create.image.store(<4 x float> %11, i32 0, i32 8, <4 x i32> %6, i32 1)
+  ret void
+}
+
+; Function Attrs: nounwind willreturn memory(read)
+declare <4 x i32> @lgc.create.read.generic.input.v4i32(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind memory(none)
+declare ptr addrspace(4) @lgc.create.get.desc.ptr.p4(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind memory(none)
+declare i32 @lgc.create.get.desc.stride.i32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind willreturn memory(read)
+declare <4 x float> @lgc.create.image.load.v4f32(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind memory(write)
+declare void @lgc.create.image.store(...) local_unnamed_addr #3
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign" }
+attributes #1 = { nounwind willreturn memory(read) }
+attributes #2 = { nounwind memory(none) }
+attributes #3 = { nounwind memory(write) }
+
+!lgc.client = !{!0}
+!lgc.options = !{!1}
+!lgc.options.VS = !{!2}
+!lgc.options.FS = !{!3}
+!lgc.user.data.nodes = !{!4, !5, !6, !7}
+!lgc.vertex.inputs = !{!8, !9, !10}
+!lgc.color.export.formats = !{!11}
+!lgc.rasterizer.state = !{!12}
+!amdgpu.pal.metadata.msgpack = !{!13}
+
+!0 = !{!"Vulkan"}
+!1 = !{i32 -1055878566, i32 -1332805290, i32 1045905781, i32 -589165353, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 256, i32 256, i32 -1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16777216}
+!2 = !{i32 1639417258, i32 -1495429105, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!3 = !{i32 -1409621709, i32 -171549995, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!4 = !{!"DescriptorTableVaPtr", i32 7, i32 66, i32 0, i32 1, i32 1}
+!5 = !{!"DescriptorTexelBuffer", i32 4, i32 66, i32 0, i32 16384, i64 0, i32 0, i32 4}
+!6 = !{!"StreamOutTableVaPtr", i32 11, i32 4, i32 1, i32 1, i32 0}
+!7 = !{!"IndirectUserDataVaPtr", i32 8, i32 2, i32 2, i32 1, i32 4}
+!8 = !{i32 0, i32 0, i32 0, i32 40, i32 14, i32 7}
+!9 = !{i32 1, i32 0, i32 16, i32 40, i32 11, i32 7}
+!10 = !{i32 2, i32 0, i32 24, i32 40, i32 14, i32 5}
+!11 = !{i32 14, i32 7, i32 0, i32 0, i32 15}
+!12 = !{i32 0, i32 0, i32 0, i32 1}
+!13 = !{!"\82\B0amdpal.pipelines\91\83\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\D6\9C\B4\09\0A8A\DA\CF3\09\AF\FF\11\A9U\06\AD.llpc_version\A470.1\AEamdpal.version\92\03\00"}
+!14 = !{i32 0}
+!15 = !{i32 1}
+!16 = !{}
+; CHECK-LABEL: @lgc.shader.VS.main(
+; CHECK-NEXT:  .entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @lgc.input.import.generic.v4i32(i1 false, i32 2, i32 0, i32 0, i32 poison)
+; CHECK-NEXT:    [[DOTFR:%.*]] = freeze <4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[__LLPC_INPUT_PROXY_4_0_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[DOTFR]], i64 0
+; CHECK-NEXT:    [[__LLPC_INPUT_PROXY_4_4_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[DOTFR]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @lgc.load.user.data.i32(i32 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr addrspace(4)
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[__LLPC_INPUT_PROXY_4_4_VEC_EXTRACT]], 16
+; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP10]], align 16, !invariant.load !16
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i32 [[__LLPC_INPUT_PROXY_4_0_VEC_EXTRACT]], 16
+; CHECK-NEXT:    [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP14]], align 16, !invariant.load !16
+; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP12]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP16]], i32 [[TMP12]])
+; CHECK-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP19]], align 16, !invariant.load !16
+; CHECK-NEXT:    [[TMP21:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[TMP20]], i32 0, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP22:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP16]], <4 x float> [[TMP21]])
+; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP8]])
+; CHECK-NEXT:    [[TMP24:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP23]], i32 [[TMP8]])
+; CHECK-NEXT:    [[TMP25:%.*]] = sext i32 [[TMP24]] to i64
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP26]], align 16, !invariant.load !16
+; CHECK-NEXT:    [[TMP28:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.last.use.v4i32(i32 [[TMP23]], <4 x i32> [[TMP27]])
+; CHECK-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> [[TMP22]], <4 x i32> [[TMP28]], i32 1, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest2.ll b/lgc/test/scalarizationOfDescriptorLoadsTest2.ll
new file mode 100644
index 0000000000..3e5e8edf0f
--- /dev/null
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest2.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --tool lgc
+; RUN: lgc -mcpu=gfx1010 -print-after=lgc-builder-replayer -o - %s 2>&1 | FileCheck --check-prefixes=CHECK %s
+; ModuleID = 'lgcPipeline'
+source_filename = "lgcPipeline"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8-p32:32:32"
+target triple = "amdgcn--amdpal"
+
+; Function Attrs: nounwind
+define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spirv.ExecutionModel !14 !lgc.shaderstage !15 {
+.entry:
+  %0 = call <4 x i32> (...) @lgc.create.read.generic.input.v4i32(i32 2, i32 0, i32 0, i32 0, i32 0, i32 poison)
+  %.fr = freeze <4 x i32> %0
+  %__llpc_input_proxy_4.0.vec.extract = extractelement <4 x i32> %.fr, i64 0
+  %1 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 4, i32 4, i64 0, i32 0)
+  %2 = call i32 (...) @lgc.create.get.desc.stride.i32(i32 4, i32 4, i64 0, i32 0)
+  %3 = mul i32 %__llpc_input_proxy_4.0.vec.extract, %2
+  %4 = sext i32 %3 to i64
+  %5 = getelementptr i8, ptr addrspace(4) %1, i64 %4
+  %6 = load <4 x i32>, ptr addrspace(4) %5, align 16, !invariant.load !16
+  %7 = getelementptr i8, ptr addrspace(4) %1, i64 %4
+  %8 = load <4 x i32>, ptr addrspace(4) %7, align 16, !invariant.load !16
+  call void (...) @lgc.create.image.store(<4 x i32> %8, i32 0, i32 8, <4 x i32> %6, i32 1)
+  ret void
+}
+
+; Function Attrs: nounwind willreturn memory(read)
+declare <4 x i32> @lgc.create.read.generic.input.v4i32(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind memory(none)
+declare ptr addrspace(4) @lgc.create.get.desc.ptr.p4(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind memory(none)
+declare i32 @lgc.create.get.desc.stride.i32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind memory(write)
+declare void @lgc.create.image.store(...) local_unnamed_addr #3
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign" }
+attributes #1 = { nounwind willreturn memory(read) }
+attributes #2 = { nounwind memory(none) }
+attributes #3 = { nounwind memory(write) }
+attributes #4 = { nounwind }
+
+!lgc.client = !{!0}
+!lgc.options = !{!1}
+!lgc.options.VS = !{!2}
+!lgc.options.FS = !{!3}
+!lgc.user.data.nodes = !{!4, !5, !6, !7}
+!lgc.vertex.inputs = !{!8, !9, !10}
+!lgc.color.export.formats = !{!11}
+!lgc.rasterizer.state = !{!12}
+!amdgpu.pal.metadata.msgpack = !{!13}
+
+!0 = !{!"Vulkan"}
+!1 = !{i32 -1055878566, i32 -1332805290, i32 1045905781, i32 -589165353, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 256, i32 256, i32 -1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16777216}
+!2 = !{i32 1639417258, i32 -1495429105, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!3 = !{i32 -1409621709, i32 -171549995, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!4 = !{!"DescriptorTableVaPtr", i32 7, i32 66, i32 0, i32 1, i32 1}
+!5 = !{!"DescriptorTexelBuffer", i32 4, i32 66, i32 0, i32 16384, i64 0, i32 0, i32 4}
+!6 = !{!"StreamOutTableVaPtr", i32 11, i32 4, i32 1, i32 1, i32 0}
+!7 = !{!"IndirectUserDataVaPtr", i32 8, i32 2, i32 2, i32 1, i32 4}
+!8 = !{i32 0, i32 0, i32 0, i32 40, i32 14, i32 7}
+!9 = !{i32 1, i32 0, i32 16, i32 40, i32 11, i32 7}
+!10 = !{i32 2, i32 0, i32 24, i32 40, i32 14, i32 5}
+!11 = !{i32 14, i32 7, i32 0, i32 0, i32 15}
+!12 = !{i32 0, i32 0, i32 0, i32 1}
+!13 = !{!"\82\B0amdpal.pipelines\91\83\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\D6\9C\B4\09\0A8A\DA\CF3\09\AF\FF\11\A9U\06\AD.llpc_version\A470.1\AEamdpal.version\92\03\00"}
+!14 = !{i32 0}
+!15 = !{i32 1}
+!16 = !{}
+; CHECK-LABEL: @lgc.shader.VS.main(
+; CHECK-NEXT:  .entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @lgc.input.import.generic.v4i32(i1 false, i32 2, i32 0, i32 0, i32 poison)
+; CHECK-NEXT:    [[DOTFR:%.*]] = freeze <4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[__LLPC_INPUT_PROXY_4_0_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[DOTFR]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @lgc.load.user.data.i32(i32 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr addrspace(4)
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[__LLPC_INPUT_PROXY_4_0_VEC_EXTRACT]], 16
+; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP10]], align 16, !invariant.load !16
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP12]], align 16, !invariant.load !16
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <4 x float>
+; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP8]])
+; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP15]], i32 [[TMP8]])
+; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP18]], align 16, !invariant.load !16
+; CHECK-NEXT:    [[TMP20:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.last.use.v4i32(i32 [[TMP15]], <4 x i32> [[TMP19]])
+; CHECK-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> [[TMP14]], <4 x i32> [[TMP20]], i32 1, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest3.ll b/lgc/test/scalarizationOfDescriptorLoadsTest3.ll
new file mode 100644
index 0000000000..40ef6af7bc
--- /dev/null
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest3.ll
@@ -0,0 +1,115 @@
+; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --tool lgc
+; RUN: lgc -mcpu=gfx1010 -print-after=lgc-builder-replayer -o - %s 2>&1 | FileCheck --check-prefixes=CHECK %s
+; ModuleID = 'lgcPipeline'
+source_filename = "lgcPipeline"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8-p32:32:32"
+target triple = "amdgcn--amdpal"
+
+declare <4 x i32> @foo1(<4 x i32> %V)
+
+; Function Attrs: nounwind
+define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spirv.ExecutionModel !22 !lgc.shaderstage !23 {
+.entry:
+  %0 = call ptr addrspace(4) (...) @lgc.create.load.push.constants.ptr.p4()
+  %1 = load i32, ptr addrspace(4) %0, align 4
+  %2 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 4, i32 4, i64 1, i32 12)
+  %3 = load <4 x i32>, ptr addrspace(4) %2, align 16, !invariant.load !24
+  %4 = call <4 x i32> (...) @lgc.create.image.load.v4i32(i32 0, i32 1536, <4 x i32> %3, i32 %1)
+  %5 = extractelement <4 x i32> %4, i64 0
+  %6 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 6)
+  %7 = call i32 (...) @lgc.create.get.desc.stride.i32(i32 1, i32 1, i64 0, i32 6)
+  %8 = mul i32 %5, %7
+  %9 = sext i32 %8 to i64
+  %10 = getelementptr i8, ptr addrspace(4) %6, i64 %9
+  %11 = load <8 x i32>, ptr addrspace(4) %10, align 32, !invariant.load !24
+  %12 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %11, <4 x i32> <i32 42, i32 42, i32 42, i32 42>, i32 1, <2 x float> zeroinitializer)
+  ret void
+}
+
+; Function Attrs: nounwind willreturn memory(read)
+declare ptr addrspace(4) @lgc.create.load.push.constants.ptr.p4(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind memory(none)
+declare ptr addrspace(4) @lgc.create.get.desc.ptr.p4(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind memory(none)
+declare i32 @lgc.create.get.desc.stride.i32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind willreturn memory(read)
+declare <4 x i32> @lgc.create.image.load.v4i32(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind willreturn memory(read)
+declare <4 x float> @lgc.create.image.sample.v4f32(...) local_unnamed_addr #1
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign" }
+attributes #1 = { nounwind willreturn memory(read) }
+attributes #2 = { nounwind memory(none) }
+
+!lgc.client = !{!0}
+!lgc.options = !{!1}
+!lgc.options.VS = !{!2}
+!lgc.options.FS = !{!3}
+!lgc.user.data.nodes = !{!4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15}
+!lgc.vertex.inputs = !{!16, !17, !18}
+!lgc.color.export.formats = !{!19}
+!lgc.rasterizer.state = !{!20}
+!amdgpu.pal.metadata.msgpack = !{!21}
+
+!0 = !{!"Vulkan"}
+!1 = !{i32 1397006593, i32 1762399868, i32 679484448, i32 1745956893, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 272, i32 0, i32 0, i32 1, i32 256, i32 256, i32 -1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16777216}
+!2 = !{i32 1156202838, i32 -1602642692, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!3 = !{i32 -1603553139, i32 446675175, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!4 = !{!"IndirectUserDataVaPtr", i32 8, i32 2, i32 0, i32 1, i32 4}
+!5 = !{!"DescriptorTableVaPtr", i32 7, i32 66, i32 11, i32 1, i32 3}
+!6 = !{!"DescriptorBufferCompact", i32 10, i32 66, i32 0, i32 2, i64 93, i32 17, i32 2}
+!7 = !{!"DescriptorBuffer", i32 6, i32 66, i32 2, i32 4, i64 93, i32 0, i32 4}
+!8 = !{!"DescriptorBuffer", i32 6, i32 66, i32 6, i32 4, i64 93, i32 1, i32 4}
+!9 = !{!"StreamOutTableVaPtr", i32 11, i32 4, i32 3, i32 1, i32 0}
+!10 = !{!"PushConst", i32 9, i32 66, i32 7, i32 2, i64 4294967295, i32 0, i32 4}
+!11 = !{!"DescriptorTableVaPtr", i32 7, i32 66, i32 9, i32 1, i32 2}
+!12 = !{!"DescriptorSampler", i32 2, i32 66, i32 0, i32 4, i64 0, i32 5, i32 4}
+!13 = !{!"DescriptorResource", i32 1, i32 66, i32 4, i32 32768, i64 0, i32 6, i32 8}
+!14 = !{!"DescriptorTableVaPtr", i32 7, i32 66, i32 10, i32 1, i32 1}
+!15 = !{!"DescriptorTexelBuffer", i32 4, i32 66, i32 0, i32 4, i64 1, i32 12, i32 4}
+!16 = !{i32 0, i32 0, i32 0, i32 40, i32 14, i32 7}
+!17 = !{i32 1, i32 0, i32 16, i32 40, i32 11, i32 7}
+!18 = !{i32 2, i32 0, i32 24, i32 40, i32 14, i32 5}
+!19 = !{i32 14, i32 7, i32 0, i32 0, i32 15}
+!20 = !{i32 0, i32 0, i32 0, i32 1}
+!21 = !{!"\82\B0amdpal.pipelines\91\83\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\9BN7\81A[\8A\DB\CF\9Daz\E2A\8F\88\AD\AD.llpc_version\A470.1\AEamdpal.version\92\03\00"}
+!22 = !{i32 4}
+!23 = !{i32 6}
+!24 = !{}
+; CHECK-LABEL: @lgc.shader.FS.main(
+; CHECK-NEXT:  .entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = call ptr addrspace(4) @lgc.user.data(i32 28)
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(4) [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @lgc.load.user.data.i32(i32 40)
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP6]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr addrspace(4)
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP10]], align 16, !invariant.load !24
+; CHECK-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.amdgcn.struct.buffer.load.format.v4i32(<4 x i32> [[TMP11]], i32 [[TMP5]], i32 0, i32 0, i32 0), !invariant.load !24
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP12]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @lgc.load.user.data.i32(i32 36)
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP14]], i64 0
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP15]] to i64
+; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr addrspace(4)
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP17]], i32 16
+; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP13]], 32
+; CHECK-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP18]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP21]], align 32, !invariant.load !24
+; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP19]])
+; CHECK-NEXT:    [[TMP24:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.v4i32(i32 [[TMP23]], <4 x i32> <i32 42, i32 42, i32 42, i32 42>)
+; CHECK-NEXT:    [[TMP25:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 [[TMP24]], <8 x i32> [[TMP22]])
+; CHECK-NEXT:    [[TMP26:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 [[TMP24]], <4 x i32> <i32 42, i32 42, i32 42, i32 42>)
+; CHECK-NEXT:    [[TMP27:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP25]], <4 x i32> [[TMP26]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP28:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP24]], <4 x float> [[TMP27]])
+; CHECK-NEXT:    ret void
+;
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest4.ll b/lgc/test/scalarizationOfDescriptorLoadsTest4.ll
new file mode 100644
index 0000000000..dc03dd036d
--- /dev/null
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest4.ll
@@ -0,0 +1,117 @@
+; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --tool lgc
+; RUN: lgc -mcpu=gfx1010 -print-after=lgc-builder-replayer -o - %s 2>&1 | FileCheck --check-prefixes=CHECK %s
+; ModuleID = 'lgcPipeline'
+source_filename = "lgcPipeline"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8-p32:32:32"
+target triple = "amdgcn--amdpal"
+
+declare <4 x i32> @foo1(<4 x i32> %V)
+
+; Function Attrs: nounwind
+define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spirv.ExecutionModel !22 !lgc.shaderstage !23 {
+.entry:
+  %0 = call ptr addrspace(4) (...) @lgc.create.load.push.constants.ptr.p4()
+  %1 = load i32, ptr addrspace(4) %0, align 4
+  %2 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 4, i32 4, i64 1, i32 12)
+  %3 = load <4 x i32>, ptr addrspace(4) %2, align 16, !invariant.load !24
+  %4 = call <4 x i32> (...) @lgc.create.image.load.v4i32(i32 0, i32 1536, <4 x i32> %3, i32 %1)
+  %5 = extractelement <4 x i32> %4, i64 0
+  %6 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 6)
+  %7 = call i32 (...) @lgc.create.get.desc.stride.i32(i32 1, i32 1, i64 0, i32 6)
+  %8 = mul i32 %5, %7
+  %9 = sext i32 %8 to i64
+  %10 = getelementptr i8, ptr addrspace(4) %6, i64 %9
+  %11 = load <8 x i32>, ptr addrspace(4) %10, align 32, !invariant.load !24
+  %12 = call <4 x i32> @foo1(<4 x i32> %4)
+  %13 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %11, <4 x i32> %12, i32 1, <2 x float> zeroinitializer)
+  ret void
+}
+
+; Function Attrs: nounwind willreturn memory(read)
+declare ptr addrspace(4) @lgc.create.load.push.constants.ptr.p4(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind memory(none)
+declare ptr addrspace(4) @lgc.create.get.desc.ptr.p4(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind memory(none)
+declare i32 @lgc.create.get.desc.stride.i32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind willreturn memory(read)
+declare <4 x i32> @lgc.create.image.load.v4i32(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind willreturn memory(read)
+declare <4 x float> @lgc.create.image.sample.v4f32(...) local_unnamed_addr #1
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign" }
+attributes #1 = { nounwind willreturn memory(read) }
+attributes #2 = { nounwind memory(none) }
+
+!lgc.client = !{!0}
+!lgc.options = !{!1}
+!lgc.options.VS = !{!2}
+!lgc.options.FS = !{!3}
+!lgc.user.data.nodes = !{!4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15}
+!lgc.vertex.inputs = !{!16, !17, !18}
+!lgc.color.export.formats = !{!19}
+!lgc.rasterizer.state = !{!20}
+!amdgpu.pal.metadata.msgpack = !{!21}
+
+!0 = !{!"Vulkan"}
+!1 = !{i32 1397006593, i32 1762399868, i32 679484448, i32 1745956893, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 272, i32 0, i32 0, i32 1, i32 256, i32 256, i32 -1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16777216}
+!2 = !{i32 1156202838, i32 -1602642692, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!3 = !{i32 -1603553139, i32 446675175, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!4 = !{!"IndirectUserDataVaPtr", i32 8, i32 2, i32 0, i32 1, i32 4}
+!5 = !{!"DescriptorTableVaPtr", i32 7, i32 66, i32 11, i32 1, i32 3}
+!6 = !{!"DescriptorBufferCompact", i32 10, i32 66, i32 0, i32 2, i64 93, i32 17, i32 2}
+!7 = !{!"DescriptorBuffer", i32 6, i32 66, i32 2, i32 4, i64 93, i32 0, i32 4}
+!8 = !{!"DescriptorBuffer", i32 6, i32 66, i32 6, i32 4, i64 93, i32 1, i32 4}
+!9 = !{!"StreamOutTableVaPtr", i32 11, i32 4, i32 3, i32 1, i32 0}
+!10 = !{!"PushConst", i32 9, i32 66, i32 7, i32 2, i64 4294967295, i32 0, i32 4}
+!11 = !{!"DescriptorTableVaPtr", i32 7, i32 66, i32 9, i32 1, i32 2}
+!12 = !{!"DescriptorSampler", i32 2, i32 66, i32 0, i32 4, i64 0, i32 5, i32 4}
+!13 = !{!"DescriptorResource", i32 1, i32 66, i32 4, i32 32768, i64 0, i32 6, i32 8}
+!14 = !{!"DescriptorTableVaPtr", i32 7, i32 66, i32 10, i32 1, i32 1}
+!15 = !{!"DescriptorTexelBuffer", i32 4, i32 66, i32 0, i32 4, i64 1, i32 12, i32 4}
+!16 = !{i32 0, i32 0, i32 0, i32 40, i32 14, i32 7}
+!17 = !{i32 1, i32 0, i32 16, i32 40, i32 11, i32 7}
+!18 = !{i32 2, i32 0, i32 24, i32 40, i32 14, i32 5}
+!19 = !{i32 14, i32 7, i32 0, i32 0, i32 15}
+!20 = !{i32 0, i32 0, i32 0, i32 1}
+!21 = !{!"\82\B0amdpal.pipelines\91\83\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\9BN7\81A[\8A\DB\CF\9Daz\E2A\8F\88\AD\AD.llpc_version\A470.1\AEamdpal.version\92\03\00"}
+!22 = !{i32 4}
+!23 = !{i32 6}
+!24 = !{}
+; CHECK-LABEL: @lgc.shader.FS.main(
+; CHECK-NEXT:  .entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = call ptr addrspace(4) @lgc.user.data(i32 28)
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(4) [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @lgc.load.user.data.i32(i32 40)
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP6]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr addrspace(4)
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP10]], align 16, !invariant.load !24
+; CHECK-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.amdgcn.struct.buffer.load.format.v4i32(<4 x i32> [[TMP11]], i32 [[TMP5]], i32 0, i32 0, i32 0), !invariant.load !24
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP12]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @lgc.load.user.data.i32(i32 36)
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP14]], i64 0
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP15]] to i64
+; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr addrspace(4)
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP17]], i32 16
+; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP13]], 32
+; CHECK-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP18]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP21]], align 32, !invariant.load !24
+; CHECK-NEXT:    [[TMP23:%.*]] = call <4 x i32> @foo1(<4 x i32> [[TMP12]])
+; CHECK-NEXT:    [[TMP24:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP19]])
+; CHECK-NEXT:    [[TMP25:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.v4i32(i32 [[TMP24]], <4 x i32> [[TMP23]])
+; CHECK-NEXT:    [[TMP26:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 [[TMP25]], <8 x i32> [[TMP22]])
+; CHECK-NEXT:    [[TMP27:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 [[TMP25]], <4 x i32> [[TMP23]])
+; CHECK-NEXT:    [[TMP28:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP26]], <4 x i32> [[TMP27]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP29:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP25]], <4 x float> [[TMP28]])
+; CHECK-NEXT:    ret void
+;
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest5.ll b/lgc/test/scalarizationOfDescriptorLoadsTest5.ll
new file mode 100644
index 0000000000..15a37bd389
--- /dev/null
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest5.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --tool lgc
+; RUN: lgc -mcpu=gfx1010 -print-after=lgc-builder-replayer -o - %s 2>&1 | FileCheck --check-prefixes=CHECK %s
+; ModuleID = 'lgcPipeline'
+source_filename = "lgcPipeline"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8-p32:32:32"
+target triple = "amdgcn--amdpal"
+
+; Function Attrs: nounwind
+define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spirv.ExecutionModel !14 !lgc.shaderstage !15 {
+entry:
+  %0 = call <4 x i32> (...) @lgc.create.read.generic.input.v4i32(i32 2, i32 0, i32 0, i32 0, i32 0, i32 poison)
+  %.fr = freeze <4 x i32> %0
+  %__llpc_input_proxy_4.0.vec.extract = extractelement <4 x i32> %.fr, i64 0
+  %__llpc_input_proxy_4.4.vec.extract = extractelement <4 x i32> %.fr, i64 1
+  %.not = icmp eq i32 %__llpc_input_proxy_4.4.vec.extract, 0
+  br i1 %.not, label %ret, label %bb
+
+bb:                                                ; preds = %entry
+  %1 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 4, i32 4, i64 0, i32 0)
+  %2 = call i32 (...) @lgc.create.get.desc.stride.i32(i32 4, i32 4, i64 0, i32 0)
+  %3 = mul i32 %__llpc_input_proxy_4.4.vec.extract, %2
+  %4 = sext i32 %3 to i64
+  %5 = getelementptr i8, ptr addrspace(4) %1, i64 %4
+  %6 = load <4 x i32>, ptr addrspace(4) %5, align 16, !invariant.load !16
+  %7 = mul i32 %__llpc_input_proxy_4.0.vec.extract, %2
+  %8 = sext i32 %7 to i64
+  %9 = getelementptr i8, ptr addrspace(4) %1, i64 %8
+  %10 = load <4 x i32>, ptr addrspace(4) %9, align 16, !invariant.load !16
+  %11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 0, i32 8, <4 x i32> %10, i32 0)
+  call void (...) @lgc.create.image.store(<4 x float> %11, i32 0, i32 8, <4 x i32> %6, i32 1)
+  br label %ret
+
+ret:                                               ; preds = %bb, %entry
+  ret void
+}
+
+; Function Attrs: nounwind willreturn memory(read)
+declare <4 x i32> @lgc.create.read.generic.input.v4i32(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind memory(none)
+declare ptr addrspace(4) @lgc.create.get.desc.ptr.p4(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind memory(none)
+declare i32 @lgc.create.get.desc.stride.i32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind willreturn memory(read)
+declare <4 x float> @lgc.create.image.load.v4f32(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind memory(write)
+declare void @lgc.create.image.store(...) local_unnamed_addr #3
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign" }
+attributes #1 = { nounwind willreturn memory(read) }
+attributes #2 = { nounwind memory(none) }
+attributes #3 = { nounwind memory(write) }
+
+!lgc.client = !{!0}
+!lgc.options = !{!1}
+!lgc.options.VS = !{!2}
+!lgc.options.FS = !{!3}
+!lgc.user.data.nodes = !{!4, !5, !6, !7}
+!lgc.vertex.inputs = !{!8, !9, !10}
+!lgc.color.export.formats = !{!11}
+!lgc.rasterizer.state = !{!12}
+!amdgpu.pal.metadata.msgpack = !{!13}
+
+!0 = !{!"Vulkan"}
+!1 = !{i32 -1055878566, i32 -1332805290, i32 1045905781, i32 -589165353, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 256, i32 256, i32 -1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16777216}
+!2 = !{i32 1639417258, i32 -1495429105, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!3 = !{i32 -1409621709, i32 -171549995, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!4 = !{!"DescriptorTableVaPtr", i32 7, i32 66, i32 0, i32 1, i32 1}
+!5 = !{!"DescriptorTexelBuffer", i32 4, i32 66, i32 0, i32 16384, i64 0, i32 0, i32 4}
+!6 = !{!"StreamOutTableVaPtr", i32 11, i32 4, i32 1, i32 1, i32 0}
+!7 = !{!"IndirectUserDataVaPtr", i32 8, i32 2, i32 2, i32 1, i32 4}
+!8 = !{i32 0, i32 0, i32 0, i32 40, i32 14, i32 7}
+!9 = !{i32 1, i32 0, i32 16, i32 40, i32 11, i32 7}
+!10 = !{i32 2, i32 0, i32 24, i32 40, i32 14, i32 5}
+!11 = !{i32 14, i32 7, i32 0, i32 0, i32 15}
+!12 = !{i32 0, i32 0, i32 0, i32 1}
+!13 = !{!"\82\B0amdpal.pipelines\91\83\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\D6\9C\B4\09\0A8A\DA\CF3\09\AF\FF\11\A9U\06\AD.llpc_version\A470.1\AEamdpal.version\92\03\00"}
+!14 = !{i32 0}
+!15 = !{i32 1}
+!16 = !{}
+; CHECK-LABEL: @lgc.shader.VS.main(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @lgc.input.import.generic.v4i32(i1 false, i32 2, i32 0, i32 0, i32 poison)
+; CHECK-NEXT:    [[DOTFR:%.*]] = freeze <4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[__LLPC_INPUT_PROXY_4_0_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[DOTFR]], i64 0
+; CHECK-NEXT:    [[__LLPC_INPUT_PROXY_4_4_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[DOTFR]], i64 1
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[__LLPC_INPUT_PROXY_4_4_VEC_EXTRACT]], 0
+; CHECK-NEXT:    br i1 [[DOTNOT]], label [[RET:%.*]], label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @lgc.load.user.data.i32(i32 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr addrspace(4)
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[__LLPC_INPUT_PROXY_4_4_VEC_EXTRACT]], 16
+; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP10]], align 16, !invariant.load !16
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i32 [[__LLPC_INPUT_PROXY_4_0_VEC_EXTRACT]], 16
+; CHECK-NEXT:    [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP14]], align 16, !invariant.load !16
+; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP12]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 [[TMP16]], <4 x i32> [[TMP15]])
+; CHECK-NEXT:    [[TMP18:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[TMP17]], i32 0, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP19:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP16]], <4 x float> [[TMP18]])
+; CHECK-NEXT:    [[TMP20:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP8]])
+; CHECK-NEXT:    [[TMP21:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 [[TMP20]], <4 x i32> [[TMP11]])
+; CHECK-NEXT:    [[TMP22:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.last.use.v4i32(i32 [[TMP20]], <4 x i32> [[TMP21]])
+; CHECK-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> [[TMP19]], <4 x i32> [[TMP22]], i32 1, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    br label [[RET]]
+; CHECK:       ret:
+; CHECK-NEXT:    ret void
+;
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest6.ll b/lgc/test/scalarizationOfDescriptorLoadsTest6.ll
new file mode 100644
index 0000000000..5bc5b3141e
--- /dev/null
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest6.ll
@@ -0,0 +1,186 @@
+; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --tool lgc
+; RUN: lgc -mcpu=gfx1010 -print-after=lgc-builder-replayer -o - %s 2>&1 | FileCheck --check-prefixes=CHECK %s
+; ModuleID = 'lgcPipeline'
+source_filename = "lgcPipeline"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8-p32:32:32"
+target triple = "amdgcn--amdpal"
+
+; Function Attrs: nounwind
+define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spirv.ExecutionModel !22 !lgc.shaderstage !23 {
+.entry:
+  %0 = call ptr addrspace(4) (...) @lgc.create.load.push.constants.ptr.p4()
+  %1 = call i32 (...) @lgc.create.read.generic.input.i32(i32 3, i32 0, i32 0, i32 0, i32 17, i32 poison)
+  %2 = load i32, ptr addrspace(4) %0, align 4
+  br label %3
+
+3:                                                ; preds = %7, %.entry
+  %.010 = phi <4 x float> [ zeroinitializer, %.entry ], [ %30, %7 ]
+  %.09 = phi <4 x float> [ zeroinitializer, %.entry ], [ %28, %7 ]
+  %.0 = phi i32 [ %2, %.entry ], [ %31, %7 ]
+  %4 = getelementptr inbounds <{ i32, i32 }>, ptr addrspace(4) %0, i64 0, i32 1
+  %5 = load i32, ptr addrspace(4) %4, align 4
+  %6 = icmp slt i32 %.0, %5
+  %cond.freeze = freeze i1 %6
+  br i1 %cond.freeze, label %7, label %32
+
+7:                                                ; preds = %3
+  %8 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 4, i32 4, i64 1, i32 12)
+  %9 = call i32 (...) @lgc.create.get.desc.stride.i32(i32 4, i32 4, i64 1, i32 12)
+  %10 = load <4 x i32>, ptr addrspace(4) %8, align 16, !invariant.load !24
+  %11 = call <4 x i32> (...) @lgc.create.image.load.v4i32(i32 0, i32 1536, <4 x i32> %10, i32 %.0)
+  %12 = extractelement <4 x i32> %11, i64 0
+  %13 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 6)
+  %14 = call i32 (...) @lgc.create.get.desc.stride.i32(i32 1, i32 1, i64 0, i32 6)
+  %15 = mul i32 %12, %14
+  %16 = sext i32 %15 to i64
+  %17 = getelementptr i8, ptr addrspace(4) %13, i64 %16
+  %18 = load <8 x i32>, ptr addrspace(4) %17, align 32, !invariant.load !24
+  %19 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 2, i32 2, i64 0, i32 5)
+  %20 = load <4 x i32>, ptr addrspace(4) %19, align 16, !invariant.load !24
+  %21 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %18, <4 x i32> %20, i32 1, <2 x float> zeroinitializer)
+  %22 = mul i32 %1, %14
+  %23 = sext i32 %22 to i64
+  %24 = getelementptr i8, ptr addrspace(4) %13, i64 %23
+  %25 = load <8 x i32>, ptr addrspace(4) %24, align 32, !invariant.load !24
+  %26 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %25, <4 x i32> %20, i32 1, <2 x float> zeroinitializer)
+  %27 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %18, <4 x i32> %20, i32 1, <2 x float> zeroinitializer)
+  %28 = fadd reassoc nnan nsz arcp contract afn <4 x float> %.09, %27
+  %29 = fadd reassoc nnan nsz arcp contract afn <4 x float> %21, %26
+  %30 = fadd reassoc nnan nsz arcp contract afn <4 x float> %.010, %29
+  %31 = add i32 %.0, 1
+  br label %3, !llvm.loop !25
+
+32:                                               ; preds = %3
+  ret void
+}
+
+; Function Attrs: nounwind willreturn memory(read)
+declare ptr addrspace(4) @lgc.create.load.push.constants.ptr.p4(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind willreturn memory(read)
+declare i32 @lgc.create.read.generic.input.i32(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind memory(none)
+declare ptr addrspace(4) @lgc.create.get.desc.ptr.p4(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind memory(none)
+declare i32 @lgc.create.get.desc.stride.i32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind willreturn memory(read)
+declare <4 x i32> @lgc.create.image.load.v4i32(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind willreturn memory(read)
+declare <4 x float> @lgc.create.image.sample.v4f32(...) local_unnamed_addr #1
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign" }
+attributes #1 = { nounwind willreturn memory(read) }
+attributes #2 = { nounwind memory(none) }
+
+!lgc.client = !{!0}
+!lgc.options = !{!1}
+!lgc.options.VS = !{!2}
+!lgc.options.FS = !{!3}
+!lgc.user.data.nodes = !{!4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15}
+!lgc.vertex.inputs = !{!16, !17, !18}
+!lgc.color.export.formats = !{!19}
+!lgc.rasterizer.state = !{!20}
+!amdgpu.pal.metadata.msgpack = !{!21}
+
+!0 = !{!"Vulkan"}
+!1 = !{i32 1397006593, i32 1762399868, i32 679484448, i32 1745956893, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 272, i32 0, i32 0, i32 1, i32 256, i32 256, i32 -1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16777216}
+!2 = !{i32 1156202838, i32 -1602642692, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!3 = !{i32 -1603553139, i32 446675175, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!4 = !{!"IndirectUserDataVaPtr", i32 8, i32 2, i32 0, i32 1, i32 4}
+!5 = !{!"DescriptorTableVaPtr", i32 7, i32 66, i32 11, i32 1, i32 3}
+!6 = !{!"DescriptorBufferCompact", i32 10, i32 66, i32 0, i32 2, i64 93, i32 17, i32 2}
+!7 = !{!"DescriptorBuffer", i32 6, i32 66, i32 2, i32 4, i64 93, i32 0, i32 4}
+!8 = !{!"DescriptorBuffer", i32 6, i32 66, i32 6, i32 4, i64 93, i32 1, i32 4}
+!9 = !{!"StreamOutTableVaPtr", i32 11, i32 4, i32 3, i32 1, i32 0}
+!10 = !{!"PushConst", i32 9, i32 66, i32 7, i32 2, i64 4294967295, i32 0, i32 4}
+!11 = !{!"DescriptorTableVaPtr", i32 7, i32 66, i32 9, i32 1, i32 2}
+!12 = !{!"DescriptorSampler", i32 2, i32 66, i32 0, i32 4, i64 0, i32 5, i32 4}
+!13 = !{!"DescriptorResource", i32 1, i32 66, i32 4, i32 32768, i64 0, i32 6, i32 8}
+!14 = !{!"DescriptorTableVaPtr", i32 7, i32 66, i32 10, i32 1, i32 1}
+!15 = !{!"DescriptorTexelBuffer", i32 4, i32 66, i32 0, i32 4, i64 1, i32 12, i32 4}
+!16 = !{i32 0, i32 0, i32 0, i32 40, i32 14, i32 7}
+!17 = !{i32 1, i32 0, i32 16, i32 40, i32 11, i32 7}
+!18 = !{i32 2, i32 0, i32 24, i32 40, i32 14, i32 5}
+!19 = !{i32 14, i32 7, i32 0, i32 0, i32 15}
+!20 = !{i32 0, i32 0, i32 0, i32 1}
+!21 = !{!"\82\B0amdpal.pipelines\91\83\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\9BN7\81A[\8A\DB\CF\9Daz\E2A\8F\88\AD\AD.llpc_version\A470.1\AEamdpal.version\92\03\00"}
+!22 = !{i32 4}
+!23 = !{i32 6}
+!24 = !{}
+!25 = distinct !{!25}
+; CHECK-LABEL: @lgc.shader.FS.main(
+; CHECK-NEXT:  .entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64 [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = call ptr addrspace(4) @lgc.user.data(i32 28)
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 (...) @lgc.input.import.interpolated.i32(i1 false, i32 3, i32 0, i32 0, i32 poison, i32 1, i32 poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(4) [[TMP6]], align 4
+; CHECK-NEXT:    br label [[TMP9:%.*]]
+; CHECK:       9:
+; CHECK-NEXT:    [[DOT010:%.*]] = phi <4 x float> [ zeroinitializer, [[DOTENTRY:%.*]] ], [ [[TMP58:%.*]], [[TMP13:%.*]] ]
+; CHECK-NEXT:    [[DOT09:%.*]] = phi <4 x float> [ zeroinitializer, [[DOTENTRY]] ], [ [[TMP56:%.*]], [[TMP13]] ]
+; CHECK-NEXT:    [[DOT0:%.*]] = phi i32 [ [[TMP8]], [[DOTENTRY]] ], [ [[TMP59:%.*]], [[TMP13]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <{ i32, i32 }>, ptr addrspace(4) [[TMP6]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr addrspace(4) [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp slt i32 [[DOT0]], [[TMP11]]
+; CHECK-NEXT:    [[COND_FREEZE:%.*]] = freeze i1 [[TMP12]]
+; CHECK-NEXT:    br i1 [[COND_FREEZE]], label [[TMP13]], label [[TMP60:%.*]]
+; CHECK:       13:
+; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @lgc.load.user.data.i32(i32 40)
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP14]], i64 0
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP15]] to i64
+; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr addrspace(4)
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP17]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP18]], align 16, !invariant.load !24
+; CHECK-NEXT:    [[TMP20:%.*]] = call <4 x i32> @llvm.amdgcn.struct.buffer.load.format.v4i32(<4 x i32> [[TMP19]], i32 [[DOT0]], i32 0, i32 0, i32 0), !invariant.load !24
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP20]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = call i32 @lgc.load.user.data.i32(i32 36)
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP22]], i64 0
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <2 x i32> [[TMP23]] to i64
+; CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr addrspace(4)
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP25]], i32 16
+; CHECK-NEXT:    [[TMP27:%.*]] = mul i32 [[TMP21]], 32
+; CHECK-NEXT:    [[TMP28:%.*]] = sext i32 [[TMP27]] to i64
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP26]], i64 [[TMP28]]
+; CHECK-NEXT:    [[TMP30:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP29]], align 32, !invariant.load !24
+; CHECK-NEXT:    [[TMP31:%.*]] = call i32 @lgc.load.user.data.i32(i32 36)
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP31]], i64 0
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast <2 x i32> [[TMP32]] to i64
+; CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr addrspace(4)
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP34]], i32 0
+; CHECK-NEXT:    [[TMP36:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP35]], align 16, !invariant.load !24
+; CHECK-NEXT:    [[TMP37:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP27]])
+; CHECK-NEXT:    [[TMP38:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 [[TMP37]], <8 x i32> [[TMP30]])
+; CHECK-NEXT:    [[TMP39:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 [[TMP37]], <4 x i32> [[TMP36]])
+; CHECK-NEXT:    [[TMP40:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP38]], <4 x i32> [[TMP39]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP41:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP37]], <4 x float> [[TMP40]])
+; CHECK-NEXT:    [[TMP42:%.*]] = mul i32 [[TMP7]], 32
+; CHECK-NEXT:    [[TMP43:%.*]] = sext i32 [[TMP42]] to i64
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP26]], i64 [[TMP43]]
+; CHECK-NEXT:    [[TMP45:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP44]], align 32, !invariant.load !24
+; CHECK-NEXT:    [[TMP46:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP42]])
+; CHECK-NEXT:    [[TMP47:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 [[TMP46]], <8 x i32> [[TMP45]])
+; CHECK-NEXT:    [[TMP48:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 [[TMP46]], <4 x i32> [[TMP36]])
+; CHECK-NEXT:    [[TMP49:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP47]], <4 x i32> [[TMP48]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP50:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP46]], <4 x float> [[TMP49]])
+; CHECK-NEXT:    [[TMP51:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP27]])
+; CHECK-NEXT:    [[TMP52:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 [[TMP51]], <8 x i32> [[TMP30]])
+; CHECK-NEXT:    [[TMP53:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 [[TMP51]], <4 x i32> [[TMP36]])
+; CHECK-NEXT:    [[TMP54:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP52]], <4 x i32> [[TMP53]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP55:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP51]], <4 x float> [[TMP54]])
+; CHECK-NEXT:    [[TMP56]] = fadd reassoc nnan nsz arcp contract afn <4 x float> [[DOT09]], [[TMP55]]
+; CHECK-NEXT:    [[TMP57:%.*]] = fadd reassoc nnan nsz arcp contract afn <4 x float> [[TMP41]], [[TMP50]]
+; CHECK-NEXT:    [[TMP58]] = fadd reassoc nnan nsz arcp contract afn <4 x float> [[DOT010]], [[TMP57]]
+; CHECK-NEXT:    [[TMP59]] = add i32 [[DOT0]], 1
+; CHECK-NEXT:    br label [[TMP9]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK:       60:
+; CHECK-NEXT:    ret void
+;
diff --git a/llpc/context/llpcPipelineContext.cpp b/llpc/context/llpcPipelineContext.cpp
index e71c97e687..c9fb480c0d 100644
--- a/llpc/context/llpcPipelineContext.cpp
+++ b/llpc/context/llpcPipelineContext.cpp
@@ -618,10 +618,9 @@ ShaderOptions PipelineContext::computeShaderOptions(const PipelineShaderInfo &sh
   if (ScalarizeWaterfallDescriptorLoads.getNumOccurrences() > 0) {
     shaderOptions.scalarizeWaterfallLoads = ScalarizeWaterfallDescriptorLoads;
   } else {
-    shaderOptions.scalarizeWaterfallLoads = shaderInfo.options.scalarizeWaterfallLoads;
-    // Enable waterfall load scalarization when vgpr limit is set.
-    if (shaderOptions.vgprLimit != 0 && shaderOptions.vgprLimit != UINT_MAX)
-      shaderOptions.scalarizeWaterfallLoads = true;
+    shaderOptions.scalarizeWaterfallLoads = true;
+    if (shaderInfo.options.scalarizeWaterfallLoads.has_value())
+      shaderOptions.scalarizeWaterfallLoads = *shaderInfo.options.scalarizeWaterfallLoads;
   }
 
   shaderOptions.sgprLimit = shaderInfo.options.sgprLimit;
diff --git a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallInsertion.frag b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallInsertion.frag
index 6845f3f011..74975a3767 100644
--- a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallInsertion.frag
+++ b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallInsertion.frag
@@ -18,16 +18,24 @@ void main()
     _3 = texture(_11[nonuniformEXT(_12)], vec2(0.0));
 }
 
-// BEGIN_SHADERTEST
-/*
-; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
-; Make sure that the begin indices chosen are the non-uniform offsets rather than the whole resource desc
-; Make sure that there's a waterfall.readfirstlane for both the image resource desc and sample desc
-; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST-DAG: call i32 @llvm.amdgcn.waterfall.begin.i32
-; SHADERTEST-DAG: call i32 @llvm.amdgcn.waterfall.begin.i32
-; SHADERTEST-DAG: call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32
-; SHADERTEST-DAG: call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32
-; SHADERTEST: AMDLLPC SUCCESS
-*/
-// END_SHADERTEST
+// RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
+// Make sure that the begin indices chosen are the non-uniform offsets rather than the whole resource desc
+// Make sure that there's a waterfall.readfirstlane for both the image resource desc and sample desc
+// SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
+// SHADERTEST: %[[mul:[0-9]+]] = mul i32 %{{.*}}, 48
+// SHADERTEST-NEXT: %[[begin:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul]])
+// SHADERTEST-NEXT: %[[readfirstlane:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin]], i32 %[[mul]])
+// SHADERTEST-NEXT: %[[sext:[0-9]+]] = sext i32 %[[readfirstlane]] to i64
+// SHADERTEST-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
+// SHADERTEST-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// SHADERTEST-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 3
+// SHADERTEST-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], 268435455
+// SHADERTEST-NEXT: %[[cmp:[0-9]+]] = icmp slt i32 %[[extract]], 0
+// SHADERTEST-NEXT: %[[select:[0-9]+]] = select i1 %[[cmp]], i32 %[[extract]], i32 %[[and]]
+// SHADERTEST-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[select]], i64 3
+// SHADERTEST-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// SHADERTEST-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
+// SHADERTEST-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// SHADERTEST-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half 0xH0000, half 0xH0000, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]])
+// SHADERTEST: AMDLLPC SUCCESS
diff --git a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize.frag b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize.frag
index fbf9c25c0f..ff090feb37 100644
--- a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize.frag
+++ b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize.frag
@@ -1,6 +1,3 @@
-// Make sure that there is a single begin index
-// Make sure that there is a single waterfall.readfirstlane for the offset
-
 #version 450
 #extension GL_EXT_nonuniform_qualifier : require
 
@@ -16,18 +13,56 @@ void main()
     _3 = texture(_11[nonuniformEXT(_12)], _6);
 }
 
-// BEGIN_SHADERTEST
-//
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST-GFX %s
 // Explicitly check GFX10.3 ASIC variants:
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST %s
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST %s
-// SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-// SHADERTEST: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32
-// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32
-// SHADERTEST: AMDLLPC SUCCESS
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST-GFX_10_3_0 %s
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST-GFX_10_3_2 %s
+
+// SHADERTEST-GFX-LABEL: {{^// LLPC}} pipeline patching results
+// SHADERTEST-GFX: %[[mul:[0-9]+]] = mul i32 %{{.*}}, 48
+// SHADERTEST-GFX-NEXT: %[[begin:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul]])
+// SHADERTEST-GFX-NEXT: %[[readfirstlane:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin]], i32 %[[mul]])
+// SHADERTEST-GFX-NEXT: %[[sext:[0-9]+]] = sext i32 %[[readfirstlane]] to i64
+// SHADERTEST-GFX-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
+// SHADERTEST-GFX-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// SHADERTEST-GFX-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 3
+// SHADERTEST-GFX-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], 268435455
+// SHADERTEST-GFX-NEXT: %[[cmp:[0-9]+]] = icmp slt i32 %[[extract]], 0
+// SHADERTEST-GFX-NEXT: %[[select:[0-9]+]] = select i1 %[[cmp]], i32 %[[extract]], i32 %[[and]]
+// SHADERTEST-GFX-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[select]], i64 3
+// SHADERTEST-GFX-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// SHADERTEST-GFX-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
+// SHADERTEST-GFX-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// SHADERTEST-GFX-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]])
+// SHADERTEST-GFX: AMDLLPC SUCCESS
+//
+// SHADERTEST-GFX_10_3_0-LABEL: {{^// LLPC}} pipeline patching results
+// SHADERTEST-GFX_10_3_0: %[[mul:[0-9]+]] = mul i32 %{{.*}}, 48
+// SHADERTEST-GFX_10_3_0-NEXT: %[[begin:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[readfirstlane:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin]], i32 %[[mul]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[sext:[0-9]+]] = sext i32 %[[readfirstlane]] to i64
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// SHADERTEST-GFX_10_3_0-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_0-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]])
+// SHADERTEST-GFX_10_3_0: AMDLLPC SUCCESS
 //
-// END_SHADERTEST
+// SHADERTEST-GFX_10_3_2-LABEL: {{^// LLPC}} pipeline patching results
+// SHADERTEST-GFX_10_3_2: %[[mul:[0-9]+]] = mul i32 %{{.*}}, 48
+// SHADERTEST-GFX_10_3_2-NEXT: %[[begin:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[readfirstlane:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin]], i32 %[[mul]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[sext:[0-9]+]] = sext i32 %[[readfirstlane]] to i64
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// SHADERTEST-GFX_10_3_2-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], -1048577
+// SHADERTEST-GFX_10_3_2-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[and]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// SHADERTEST-GFX_10_3_2-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_2-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]])
+// SHADERTEST-GFX_10_3_2: AMDLLPC SUCCESS
diff --git a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_MultiBlock.frag b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_MultiBlock.frag
index 82cd87a930..8e1893653d 100644
--- a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_MultiBlock.frag
+++ b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_MultiBlock.frag
@@ -1,5 +1,5 @@
 // Make sure that there are two non-overlapping waterfall loops
-// First is scalarized and second is vector type
+// The first two loops are scalarized and the last one is vector type
 
 #version 450
 #extension GL_EXT_nonuniform_qualifier : require
@@ -25,24 +25,139 @@ void main()
     _3 = samp0 + samp1;
 }
 
-// BEGIN_SHADERTEST
-//
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST-GFX %s
 // Explicitly check GFX10.3 ASIC variants:
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST %s
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST %s
-// SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-// SHADERTEST: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32
-// SHADERTEST: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST: call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32
-// SHADERTEST: call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32
-// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32
-// SHADERTEST: AMDLLPC SUCCESS
-//
-// END_SHADERTEST
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST-GFX_10_3_0 %s
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST-GFX_10_3_2 %s
+
+// SHADERTEST-GFX-LABEL: {{^// LLPC}} pipeline patching results
+// SHADERTEST-GFX: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48
+// SHADERTEST-GFX-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[mul1]] to i64
+// SHADERTEST-GFX-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX-NEXT: %[[load1:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// SHADERTEST-GFX-NEXT: %[[load2:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// SHADERTEST-GFX-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]])
+// SHADERTEST-GFX-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64
+// SHADERTEST-GFX-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32
+// SHADERTEST-GFX-NEXT: %[[extract1:[.a-z0-9]+]] = extractelement <8 x i32> %[[load3]], i64 3
+// SHADERTEST-GFX-NEXT: %[[and1:[0-9]+]] = and i32 %[[extract1]], 268435455
+// SHADERTEST-GFX-NEXT: %[[cmp1:[0-9]+]] = icmp slt i32 %[[extract1]], 0
+// SHADERTEST-GFX-NEXT: %[[select1:[0-9]+]] = select i1 %[[cmp1]], i32 %[[extract1]], i32 %[[and1]]
+// SHADERTEST-GFX-NEXT: %[[insert1:[.a-z0-9]+]] = insertelement <8 x i32> %[[load3]], i32 %[[select1]], i64 3
+// SHADERTEST-GFX-NEXT: %[[shufflevector1:[0-9]+]] = shufflevector <8 x i32> %[[insert1]], <8 x i32> %[[load3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// SHADERTEST-GFX-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16
+// SHADERTEST-GFX-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector1]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]])
+//
+// SHADERTEST-GFX-NEXT: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]])
+// SHADERTEST-GFX-NEXT: %[[sext3:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64
+// SHADERTEST-GFX-NEXT: %[[gep5:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]]
+// SHADERTEST-GFX-NEXT: %[[load5:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep5]], align 32
+// SHADERTEST-GFX-NEXT: %[[extract2:[.a-z0-9]+]] = extractelement <8 x i32> %[[load5]], i64 3
+// SHADERTEST-GFX-NEXT: %[[and2:[0-9]+]] = and i32 %[[extract2]], 268435455
+// SHADERTEST-GFX-NEXT: %[[cmp2:[0-9]+]] = icmp slt i32 %[[extract2]], 0
+// SHADERTEST-GFX-NEXT: %[[select2:[0-9]+]] = select i1 %[[cmp2]], i32 %[[extract2]], i32 %[[and2]]
+// SHADERTEST-GFX-NEXT: %[[insert2:[.a-z0-9]+]] = insertelement <8 x i32> %[[load5]], i32 %[[select2]], i64 3
+// SHADERTEST-GFX-NEXT: %[[shufflevector2:[0-9]+]] = shufflevector <8 x i32> %[[insert2]], <8 x i32> %[[load5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// SHADERTEST-GFX-NEXT: %[[gep6:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]]
+// SHADERTEST-GFX-NEXT: %[[load6:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep6]], align 16
+// SHADERTEST-GFX-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector2]], <4 x i32> %[[load6]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]])
+//
+// SHADERTEST-GFX: %[[begin3:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX-NEXT: %[[readfirstlane3:[0-9]+]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 %[[begin3]], <8 x i32> %[[load2]])
+// SHADERTEST-GFX-NEXT: %[[extract3:[.a-z0-9]+]] = extractelement <8 x i32> %[[readfirstlane3]], i64 3
+// SHADERTEST-GFX-NEXT: %[[and3:[0-9]+]] = and i32 %[[extract3]], 268435455
+// SHADERTEST-GFX-NEXT: %[[cmp3:[0-9]+]] = icmp slt i32 %[[extract3]], 0
+// SHADERTEST-GFX-NEXT: %[[select3:[0-9]+]] = select i1 %[[cmp3]], i32 %[[extract3]], i32 %[[and3]]
+// SHADERTEST-GFX-NEXT: %[[insert3:[.a-z0-9]+]] = insertelement <8 x i32> %[[readfirstlane3]], i32 %[[select3]], i64 3
+// SHADERTEST-GFX-NEXT: %[[shufflevector3:[0-9]+]] = shufflevector <8 x i32> %[[insert3]], <8 x i32> %[[readfirstlane3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// SHADERTEST-GFX-NEXT: %[[readfirstlane4:[0-9]+]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 %[[begin3]], <4 x i32> %[[load1]])
+// SHADERTEST-GFX-NEXT: [[image_call3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector3]], <4 x i32> %[[readfirstlane4]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX-NEXT: %[[end3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin3]], <4 x float> %[[image_call3]])
+// SHADERTEST-GFX: AMDLLPC SUCCESS
+//
+// SHADERTEST-GFX_10_3_0-LABEL: {{^// LLPC}} pipeline patching results
+// SHADERTEST-GFX_10_3_0: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48
+// SHADERTEST-GFX_10_3_0-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[mul1]] to i64
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load1:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load2:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// SHADERTEST-GFX_10_3_0-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16
+// SHADERTEST-GFX_10_3_0-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load3]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_0-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]])
+//
+// SHADERTEST-GFX_10_3_0-NEXT: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[sext3:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep5:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load5:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep5]], align 32
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep6:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load6:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep6]], align 16
+// SHADERTEST-GFX_10_3_0-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load5]], <4 x i32> %[[load6]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_0-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]])
+//
+// SHADERTEST-GFX_10_3_0: %[[begin3:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[readfirstlane3:[0-9]+]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 %[[begin3]], <8 x i32> %[[load2]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[readfirstlane4:[0-9]+]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 %[[begin3]], <4 x i32> %[[load1]])
+// SHADERTEST-GFX_10_3_0-NEXT: [[image_call3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[readfirstlane3]], <4 x i32> %[[readfirstlane4]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_0-NEXT: %[[end3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin3]], <4 x float> %[[image_call3]])
+// SHADERTEST-GFX_10_3_0: AMDLLPC SUCCESS
+//
+// SHADERTEST-GFX_10_3_2-LABEL: {{^// LLPC}} pipeline patching results
+// SHADERTEST-GFX_10_3_2: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48
+// SHADERTEST-GFX_10_3_2-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[mul1]] to i64
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load1:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load2:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// SHADERTEST-GFX_10_3_2-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32
+// SHADERTEST-GFX_10_3_2-NEXT: %[[extract1:[.a-z0-9]+]] = extractelement <8 x i32> %[[load3]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[and1:[0-9]+]] = and i32 %[[extract1]], -1048577
+// SHADERTEST-GFX_10_3_2-NEXT: %[[insert1:[.a-z0-9]+]] = insertelement <8 x i32> %[[load3]], i32 %[[and1]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[shufflevector1:[0-9]+]] = shufflevector <8 x i32> %[[insert1]], <8 x i32> %[[load3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16
+// SHADERTEST-GFX_10_3_2-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector1]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_2-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]])
+//
+// SHADERTEST-GFX_10_3_2-NEXT: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[sext3:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep5:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load5:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep5]], align 32
+// SHADERTEST-GFX_10_3_2-NEXT: %[[extract1:[.a-z0-9]+]] = extractelement <8 x i32> %[[load5]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[and1:[0-9]+]] = and i32 %[[extract1]], -1048577
+// SHADERTEST-GFX_10_3_2-NEXT: %[[insert1:[.a-z0-9]+]] = insertelement <8 x i32> %[[load5]], i32 %[[and1]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[shufflevector1:[0-9]+]] = shufflevector <8 x i32> %[[insert1]], <8 x i32> %[[load5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep6:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load6:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep6]], align 16
+// SHADERTEST-GFX_10_3_2-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector1]], <4 x i32> %[[load6]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_2-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]])
+//
+// SHADERTEST-GFX_10_3_2: %[[extract2:[.a-z0-9]+]] = extractelement <8 x i32> %[[load2]], i64 6
+// SHADERTEST-GFX_10_3_2: %[[and2:[0-9]+]] = and i32 %[[extract2]], -1048577
+// SHADERTEST-GFX_10_3_2-NEXT: %[[insert2:[.a-z0-9]+]] = insertelement <8 x i32> %[[load2]], i32 %[[and2]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[shufflevector2:[0-9]+]] = shufflevector <8 x i32> %[[insert2]], <8 x i32> %[[load2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+// SHADERTEST-GFX_10_3_2: %[[begin3:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[readfirstlane3:[0-9]+]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 %[[begin3]], <8 x i32> %[[shufflevector2]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[readfirstlane4:[0-9]+]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 %[[begin3]], <4 x i32> %[[load1]])
+// SHADERTEST-GFX_10_3_2-NEXT: [[image_call3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[readfirstlane3]], <4 x i32> %[[readfirstlane4]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_2-NEXT: %[[end3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin3]], <4 x float> %[[image_call3]])
+// SHADERTEST-GFX_10_3_2: AMDLLPC SUCCESS
diff --git a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_SharedDesc.frag b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_SharedDesc.frag
index 123a2bc917..132f84103f 100644
--- a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_SharedDesc.frag
+++ b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_SharedDesc.frag
@@ -1,7 +1,3 @@
-// Make sure that there is a single begin index
-// Make sure that there is a single waterfall.readfirstlane for the offset
-// Make sure that there are two waterfall.end operations for the samples
-
 #version 450
 #extension GL_EXT_nonuniform_qualifier : require
 
@@ -20,21 +16,96 @@ void main()
     _3 = samp0 + samp1;
 }
 
-// BEGIN_SHADERTEST
-//
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST-GFX %s
 // Explicitly check GFX10.3 ASIC variants:
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST %s
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST %s
-// SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-// SHADERTEST: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32
-// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32
-// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32
-// SHADERTEST: AMDLLPC SUCCESS
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST-GFX_10_3_0 %s
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST-GFX_10_3_2 %s
+
+// SHADERTEST-GFX-LABEL: {{^// LLPC}} pipeline patching results
+// SHADERTEST-GFX: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48
+// SHADERTEST-GFX-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]])
+// SHADERTEST-GFX-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64
+// SHADERTEST-GFX-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// SHADERTEST-GFX-NEXT: %[[extract1:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 3
+// SHADERTEST-GFX-NEXT: %[[and1:[0-9]+]] = and i32 %[[extract1]], 268435455
+// SHADERTEST-GFX-NEXT: %[[cmp1:[0-9]+]] = icmp slt i32 %[[extract1]], 0
+// SHADERTEST-GFX-NEXT: %[[select1:[0-9]+]] = select i1 %[[cmp1]], i32 %[[extract1]], i32 %[[and1]]
+// SHADERTEST-GFX-NEXT: %[[insert1:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[select1]], i64 3
+// SHADERTEST-GFX-NEXT: %[[shufflevector1:[0-9]+]] = shufflevector <8 x i32> %[[insert1]], <8 x i32> %[[load1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// SHADERTEST-GFX-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// SHADERTEST-GFX-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]])
+//
+// SHADERTEST-GFX: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]])
+// SHADERTEST-GFX-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64
+// SHADERTEST-GFX-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32
+// SHADERTEST-GFX-NEXT: %[[extract2:[.a-z0-9]+]] = extractelement <8 x i32> %[[load3]], i64 3
+// SHADERTEST-GFX-NEXT: %[[and2:[0-9]+]] = and i32 %[[extract2]], 268435455
+// SHADERTEST-GFX-NEXT: %[[cmp2:[0-9]+]] = icmp slt i32 %[[extract2]], 0
+// SHADERTEST-GFX-NEXT: %[[select2:[0-9]+]] = select i1 %[[cmp2]], i32 %[[extract2]], i32 %[[and2]]
+// SHADERTEST-GFX-NEXT: %[[insert2:[.a-z0-9]+]] = insertelement <8 x i32> %[[load3]], i32 %[[select2]], i64 3
+// SHADERTEST-GFX-NEXT: %[[shufflevector2:[0-9]+]] = shufflevector <8 x i32> %[[insert2]], <8 x i32> %[[load3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// SHADERTEST-GFX-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16
+// SHADERTEST-GFX-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector2]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]])
+// SHADERTEST-GFX: AMDLLPC SUCCESS
+
+// SHADERTEST-GFX_10_3_0-LABEL: {{^// LLPC}} pipeline patching results
+// SHADERTEST-GFX_10_3_0: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48
+// SHADERTEST-GFX_10_3_0-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// SHADERTEST-GFX_10_3_0-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_0-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]])
+//
+// SHADERTEST-GFX_10_3_0: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16
+// SHADERTEST-GFX_10_3_0-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load3]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_0-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]])
+// SHADERTEST-GFX_10_3_0: AMDLLPC SUCCESS
+
+// SHADERTEST-GFX_10_3_2-LABEL: {{^// LLPC}} pipeline patching results
+// SHADERTEST-GFX_10_3_2: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48
+// SHADERTEST-GFX_10_3_2-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// SHADERTEST-GFX_10_3_2-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], -1048577
+// SHADERTEST-GFX_10_3_2-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[and]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// SHADERTEST-GFX_10_3_2-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_2-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]])
 //
-// END_SHADERTEST
+// SHADERTEST-GFX_10_3_2: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32
+// SHADERTEST-GFX_10_3_2-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load3]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], -1048577
+// SHADERTEST-GFX_10_3_2-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load3]], i32 %[[and]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16
+// SHADERTEST-GFX_10_3_2-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_2-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]])
+// SHADERTEST-GFX_10_3_2: AMDLLPC SUCCESS
diff --git a/tool/dumper/vkgcPipelineDumper.cpp b/tool/dumper/vkgcPipelineDumper.cpp
index 3691e649f7..143280fc83 100644
--- a/tool/dumper/vkgcPipelineDumper.cpp
+++ b/tool/dumper/vkgcPipelineDumper.cpp
@@ -647,7 +647,8 @@ void PipelineDumper::dumpPipelineShaderInfo(const PipelineShaderInfo *shaderInfo
   dumpFile << "options.fastMathFlags = " << shaderInfo->options.fastMathFlags << "\n";
   dumpFile << "options.disableFastMathFlags = " << shaderInfo->options.disableFastMathFlags << "\n";
   dumpFile << "options.ldsSpillLimitDwords = " << shaderInfo->options.ldsSpillLimitDwords << "\n";
-  dumpFile << "options.scalarizeWaterfallLoads = " << shaderInfo->options.scalarizeWaterfallLoads << "\n";
+  if (shaderInfo->options.scalarizeWaterfallLoads.has_value())
+    dumpFile << "options.scalarizeWaterfallLoads = " << *shaderInfo->options.scalarizeWaterfallLoads << "\n";
   dumpFile << "options.overrideShaderThreadGroupSizeX = " << shaderInfo->options.overrideShaderThreadGroupSizeX << "\n";
   dumpFile << "options.overrideShaderThreadGroupSizeY = " << shaderInfo->options.overrideShaderThreadGroupSizeY << "\n";
   dumpFile << "options.overrideShaderThreadGroupSizeZ = " << shaderInfo->options.overrideShaderThreadGroupSizeZ << "\n";