diff --git a/include/vkgcDefs.h b/include/vkgcDefs.h index 8365ffa169..db665dd386 100644 --- a/include/vkgcDefs.h +++ b/include/vkgcDefs.h @@ -246,6 +246,7 @@ struct optional_bool : private std::optional { using std::optional::has_value; using std::optional::value; using std::optional::value_or; + using std::optional::operator*; }; /// Enumerates result codes of LLPC operations. @@ -888,7 +889,7 @@ struct PipelineShaderOptions { unsigned ldsSpillLimitDwords; /// Attempt to scalarize waterfall descriptor loads. - bool scalarizeWaterfallLoads; + optional_bool scalarizeWaterfallLoads; /// Force rearranges threadId within group into blocks of 8*8 or 8*4 bool overrideForceThreadIdSwizzling; diff --git a/lgc/builder/BuilderImpl.cpp b/lgc/builder/BuilderImpl.cpp index ec99c51e21..f3501d6c44 100644 --- a/lgc/builder/BuilderImpl.cpp +++ b/lgc/builder/BuilderImpl.cpp @@ -33,6 +33,7 @@ #include "lgc/LgcDialect.h" #include "lgc/state/PipelineState.h" #include "lgc/state/TargetInfo.h" +#include "llvm/ADT/BitVector.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsAMDGPU.h" @@ -329,6 +330,117 @@ BranchInst *BuilderImpl::createIf(Value *condition, bool wantElse, const Twine & } #if defined(LLVM_HAVE_BRANCH_AMD_GFX) +// A simple memory efficient container that holds up to 64 instructions in a bit vector. It needs two helper data +// structures: 1. instrToIndex that maps an instruction to its index in the bit vector and 2. indexToInstr that maps an +// index back to an instruction. +class TinyInstructionSet { +public: + using IndexToInstructionVec = SmallVector; + using InstrToIndexMap = DenseMap; + +private: + BitVector bits; + +public: + TinyInstructionSet(unsigned size) { bits.resize(size); } + + class iterator { + BitVector::const_set_bits_iterator it; + const SmallVector &indexToInstr; + + public: + iterator(BitVector::const_set_bits_iterator it, const IndexToInstructionVec &indexToInstr) + : it(it), indexToInstr(indexToInstr) {} + iterator &operator++() { + ++it; + return *this; + } + + Instruction *operator*() { + unsigned index = *it; + assert(index < indexToInstr.size() && "Index out of range."); + return indexToInstr[index]; + } + + bool operator!=(const iterator &otherIt) { + assert(&otherIt.indexToInstr == &indexToInstr && "Iterators of different objects."); + return otherIt.it != it; + } + }; + + iterator begin(const IndexToInstructionVec &indexToInstr) const { + return iterator(bits.set_bits_begin(), indexToInstr); + } + + iterator end(const IndexToInstructionVec &indexToInstr) const { return iterator(bits.set_bits_end(), indexToInstr); } + + void insert(Instruction *instr, const InstrToIndexMap &instrToIndex) { + auto it = instrToIndex.find(instr); + assert(it != instrToIndex.end() && "Expected to find instr in instrToIndex."); + unsigned index = it->second; + bits.set(index); + } + + unsigned size() const { return bits.size(); } + + bool empty() const { return bits.empty(); } +}; + +class TraceNonUniformIndex { + // Maps the instruction to its index in the bit vector. + TinyInstructionSet::InstrToIndexMap instrToIndex; + // The instructions used as keys in instrToIndex in program order. It is used to map an index to an instruction. + TinyInstructionSet::IndexToInstructionVec indexToInstr; + // Maps an instruction to its dependencies. + DenseMap instrDeps; + bool scalarizeDescriptorLoads; + unsigned upperLimit; + void insertNewValueInInstrDeps(Value *, Instruction *); + +public: + TraceNonUniformIndex(Instruction *nonUniformInst, bool scalarizeDescriptorLoads = false, unsigned upperLimit = 64) + : scalarizeDescriptorLoads(scalarizeDescriptorLoads), upperLimit(upperLimit) { + // Initialization of instrToIndex and indexToInstr. + if (scalarizeDescriptorLoads) { + unsigned cnt = 0; + for (Instruction *I = nonUniformInst->getPrevNode(); I != nullptr && cnt < upperLimit; + I = I->getPrevNode(), ++cnt) { + indexToInstr.push_back(I); + instrToIndex[I] = cnt; + } + } + } + + Value *run(Value *); + + const DenseMap &getInstrDeps() const { return instrDeps; } + + const TinyInstructionSet::IndexToInstructionVec &getIndexToInstr() const { return indexToInstr; } + + bool foundDependencies() const { return scalarizeDescriptorLoads; } +}; + +// Adds newValue in instrDeps map. The dependencies of the newValue are the currentVisitedInstr and its dependencies. +// @param newValue : the new value to be added in instrDeps map +// @param currentVisitedInstr : the value from where we copy the dependencies for newValue +void TraceNonUniformIndex::insertNewValueInInstrDeps(Value *newValue, Instruction *currentVisitedInstr) { + if (!instrToIndex.contains(currentVisitedInstr)) { + // The instruction is either outside of 64 limit or in a different basic block. So, we bail-out scalarization. + scalarizeDescriptorLoads = false; + return; + } + assert(instrDeps.contains(currentVisitedInstr) && "The current visited instruction should have been in the map."); + auto it1 = instrDeps.try_emplace(newValue, upperLimit).first; + auto &setOfInstrs = it1->second; + auto it2 = instrDeps.find(currentVisitedInstr); + const auto &set = it2->second; + for (auto it3 = set.begin(indexToInstr), ite = set.end(indexToInstr); it3 != ite; ++it3) { + auto *instr = *it3; + setOfInstrs.insert(instr, instrToIndex); + } + setOfInstrs.insert(currentVisitedInstr, instrToIndex); +} + // ===================================================================================================================== // For a non-uniform input, try and trace back through a descriptor load to // find the non-uniform index used in it. If that fails, we just use the @@ -339,11 +451,15 @@ BranchInst *BuilderImpl::createIf(Value *condition, bool wantElse, const Twine & // This uses a fairly simple heuristic that nevertheless allows temporary expansion of the search breadth to handle // the common case where a base pointer is assembled from separate high and low halves. // +// In case of scalarization, it fills the instrDeps map by using insertNewValueInInstrDeps(). +// // @param nonUniformVal : Value representing non-uniform descriptor // @return : Value representing the non-uniform index, or null if nonUniformVal could be proven to be uniform -static Value *traceNonUniformIndex(Value *nonUniformVal) { +Value *TraceNonUniformIndex::run(Value *nonUniformVal) { auto load = dyn_cast(nonUniformVal); - if (!load) { + if (scalarizeDescriptorLoads && load) { + instrDeps.try_emplace(load, upperLimit); + } else if (!load) { // Workarounds that modify image descriptor can be peeped through, i.e. // %baseValue = load <8 x i32>, <8 x i32> addrspace(4)* %..., align 16 // %rawElement = extractelement <8 x i32> %baseValue, i64 6 @@ -353,6 +469,9 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) { if (!insert) return nonUniformVal; + if (scalarizeDescriptorLoads) + instrDeps.try_emplace(insert, upperLimit); + load = dyn_cast(insert->getOperand(0)); if (!load) return nonUniformVal; @@ -360,9 +479,15 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) { // We found the load, but must verify the chain. // Consider updatedElement as a generic instruction or constant. if (auto updatedElement = dyn_cast(insert->getOperand(1))) { + if (scalarizeDescriptorLoads) + insertNewValueInInstrDeps(updatedElement, insert); for (Value *operand : updatedElement->operands()) { if (auto extract = dyn_cast(operand)) { // Only dynamic value must be ExtractElementInst based on load. + if (scalarizeDescriptorLoads) { + insertNewValueInInstrDeps(extract, updatedElement); + insertNewValueInInstrDeps(load, extract); + } if (dyn_cast(extract->getOperand(0)) != load) return nonUniformVal; } else if (!isa(operand)) { @@ -386,11 +511,13 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) { SmallVector nonUniforms; nonUniforms.push_back(load); - auto propagate = [&](Value *value) -> bool { + auto propagate = [&](Value *value, Instruction *current) { if (auto inst = dyn_cast(value)) { if (nonUniforms.size() >= 2) return false; nonUniforms.push_back(inst); + if (scalarizeDescriptorLoads) + insertNewValueInInstrDeps(inst, current); return true; } return isa(value); @@ -410,13 +537,13 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) { // See if we can propagate the search further. if (current->isCast() || current->isUnaryOp()) { - if (!propagate(current->getOperand(0))) + if (!propagate(current->getOperand(0), current)) return nonUniformVal; continue; } if (current->isBinaryOp()) { - if (!propagate(current->getOperand(0)) || !propagate(current->getOperand(1))) + if (!propagate(current->getOperand(0), current) || !propagate(current->getOperand(1), current)) return nonUniformVal; continue; } @@ -427,14 +554,15 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) { if (as == ADDR_SPACE_FLAT || as == ADDR_SPACE_PRIVATE) return nonUniformVal; // load is a source of divergence, can't propagate - if (!propagate(ptr)) + if (!propagate(ptr, current)) return nonUniformVal; continue; } if (auto gep = dyn_cast(current)) { if (gep->hasAllConstantIndices()) { - if (!propagate(gep->getPointerOperand())) + + if (!propagate(gep->getPointerOperand(), current)) return nonUniformVal; continue; } @@ -443,33 +571,37 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) { if (candidateIndex || gep->getNumIndices() != 1) return nonUniformVal; - if (!propagate(gep->getPointerOperand())) + if (!propagate(gep->getPointerOperand(), current)) return nonUniformVal; candidateIndex = *gep->idx_begin(); if (getSize(candidateIndex) > nonUniformValSize) return nonUniformVal; // propagating further is worthless + + if (scalarizeDescriptorLoads) + insertNewValueInInstrDeps(candidateIndex, current); + continue; } if (auto extract = dyn_cast(current)) { - if (!propagate(extract->getAggregateOperand())) + if (!propagate(extract->getAggregateOperand(), current)) return nonUniformVal; continue; } if (auto insert = dyn_cast(current)) { - if (!propagate(insert->getAggregateOperand()) || !propagate(insert->getInsertedValueOperand())) + if (!propagate(insert->getAggregateOperand(), current) || !propagate(insert->getInsertedValueOperand(), current)) return nonUniformVal; continue; } if (auto extract = dyn_cast(current)) { - if (!isa(extract->getIndexOperand()) || !propagate(extract->getVectorOperand())) + if (!isa(extract->getIndexOperand()) || !propagate(extract->getVectorOperand(), current)) return nonUniformVal; continue; } if (auto insert = dyn_cast(current)) { - if (!isa(insert->getOperand(2)) || !propagate(insert->getOperand(0)) || - !propagate(insert->getOperand(1))) + if (!isa(insert->getOperand(2)) || !propagate(insert->getOperand(0), current) || + !propagate(insert->getOperand(1), current)) return nonUniformVal; continue; } @@ -532,6 +664,12 @@ static bool instructionsEqual(Instruction *lhs, Instruction *rhs) { // Create a waterfall loop containing the specified instruction. // This does not use the current insert point; new code is inserted before and after nonUniformInst. // +// For scalarization we need to collect all the instructions that need to be moved inside the loop. This is done by +// traceNonUniformIndex() which traverses all use-def predecessors of nonUniformInst. At the same time it adds these +// instructions to instrDeps map. Once traceNonUniformIndex() completes, we use the returned value as a key to the +// instrDeps map to get the dependencies. These dependencies are the instructions that will be cloned and moved inside +// the waterfall loop. +// // @param nonUniformInst : The instruction to put in a waterfall loop // @param operandIdxs : The operand index/indices for non-uniform inputs that need to be uniform // @param scalarizeDescriptorLoads : Attempt to scalarize descriptor loads @@ -548,24 +686,40 @@ Instruction *BuilderImpl::createWaterfallLoop(Instruction *nonUniformInst, Array assert(operandIdxs.empty() == false); SmallVector nonUniformIndices; + // Maps the nonUniformIndex that is returned by traceNonUniformIndex() to the nonUniformInst. + DenseMap> nonUniformIndexImageCallOperand; + TraceNonUniformIndex traceNonUniformIndex(nonUniformInst, scalarizeDescriptorLoads, 64); + for (unsigned operandIdx : operandIdxs) { - Value *nonUniformIndex = traceNonUniformIndex(nonUniformInst->getOperand(operandIdx)); - if (nonUniformIndex) + Value *nonUniformImageCallOperand = nonUniformInst->getOperand(operandIdx); + Value *nonUniformIndex = traceNonUniformIndex.run(nonUniformImageCallOperand); + scalarizeDescriptorLoads = traceNonUniformIndex.foundDependencies(); + if (nonUniformIndex) { nonUniformIndices.push_back(nonUniformIndex); + + if (scalarizeDescriptorLoads) + nonUniformIndexImageCallOperand[nonUniformIndex] = std::make_pair(nonUniformImageCallOperand, operandIdx); + } } + if (nonUniformIndices.empty()) return nonUniformInst; - // For any index that is 64 bit, change it back to 32 bit for comparison at the top of the - // waterfall loop. + // For any index that is 64 bit, change it back to 32 bit for comparison at the top of the waterfall loop. + // At this point the nonUniformVal of nonUniformIndices might change. We also need the original non uniform values for + // the scalarization of the descriptor loads. + DenseMap newOrigNonUniformVal; for (Value *&nonUniformVal : nonUniformIndices) { if (nonUniformVal->getType()->isIntegerTy(64)) { auto sExt = dyn_cast(nonUniformVal); + Value *origNonUniformVal = nonUniformVal; // 64-bit index may already be formed from extension of 32-bit value. if (sExt && sExt->getOperand(0)->getType()->isIntegerTy(32)) { nonUniformVal = sExt->getOperand(0); + newOrigNonUniformVal[nonUniformVal] = origNonUniformVal; } else { nonUniformVal = CreateTrunc(nonUniformVal, getInt32Ty()); + newOrigNonUniformVal[nonUniformVal] = origNonUniformVal; } } } @@ -601,36 +755,84 @@ Instruction *BuilderImpl::createWaterfallLoop(Instruction *nonUniformInst, Array Value *waterfallBegin; if (scalarizeDescriptorLoads) { - // Attempt to scalarize descriptor loads. - assert(firstIndexInst); - CallInst *firstCallInst = dyn_cast(firstIndexInst); - if (firstCallInst && firstCallInst->getIntrinsicID() == Intrinsic::amdgcn_waterfall_readfirstlane) { - // Descriptor loads are already inside a waterfall. - waterfallBegin = firstCallInst->getArgOperand(0); - } else { - // Begin waterfall loop just after shared index is computed. - // This places all dependent instructions within the waterfall loop, including descriptor loads. - auto descTy = firstIndexInst->getType(); - SetInsertPoint(firstIndexInst->getNextNonDebugInstruction(false)); - waterfallBegin = ConstantInt::get(getInt32Ty(), 0); - waterfallBegin = CreateIntrinsic(Intrinsic::amdgcn_waterfall_begin, descTy, {waterfallBegin, firstIndexInst}, - nullptr, instName); - - // Scalarize shared index. - Value *desc = CreateIntrinsic(Intrinsic::amdgcn_waterfall_readfirstlane, {descTy, descTy}, - {waterfallBegin, firstIndexInst}, nullptr, instName); + SetInsertPoint(nonUniformInst); + auto descTy = firstIndexInst->getType(); + // Create waterfall.begin and waterfall.readfirstlane intrinsics. + waterfallBegin = ConstantInt::get(getInt32Ty(), 0); + waterfallBegin = + CreateIntrinsic(Intrinsic::amdgcn_waterfall_begin, descTy, {waterfallBegin, firstIndexInst}, nullptr, instName); + + // Scalarize shared index. + Value *readFirstLane = CreateIntrinsic(Intrinsic::amdgcn_waterfall_readfirstlane, {descTy, descTy}, + {waterfallBegin, firstIndexInst}, nullptr, instName); + + for (auto *nonUniformVal : nonUniformIndices) { + // Get the first non uniform instruction of the chain. + auto it1 = newOrigNonUniformVal.find(nonUniformVal); + Value *origNonUniformVal = nonUniformVal; + if (it1 != newOrigNonUniformVal.end()) + origNonUniformVal = it1->second; + + auto [nonUniformImageCallOperand, operandIdx] = nonUniformIndexImageCallOperand[origNonUniformVal]; + + if (origNonUniformVal == nonUniformImageCallOperand) + continue; + + // Get the instruction chain of the first non uniform instruction. + const DenseMap &instrDeps = traceNonUniformIndex.getInstrDeps(); + auto it2 = instrDeps.find(origNonUniformVal); + assert(it2 != instrDeps.end() && "The non-uniform index should be in instrDep map."); + auto &instrsToClone = it2->second; + assert(!instrsToClone.empty() && "There are not any instructions to clone."); + + // Clone and emit the instructions that we want to push inside the waterfall loop. + std::map origClonedValuesMap; + Instruction *prevInst = nonUniformInst; + const TinyInstructionSet::IndexToInstructionVec &indexToInstr = traceNonUniformIndex.getIndexToInstr(); + for (auto it3 = instrsToClone.begin(indexToInstr), ite = instrsToClone.end(indexToInstr); it3 != ite; ++it3) { + auto *origInst = *it3; + auto *newInst = origInst->clone(); + newInst->insertBefore(prevInst); + origClonedValuesMap[origInst] = newInst; + prevInst = newInst; + // Update the non-uniform operand of the image call with the new non-uniform operand. + if (nonUniformImageCallOperand == origInst) { + if (nonUniformInst->getType()->isVoidTy()) + newInst = CreateIntrinsic(Intrinsic::amdgcn_waterfall_last_use, newInst->getType(), + {waterfallBegin, newInst}, nullptr, instName); + nonUniformInst->setOperand(operandIdx, newInst); + } + } + // Finally, clone the first non uniform instruction. + auto *origInst = cast(origNonUniformVal); + auto *newInst = origInst->clone(); + newInst->insertBefore(prevInst); + origClonedValuesMap[origInst] = newInst; + + // Update the operands of the cloned instructions. + for (auto [origInst, newInst] : origClonedValuesMap) { + for (Use &use : newInst->operands()) { + Value *op = use.get(); + if (auto *opI = dyn_cast(op)) { + auto it = origClonedValuesMap.find(opI); + if (it == origClonedValuesMap.end()) + continue; + Instruction *clonedI = it->second; + use.set(clonedI); + } + } + } // Replace all references to shared index within the waterfall loop with scalarized index. // (Note: this includes the non-uniform instruction itself.) // Loads using scalarized index will become scalar loads. - for (Value *otherNonUniformVal : nonUniformIndices) { - otherNonUniformVal->replaceUsesWithIf(desc, [desc, waterfallBegin, nonUniformInst](Use &U) { - Instruction *userInst = cast(U.getUser()); - return U.getUser() != waterfallBegin && U.getUser() != desc && - userInst->getParent() == nonUniformInst->getParent() && - (userInst == nonUniformInst || userInst->comesBefore(nonUniformInst)); - }); - } + nonUniformVal->replaceUsesWithIf(readFirstLane, [readFirstLane, waterfallBegin, nonUniformInst](Use &U) { + Instruction *userInst = cast(U.getUser()); + return userInst != waterfallBegin && userInst != readFirstLane && + userInst->getParent() == nonUniformInst->getParent() && + (userInst == nonUniformInst || userInst->comesBefore(nonUniformInst)) && + !userInst->comesBefore(cast(waterfallBegin)); + }); } } else { // Insert new code just before nonUniformInst. diff --git a/llpc/context/llpcPipelineContext.cpp b/llpc/context/llpcPipelineContext.cpp index e71c97e687..c9fb480c0d 100644 --- a/llpc/context/llpcPipelineContext.cpp +++ b/llpc/context/llpcPipelineContext.cpp @@ -618,10 +618,9 @@ ShaderOptions PipelineContext::computeShaderOptions(const PipelineShaderInfo &sh if (ScalarizeWaterfallDescriptorLoads.getNumOccurrences() > 0) { shaderOptions.scalarizeWaterfallLoads = ScalarizeWaterfallDescriptorLoads; } else { - shaderOptions.scalarizeWaterfallLoads = shaderInfo.options.scalarizeWaterfallLoads; - // Enable waterfall load scalarization when vgpr limit is set. - if (shaderOptions.vgprLimit != 0 && shaderOptions.vgprLimit != UINT_MAX) - shaderOptions.scalarizeWaterfallLoads = true; + shaderOptions.scalarizeWaterfallLoads = true; + if (shaderInfo.options.scalarizeWaterfallLoads.has_value()) + shaderOptions.scalarizeWaterfallLoads = *shaderInfo.options.scalarizeWaterfallLoads; } shaderOptions.sgprLimit = shaderInfo.options.sgprLimit; diff --git a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallInsertion.frag b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallInsertion.frag index 6845f3f011..74975a3767 100644 --- a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallInsertion.frag +++ b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallInsertion.frag @@ -18,16 +18,24 @@ void main() _3 = texture(_11[nonuniformEXT(_12)], vec2(0.0)); } -// BEGIN_SHADERTEST -/* -; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s -; Make sure that the begin indices chosen are the non-uniform offsets rather than the whole resource desc -; Make sure that there's a waterfall.readfirstlane for both the image resource desc and sample desc -; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results -; SHADERTEST-DAG: call i32 @llvm.amdgcn.waterfall.begin.i32 -; SHADERTEST-DAG: call i32 @llvm.amdgcn.waterfall.begin.i32 -; SHADERTEST-DAG: call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32 -; SHADERTEST-DAG: call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32 -; SHADERTEST: AMDLLPC SUCCESS -*/ -// END_SHADERTEST +// RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s +// Make sure that the begin indices chosen are the non-uniform offsets rather than the whole resource desc +// Make sure that there's a waterfall.readfirstlane for both the image resource desc and sample desc +// SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results +// SHADERTEST: %[[mul:[0-9]+]] = mul i32 %{{.*}}, 48 +// SHADERTEST-NEXT: %[[begin:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul]]) +// SHADERTEST-NEXT: %[[readfirstlane:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin]], i32 %[[mul]]) +// SHADERTEST-NEXT: %[[sext:[0-9]+]] = sext i32 %[[readfirstlane]] to i64 +// SHADERTEST-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]] +// SHADERTEST-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32 +// SHADERTEST-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 3 +// SHADERTEST-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], 268435455 +// SHADERTEST-NEXT: %[[cmp:[0-9]+]] = icmp slt i32 %[[extract]], 0 +// SHADERTEST-NEXT: %[[select:[0-9]+]] = select i1 %[[cmp]], i32 %[[extract]], i32 %[[and]] +// SHADERTEST-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[select]], i64 3 +// SHADERTEST-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load1]], <8 x i32> +// SHADERTEST-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]] +// SHADERTEST-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16 +// SHADERTEST-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half 0xH0000, half 0xH0000, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0) +// SHADERTEST-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]]) +// SHADERTEST: AMDLLPC SUCCESS diff --git a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize.frag b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize.frag index fbf9c25c0f..ff090feb37 100644 --- a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize.frag +++ b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize.frag @@ -1,6 +1,3 @@ -// Make sure that there is a single begin index -// Make sure that there is a single waterfall.readfirstlane for the offset - #version 450 #extension GL_EXT_nonuniform_qualifier : require @@ -16,18 +13,56 @@ void main() _3 = texture(_11[nonuniformEXT(_12)], _6); } -// BEGIN_SHADERTEST -// -// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s +// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST-GFX %s // Explicitly check GFX10.3 ASIC variants: -// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST %s -// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST %s -// SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results -// SHADERTEST: call i32 @llvm.amdgcn.waterfall.begin.i32 -// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32 -// SHADERTEST: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32 -// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32 -// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32 -// SHADERTEST: AMDLLPC SUCCESS +// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST-GFX_10_3_0 %s +// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST-GFX_10_3_2 %s + +// SHADERTEST-GFX-LABEL: {{^// LLPC}} pipeline patching results +// SHADERTEST-GFX: %[[mul:[0-9]+]] = mul i32 %{{.*}}, 48 +// SHADERTEST-GFX-NEXT: %[[begin:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul]]) +// SHADERTEST-GFX-NEXT: %[[readfirstlane:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin]], i32 %[[mul]]) +// SHADERTEST-GFX-NEXT: %[[sext:[0-9]+]] = sext i32 %[[readfirstlane]] to i64 +// SHADERTEST-GFX-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]] +// SHADERTEST-GFX-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32 +// SHADERTEST-GFX-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 3 +// SHADERTEST-GFX-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], 268435455 +// SHADERTEST-GFX-NEXT: %[[cmp:[0-9]+]] = icmp slt i32 %[[extract]], 0 +// SHADERTEST-GFX-NEXT: %[[select:[0-9]+]] = select i1 %[[cmp]], i32 %[[extract]], i32 %[[and]] +// SHADERTEST-GFX-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[select]], i64 3 +// SHADERTEST-GFX-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load1]], <8 x i32> +// SHADERTEST-GFX-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]] +// SHADERTEST-GFX-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16 +// SHADERTEST-GFX-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0) +// SHADERTEST-GFX-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]]) +// SHADERTEST-GFX: AMDLLPC SUCCESS +// +// SHADERTEST-GFX_10_3_0-LABEL: {{^// LLPC}} pipeline patching results +// SHADERTEST-GFX_10_3_0: %[[mul:[0-9]+]] = mul i32 %{{.*}}, 48 +// SHADERTEST-GFX_10_3_0-NEXT: %[[begin:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul]]) +// SHADERTEST-GFX_10_3_0-NEXT: %[[readfirstlane:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin]], i32 %[[mul]]) +// SHADERTEST-GFX_10_3_0-NEXT: %[[sext:[0-9]+]] = sext i32 %[[readfirstlane]] to i64 +// SHADERTEST-GFX_10_3_0-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]] +// SHADERTEST-GFX_10_3_0-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32 +// SHADERTEST-GFX_10_3_0-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]] +// SHADERTEST-GFX_10_3_0-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16 +// SHADERTEST-GFX_10_3_0-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0) +// SHADERTEST-GFX_10_3_0-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]]) +// SHADERTEST-GFX_10_3_0: AMDLLPC SUCCESS // -// END_SHADERTEST +// SHADERTEST-GFX_10_3_2-LABEL: {{^// LLPC}} pipeline patching results +// SHADERTEST-GFX_10_3_2: %[[mul:[0-9]+]] = mul i32 %{{.*}}, 48 +// SHADERTEST-GFX_10_3_2-NEXT: %[[begin:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul]]) +// SHADERTEST-GFX_10_3_2-NEXT: %[[readfirstlane:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin]], i32 %[[mul]]) +// SHADERTEST-GFX_10_3_2-NEXT: %[[sext:[0-9]+]] = sext i32 %[[readfirstlane]] to i64 +// SHADERTEST-GFX_10_3_2-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]] +// SHADERTEST-GFX_10_3_2-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32 +// SHADERTEST-GFX_10_3_2-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 6 +// SHADERTEST-GFX_10_3_2-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], -1048577 +// SHADERTEST-GFX_10_3_2-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[and]], i64 6 +// SHADERTEST-GFX_10_3_2-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load1]], <8 x i32> +// SHADERTEST-GFX_10_3_2-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]] +// SHADERTEST-GFX_10_3_2-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16 +// SHADERTEST-GFX_10_3_2-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0) +// SHADERTEST-GFX_10_3_2-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]]) +// SHADERTEST-GFX_10_3_2: AMDLLPC SUCCESS diff --git a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_MultiBlock.frag b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_MultiBlock.frag index 82cd87a930..8e1893653d 100644 --- a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_MultiBlock.frag +++ b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_MultiBlock.frag @@ -1,5 +1,5 @@ // Make sure that there are two non-overlapping waterfall loops -// First is scalarized and second is vector type +// The first two loops are scalarized and the last one is vector type #version 450 #extension GL_EXT_nonuniform_qualifier : require @@ -25,24 +25,139 @@ void main() _3 = samp0 + samp1; } -// BEGIN_SHADERTEST -// -// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s +// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST-GFX %s // Explicitly check GFX10.3 ASIC variants: -// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST %s -// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST %s -// SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results -// SHADERTEST: call i32 @llvm.amdgcn.waterfall.begin.i32 -// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32 -// SHADERTEST: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32 -// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32 -// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32 -// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32 -// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32 -// SHADERTEST: call i32 @llvm.amdgcn.waterfall.begin.i32 -// SHADERTEST: call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32 -// SHADERTEST: call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32 -// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32 -// SHADERTEST: AMDLLPC SUCCESS -// -// END_SHADERTEST +// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST-GFX_10_3_0 %s +// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST-GFX_10_3_2 %s + +// SHADERTEST-GFX-LABEL: {{^// LLPC}} pipeline patching results +// SHADERTEST-GFX: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48 +// SHADERTEST-GFX-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[mul1]] to i64 +// SHADERTEST-GFX-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]] +// SHADERTEST-GFX-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]] +// SHADERTEST-GFX-NEXT: %[[load1:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16 +// SHADERTEST-GFX-NEXT: %[[load2:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32 +// SHADERTEST-GFX-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// SHADERTEST-GFX-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]]) +// SHADERTEST-GFX-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64 +// SHADERTEST-GFX-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]] +// SHADERTEST-GFX-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32 +// SHADERTEST-GFX-NEXT: %[[extract1:[.a-z0-9]+]] = extractelement <8 x i32> %[[load3]], i64 3 +// SHADERTEST-GFX-NEXT: %[[and1:[0-9]+]] = and i32 %[[extract1]], 268435455 +// SHADERTEST-GFX-NEXT: %[[cmp1:[0-9]+]] = icmp slt i32 %[[extract1]], 0 +// SHADERTEST-GFX-NEXT: %[[select1:[0-9]+]] = select i1 %[[cmp1]], i32 %[[extract1]], i32 %[[and1]] +// SHADERTEST-GFX-NEXT: %[[insert1:[.a-z0-9]+]] = insertelement <8 x i32> %[[load3]], i32 %[[select1]], i64 3 +// SHADERTEST-GFX-NEXT: %[[shufflevector1:[0-9]+]] = shufflevector <8 x i32> %[[insert1]], <8 x i32> %[[load3]], <8 x i32> +// SHADERTEST-GFX-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]] +// SHADERTEST-GFX-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16 +// SHADERTEST-GFX-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector1]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0) +// SHADERTEST-GFX-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]]) +// +// SHADERTEST-GFX-NEXT: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// SHADERTEST-GFX-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]]) +// SHADERTEST-GFX-NEXT: %[[sext3:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64 +// SHADERTEST-GFX-NEXT: %[[gep5:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]] +// SHADERTEST-GFX-NEXT: %[[load5:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep5]], align 32 +// SHADERTEST-GFX-NEXT: %[[extract2:[.a-z0-9]+]] = extractelement <8 x i32> %[[load5]], i64 3 +// SHADERTEST-GFX-NEXT: %[[and2:[0-9]+]] = and i32 %[[extract2]], 268435455 +// SHADERTEST-GFX-NEXT: %[[cmp2:[0-9]+]] = icmp slt i32 %[[extract2]], 0 +// SHADERTEST-GFX-NEXT: %[[select2:[0-9]+]] = select i1 %[[cmp2]], i32 %[[extract2]], i32 %[[and2]] +// SHADERTEST-GFX-NEXT: %[[insert2:[.a-z0-9]+]] = insertelement <8 x i32> %[[load5]], i32 %[[select2]], i64 3 +// SHADERTEST-GFX-NEXT: %[[shufflevector2:[0-9]+]] = shufflevector <8 x i32> %[[insert2]], <8 x i32> %[[load5]], <8 x i32> +// SHADERTEST-GFX-NEXT: %[[gep6:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]] +// SHADERTEST-GFX-NEXT: %[[load6:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep6]], align 16 +// SHADERTEST-GFX-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector2]], <4 x i32> %[[load6]], i1 false, i32 0, i32 0) +// SHADERTEST-GFX-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]]) +// +// SHADERTEST-GFX: %[[begin3:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// SHADERTEST-GFX-NEXT: %[[readfirstlane3:[0-9]+]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 %[[begin3]], <8 x i32> %[[load2]]) +// SHADERTEST-GFX-NEXT: %[[extract3:[.a-z0-9]+]] = extractelement <8 x i32> %[[readfirstlane3]], i64 3 +// SHADERTEST-GFX-NEXT: %[[and3:[0-9]+]] = and i32 %[[extract3]], 268435455 +// SHADERTEST-GFX-NEXT: %[[cmp3:[0-9]+]] = icmp slt i32 %[[extract3]], 0 +// SHADERTEST-GFX-NEXT: %[[select3:[0-9]+]] = select i1 %[[cmp3]], i32 %[[extract3]], i32 %[[and3]] +// SHADERTEST-GFX-NEXT: %[[insert3:[.a-z0-9]+]] = insertelement <8 x i32> %[[readfirstlane3]], i32 %[[select3]], i64 3 +// SHADERTEST-GFX-NEXT: %[[shufflevector3:[0-9]+]] = shufflevector <8 x i32> %[[insert3]], <8 x i32> %[[readfirstlane3]], <8 x i32> +// SHADERTEST-GFX-NEXT: %[[readfirstlane4:[0-9]+]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 %[[begin3]], <4 x i32> %[[load1]]) +// SHADERTEST-GFX-NEXT: [[image_call3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector3]], <4 x i32> %[[readfirstlane4]], i1 false, i32 0, i32 0) +// SHADERTEST-GFX-NEXT: %[[end3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin3]], <4 x float> %[[image_call3]]) +// SHADERTEST-GFX: AMDLLPC SUCCESS +// +// SHADERTEST-GFX_10_3_0-LABEL: {{^// LLPC}} pipeline patching results +// SHADERTEST-GFX_10_3_0: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48 +// SHADERTEST-GFX_10_3_0-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[mul1]] to i64 +// SHADERTEST-GFX_10_3_0-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]] +// SHADERTEST-GFX_10_3_0-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]] +// SHADERTEST-GFX_10_3_0-NEXT: %[[load1:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16 +// SHADERTEST-GFX_10_3_0-NEXT: %[[load2:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32 +// SHADERTEST-GFX_10_3_0-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// SHADERTEST-GFX_10_3_0-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]]) +// SHADERTEST-GFX_10_3_0-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64 +// SHADERTEST-GFX_10_3_0-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]] +// SHADERTEST-GFX_10_3_0-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32 +// SHADERTEST-GFX_10_3_0-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]] +// SHADERTEST-GFX_10_3_0-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16 +// SHADERTEST-GFX_10_3_0-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load3]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0) +// SHADERTEST-GFX_10_3_0-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]]) +// +// SHADERTEST-GFX_10_3_0-NEXT: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// SHADERTEST-GFX_10_3_0-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]]) +// SHADERTEST-GFX_10_3_0-NEXT: %[[sext3:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64 +// SHADERTEST-GFX_10_3_0-NEXT: %[[gep5:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]] +// SHADERTEST-GFX_10_3_0-NEXT: %[[load5:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep5]], align 32 +// SHADERTEST-GFX_10_3_0-NEXT: %[[gep6:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]] +// SHADERTEST-GFX_10_3_0-NEXT: %[[load6:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep6]], align 16 +// SHADERTEST-GFX_10_3_0-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load5]], <4 x i32> %[[load6]], i1 false, i32 0, i32 0) +// SHADERTEST-GFX_10_3_0-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]]) +// +// SHADERTEST-GFX_10_3_0: %[[begin3:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// SHADERTEST-GFX_10_3_0-NEXT: %[[readfirstlane3:[0-9]+]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 %[[begin3]], <8 x i32> %[[load2]]) +// SHADERTEST-GFX_10_3_0-NEXT: %[[readfirstlane4:[0-9]+]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 %[[begin3]], <4 x i32> %[[load1]]) +// SHADERTEST-GFX_10_3_0-NEXT: [[image_call3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[readfirstlane3]], <4 x i32> %[[readfirstlane4]], i1 false, i32 0, i32 0) +// SHADERTEST-GFX_10_3_0-NEXT: %[[end3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin3]], <4 x float> %[[image_call3]]) +// SHADERTEST-GFX_10_3_0: AMDLLPC SUCCESS +// +// SHADERTEST-GFX_10_3_2-LABEL: {{^// LLPC}} pipeline patching results +// SHADERTEST-GFX_10_3_2: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48 +// SHADERTEST-GFX_10_3_2-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[mul1]] to i64 +// SHADERTEST-GFX_10_3_2-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]] +// SHADERTEST-GFX_10_3_2-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]] +// SHADERTEST-GFX_10_3_2-NEXT: %[[load1:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16 +// SHADERTEST-GFX_10_3_2-NEXT: %[[load2:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32 +// SHADERTEST-GFX_10_3_2-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// SHADERTEST-GFX_10_3_2-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]]) +// SHADERTEST-GFX_10_3_2-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64 +// SHADERTEST-GFX_10_3_2-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]] +// SHADERTEST-GFX_10_3_2-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32 +// SHADERTEST-GFX_10_3_2-NEXT: %[[extract1:[.a-z0-9]+]] = extractelement <8 x i32> %[[load3]], i64 6 +// SHADERTEST-GFX_10_3_2-NEXT: %[[and1:[0-9]+]] = and i32 %[[extract1]], -1048577 +// SHADERTEST-GFX_10_3_2-NEXT: %[[insert1:[.a-z0-9]+]] = insertelement <8 x i32> %[[load3]], i32 %[[and1]], i64 6 +// SHADERTEST-GFX_10_3_2-NEXT: %[[shufflevector1:[0-9]+]] = shufflevector <8 x i32> %[[insert1]], <8 x i32> %[[load3]], <8 x i32> +// SHADERTEST-GFX_10_3_2-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]] +// SHADERTEST-GFX_10_3_2-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16 +// SHADERTEST-GFX_10_3_2-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector1]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0) +// SHADERTEST-GFX_10_3_2-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]]) +// +// SHADERTEST-GFX_10_3_2-NEXT: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// SHADERTEST-GFX_10_3_2-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]]) +// SHADERTEST-GFX_10_3_2-NEXT: %[[sext3:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64 +// SHADERTEST-GFX_10_3_2-NEXT: %[[gep5:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]] +// SHADERTEST-GFX_10_3_2-NEXT: %[[load5:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep5]], align 32 +// SHADERTEST-GFX_10_3_2-NEXT: %[[extract1:[.a-z0-9]+]] = extractelement <8 x i32> %[[load5]], i64 6 +// SHADERTEST-GFX_10_3_2-NEXT: %[[and1:[0-9]+]] = and i32 %[[extract1]], -1048577 +// SHADERTEST-GFX_10_3_2-NEXT: %[[insert1:[.a-z0-9]+]] = insertelement <8 x i32> %[[load5]], i32 %[[and1]], i64 6 +// SHADERTEST-GFX_10_3_2-NEXT: %[[shufflevector1:[0-9]+]] = shufflevector <8 x i32> %[[insert1]], <8 x i32> %[[load5]], <8 x i32> +// SHADERTEST-GFX_10_3_2-NEXT: %[[gep6:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]] +// SHADERTEST-GFX_10_3_2-NEXT: %[[load6:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep6]], align 16 +// SHADERTEST-GFX_10_3_2-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector1]], <4 x i32> %[[load6]], i1 false, i32 0, i32 0) +// SHADERTEST-GFX_10_3_2-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]]) +// +// SHADERTEST-GFX_10_3_2: %[[extract2:[.a-z0-9]+]] = extractelement <8 x i32> %[[load2]], i64 6 +// SHADERTEST-GFX_10_3_2: %[[and2:[0-9]+]] = and i32 %[[extract2]], -1048577 +// SHADERTEST-GFX_10_3_2-NEXT: %[[insert2:[.a-z0-9]+]] = insertelement <8 x i32> %[[load2]], i32 %[[and2]], i64 6 +// SHADERTEST-GFX_10_3_2-NEXT: %[[shufflevector2:[0-9]+]] = shufflevector <8 x i32> %[[insert2]], <8 x i32> %[[load2]], <8 x i32> +// SHADERTEST-GFX_10_3_2: %[[begin3:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// SHADERTEST-GFX_10_3_2-NEXT: %[[readfirstlane3:[0-9]+]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 %[[begin3]], <8 x i32> %[[shufflevector2]]) +// SHADERTEST-GFX_10_3_2-NEXT: %[[readfirstlane4:[0-9]+]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 %[[begin3]], <4 x i32> %[[load1]]) +// SHADERTEST-GFX_10_3_2-NEXT: [[image_call3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[readfirstlane3]], <4 x i32> %[[readfirstlane4]], i1 false, i32 0, i32 0) +// SHADERTEST-GFX_10_3_2-NEXT: %[[end3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin3]], <4 x float> %[[image_call3]]) +// SHADERTEST-GFX_10_3_2: AMDLLPC SUCCESS diff --git a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_SharedDesc.frag b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_SharedDesc.frag index 123a2bc917..132f84103f 100644 --- a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_SharedDesc.frag +++ b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_SharedDesc.frag @@ -1,7 +1,3 @@ -// Make sure that there is a single begin index -// Make sure that there is a single waterfall.readfirstlane for the offset -// Make sure that there are two waterfall.end operations for the samples - #version 450 #extension GL_EXT_nonuniform_qualifier : require @@ -20,21 +16,96 @@ void main() _3 = samp0 + samp1; } -// BEGIN_SHADERTEST -// -// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s +// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST-GFX %s // Explicitly check GFX10.3 ASIC variants: -// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST %s -// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST %s -// SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results -// SHADERTEST: call i32 @llvm.amdgcn.waterfall.begin.i32 -// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32 -// SHADERTEST: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32 -// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32 -// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32 -// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32 -// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32 -// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32 -// SHADERTEST: AMDLLPC SUCCESS +// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST-GFX_10_3_0 %s +// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST-GFX_10_3_2 %s + +// SHADERTEST-GFX-LABEL: {{^// LLPC}} pipeline patching results +// SHADERTEST-GFX: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48 +// SHADERTEST-GFX-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// SHADERTEST-GFX-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]]) +// SHADERTEST-GFX-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64 +// SHADERTEST-GFX-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]] +// SHADERTEST-GFX-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32 +// SHADERTEST-GFX-NEXT: %[[extract1:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 3 +// SHADERTEST-GFX-NEXT: %[[and1:[0-9]+]] = and i32 %[[extract1]], 268435455 +// SHADERTEST-GFX-NEXT: %[[cmp1:[0-9]+]] = icmp slt i32 %[[extract1]], 0 +// SHADERTEST-GFX-NEXT: %[[select1:[0-9]+]] = select i1 %[[cmp1]], i32 %[[extract1]], i32 %[[and1]] +// SHADERTEST-GFX-NEXT: %[[insert1:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[select1]], i64 3 +// SHADERTEST-GFX-NEXT: %[[shufflevector1:[0-9]+]] = shufflevector <8 x i32> %[[insert1]], <8 x i32> %[[load1]], <8 x i32> +// SHADERTEST-GFX-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]] +// SHADERTEST-GFX-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16 +// SHADERTEST-GFX-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0) +// SHADERTEST-GFX-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]]) +// +// SHADERTEST-GFX: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// SHADERTEST-GFX-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]]) +// SHADERTEST-GFX-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64 +// SHADERTEST-GFX-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]] +// SHADERTEST-GFX-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32 +// SHADERTEST-GFX-NEXT: %[[extract2:[.a-z0-9]+]] = extractelement <8 x i32> %[[load3]], i64 3 +// SHADERTEST-GFX-NEXT: %[[and2:[0-9]+]] = and i32 %[[extract2]], 268435455 +// SHADERTEST-GFX-NEXT: %[[cmp2:[0-9]+]] = icmp slt i32 %[[extract2]], 0 +// SHADERTEST-GFX-NEXT: %[[select2:[0-9]+]] = select i1 %[[cmp2]], i32 %[[extract2]], i32 %[[and2]] +// SHADERTEST-GFX-NEXT: %[[insert2:[.a-z0-9]+]] = insertelement <8 x i32> %[[load3]], i32 %[[select2]], i64 3 +// SHADERTEST-GFX-NEXT: %[[shufflevector2:[0-9]+]] = shufflevector <8 x i32> %[[insert2]], <8 x i32> %[[load3]], <8 x i32> +// SHADERTEST-GFX-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]] +// SHADERTEST-GFX-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16 +// SHADERTEST-GFX-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector2]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0) +// SHADERTEST-GFX-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]]) +// SHADERTEST-GFX: AMDLLPC SUCCESS + +// SHADERTEST-GFX_10_3_0-LABEL: {{^// LLPC}} pipeline patching results +// SHADERTEST-GFX_10_3_0: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48 +// SHADERTEST-GFX_10_3_0-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// SHADERTEST-GFX_10_3_0-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]]) +// SHADERTEST-GFX_10_3_0-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64 +// SHADERTEST-GFX_10_3_0-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]] +// SHADERTEST-GFX_10_3_0-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32 +// SHADERTEST-GFX_10_3_0-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]] +// SHADERTEST-GFX_10_3_0-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16 +// SHADERTEST-GFX_10_3_0-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0) +// SHADERTEST-GFX_10_3_0-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]]) +// +// SHADERTEST-GFX_10_3_0: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// SHADERTEST-GFX_10_3_0-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]]) +// SHADERTEST-GFX_10_3_0-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64 +// SHADERTEST-GFX_10_3_0-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]] +// SHADERTEST-GFX_10_3_0-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32 +// SHADERTEST-GFX_10_3_0-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]] +// SHADERTEST-GFX_10_3_0-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16 +// SHADERTEST-GFX_10_3_0-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load3]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0) +// SHADERTEST-GFX_10_3_0-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]]) +// SHADERTEST-GFX_10_3_0: AMDLLPC SUCCESS + +// SHADERTEST-GFX_10_3_2-LABEL: {{^// LLPC}} pipeline patching results +// SHADERTEST-GFX_10_3_2: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48 +// SHADERTEST-GFX_10_3_2-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// SHADERTEST-GFX_10_3_2-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]]) +// SHADERTEST-GFX_10_3_2-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64 +// SHADERTEST-GFX_10_3_2-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]] +// SHADERTEST-GFX_10_3_2-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32 +// SHADERTEST-GFX_10_3_2-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 6 +// SHADERTEST-GFX_10_3_2-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], -1048577 +// SHADERTEST-GFX_10_3_2-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[and]], i64 6 +// SHADERTEST-GFX_10_3_2-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load1]], <8 x i32> +// SHADERTEST-GFX_10_3_2-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]] +// SHADERTEST-GFX_10_3_2-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16 +// SHADERTEST-GFX_10_3_2-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0) +// SHADERTEST-GFX_10_3_2-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]]) // -// END_SHADERTEST +// SHADERTEST-GFX_10_3_2: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// SHADERTEST-GFX_10_3_2-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]]) +// SHADERTEST-GFX_10_3_2-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64 +// SHADERTEST-GFX_10_3_2-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]] +// SHADERTEST-GFX_10_3_2-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32 +// SHADERTEST-GFX_10_3_2-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load3]], i64 6 +// SHADERTEST-GFX_10_3_2-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], -1048577 +// SHADERTEST-GFX_10_3_2-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load3]], i32 %[[and]], i64 6 +// SHADERTEST-GFX_10_3_2-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load3]], <8 x i32> +// SHADERTEST-GFX_10_3_2-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]] +// SHADERTEST-GFX_10_3_2-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16 +// SHADERTEST-GFX_10_3_2-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0) +// SHADERTEST-GFX_10_3_2-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]]) +// SHADERTEST-GFX_10_3_2: AMDLLPC SUCCESS diff --git a/tool/dumper/vkgcPipelineDumper.cpp b/tool/dumper/vkgcPipelineDumper.cpp index 3691e649f7..143280fc83 100644 --- a/tool/dumper/vkgcPipelineDumper.cpp +++ b/tool/dumper/vkgcPipelineDumper.cpp @@ -647,7 +647,8 @@ void PipelineDumper::dumpPipelineShaderInfo(const PipelineShaderInfo *shaderInfo dumpFile << "options.fastMathFlags = " << shaderInfo->options.fastMathFlags << "\n"; dumpFile << "options.disableFastMathFlags = " << shaderInfo->options.disableFastMathFlags << "\n"; dumpFile << "options.ldsSpillLimitDwords = " << shaderInfo->options.ldsSpillLimitDwords << "\n"; - dumpFile << "options.scalarizeWaterfallLoads = " << shaderInfo->options.scalarizeWaterfallLoads << "\n"; + if (shaderInfo->options.scalarizeWaterfallLoads.has_value()) + dumpFile << "options.scalarizeWaterfallLoads = " << *shaderInfo->options.scalarizeWaterfallLoads << "\n"; dumpFile << "options.overrideShaderThreadGroupSizeX = " << shaderInfo->options.overrideShaderThreadGroupSizeX << "\n"; dumpFile << "options.overrideShaderThreadGroupSizeY = " << shaderInfo->options.overrideShaderThreadGroupSizeY << "\n"; dumpFile << "options.overrideShaderThreadGroupSizeZ = " << shaderInfo->options.overrideShaderThreadGroupSizeZ << "\n";